From 071e6175dcc130b4c623e849a380d6434289eb66 Mon Sep 17 00:00:00 2001
From: Erik Smistad <ersmistad@gmail.com>
Date: Thu, 24 May 2018 15:47:00 +0200
Subject: [PATCH 001/570] Added the -Thost=x64 flag to cmake build instructions

---
 tensorflow/contrib/cmake/README.md | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 0b79f718d4..5c203b777c 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -106,17 +106,6 @@ Step-by-step Windows build
 
 1. Install the prerequisites detailed above, and set up your environment.
 
-   * The following commands assume that you are using the Windows Command
-     Prompt (`cmd.exe`). You will need to set up your environment to use the
-     appropriate toolchain, i.e. the 64-bit tools. (Some of the binary targets
-     we will build are too large for the 32-bit tools, and they will fail with
-     out-of-memory errors.) The typical command to do set up your
-     environment is:
-
-     ```
-     D:\temp> "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\amd64\vcvarsall.bat"
-     ```
-
    * When building with GPU support after installing the CUDNN zip file from NVidia, append its
      bin directory to your PATH environment variable.
      In case TensorFlow fails to find the CUDA dll's during initialization, check your PATH environment variable.
@@ -168,7 +157,7 @@ Step-by-step Windows build
    and must be the last character on each line.
 
    ```
-   D:\...\build> cmake .. -A x64 -DCMAKE_BUILD_TYPE=Release ^
+   D:\...\build> cmake .. -A x64 -Thost=x64 -DCMAKE_BUILD_TYPE=Release ^
    More? -DSWIG_EXECUTABLE=C:/tools/swigwin-3.0.10/swig.exe ^
    More? -DPYTHON_EXECUTABLE=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/python.exe ^
    More? -DPYTHON_LIBRARIES=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/libs/python35.lib
@@ -197,6 +186,10 @@ Step-by-step Windows build
    not currently supported, because it relies on a `Debug` library for
    Python (`python35d.lib`) that is not distributed by default.
 
+   The `-Thost=x64` flag will ensure that the 64 bit compiler and linker
+   is used when building. Without this flag, MSBuild will use the 32 bit
+   toolchain which is prone to compile errors such as "compiler out of heap space".
+
    There are various options that can be specified when generating the
    solution and project files:
 
@@ -263,6 +256,11 @@ Step-by-step Windows build
 
 4. Invoke MSBuild to build TensorFlow.
 
+   Set up the path to find MSbuild:
+   ```
+   D:\temp> "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\amd64\vcvarsall.bat"
+   ```
+
    To build the C++ example program, which will be created as a `.exe`
    executable in the subdirectory `.\Release`:
 
-- 
GitLab


From 6890731b2693f6b71dedaca6b2eaf8b488226836 Mon Sep 17 00:00:00 2001
From: Erik Smistad <ersmistad@gmail.com>
Date: Thu, 24 May 2018 15:47:22 +0200
Subject: [PATCH 002/570] increase minimum cmake version required to 3.8

---
 tensorflow/contrib/cmake/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 0708d6b7b9..225c5e6227 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -1,5 +1,9 @@
 # Minimum CMake required
-cmake_minimum_required(VERSION 3.5)
+if(WIN32)
+  cmake_minimum_required(VERSION 3.8)
+else()
+  cmake_minimum_required(VERSION 3.5)
+endif()
 
 # Project
 project(tensorflow C CXX)
-- 
GitLab


From 2e436951bb63a0294848b6f6d3746e449a305ad1 Mon Sep 17 00:00:00 2001
From: Stefan Dyulgerov <stefan.dyulgerov@gmail.com>
Date: Tue, 17 Jul 2018 22:37:19 +0300
Subject: [PATCH 003/570] version_info.cc generated only once

version_info.cc in the cmake files is generated every time when we build tensorflow and this forces rebuild of the whole project, since it is in the core library.
added make.bat for windows, which does the same as make.sh to be executed easily from a build machine. the default now is visual studio 17
---
 tensorflow/contrib/cmake/make.bat             | 38 +++++++++++++++++++
 .../contrib/cmake/tf_core_framework.cmake     | 23 +++++++----
 2 files changed, 53 insertions(+), 8 deletions(-)
 create mode 100644 tensorflow/contrib/cmake/make.bat

diff --git a/tensorflow/contrib/cmake/make.bat b/tensorflow/contrib/cmake/make.bat
new file mode 100644
index 0000000000..d52b24e01d
--- /dev/null
+++ b/tensorflow/contrib/cmake/make.bat
@@ -0,0 +1,38 @@
+%echo off
+
+cd /d %~dp0
+
+if exist _build rd /s /q _build
+
+mkdir _build
+chdir _build
+
+
+rem cmake ../ -G "Visual Studio 15 Win64" -DCMAKE_GENERATOR_TOOLSET=v141,host=x64 -DCMAKE_INSTALL_PREFIX:PATH=.\install
+
+CALL :NORMALIZEPATH "..\..\..\.."
+SET SOURCE_DIR=%RETVAL%
+
+echo %SOURCE_DIR%
+
+SET SOURCE_DIR=F:\frameworks\tensorflow\
+
+CALL :NORMALIZEPATH "../../../tools/git/gen_git_source.py"
+SET SOURCE_PYTHON_SCRIPT=%RETVAL%
+
+CALL :NORMALIZEPATH "../../../core/util/version_info.cc"
+SET SOURCE_VERSION_CC=%RETVAL%
+
+python %SOURCE_PYTHON_SCRIPT% --raw_generate %SOURCE_VERSION_CC% --source_dir %SOURCE_DIR% --git_tag_override=
+
+cmake ../ -G "Visual Studio 15 Win64" -DCMAKE_GENERATOR_TOOLSET=v141,host=x64 -DCMAKE_INSTALL_PREFIX:PATH=.\install
+
+EXIT /B
+
+:NORMALIZEPATH
+  SET RETVAL=%~dpfn1
+  EXIT /B
+
+
+
+                                                                              
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 067c299a71..7e806685b8 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -258,14 +258,21 @@ add_dependencies(tf_core_lib ${tensorflow_EXTERNAL_DEPENDENCIES} tf_protos_cc)
 # force_rebuild always runs forcing ${VERSION_INFO_CC} target to run
 # ${VERSION_INFO_CC} would cache, but it depends on a phony never produced
 # target.
-set(VERSION_INFO_CC ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
-add_custom_target(force_rebuild_target ALL DEPENDS ${VERSION_INFO_CC})
-add_custom_command(OUTPUT __force_rebuild COMMAND ${CMAKE_COMMAND} -E echo)
-add_custom_command(OUTPUT
-    ${VERSION_INFO_CC}
-    COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
-    ARGS --raw_generate ${VERSION_INFO_CC} --source_dir ${tensorflow_source_dir} --git_tag_override=${GIT_TAG_OVERRIDE}
-    DEPENDS __force_rebuild)
+# This code forces rebuild every time, not needed as version from git is fetched only once
+# move to make.bat which mimicks make.sh
+
+if (NOT WIN32)
+
+  set(VERSION_INFO_CC ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
+  add_custom_target(force_rebuild_target ALL DEPENDS ${VERSION_INFO_CC})
+  add_custom_command(OUTPUT __force_rebuild COMMAND ${CMAKE_COMMAND} -E echo)
+  add_custom_command(OUTPUT
+      ${VERSION_INFO_CC}
+      COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
+      ARGS --raw_generate ${VERSION_INFO_CC} --source_dir ${tensorflow_source_dir} --git_tag_override=${GIT_TAG_OVERRIDE}
+      DEPENDS __force_rebuild)
+endif()
+
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 
 ########################################################
-- 
GitLab


From 29f596cf21f0332c1e2ece8798fdd9fefd2ba947 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 4 Jun 2018 14:04:59 +0000
Subject: [PATCH 004/570] Improve the shape function of Bincount

There was not a lot of restriction in shape function
of Bincount and the output shape was unknown.
It is actually possible to get a better shape output
if `size` input is known.
This fix adds enhancement to the shape function of
Bincount.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/math_ops.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 1667c398f4..7d0f29368b 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1416,6 +1416,10 @@ REGISTER_OP("Bincount")
     .Attr("T: {int32, int64, float32, float64}")
     .Output("bins: T")
     .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      // The input `size` must be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
       c->set_output(0, c->UnknownShapeOfRank(1));
       return Status::OK();
     });
-- 
GitLab


From 740c58b6fa5b6e1c85f688fbda322da0231aa169 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 4 Jun 2018 14:44:44 +0000
Subject: [PATCH 005/570] Return `[size]` shape if size is known for Bincount.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/math_ops.cc | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 7d0f29368b..b57385f63b 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1420,7 +1420,19 @@ REGISTER_OP("Bincount")
       // The input `size` must be a scalar.
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
 
-      c->set_output(0, c->UnknownShapeOfRank(1));
+      const Tensor* size_tensor = c->input_tensor(1);
+      if (size_tensor == nullptr) {
+        // Return unknown shape if size is not known.
+        c->set_output(0, c->UnknownShapeOfRank(1));
+        return Status::OK();
+      }
+
+      // Return `[size]` shape if size is known.
+      int32 size_val = size_tensor->scalar<int32>()();
+      if (size_val < 0) {
+        return errors::InvalidArgument("size (", size_val, ") must be non-negative");
+      }
+      c->set_output(0, c->MakeShape({size_val}));
       return Status::OK();
     });
 
-- 
GitLab


From e6981fc2225a529427391e98f492eee7bb865988 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 11 Aug 2018 18:39:13 +0000
Subject: [PATCH 006/570] Add additional test cases for Bincount Shape
 function, and fix clang-format issue

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/math_ops.cc               |  3 ++-
 tensorflow/core/ops/math_ops_test.cc          | 12 ++++++++++++
 .../python/kernel_tests/bincount_op_test.py   | 19 +++++++++++++++++++
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index b57385f63b..0ba4a9a005 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1430,7 +1430,8 @@ REGISTER_OP("Bincount")
       // Return `[size]` shape if size is known.
       int32 size_val = size_tensor->scalar<int32>()();
       if (size_val < 0) {
-        return errors::InvalidArgument("size (", size_val, ") must be non-negative");
+        return errors::InvalidArgument("size (", size_val,
+                                       ") must be non-negative");
       }
       c->set_output(0, c->MakeShape({size_val}));
       return Status::OK();
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 23f1538912..7bf7c476f4 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -558,4 +558,16 @@ TEST(MathOpsTest, QuantizedAdd_ShapeFn) {
   INFER_ERROR("must be rank 0", op, "?;?;?;?;[3];?");
   INFER_ERROR("must be rank 0", op, "?;?;?;?;?;[4]");
 }
+
+TEST(MathOpsTest, Bincount_ShapeFn) {
+  ShapeInferenceTestOp op("Bincount");
+
+  // size should be scalar.
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;[1];?");
+
+  INFER_OK(op, "?;?;?", "[?]");
+  INFER_OK(op, "?;[];?", "[?]");
+  INFER_OK(op, "[?];[];?", "[?]");
+  INFER_OK(op, "[?];[];[?]", "[?]");
+}
 }  // end namespace tensorflow
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index 2767df127e..15d9de56db 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -22,6 +22,8 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 
@@ -97,6 +99,23 @@ class BincountTest(test_util.TensorFlowTestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         math_ops.bincount([1, 2, 3, -1, 6, 8]).eval()
 
+  def test_shape_function(self):
+    # size must be scalar.
+    with self.assertRaisesRegexp(
+        ValueError, "Shape must be rank 0 but is rank 1 for 'Bincount'"):
+      gen_math_ops.bincount([1, 2, 3, -1, 6, 8], [1], [])
+    # size must be positive.
+    with self.assertRaisesRegexp(
+        ValueError, "must be non-negative"):
+      gen_math_ops.bincount([1, 2, 3, -1, 6, 8], -5, [])
+    # if size is a constant then the shape is known.
+    v1 = gen_math_ops.bincount([1, 2, 3, -1, 6, 8], 5, [])
+    self.assertAllEqual(v1.get_shape().as_list(), [5])
+    # if size is a placeholder then the shape is unknown.
+    s = array_ops.placeholder(dtype=dtypes.int32)
+    v2 = gen_math_ops.bincount([1, 2, 3, -1, 6, 8], s, [])
+    self.assertAllEqual(v2.get_shape().as_list(), [None])
+
 
 if __name__ == "__main__":
   googletest.main()
-- 
GitLab


From aa25cc078c9b55e5ca3e0f59df43e169bfee8f3c Mon Sep 17 00:00:00 2001
From: Cao Zongyan <zongyan.cao@alibaba-inc.com>
Date: Thu, 16 Aug 2018 19:04:37 +0800
Subject: [PATCH 007/570] Add LeakyRelu C++ Op and its gradient implementation.

LeakyRelu, defined as 'y = { x (x>=0) or alpha*x (x<0) }', was computed
by combined Ops 'max(x, alpha*x)' in current codes. Hence its gradient
calculation for back propagation would contain a serial of element-wise
Ops. This looks really unnecessary for such a simple op and it could be
done within just one Op with less memory accesses.
---
 tensorflow/cc/gradients/nn_grad.cc            |  13 ++
 tensorflow/cc/gradients/nn_grad_test.cc       |  13 ++
 tensorflow/core/kernels/relu_op.cc            | 153 +++++++++++-------
 tensorflow/core/kernels/relu_op.h             |  59 +++++++
 tensorflow/core/kernels/relu_op_functor.h     |  31 ++++
 tensorflow/core/kernels/relu_op_gpu.cu.cc     |  18 ++-
 tensorflow/core/ops/nn_ops.cc                 |  15 ++
 tensorflow/core/ops/ops.pbtxt                 |  68 ++++++++
 tensorflow/python/eager/pywrap_tfe_src.cc     |   2 +
 .../python/kernel_tests/relu_op_test.py       | 113 +++++++++++++
 tensorflow/python/ops/nn_grad.py              |  15 ++
 tensorflow/python/ops/nn_ops.py               |   3 +-
 12 files changed, 432 insertions(+), 71 deletions(-)

diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 588e96cb19..0fc23d0bf7 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -143,6 +143,19 @@ Status Relu6GradHelper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Relu6", Relu6GradHelper);
 
+Status LeakyReluGradHelper(const Scope& scope, const Operation& op,
+                           const std::vector<Output>& grad_inputs,
+                           std::vector<Output>* grad_outputs) {
+  float alpha;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "alpha", &alpha));
+  internal::LeakyReluGrad::Attrs attrs;
+  attrs.Alpha(alpha);
+  auto dx = internal::LeakyReluGrad(scope, grad_inputs[0], op.input(0), attrs);
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("LeakyRelu", LeakyReluGradHelper);
+
 Status EluGradHelper(const Scope& scope, const Operation& op,
                      const std::vector<Output>& grad_inputs,
                      std::vector<Output>* grad_outputs) {
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index aa72cf7ba2..5ebece7b6e 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -41,6 +41,7 @@ using ops::MaxPoolV2;
 using ops::Placeholder;
 using ops::Relu;
 using ops::Relu6;
+using ops::LeakyRelu;
 using ops::Selu;
 using ops::Softmax;
 using ops::Softplus;
@@ -160,6 +161,18 @@ TEST_F(NNGradTest, Relu6Grad) {
   RunTest(x, x_init_value, y, shape);
 }
 
+TEST_F(NNGradTest, LeakyReluGrad) {
+  TensorShape shape({5, 2});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = LeakyRelu(scope_, x);
+  // Avoid input values where Leaky ReLU gradient is not well defined (around
+  // zero).
+  Tensor x_init_value = test::AsTensor<float>(
+      {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f, 0.1f, 0.3f, 0.5f, 0.7f, 0.9f},
+      {5, 2});
+  RunTest(x, x_init_value, y, shape);
+}
+
 TEST_F(NNGradTest, EluGrad) {
   TensorShape shape({5, 2});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index d52358737f..c4f2ef5632 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -33,19 +33,25 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 
-#define REGISTER_RELU_KERNELS(type)                                   \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Relu").Device(DEVICE_CPU).TypeConstraint<type>("T"),      \
-      ReluOp<CPUDevice, type>);                                       \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("ReluGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"),  \
-      ReluGradOp<CPUDevice, type>);                                   \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Relu6").Device(DEVICE_CPU).TypeConstraint<type>("T"),     \
-      Relu6Op<CPUDevice, type>);                                      \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Relu6Grad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
-      Relu6GradOp<CPUDevice, type>)
+#define REGISTER_RELU_KERNELS(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Relu").Device(DEVICE_CPU).TypeConstraint<type>("T"),          \
+      ReluOp<CPUDevice, type>);                                           \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("ReluGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"),      \
+      ReluGradOp<CPUDevice, type>);                                       \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Relu6").Device(DEVICE_CPU).TypeConstraint<type>("T"),         \
+      Relu6Op<CPUDevice, type>);                                          \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Relu6Grad").Device(DEVICE_CPU).TypeConstraint<type>("T"),     \
+      Relu6GradOp<CPUDevice, type>)                                       \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("LeakyRelu").Device(DEVICE_CPU).TypeConstraint<type>("T"),     \
+      LeakyReluOp<CPUDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("LeakyReluGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      LeakyReluGradOp<CPUDevice, type>);
 
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_RELU_KERNELS);
 #undef REGISTER_RELU_KERNELS
@@ -99,6 +105,19 @@ namespace functor {
   extern template struct Relu6Grad<GPUDevice, T>;                              \
                                                                                \
   template <>                                                                  \
+  void LeakyRelu<GPUDevice, T>::operator()(                                    \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor features,            \
+      T alpha, typename TTypes<T>::Tensor activations);                        \
+  extern template struct LeakyRelu<GPUDevice, T>;                              \
+                                                                               \
+  template <>                                                                  \
+  void LeakyReluGrad<GPUDevice, T>::operator()(                                \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients,           \
+      typename TTypes<T>::ConstTensor features,                                \
+      T alpha, typename TTypes<T>::Tensor backprops);                          \
+  extern template struct LeakyReluGrad<GPUDevice, T>;                          \
+                                                                               \
+  template <>                                                                  \
   void Elu<GPUDevice, T>::operator()(const GPUDevice& d,                       \
                                      typename TTypes<T>::ConstTensor features, \
                                      typename TTypes<T>::Tensor activations);  \
@@ -128,30 +147,36 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 }  // namespace functor
 
 // Registration of the GPU implementations.
-#define REGISTER_GPU_KERNELS(type)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Relu").Device(DEVICE_GPU).TypeConstraint<type>("T"),      \
-      ReluOp<GPUDevice, type>);                                       \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("ReluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),  \
-      ReluGradOp<GPUDevice, type>);                                   \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Relu6").Device(DEVICE_GPU).TypeConstraint<type>("T"),     \
-      Relu6Op<GPUDevice, type>);                                      \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Relu6Grad").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
-      Relu6GradOp<GPUDevice, type>);                                  \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Elu").Device(DEVICE_GPU).TypeConstraint<type>("T"),       \
-      EluOp<GPUDevice, type>);                                        \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("EluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),   \
-      EluGradOp<GPUDevice, type>);                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Selu").Device(DEVICE_GPU).TypeConstraint<type>("T"),      \
-      SeluOp<GPUDevice, type>);                                       \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("SeluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),  \
+#define REGISTER_GPU_KERNELS(type)                                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Relu").Device(DEVICE_GPU).TypeConstraint<type>("T"),          \
+      ReluOp<GPUDevice, type>);                                           \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("ReluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),      \
+      ReluGradOp<GPUDevice, type>);                                       \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Relu6").Device(DEVICE_GPU).TypeConstraint<type>("T"),         \
+      Relu6Op<GPUDevice, type>);                                          \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Relu6Grad").Device(DEVICE_GPU).TypeConstraint<type>("T"),     \
+      Relu6GradOp<GPUDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("LeakyRelu").Device(DEVICE_GPU).TypeConstraint<type>("T"),     \
+      LeakyReluOp<GPUDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("LeakyReluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      LeakyReluGradOp<GPUDevice, type>);                                  \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Elu").Device(DEVICE_GPU).TypeConstraint<type>("T"),           \
+      EluOp<GPUDevice, type>);                                            \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("EluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),       \
+      EluGradOp<GPUDevice, type>);                                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Selu").Device(DEVICE_GPU).TypeConstraint<type>("T"),          \
+      SeluOp<GPUDevice, type>);                                           \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("SeluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),      \
       SeluGradOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
@@ -161,30 +186,36 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 
 #ifdef TENSORFLOW_USE_SYCL
 // Registration of the GPU implementations.
-#define REGISTER_SYCL_KERNELS(type)                                    \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("Relu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
-      ReluOp<SYCLDevice, type>);                                       \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("ReluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
-      ReluGradOp<SYCLDevice, type>);                                   \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("Relu6").Device(DEVICE_SYCL).TypeConstraint<type>("T"),     \
-      Relu6Op<SYCLDevice, type>);                                      \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("Relu6Grad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      Relu6GradOp<SYCLDevice, type>);                                  \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("Elu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),       \
-      EluOp<SYCLDevice, type>);                                        \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("EluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
-      EluGradOp<SYCLDevice, type>);                                    \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("Selu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
-      SeluOp<SYCLDevice, type>);                                       \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("SeluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
+#define REGISTER_SYCL_KERNELS(type)                                        \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Relu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),          \
+      ReluOp<SYCLDevice, type>);                                           \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("ReluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
+      ReluGradOp<SYCLDevice, type>);                                       \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Relu6").Device(DEVICE_SYCL).TypeConstraint<type>("T"),         \
+      Relu6Op<SYCLDevice, type>);                                          \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Relu6Grad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),     \
+      Relu6GradOp<SYCLDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("LeakyRelu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),     \
+      LeakyReluOp<SYCLDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("LeakyReluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      LeakyReluGradOp<SYCLDevice, type>);                                  \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Elu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),           \
+      EluOp<SYCLDevice, type>);                                            \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("EluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),       \
+      EluGradOp<SYCLDevice, type>);                                        \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Selu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),          \
+      SeluOp<SYCLDevice, type>);                                           \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("SeluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
       SeluGradOp<SYCLDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
diff --git a/tensorflow/core/kernels/relu_op.h b/tensorflow/core/kernels/relu_op.h
index e712b02bd7..c55190065c 100644
--- a/tensorflow/core/kernels/relu_op.h
+++ b/tensorflow/core/kernels/relu_op.h
@@ -131,6 +131,65 @@ void Relu6GradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
           output->flat<T>());
 }
 
+template <typename Device, typename T>
+class LeakyReluOp : public UnaryElementWiseOp<T, LeakyReluOp<Device, T>> {
+ public:
+  explicit LeakyReluOp(OpKernelConstruction* context)
+      : UnaryElementWiseOp<T, LeakyReluOp<Device, T>>(context) {
+    float alpha_tmp;
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_tmp));
+    alpha_ = T(alpha_tmp);
+  }
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::LeakyRelu<Device, T> functor;
+    functor(context->eigen_device<Device>(), input.flat<T>(),
+            alpha_, output->flat<T>());
+  }
+
+ private:
+  T alpha_;
+};
+
+template <typename Device, typename T>
+class LeakyReluGradOp
+    : public BinaryElementWiseOp<T, LeakyReluGradOp<Device, T>> {
+ public:
+  explicit LeakyReluGradOp(OpKernelConstruction* context)
+      : BinaryElementWiseOp<T, LeakyReluGradOp<Device, T>>(context) {
+    float alpha_tmp;
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_tmp));
+    alpha_ = T(alpha_tmp);
+  }
+
+  void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
+                         const Tensor& a, T alpha, Tensor* output);
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (inputs): either the inputs that were passed to LeakyReluOp(), or its
+  //               outputs (using either one yields the same result here).
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OperateNoTemplate(context, g, a, alpha_, output);
+  }
+
+ private:
+  T alpha_;
+};
+
+template <typename Device, typename T>
+void LeakyReluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
+    const Tensor& g, const Tensor& a, T alpha, Tensor* output) {
+  if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
+  functor::LeakyReluGrad<Device, T> functor;
+  functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(), alpha,
+          output->flat<T>());
+};
+
 template <typename Device, typename T>
 class EluOp : public UnaryElementWiseOp<T, EluOp<Device, T>> {
  public:
diff --git a/tensorflow/core/kernels/relu_op_functor.h b/tensorflow/core/kernels/relu_op_functor.h
index 3bc5ba8a50..7f0951451d 100644
--- a/tensorflow/core/kernels/relu_op_functor.h
+++ b/tensorflow/core/kernels/relu_op_functor.h
@@ -91,6 +91,37 @@ struct Relu6Grad {
   }
 };
 
+
+// Functor used by LeakyReluOp to do the computations.
+template <typename Device, typename T>
+struct LeakyRelu {
+  // Computes LeakyRelu activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  T alpha, typename TTypes<T>::Tensor activations) {
+    activations.device(d) = features.cwiseMax(features * alpha);
+  }
+};
+
+// Functor used by LeakyReluGradOp to do the computations.
+template <typename Device, typename T>
+struct LeakyReluGrad {
+  // Computes LeakyReluGrad backprops.
+  //
+  // gradients: gradients backpropagated to the LeakyRelu op.
+  // features: either the inputs that were passed to the LeakyRelu or, or its
+  //           outputs (using either one yields the same result here).
+  // backprops: gradients to backpropagate to the LeakyRelu inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor features, T alpha,
+                  typename TTypes<T>::Tensor backprops) {
+    backprops.device(d) =
+        (features > static_cast<T>(0)).select(gradients, gradients * alpha);
+  }
+};
+
 // Functor used by EluOp to do the computations.
 template <typename Device, typename T>
 struct Elu {
diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index 089ca8ed27..4452f4dcc9 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -114,14 +114,16 @@ struct ReluGrad<Device, Eigen::half> {
 }  // namespace functor
 
 // Definition of the GPU implementations declared in relu_op.cc.
-#define DEFINE_GPU_KERNELS(T)                       \
-  template struct functor::Relu<GPUDevice, T>;      \
-  template struct functor::ReluGrad<GPUDevice, T>;  \
-  template struct functor::Relu6<GPUDevice, T>;     \
-  template struct functor::Relu6Grad<GPUDevice, T>; \
-  template struct functor::Elu<GPUDevice, T>;       \
-  template struct functor::EluGrad<GPUDevice, T>;   \
-  template struct functor::Selu<GPUDevice, T>;      \
+#define DEFINE_GPU_KERNELS(T)                           \
+  template struct functor::Relu<GPUDevice, T>;          \
+  template struct functor::ReluGrad<GPUDevice, T>;      \
+  template struct functor::Relu6<GPUDevice, T>;         \
+  template struct functor::Relu6Grad<GPUDevice, T>;     \
+  template struct functor::LeakyRelu<GPUDevice, T>;     \
+  template struct functor::LeakyReluGrad<GPUDevice, T>; \
+  template struct functor::Elu<GPUDevice, T>;           \
+  template struct functor::EluGrad<GPUDevice, T>;       \
+  template struct functor::Selu<GPUDevice, T>;          \
   template struct functor::SeluGrad<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index e0f25fb4ef..023f988f80 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -983,6 +983,21 @@ REGISTER_OP("Relu6Grad")
     .Attr("T: realnumbertype")
     .SetShapeFn(shape_inference::MergeBothInputsShapeFn);
 
+REGISTER_OP("LeakyRelu")
+    .Input("features: T")
+    .Output("activations: T")
+    .Attr("alpha: float = 0.2")
+    .Attr("T: {half, float, double} = DT_FLOAT")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("LeakyReluGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Output("backprops: T")
+    .Attr("alpha: float = 0.2")
+    .Attr("T: {half, float, double} = DT_FLOAT")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn);
+
 REGISTER_OP("Elu")
     .Input("features: T")
     .Output("activations: T")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index f2595279e0..837e91bc23 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -13604,6 +13604,74 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "LeakyRelu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LeakykReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "LearnedUnigramCandidateSampler"
   input_arg {
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 2d54555cd3..9b3b5fd7aa 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1730,6 +1730,7 @@ bool OpDoesntRequireOutput(const string& op_name) {
           "SoftplusGrad",
           "Softsign",
           "ReluGrad",
+          "LeakyReluGrad",
           "Conv2D",
           "DepthwiseConv2dNative",
           "Dilation2D",
@@ -1799,6 +1800,7 @@ bool OpDoesntRequireInput(const string& op_name) {
           "BiasAdd",
           "Relu",
           "Relu6",
+          "LeakyRelu",
           "Elu",
           "Selu",
           "SparseSoftmaxCrossEntropyWithLogits",
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index 25e947f09e..ccb3a231bb 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -252,6 +252,119 @@ class Relu6Test(test.TestCase):
     self.assertLess(err, 1e-10)
 
 
+class LeakyReluTest(test.TestCase):
+
+  def _npLeakyRelu(self, np_features, alpha=0.1):
+    return np.maximum(np_features, alpha * np_features)
+
+  def testNpLeakyRelu(self):
+    self.assertAllClose(
+        np.array([[-0.09, 0.7, -0.05, 0.3, -0.01],
+                  [0.1, -0.03, 0.5, -0.07, 0.9]]),
+        self._npLeakyRelu(
+            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, 0.9]
+                     ]), alpha=0.1))
+
+  def _testLeakyRelu(self, np_features, alpha, use_gpu=False):
+    np_leaky_relu = self._npLeakyRelu(np_features, alpha)
+    with self.test_session(use_gpu=use_gpu):
+      leaky_relu = nn_ops.leaky_relu(np_features, alpha)
+      tf_leaky_relu = leaky_relu.eval()
+    self.assertAllClose(np_leaky_relu, tf_leaky_relu)
+    self.assertShapeEqual(np_leaky_relu, leaky_relu)
+
+  def testNumbers(self):
+    for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
+      self._testLeakyRelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+          alpha=0.2, use_gpu=False)
+      if t in [np.float16, np.float32, np.float64]:
+        self._testLeakyRelu(
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+            alpha=0.1, use_gpu=True)
+
+  # The gradient test for ReLU is a bit tricky as the derivative is not well
+  # defined at around zero and we want to avoid that in terms of input values.
+  def testGradientFloat32(self):
+    with self.test_session():
+      x = constant_op.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5],
+          name="x")
+      y = nn_ops.leaky_relu(x, alpha=0.1, name="leaky_relu")
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32,
+          order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], y, [2, 5], x_init_value=x_init)
+    print("leaky_relu (float32) gradient err = ", err)
+    self.assertLess(err, 1e-4)
+
+  def testGradientFloat64(self):
+    with self.test_session():
+      x = constant_op.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5],
+          dtype=dtypes.float64,
+          name="x")
+      y = nn_ops.leaky_relu(x, alpha=0.2, name="leaky_relu")
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float64,
+          order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], y, [2, 5], x_init_value=x_init)
+    print("leaky_relu (float64) gradient err = ", err)
+    self.assertLess(err, 1e-10)
+
+  def testGradGradFloat32(self):
+    with self.test_session():
+      x = constant_op.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5],
+          name="x")
+      y = nn_ops.leaky_relu(x, alpha=0.1, name="leaky_relu")
+      z = gradients_impl.gradients(y, x)
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32,
+          order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+    print("leaky_relu (float32) gradient of gradient err = ", err)
+    self.assertLess(err, 1e-4)
+
+  def testGradGradFloat64(self):
+    with self.test_session():
+      x = constant_op.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5],
+          dtype=dtypes.float64,
+          name="x")
+      y = nn_ops.leaky_relu(x, alpha=0.02, name="leaky_relu")
+      z = gradients_impl.gradients(y, x)
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float64,
+          order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+    print("leaky_relu (float64) gradient of gradient err = ", err)
+    self.assertLess(err, 1e-10)
+
+  def testGradientScalar(self):
+    with self.test_session() as sess:
+      x = variables.Variable(-100.)
+      y = nn_ops.leaky_relu(x, 0.05)
+      loss = y**2
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.2)
+      train_op = optimizer.minimize(loss)
+      sess.run(variables.global_variables_initializer())
+      sess.run(train_op)
+      self.assertAllClose(x.eval(), -99.9)
+
+
 class EluTest(test.TestCase):
 
   def _npElu(self, np_features):
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index df23ac55ce..c2dd58bdf0 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -390,6 +390,21 @@ def _Relu6GradGrad(op, grad):
           array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype))
 
 
+@ops.RegisterGradient("LeakyRelu")
+def _LeakyReluGrad(op, grad):
+  x = op.inputs[0]
+  alpha = op.get_attr("alpha")
+  return gen_nn_ops.leaky_relu_grad(grad, x, alpha=alpha)
+
+
+@ops.RegisterGradient("LeakyReluGrad")
+def _LeakyReluGradGrad(op, grad):
+  x = op.inputs[1]
+  alpha = op.get_attr("alpha")
+  return (gen_nn_ops.leaky_relu_grad(grad, x, alpha=alpha),
+          array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype))
+
+
 @ops.RegisterGradient("Elu")
 def _EluGrad(op, grad):
   return gen_nn_ops.elu_grad(grad, op.outputs[0])
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 6fd1273687..31b8f3945d 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1601,8 +1601,7 @@ def leaky_relu(features, alpha=0.2, name=None):
     features = ops.convert_to_tensor(features, name="features")
     if features.dtype.is_integer:
       features = math_ops.to_float(features)
-    alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
-    return math_ops.maximum(alpha * features, features, name=name)
+    return gen_nn_ops.leaky_relu(features, alpha=alpha, name=name)
 
 
 def _flatten_outer_dims(logits):
-- 
GitLab


From cb5c61a3e11a37fb39a246aaf8ed6d02dd9ae9ab Mon Sep 17 00:00:00 2001
From: Cao Zongyan <zongyan.cao@alibaba-inc.com>
Date: Fri, 24 Aug 2018 11:51:34 +0800
Subject: [PATCH 008/570] Refine LeakyRelu codes and update APIs.

---
 .../api_def/base_api/api_def_LeakyRelu.pbtxt  |  4 ++++
 .../base_api/api_def_LeakyReluGrad.pbtxt      | 24 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  2 +-
 tensorflow/python/eager/pywrap_tfe_src.cc     |  2 +-
 4 files changed, 30 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_LeakyRelu.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_LeakyReluGrad.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_LeakyRelu.pbtxt b/tensorflow/core/api_def/base_api/api_def_LeakyRelu.pbtxt
new file mode 100644
index 0000000000..4a61889f54
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LeakyRelu.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LeakyRelu"
+  summary: "Computes rectified linear: `max(features, features * alpha)`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LeakyReluGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_LeakyReluGrad.pbtxt
new file mode 100644
index 0000000000..e427526602
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LeakyReluGrad.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "LeakyReluGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "gradients"
+    description: <<END
+The backpropagated gradients to the corresponding LeakyRelu operation.
+END
+  }
+  in_arg {
+    name: "features"
+    description: <<END
+The features passed as input to the corresponding LeakyRelu operation,
+OR the outputs of that operation (both work equivalently).
+END
+  }
+  out_arg {
+    name: "backprops"
+    description: <<END
+`gradients * (features > 0) + alpha * gradients * (featurs <= 0)`.
+END
+  }
+  summary: "Computes rectified linear gradients for a LeakyRelu operation."
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 837e91bc23..7693c2d485 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -13637,7 +13637,7 @@ op {
   }
 }
 op {
-  name: "LeakykReluGrad"
+  name: "LeakyReluGrad"
   input_arg {
     name: "gradients"
     type_attr: "T"
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 9b3b5fd7aa..18fafd0de1 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1730,6 +1730,7 @@ bool OpDoesntRequireOutput(const string& op_name) {
           "SoftplusGrad",
           "Softsign",
           "ReluGrad",
+          "LeakyRelu",
           "LeakyReluGrad",
           "Conv2D",
           "DepthwiseConv2dNative",
@@ -1800,7 +1801,6 @@ bool OpDoesntRequireInput(const string& op_name) {
           "BiasAdd",
           "Relu",
           "Relu6",
-          "LeakyRelu",
           "Elu",
           "Selu",
           "SparseSoftmaxCrossEntropyWithLogits",
-- 
GitLab


From 7a54c15804f7bb0d0c40fea5c84b1f4acee58bac Mon Sep 17 00:00:00 2001
From: Stefan Dyulgerov <stefan.dyulgerov@gmail.com>
Date: Sat, 25 Aug 2018 13:18:11 +0300
Subject: [PATCH 009/570] upgraded protobuf to v.3.6.1

---
 tensorflow/contrib/cmake/external/protobuf.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index f56fb35a0f..56a57a2340 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(PROTOBUF_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src)
 set(PROTOBUF_URL https://github.com/google/protobuf.git)
-set(PROTOBUF_TAG v3.6.0)
+set(PROTOBUF_TAG v3.6.1)
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
-- 
GitLab


From e93a9f9ccfd9c7a2419bf3fc1d7866765bbcfce3 Mon Sep 17 00:00:00 2001
From: Matt Conley <mconley@nvidia.com>
Date: Tue, 28 Aug 2018 18:55:51 -0700
Subject: [PATCH 010/570] Update GPU occupancy checking to utilize CUDA's
 occupancy calculator functions

-Replace references to the UnqueryableDeviceParams struct with calls to CUDA's built-in occupancy calculation functions
-Update calls to the occupancy checking functions with the new changes
-Changes should provide more long-term reliability and will remove the need to manually update hardcoded data values for new GPU architectures
---
 .../xla/service/gpu/partition_assignment.cc   |   9 +-
 .../stream_executor/cuda/cuda_gpu_executor.cc | 192 ++----------------
 .../stream_executor/device_description.cc     |  98 +++------
 .../stream_executor/device_description.h      |  73 ++-----
 4 files changed, 61 insertions(+), 311 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index cf9f102d31..375f68a159 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -62,13 +62,8 @@ LaunchDimensions CalculateLaunchDimensions(
   //
   //   <num threads per block> * <max blocks per core> = <max threads per core>
 
-  auto threads_per_core = device_desc.threads_per_core_limit();
-  auto blocks_per_core = device_desc.blocks_per_core_limit();
-  int64 threads_per_block;
-  if (threads_per_core != 0 && blocks_per_core != 0) {
-    threads_per_block = device_desc.threads_per_core_limit() /
-                        device_desc.blocks_per_core_limit();
-  } else {
+  int64 threads_per_block = device_desc.threads_per_block_limit();
+  if (threads_per_block == 0) {
     static std::atomic<int64> log_count{0};
     if (log_count.fetch_add(1) < 8) {
       LOG(WARNING) << "Attempting to calculate launch dimensions for GPU "
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index e30f50ea2a..39b0696c93 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -467,33 +467,26 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
     return;
   }
 
+  int block_size = thread_dims.x * thread_dims.y * thread_dims.z;
+
   const DeviceDescription &device_description =
       kernel.parent()->GetDeviceDescription();
 
-  uint64 blocks_per_sm = CalculateOccupancy(
-      device_description, regs_per_thread, smem_per_block, thread_dims);
-  VLOG(2) << "Resident blocks per SM is " << blocks_per_sm;
+  const CUDAKernel* cuda_kernel = AsCUDAKernel(&kernel);
+  CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
 
-  // To increase occupancy, there must be a sufficient number of blocks
-  // available to spread across the sm's at this new improved occupancy level.
-  int multiprocessor_count = device_description.core_count();
-  int block_count = block_dims.x * block_dims.y * block_dims.z;
-  int available_blocks_per_sm =
-      port::MathUtil::CeilOfRatio(block_count, multiprocessor_count);
-  if (available_blocks_per_sm <= static_cast<int64>(blocks_per_sm)) {
-    VLOG(2) << "Occupancy is limited by number of blocks available per sm.";
-    return;
-  }
+  int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
+                                         smem_per_block, thread_dims, cufunc);
+  VLOG(2) << "Resident blocks per SM is " << blocks_per_sm;
 
-  uint64 improved_regs_per_thread = CalculateRegisterLimitForTargetOccupancy(
-      device_description, smem_per_block, thread_dims, blocks_per_sm + 1);
-  if (improved_regs_per_thread != 0) {
-    VLOG(2) << "Reducing register usage from " << regs_per_thread
-            << " to " << improved_regs_per_thread
-            << " could increase resident blocks per SM by one.";
-  } else {
-    VLOG(2) << "Resident blocks per SM cannot be increased by reducing "
-        "register usage.";
+  int suggested_threads =
+      CompareOccupancy(&blocks_per_sm, device_description, regs_per_thread,
+                       smem_per_block, thread_dims, cufunc);
+  if (suggested_threads != 0) {
+    VLOG(2) << "The cuda occupancy calculator reccommends using "
+            << suggested_threads
+            << " threads per block to acheive an occupancy of " << blocks_per_sm
+            << " blocks per SM.";
   }
 }
 
@@ -980,144 +973,6 @@ static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
 #endif
 }
 
-// Set of compute capability specific device parameters that cannot be
-// queried from the driver API.  These values instead are baked into a
-// lookup table indexed by compute capability version.
-struct UnqueryableDeviceParams {
-  int cc_major;
-  int cc_minor;
-  uint64 blocks_per_core_limit;
-  uint64 registers_per_core_limit;
-  uint64 registers_per_thread_limit;
-  uint64 warp_alloc_granularity;
-  uint64 register_alloc_granularity;
-  uint64 shared_memory_alloc_granularity;
-};
-
-// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
-// https://developer.download.nvidia.com/compute/cuda/CUDA_Occupancy_calculator.xls
-static const UnqueryableDeviceParams kAllUnqueryableDeviceParams[] = {
-    {
-        2, 0,       // compute capability (2.0)
-        8,          // blocks_per_core_limit
-        32 * 1024,  // registers_per_core_limit
-        63,         // registers_per_thread_limit
-        2,          // warp_alloc_granularity
-        64,         // register_alloc_granularity
-        128,        // shared_memory_alloc_granularity
-    },
-    {
-        2, 1,       // compute capability (2.1)
-        8,          // blocks_per_core_limit
-        32 * 1024,  // registers_per_core_limit
-        63,         // registers_per_thread_limit
-        2,          // warp_alloc_granularity
-        64,         // register_alloc_granularity
-        128,        // shared_memory_alloc_granularity
-    },
-    {
-        3, 0,       // compute capability (3.0)
-        16,         // blocks_per_core_limit
-        64 * 1024,  // registers_per_core_limit
-        63,         // registers_per_thread_limit
-        4,          // warp_alloc_granularity
-        256,        // register_alloc_granularity
-        256,        // shared_memory_alloc_granularity
-    },
-    {
-        3, 2,       // compute capability (3.2)
-        16,         // blocks_per_core_limit
-        64 * 1024,  // registers_per_core_limit
-        255,        // registers_per_thread_limit
-        4,          // warp_alloc_granularity
-        256,        // register_alloc_granularity
-        256,        // shared_memory_alloc_granularity
-    },
-    {
-        3, 5,       // compute capability (3.5)
-        16,         // blocks_per_core_limit
-        64 * 1024,  // registers_per_core_limit
-        255,        // registers_per_thread_limit
-        4,          // warp_alloc_granularity
-        256,        // register_alloc_granularity
-        256,        // shared_memory_alloc_granularity
-    },
-    {
-        3, 7,        // compute capability (3.7)
-        16,          // blocks_per_core_limit
-        128 * 1024,  // registers_per_core_limit
-        255,         // registers_per_thread_limit
-        4,           // warp_alloc_granularity
-        256,         // register_alloc_granularity
-        256,         // shared_memory_alloc_granularity
-    },
-    {
-        5, 0,       // compute capability (5.0)
-        32,         // blocks_per_core_limit
-        64 * 1024,  // registers_per_core_limit
-        255,        // registers_per_thread_limit
-        4,          // warp_alloc_granularity
-        256,        // register_alloc_granularity
-        256,        // shared_memory_alloc_granularity
-    },
-    {
-        5, 2,       // compute capability (5.2)
-        32,         // blocks_per_core_limit
-        64 * 1024,  // registers_per_core_limit
-        255,        // registers_per_thread_limit
-        4,          // warp_alloc_granularity
-        256,        // register_alloc_granularity
-        256,        // shared_memory_alloc_granularity
-    },
-    {
-        5, 3,       // compute capability (5.3)
-        32,         // blocks_per_core_limit
-        64 * 1024,  // registers_per_core_limit
-        255,        // registers_per_thread_limit
-        4,          // warp_alloc_granularity
-        256,        // register_alloc_granularity
-        256,        // shared_memory_alloc_granularity
-    },
-    {
-        6, 0,       // compute capability (6.0)
-        32,         // blocks_per_core_limit
-        64 * 1024,  // registers_per_core_limit
-        255,        // registers_per_thread_limit
-        2,          // warp_alloc_granularity
-        256,        // register_alloc_granularity
-        256,        // shared_memory_alloc_granularity
-    },
-    {
-        6, 1,       // compute capability (6.1)
-        32,         // blocks_per_core_limit
-        64 * 1024,  // registers_per_core_limit
-        255,        // registers_per_thread_limit
-        4,          // warp_alloc_granularity
-        256,        // register_alloc_granularity
-        256,        // shared_memory_alloc_granularity
-    },
-    {
-        6, 2,       // compute capability (6.2)
-        32,         // blocks_per_core_limit
-        64 * 1024,  // registers_per_core_limit
-        255,        // registers_per_thread_limit
-        4,          // warp_alloc_granularity
-        256,        // register_alloc_granularity
-        256,        // shared_memory_alloc_granularity
-    },
-    // TODO(jlebar): Confirm the alloc granularity values for sm_70.  These are
-    // not published in the spreadsheet linked above.  Currently we guess that
-    // they're the same as sm_60.
-    {
-        7, 0,       // compute capability (7.0)
-        32,         // blocks_per_core_limit
-        64 * 1024,  // registers_per_core_limit
-        255,        // registers_per_thread_limit
-        2,          // warp_alloc_granularity
-        256,        // register_alloc_granularity
-        256,        // shared_memory_alloc_granularity
-    },
-};
 
 DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
   internal::DeviceDescriptionBuilder builder;
@@ -1193,19 +1048,6 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
     builder.set_name(device_name);
   }
 
-  for (size_t i = 0; i < TF_ARRAYSIZE(kAllUnqueryableDeviceParams); i++) {
-    const auto &params = kAllUnqueryableDeviceParams[i];
-    if (params.cc_major == cc_major_ && params.cc_minor == cc_minor_) {
-      builder.set_blocks_per_core_limit(params.blocks_per_core_limit);
-      builder.set_registers_per_core_limit(params.registers_per_core_limit);
-      builder.set_registers_per_thread_limit(params.registers_per_thread_limit);
-      builder.set_warp_alloc_granularity(params.warp_alloc_granularity);
-      builder.set_register_alloc_granularity(params.register_alloc_granularity);
-      builder.set_shared_memory_alloc_granularity(
-          params.shared_memory_alloc_granularity);
-    }
-  }
-
   builder.set_platform_version(
       port::StrCat("Compute Capability ", cc_major_, ".", cc_minor_));
 
@@ -1227,6 +1069,10 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
       CUDADriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
   builder.set_threads_per_warp(
       CUDADriver::GetThreadsPerWarp(device_).ValueOrDie());
+  builder.set_registers_per_core_limit(
+      CUDADriver::GetDeviceAttribute(
+          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device_)
+          .ValueOrDie());
 
   auto built = builder.Build();
   return built.release();
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index 8ca0677f8a..df52ce6cce 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -37,16 +37,11 @@ DeviceDescription::DeviceDescription()
                         kUninitializedUint64),
       block_dim_limit_(kUninitializedUint64, kUninitializedUint64,
                        kUninitializedUint64),
-      blocks_per_core_limit_(kUninitializedUint64),
       threads_per_core_limit_(kUninitializedUint64),
       threads_per_block_limit_(kUninitializedUint64),
       threads_per_warp_(kUninitializedUint64),
       registers_per_core_limit_(kUninitializedUint64),
       registers_per_block_limit_(kUninitializedUint64),
-      registers_per_thread_limit_(kUninitializedUint64),
-      warp_alloc_granularity_(1),
-      register_alloc_granularity_(1),
-      shared_memory_alloc_granularity_(1),
       device_address_bits_(kUninitializedUint64),
       device_memory_size_(kUninitializedUint64),
       memory_bandwidth_(kUninitializedUint64),
@@ -162,75 +157,36 @@ static uint64 RoundDown(uint64 value, uint64 n) {
   return port::MathUtil::FloorOfRatio(value, n) * n;
 }
 
-uint64 CalculateOccupancy(const DeviceDescription &device_description,
-                          uint64 registers_per_thread,
-                          uint64 shared_memory_per_block,
-                          const ThreadDim &thread_dims) {
-  // Don't try to compute occupancy if necessary values are not initialized.
-  uint64 required_fields[] =  { device_description.registers_per_thread_limit(),
-                                device_description.threads_per_warp(),
-                                device_description.warp_alloc_granularity(),
-                                device_description.register_alloc_granularity(),
-                                device_description.registers_per_block_limit(),
-                                device_description.shared_memory_per_core(),
-                                device_description.blocks_per_core_limit() };
-  for (auto value : required_fields) {
-    if (value == kUninitializedUint64) {
-      return 0;
-    }
-  }
-
-  if (registers_per_thread > device_description.registers_per_thread_limit()) {
-    return 0;
-  }
-
-  uint64 warps_per_block =
-      port::MathUtil::CeilOfRatio(thread_dims.x * thread_dims.y * thread_dims.z,
-                                  device_description.threads_per_warp());
-
-  // Warp resources are allocated at a particular granularity.  This value is
-  // the effective number of warps for resource allocation purposes.
-  uint64 alloc_warps_per_block =
-      RoundUp(warps_per_block, device_description.warp_alloc_granularity());
-
-  uint64 alloc_regs_per_warp =
-      RoundUp(device_description.threads_per_warp() * registers_per_thread,
-              device_description.register_alloc_granularity());
-  uint64 regs_per_block = alloc_warps_per_block * alloc_regs_per_warp;
-  uint64 reg_limit =
-      device_description.registers_per_block_limit() / regs_per_block;
-
-  uint64 alloc_smem_per_block = RoundUp(
-      shared_memory_per_block,
-      device_description.shared_memory_alloc_granularity());
-  uint64 smem_limit = alloc_smem_per_block > 0 ?
-      device_description.shared_memory_per_core() / alloc_smem_per_block :
-      device_description.blocks_per_core_limit();
-
-  uint64 thread_limit = device_description.threads_per_core_limit()
-      / (warps_per_block  * device_description.threads_per_warp());
-
-  return std::min({ device_description.blocks_per_core_limit(),
-          reg_limit, smem_limit, thread_limit });
+int CalculateOccupancy(const DeviceDescription& device_description,
+                       uint64 registers_per_thread,
+                       uint64 shared_memory_per_block,
+                       const ThreadDim& thread_dims, CUfunction func) {
+  int suggested_blocks = 0;
+  int suggested_threads = 0;
+  CUresult err =
+      cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
+                                       func, NULL, shared_memory_per_block, 0);
+  CHECK_EQ(err, CUDA_SUCCESS);
+  return suggested_blocks;
 }
 
-uint64 CalculateRegisterLimitForTargetOccupancy(
-    const DeviceDescription &device_description, uint64 shared_memory_per_block,
-    const ThreadDim &thread_dims, uint64 target_blocks_per_core) {
-  // Linear search from maximum number of registers down until the target
-  // blocks per SM is found.
-  // TODO(meheff): Compute this using a closed form solution.
-  int reg_step = device_description.register_alloc_granularity() /
-      device_description.threads_per_warp();
-  for (int r = device_description.registers_per_thread_limit(); r > 0;
-       r = RoundDown(r - 1, reg_step)) {
-    uint64 occupancy = CalculateOccupancy(
-        device_description, r, shared_memory_per_block, thread_dims);
-    if (occupancy >= target_blocks_per_core) {
-      return r;
-    }
+int CompareOccupancy(int* initial_blocks,
+                     const DeviceDescription& device_description,
+                     uint64 registers_per_thread,
+                     uint64 shared_memory_per_block,
+                     const ThreadDim& thread_dims, CUfunction func) {
+  int suggested_blocks = 0;
+  int suggested_threads = 0;
+  CUresult err =
+      cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
+                                       func, NULL, shared_memory_per_block, 0);
+  CHECK_EQ(err, CUDA_SUCCESS);
+  if (suggested_blocks > *initial_blocks) {
+    *initial_blocks = suggested_blocks;
+    return suggested_threads;
+  } else {
+    return 0;
   }
-  return 0;
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index 7f99d81ef3..d335b9b875 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/stream_executor/platform/port.h"
 
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
@@ -79,10 +80,6 @@ class DeviceDescription {
   // legitimate kernel launch request.
   const BlockDim &block_dim_limit() const { return block_dim_limit_; }
 
-  // Returns the limit on the number of simultaneously resident blocks
-  // on a multiprocessor.
-  uint64 blocks_per_core_limit() const { return blocks_per_core_limit_; }
-
   // Returns the limit on the total number of threads that can be launched in a
   // single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
   // This limit affects what constitutes a legitimate kernel launch request.
@@ -110,27 +107,6 @@ class DeviceDescription {
     return registers_per_block_limit_;
   }
 
-  // Returns the limit on the total number of registers that can be
-  // allocated to a thread.
-  const uint64 &registers_per_thread_limit() const {
-    return registers_per_thread_limit_;
-  }
-
-  // Returns the granularity at which warps are allocated resources.
-  const uint64 &warp_alloc_granularity() const {
-    return warp_alloc_granularity_;
-  }
-
-  // Returns the granularity at which registers are allocated to warps.
-  const uint64 &register_alloc_granularity() const {
-    return register_alloc_granularity_;
-  }
-
-  // Returns the granularity at which shared memory is allocated to warps.
-  const uint64 &shared_memory_alloc_granularity() const {
-    return shared_memory_alloc_granularity_;
-  }
-
   // Returns the number of address bits available to kernel code running on the
   // platform. This affects things like the maximum allocation size and perhaps
   // types used in kernel code such as size_t.
@@ -200,19 +176,12 @@ class DeviceDescription {
   ThreadDim thread_dim_limit_;
   BlockDim block_dim_limit_;
 
-  uint64 blocks_per_core_limit_;
-
   uint64 threads_per_core_limit_;
   uint64 threads_per_block_limit_;
   uint64 threads_per_warp_;
 
   uint64 registers_per_core_limit_;
   uint64 registers_per_block_limit_;
-  uint64 registers_per_thread_limit_;
-
-  uint64 warp_alloc_granularity_;
-  uint64 register_alloc_granularity_;
-  uint64 shared_memory_alloc_granularity_;
 
   uint64 device_address_bits_;
   uint64 device_memory_size_;
@@ -270,10 +239,6 @@ class DeviceDescriptionBuilder {
     device_description_->block_dim_limit_ = value;
   }
 
-  void set_blocks_per_core_limit(uint64 value) {
-    device_description_->blocks_per_core_limit_ = value;
-  }
-
   void set_threads_per_core_limit(uint64 value) {
     device_description_->threads_per_core_limit_ = value;
   }
@@ -290,19 +255,6 @@ class DeviceDescriptionBuilder {
   void set_registers_per_block_limit(uint64 value) {
     device_description_->registers_per_block_limit_ = value;
   }
-  void set_registers_per_thread_limit(uint64 value) {
-    device_description_->registers_per_thread_limit_ = value;
-  }
-
-  void set_warp_alloc_granularity(uint64 value) {
-    device_description_->warp_alloc_granularity_ = value;
-  }
-  void set_register_alloc_granularity(uint64 value) {
-    device_description_->register_alloc_granularity_ = value;
-  }
-  void set_shared_memory_alloc_granularity(uint64 value) {
-    device_description_->shared_memory_alloc_granularity_ = value;
-  }
 
   void set_device_address_bits(uint64 value) {
     device_description_->device_address_bits_ = value;
@@ -375,17 +327,18 @@ void CalculateDimensionality(const DeviceDescription &device_description,
 // Compute and return maximum blocks per core (occupancy) based on the
 // device description, some kernel characteristics and the number of threads per
 // block.  If unable to compute occupancy, zero is returned.
-uint64 CalculateOccupancy(const DeviceDescription &device_description,
-                          uint64 registers_per_thread,
-                          uint64 shared_memory_per_block,
-                          const ThreadDim &thread_dims);
-
-// Compute and return the maximum number of registers per thread which
-// achieves the target occupancy.  If the target is not possible then
-// zero is returned.
-uint64 CalculateRegisterLimitForTargetOccupancy(
-    const DeviceDescription &device_description, uint64 shared_memory_per_block,
-    const ThreadDim &thread_dims, uint64 target_blocks_per_core);
+int CalculateOccupancy(const DeviceDescription& device_description,
+                       uint64 registers_per_thread,
+                       uint64 shared_memory_per_block,
+                       const ThreadDim& thread_dims, CUfunction func);
+
+// Compute and return the suggested thread count to acheive ideal occupancy.
+// If the provided thread dimensions match this number, zero is returned.
+int CompareOccupancy(int* initial_blocks,
+                     const DeviceDescription& device_description,
+                     uint64 registers_per_thread,
+                     uint64 shared_memory_per_block,
+                     const ThreadDim& thread_dims, CUfunction func);
 
 }  // namespace stream_executor
 
-- 
GitLab


From 4e72dd865a3fc83baa69f6b7c08720a1b546a464 Mon Sep 17 00:00:00 2001
From: Cao Zongyan <zongyan.cao@alibaba-inc.com>
Date: Wed, 29 Aug 2018 17:05:43 +0800
Subject: [PATCH 011/570] Refine LeakyRelu codes.

1. Add C++ gradient of gradient definition of LeakyReLu and revalant UT.
2. Using forward compatibility layer for python code changes.
---
 tensorflow/cc/gradients/nn_grad.cc            | 18 ++++-
 tensorflow/cc/gradients/nn_grad_test.cc       | 16 +++++
 .../python/kernel_tests/relu_op_test.py       | 70 ++++++++++---------
 tensorflow/python/ops/nn_ops.py               |  5 +-
 4 files changed, 73 insertions(+), 36 deletions(-)

diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 0fc23d0bf7..2a32a2ed6f 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -149,13 +149,27 @@ Status LeakyReluGradHelper(const Scope& scope, const Operation& op,
   float alpha;
   TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "alpha", &alpha));
   internal::LeakyReluGrad::Attrs attrs;
-  attrs.Alpha(alpha);
-  auto dx = internal::LeakyReluGrad(scope, grad_inputs[0], op.input(0), attrs);
+  auto dx = internal::LeakyReluGrad(scope, grad_inputs[0], op.input(0),
+                                    attrs.Alpha(alpha));
   grad_outputs->push_back(dx);
   return scope.status();
 }
 REGISTER_GRADIENT_OP("LeakyRelu", LeakyReluGradHelper);
 
+Status LeakyReluGradGradHelper(const Scope& scope, const Operation& op,
+                               const std::vector<Output>& grad_inputs,
+                               std::vector<Output>* grad_outputs) {
+  float alpha;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "alpha", &alpha));
+  internal::LeakyReluGrad::Attrs attrs;
+  auto dx = internal::LeakyReluGrad(scope, grad_inputs[0], op.input(1),
+                                    attrs.Alpha(alpha));
+  grad_outputs->push_back(dx);
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("LeakyReluGrad", LeakyReluGradGradHelper);
+
 Status EluGradHelper(const Scope& scope, const Operation& op,
                      const std::vector<Output>& grad_inputs,
                      std::vector<Output>* grad_outputs) {
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index 5ebece7b6e..bf0db1f59d 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/cc/framework/gradient_checker.h"
 #include "tensorflow/cc/framework/testutil.h"
 #include "tensorflow/cc/gradients/grad_testutil.h"
+#include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -173,6 +174,21 @@ TEST_F(NNGradTest, LeakyReluGrad) {
   RunTest(x, x_init_value, y, shape);
 }
 
+TEST_F(NNGradTest, LeakyReluGradGrad) {
+  TensorShape shape({5, 2});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  // Avoid input values where Leaky ReLU gradient is not well defined (around
+  // zero).
+  Tensor x_init_value = test::AsTensor<float>(
+      {2.3f, 1.9f, 1.5f, 1.1f, 0.7f, 0.3f, -0.1f, -0.5f, -0.9f, -1.3f},
+      {5, 2});
+  Tensor features = test::AsTensor<float>(
+      {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f, 0.1f, 0.3f, 0.5f, 0.7f, 0.9f},
+      {5, 2});
+  auto y = ops::internal::LeakyReluGrad(scope_, x, features);
+  RunTest(x, x_init_value, y, shape);
+}
+
 TEST_F(NNGradTest, EluGrad) {
   TensorShape shape({5, 2});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index ccb3a231bb..7066f28883 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
@@ -283,8 +284,9 @@ class LeakyReluTest(test.TestCase):
             np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
             alpha=0.1, use_gpu=True)
 
-  # The gradient test for ReLU is a bit tricky as the derivative is not well
-  # defined at around zero and we want to avoid that in terms of input values.
+  # The gradient test for Leaky ReLU is a bit tricky as the derivative is not
+  # well defined at around zero and we want to avoid that in terms of input
+  # values.
   def testGradientFloat32(self):
     with self.test_session():
       x = constant_op.constant(
@@ -319,39 +321,41 @@ class LeakyReluTest(test.TestCase):
     self.assertLess(err, 1e-10)
 
   def testGradGradFloat32(self):
-    with self.test_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.leaky_relu(x, alpha=0.1, name="leaky_relu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
-          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
-          dtype=np.float32,
-          order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
-    print("leaky_relu (float32) gradient of gradient err = ", err)
-    self.assertLess(err, 1e-4)
+    with compat.forward_compatibility_horizon(2018, 10, 2):
+      with self.test_session():
+	x = constant_op.constant(
+	    [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+	    shape=[2, 5],
+	    name="x")
+	y = nn_ops.leaky_relu(x, alpha=0.1, name="leaky_relu")
+	z = gradients_impl.gradients(y, x)
+	x_init = np.asarray(
+	    [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+	    dtype=np.float32,
+	    order="F")
+	err = gradient_checker.compute_gradient_error(
+	    x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      print("leaky_relu (float32) gradient of gradient err = ", err)
+      self.assertLess(err, 1e-4)
 
   def testGradGradFloat64(self):
-    with self.test_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.leaky_relu(x, alpha=0.02, name="leaky_relu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
-          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
-          dtype=np.float64,
-          order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
-    print("leaky_relu (float64) gradient of gradient err = ", err)
-    self.assertLess(err, 1e-10)
+    with compat.forward_compatibility_horizon(2018, 10, 2):
+      with self.test_session():
+	x = constant_op.constant(
+	    [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+	    shape=[2, 5],
+	    dtype=dtypes.float64,
+	    name="x")
+	y = nn_ops.leaky_relu(x, alpha=0.02, name="leaky_relu")
+	z = gradients_impl.gradients(y, x)
+	x_init = np.asarray(
+	    [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+	    dtype=np.float64,
+	    order="F")
+	err = gradient_checker.compute_gradient_error(
+	    x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      print("leaky_relu (float64) gradient of gradient err = ", err)
+      self.assertLess(err, 1e-10)
 
   def testGradientScalar(self):
     with self.test_session() as sess:
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 31b8f3945d..52ea202636 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1601,7 +1601,10 @@ def leaky_relu(features, alpha=0.2, name=None):
     features = ops.convert_to_tensor(features, name="features")
     if features.dtype.is_integer:
       features = math_ops.to_float(features)
-    return gen_nn_ops.leaky_relu(features, alpha=alpha, name=name)
+    if compat.forward_compatible(2018, 10, 1):
+      return gen_nn_ops.leaky_relu(features, alpha=alpha, name=name)
+    alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
+    return math_ops.maximum(alpha * features, features, name=name)
 
 
 def _flatten_outer_dims(logits):
-- 
GitLab


From 2586eb3bfeeef3af357e438ae5aff92d2bac12a5 Mon Sep 17 00:00:00 2001
From: Cao Zongyan <zongyan.cao@alibaba-inc.com>
Date: Mon, 3 Sep 2018 11:48:35 +0800
Subject: [PATCH 012/570] Code fix against ci_build error results.

---
 tensorflow/cc/gradients/nn_grad_test.cc       |  3 +-
 tensorflow/core/kernels/relu_op.cc            |  8 +--
 tensorflow/core/kernels/relu_op.h             |  8 +--
 tensorflow/core/kernels/relu_op_functor.h     |  1 -
 .../python/kernel_tests/relu_op_test.py       | 50 +++++++++----------
 .../tools/api/golden/v1/tensorflow.pbtxt      |  4 ++
 6 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index bf0db1f59d..d8c2a1a0fc 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -180,8 +180,7 @@ TEST_F(NNGradTest, LeakyReluGradGrad) {
   // Avoid input values where Leaky ReLU gradient is not well defined (around
   // zero).
   Tensor x_init_value = test::AsTensor<float>(
-      {2.3f, 1.9f, 1.5f, 1.1f, 0.7f, 0.3f, -0.1f, -0.5f, -0.9f, -1.3f},
-      {5, 2});
+      {2.3f, 1.9f, 1.5f, 1.1f, 0.7f, 0.3f, -0.1f, -0.5f, -0.9f, -1.3f}, {5, 2});
   Tensor features = test::AsTensor<float>(
       {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f, 0.1f, 0.3f, 0.5f, 0.7f, 0.9f},
       {5, 2});
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index c4f2ef5632..cafa49cbb6 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -106,15 +106,15 @@ namespace functor {
                                                                                \
   template <>                                                                  \
   void LeakyRelu<GPUDevice, T>::operator()(                                    \
-      const GPUDevice& d, typename TTypes<T>::ConstTensor features,            \
-      T alpha, typename TTypes<T>::Tensor activations);                        \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor features, T alpha,   \
+      typename TTypes<T>::Tensor activations);                                 \
   extern template struct LeakyRelu<GPUDevice, T>;                              \
                                                                                \
   template <>                                                                  \
   void LeakyReluGrad<GPUDevice, T>::operator()(                                \
       const GPUDevice& d, typename TTypes<T>::ConstTensor gradients,           \
-      typename TTypes<T>::ConstTensor features,                                \
-      T alpha, typename TTypes<T>::Tensor backprops);                          \
+      typename TTypes<T>::ConstTensor features, T alpha,                       \
+      typename TTypes<T>::Tensor backprops);                                   \
   extern template struct LeakyReluGrad<GPUDevice, T>;                          \
                                                                                \
   template <>                                                                  \
diff --git a/tensorflow/core/kernels/relu_op.h b/tensorflow/core/kernels/relu_op.h
index c55190065c..fa79ab03ae 100644
--- a/tensorflow/core/kernels/relu_op.h
+++ b/tensorflow/core/kernels/relu_op.h
@@ -143,8 +143,8 @@ class LeakyReluOp : public UnaryElementWiseOp<T, LeakyReluOp<Device, T>> {
 
   void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
     functor::LeakyRelu<Device, T> functor;
-    functor(context->eigen_device<Device>(), input.flat<T>(),
-            alpha_, output->flat<T>());
+    functor(context->eigen_device<Device>(), input.flat<T>(), alpha_,
+            output->flat<T>());
   }
 
  private:
@@ -183,7 +183,9 @@ class LeakyReluGradOp
 
 template <typename Device, typename T>
 void LeakyReluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
-    const Tensor& g, const Tensor& a, T alpha, Tensor* output) {
+                                                   const Tensor& g,
+                                                   const Tensor& a, T alpha,
+                                                   Tensor* output) {
   if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
   functor::LeakyReluGrad<Device, T> functor;
   functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(), alpha,
diff --git a/tensorflow/core/kernels/relu_op_functor.h b/tensorflow/core/kernels/relu_op_functor.h
index 7f0951451d..548d5a277d 100644
--- a/tensorflow/core/kernels/relu_op_functor.h
+++ b/tensorflow/core/kernels/relu_op_functor.h
@@ -91,7 +91,6 @@ struct Relu6Grad {
   }
 };
 
-
 // Functor used by LeakyReluOp to do the computations.
 template <typename Device, typename T>
 struct LeakyRelu {
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index 7066f28883..3e24b8a2c4 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -323,37 +323,37 @@ class LeakyReluTest(test.TestCase):
   def testGradGradFloat32(self):
     with compat.forward_compatibility_horizon(2018, 10, 2):
       with self.test_session():
-	x = constant_op.constant(
-	    [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-	    shape=[2, 5],
-	    name="x")
-	y = nn_ops.leaky_relu(x, alpha=0.1, name="leaky_relu")
-	z = gradients_impl.gradients(y, x)
-	x_init = np.asarray(
-	    [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
-	    dtype=np.float32,
-	    order="F")
-	err = gradient_checker.compute_gradient_error(
-	    x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+        x = constant_op.constant(
+            [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+            shape=[2, 5],
+            name="x")
+        y = nn_ops.leaky_relu(x, alpha=0.1, name="leaky_relu")
+        z = gradients_impl.gradients(y, x)
+        x_init = np.asarray(
+            [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+            dtype=np.float32,
+            order="F")
+        err = gradient_checker.compute_gradient_error(
+            x, [2, 5], z[0], [2, 5], x_init_value=x_init)
       print("leaky_relu (float32) gradient of gradient err = ", err)
       self.assertLess(err, 1e-4)
 
   def testGradGradFloat64(self):
     with compat.forward_compatibility_horizon(2018, 10, 2):
       with self.test_session():
-	x = constant_op.constant(
-	    [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-	    shape=[2, 5],
-	    dtype=dtypes.float64,
-	    name="x")
-	y = nn_ops.leaky_relu(x, alpha=0.02, name="leaky_relu")
-	z = gradients_impl.gradients(y, x)
-	x_init = np.asarray(
-	    [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
-	    dtype=np.float64,
-	    order="F")
-	err = gradient_checker.compute_gradient_error(
-	    x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+        x = constant_op.constant(
+            [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+            shape=[2, 5],
+            dtype=dtypes.float64,
+            name="x")
+        y = nn_ops.leaky_relu(x, alpha=0.02, name="leaky_relu")
+        z = gradients_impl.gradients(y, x)
+        x_init = np.asarray(
+            [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+            dtype=np.float64,
+            order="F")
+        err = gradient_checker.compute_gradient_error(
+            x, [2, 5], z[0], [2, 5], x_init_value=x_init)
       print("leaky_relu (float64) gradient of gradient err = ", err)
       self.assertLess(err, 1e-10)
 
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 4de662fe33..9e8d320f06 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -1324,6 +1324,10 @@ tf_module {
     name: "lbeta"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "leaky_relu"
+    argspec: "args=[\'features\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.2\', \'None\'], "
+  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-- 
GitLab


From d2ad105d2dff3c79d8f49f5fb8ce74c38f424e74 Mon Sep 17 00:00:00 2001
From: Cao Zongyan <zongyan.cao@alibaba-inc.com>
Date: Mon, 3 Sep 2018 12:10:51 +0800
Subject: [PATCH 013/570] Add XLA support for LeakyReluOp.

Code contributed by: Meng Chen <mc119496@alibaba-inc.com>
---
 tensorflow/compiler/tests/binary_ops_test.py  |  7 ++++
 tensorflow/compiler/tests/unary_ops_test.py   |  5 +++
 tensorflow/compiler/tf2xla/kernels/relu_op.cc | 42 +++++++++++++++++++
 3 files changed, 54 insertions(+)

diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 0aafda7fb4..8941dd4e27 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -178,6 +178,13 @@ class BinaryOpsTest(xla_test.XLATestCase):
               [0, 0, 0, 0, 0, 0.1, 0.3, 0.5, 0.7, 0.9, 6.1, 10.0], dtype=dtype),
           expected=np.array([0, 0, 0, 0, 0, 6, 7, 8, 9, 10, 0, 0], dtype=dtype))
 
+      self._testBinary(
+          gen_nn_ops._leaky_relu_grad,
+          np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dtype),
+          np.array(
+              [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9], dtype=dtype),
+          expected=np.array([0.2, 0.4, 0.6, 0.8, 1, 6, 7, 8, 9, 10], dtype=dtype))
+
       self._testBinary(
           gen_nn_ops.softmax_cross_entropy_with_logits,
           np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=dtype),
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 73adb0d243..91f876fa23 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -361,6 +361,11 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([[-0.05, 6.05, 5]], dtype=dtype),
           expected=np.array([[0, 6, 5]], dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          nn_ops.leaky_relu,
+          np.array([[-1.0, 1.0]], dtype=dtype),
+          expected=np.array([[-0.2, 1.0]], dtype=dtype))
+
       self._assertOpOutputMatchesExpected(
           nn_ops.softmax,
           np.array([1, 2, 3, 4], dtype=dtype),
diff --git a/tensorflow/compiler/tf2xla/kernels/relu_op.cc b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
index d35777ccb1..ec14735884 100644
--- a/tensorflow/compiler/tf2xla/kernels/relu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
@@ -50,6 +50,24 @@ class Relu6Op : public XlaOpKernel {
   }
 };
 
+
+class LeakyReluOp : public XlaOpKernel {
+ public:
+  explicit LeakyReluOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("alpha", &alpha_));
+  }
+  // Compute the max of the input x and alpha*x.
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+    auto alpha = XlaHelpers::FloatLiteral(builder, input_type(0),
+                                          static_cast<double>(alpha_));
+    ctx->SetOutput(0,
+        xla::Max(xla::Mul(alpha, ctx->Input(0)), ctx->Input(0)));
+  }
+ private:
+  float alpha_;
+};
+
 class ReluGradOp : public XlaOpKernel {
  public:
   explicit ReluGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
@@ -84,10 +102,34 @@ class Relu6GradOp : public XlaOpKernel {
   }
 };
 
+class LeakyReluGradOp : public XlaOpKernel {
+ public:
+  explicit LeakyReluGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("alpha", &alpha_));
+  }
+  // Return the lhs (incoming gradient) if the rhs (input feature) > 0,
+  // otherwise return the alpha * lhs.
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+    const TensorShape shape = ctx->InputShape(0);
+    const auto zero =
+        xla::Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes());
+    const auto pred = xla::Gt(ctx->Input(1), zero);
+    auto alpha = XlaHelpers::FloatLiteral(b, input_type(0),
+                                          static_cast<double>(alpha_));
+    ctx->SetOutput(0,
+        xla::Select(pred, ctx->Input(0), xla::Mul(alpha, ctx->Input(0))));
+  }
+ private:
+  float alpha_;
+};
+
 REGISTER_XLA_OP(Name("Relu"), ReluOp);
 REGISTER_XLA_OP(Name("Relu6"), Relu6Op);
+REGISTER_XLA_OP(Name("LeakyRelu"), LeakyReluOp);
 REGISTER_XLA_OP(Name("ReluGrad"), ReluGradOp);
 REGISTER_XLA_OP(Name("Relu6Grad"), Relu6GradOp);
+REGISTER_XLA_OP(Name("LeakyReluGrad"), LeakyReluGradOp);
 
 }  // namespace
 }  // namespace tensorflow
-- 
GitLab


From fa20b59b920233d35bb8da3fbc3c234c369a8291 Mon Sep 17 00:00:00 2001
From: Matt Conley <mconley@nvidia.com>
Date: Tue, 4 Sep 2018 14:20:40 -0700
Subject: [PATCH 014/570] Move CUDA-specific occupancy calculation into proper
 file

-Maintain functionality, just move CalculateOccupancy() and CompareOccupancy() methods from device_description to cuda_gpu_executor
-Remove CUDA requirement in general class device_description
---
 .../stream_executor/cuda/cuda_gpu_executor.cc | 37 +++++++++++++++++++
 .../stream_executor/cuda/cuda_gpu_executor.h  | 11 ++++++
 .../stream_executor/device_description.cc     | 32 ----------------
 .../stream_executor/device_description.h      | 17 ---------
 4 files changed, 48 insertions(+), 49 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 39b0696c93..458c0e3030 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -490,6 +490,43 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
   }
 }
 
+// Compute and return maximum blocks per core (occupancy) based on the
+// device description, some kernel characteristics and the number of threads per
+// block.  If unable to compute occupancy, zero is returned.
+int CalculateOccupancy(const DeviceDescription& device_description,
+                       uint64 registers_per_thread,
+                       uint64 shared_memory_per_block,
+                       const ThreadDim& thread_dims, CUfunction func) {
+  int suggested_blocks = 0;
+  int suggested_threads = 0;
+  CUresult err =
+      cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
+                                       func, NULL, shared_memory_per_block, 0);
+  CHECK_EQ(err, CUDA_SUCCESS);
+  return suggested_blocks;
+}
+
+// Compute and return the suggested thread count to acheive ideal occupancy.
+// If the provided thread dimensions match this number, zero is returned.
+int CompareOccupancy(int* initial_blocks,
+                     const DeviceDescription& device_description,
+                     uint64 registers_per_thread,
+                     uint64 shared_memory_per_block,
+                     const ThreadDim& thread_dims, CUfunction func) {
+  int suggested_blocks = 0;
+  int suggested_threads = 0;
+  CUresult err =
+      cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
+                                       func, NULL, shared_memory_per_block, 0);
+  CHECK_EQ(err, CUDA_SUCCESS);
+  if (suggested_blocks > *initial_blocks) {
+    *initial_blocks = suggested_blocks;
+    return suggested_threads;
+  } else {
+    return 0;
+  }
+}
+
 void *CUDAExecutor::Allocate(uint64 size) {
   return CUDADriver::DeviceAllocate(context_, size);
 }
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 8a954d5461..e8ebbc3220 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -70,6 +70,17 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
               const BlockDim &block_dims, const KernelBase &k,
               const KernelArgsArrayBase &args) override;
 
+  int CalculateOccupancy(const DeviceDescription& device_description,
+                       uint64 registers_per_thread,
+                       uint64 shared_memory_per_block,
+                       const ThreadDim& thread_dims, CUfunction func);
+
+  int CompareOccupancy(int* initial_blocks,
+                     const DeviceDescription& device_description,
+                     uint64 registers_per_thread,
+                     uint64 shared_memory_per_block,
+                     const ThreadDim& thread_dims, CUfunction func);
+
   void *Allocate(uint64 size) override;
 
   void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index df52ce6cce..726c4adf74 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -157,36 +157,4 @@ static uint64 RoundDown(uint64 value, uint64 n) {
   return port::MathUtil::FloorOfRatio(value, n) * n;
 }
 
-int CalculateOccupancy(const DeviceDescription& device_description,
-                       uint64 registers_per_thread,
-                       uint64 shared_memory_per_block,
-                       const ThreadDim& thread_dims, CUfunction func) {
-  int suggested_blocks = 0;
-  int suggested_threads = 0;
-  CUresult err =
-      cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
-                                       func, NULL, shared_memory_per_block, 0);
-  CHECK_EQ(err, CUDA_SUCCESS);
-  return suggested_blocks;
-}
-
-int CompareOccupancy(int* initial_blocks,
-                     const DeviceDescription& device_description,
-                     uint64 registers_per_thread,
-                     uint64 shared_memory_per_block,
-                     const ThreadDim& thread_dims, CUfunction func) {
-  int suggested_blocks = 0;
-  int suggested_threads = 0;
-  CUresult err =
-      cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
-                                       func, NULL, shared_memory_per_block, 0);
-  CHECK_EQ(err, CUDA_SUCCESS);
-  if (suggested_blocks > *initial_blocks) {
-    *initial_blocks = suggested_blocks;
-    return suggested_threads;
-  } else {
-    return 0;
-  }
-}
-
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index d335b9b875..b15ce31216 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/stream_executor/platform/port.h"
 
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
@@ -324,22 +323,6 @@ void CalculateDimensionality(const DeviceDescription &device_description,
                              uint64 element_count, uint64 *threads_per_block,
                              uint64 *block_count);
 
-// Compute and return maximum blocks per core (occupancy) based on the
-// device description, some kernel characteristics and the number of threads per
-// block.  If unable to compute occupancy, zero is returned.
-int CalculateOccupancy(const DeviceDescription& device_description,
-                       uint64 registers_per_thread,
-                       uint64 shared_memory_per_block,
-                       const ThreadDim& thread_dims, CUfunction func);
-
-// Compute and return the suggested thread count to acheive ideal occupancy.
-// If the provided thread dimensions match this number, zero is returned.
-int CompareOccupancy(int* initial_blocks,
-                     const DeviceDescription& device_description,
-                     uint64 registers_per_thread,
-                     uint64 shared_memory_per_block,
-                     const ThreadDim& thread_dims, CUfunction func);
-
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
-- 
GitLab


From cd6597b8fcd82b51ddb47a297972a1614c2a5d78 Mon Sep 17 00:00:00 2001
From: Matt Conley <mconley@nvidia.com>
Date: Tue, 4 Sep 2018 16:17:40 -0700
Subject: [PATCH 015/570] Fixed transition typo

---
 tensorflow/stream_executor/cuda/cuda_gpu_executor.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 458c0e3030..a961e9a6c4 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -493,7 +493,7 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
 // Compute and return maximum blocks per core (occupancy) based on the
 // device description, some kernel characteristics and the number of threads per
 // block.  If unable to compute occupancy, zero is returned.
-int CalculateOccupancy(const DeviceDescription& device_description,
+int CUDAExecutor::CalculateOccupancy(const DeviceDescription& device_description,
                        uint64 registers_per_thread,
                        uint64 shared_memory_per_block,
                        const ThreadDim& thread_dims, CUfunction func) {
@@ -508,7 +508,7 @@ int CalculateOccupancy(const DeviceDescription& device_description,
 
 // Compute and return the suggested thread count to acheive ideal occupancy.
 // If the provided thread dimensions match this number, zero is returned.
-int CompareOccupancy(int* initial_blocks,
+int CUDAExecutor::CompareOccupancy(int* initial_blocks,
                      const DeviceDescription& device_description,
                      uint64 registers_per_thread,
                      uint64 shared_memory_per_block,
-- 
GitLab


From 475b7715f16ad0f94fa9986a0eefc1b2cf2044bd Mon Sep 17 00:00:00 2001
From: Matt Conley <mconley@nvidia.com>
Date: Tue, 4 Sep 2018 16:31:01 -0700
Subject: [PATCH 016/570] Recommended typo fix

---
 tensorflow/stream_executor/cuda/cuda_gpu_executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index a961e9a6c4..ce2f1ce3ae 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -483,7 +483,7 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
       CompareOccupancy(&blocks_per_sm, device_description, regs_per_thread,
                        smem_per_block, thread_dims, cufunc);
   if (suggested_threads != 0) {
-    VLOG(2) << "The cuda occupancy calculator reccommends using "
+    VLOG(2) << "The cuda occupancy calculator recommends using "
             << suggested_threads
             << " threads per block to acheive an occupancy of " << blocks_per_sm
             << " blocks per SM.";
-- 
GitLab


From a95281ce1b449d8f92a3799ff9c1dbf661b70bc4 Mon Sep 17 00:00:00 2001
From: Cao Zongyan <zongyan.cao@alibaba-inc.com>
Date: Wed, 5 Sep 2018 09:02:40 +0800
Subject: [PATCH 017/570] Avoid golden API file changing.

---
 tensorflow/cc/gradients/nn_grad_test.cc                  | 3 +--
 tensorflow/core/api_def/base_api/api_def_LeakyRelu.pbtxt | 1 +
 tensorflow/tools/api/golden/v1/tensorflow.pbtxt          | 4 ----
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index d8c2a1a0fc..f5a09e09dc 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -42,7 +42,6 @@ using ops::MaxPoolV2;
 using ops::Placeholder;
 using ops::Relu;
 using ops::Relu6;
-using ops::LeakyRelu;
 using ops::Selu;
 using ops::Softmax;
 using ops::Softplus;
@@ -165,7 +164,7 @@ TEST_F(NNGradTest, Relu6Grad) {
 TEST_F(NNGradTest, LeakyReluGrad) {
   TensorShape shape({5, 2});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
-  auto y = LeakyRelu(scope_, x);
+  auto y = ops::internal::LeakyRelu(scope_, x);
   // Avoid input values where Leaky ReLU gradient is not well defined (around
   // zero).
   Tensor x_init_value = test::AsTensor<float>(
diff --git a/tensorflow/core/api_def/base_api/api_def_LeakyRelu.pbtxt b/tensorflow/core/api_def/base_api/api_def_LeakyRelu.pbtxt
index 4a61889f54..280148e032 100644
--- a/tensorflow/core/api_def/base_api/api_def_LeakyRelu.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_LeakyRelu.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "LeakyRelu"
+  visibility: HIDDEN
   summary: "Computes rectified linear: `max(features, features * alpha)`."
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 9e8d320f06..4de662fe33 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -1324,10 +1324,6 @@ tf_module {
     name: "lbeta"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "leaky_relu"
-    argspec: "args=[\'features\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.2\', \'None\'], "
-  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-- 
GitLab


From 69d3b8faf41791834301a74a05e288964940427d Mon Sep 17 00:00:00 2001
From: "Wen-Heng (Jack) Chung" <whchung@gmail.com>
Date: Fri, 22 Jun 2018 23:09:43 -0500
Subject: [PATCH 018/570] [ROCm] bazel build system and continuous integration
 logic

The commit contains following components to support TensorFlow on ROCm platform

- bazel build system
- continuous integration logic

Authors:

- Jack Chung: jack.chung@amd.com
- Jeffrey Poznanovic: Jeffrey.Poznanovic@amd.com
- Peng Sun: Peng.Sun@amd.com
---
 configure.py                                  |  20 +
 tensorflow/core/BUILD                         |   4 +-
 tensorflow/core/kernels/BUILD                 |   3 +-
 tensorflow/tensorflow.bzl                     |  67 +-
 tensorflow/tools/ci_build/Dockerfile.rocm     |  97 +++
 .../tools/ci_build/builds/docker_test.sh      |   9 +-
 tensorflow/tools/ci_build/builds/pip.sh       |   4 +-
 .../tools/ci_build/builds/with_the_same_user  |   6 +
 tensorflow/tools/ci_build/ci_build.sh         |  11 +-
 .../tools/ci_build/linux/cpu/run_cc_core.sh   |   1 +
 .../tools/ci_build/linux/cpu/run_py2_core.sh  |   1 +
 .../ci_build/linux/cpu/run_py3_contrib.sh     |   1 +
 .../tools/ci_build/linux/cpu/run_py3_core.sh  |   1 +
 .../tools/ci_build/linux/libtensorflow.sh     |   3 +
 .../tools/ci_build/linux/libtensorflow_cpu.sh |   1 +
 .../ci_build/linux/libtensorflow_docker.sh    |   6 +
 .../ci_build/linux/libtensorflow_rocm.sh      |  22 +
 .../tools/ci_build/linux/rocm/run_cc_core.sh  |  39 ++
 .../tools/ci_build/linux/rocm/run_py3_core.sh |  39 ++
 .../tools/ci_build/osx/cpu/run_py2_cc_core.sh |   1 +
 .../tools/ci_build/osx/libtensorflow_cpu.sh   |   1 +
 .../tools/ci_build/osx/libtensorflow_gpu.sh   |   1 +
 .../tools/ci_build/osx/libtensorflow_rocm.sh  |  36 +
 .../tools/ci_build/xla/linux/rocm/run_py3.sh  |  41 ++
 tensorflow/workspace.bzl                      |   2 +
 .../gpus/crosstool/CROSSTOOL_hipcc.tpl        | 158 +++++
 .../bin/crosstool_wrapper_driver_rocm.tpl     | 241 +++++++
 third_party/gpus/rocm/BUILD                   |   0
 third_party/gpus/rocm/BUILD.tpl               |  99 +++
 third_party/gpus/rocm/build_defs.bzl.tpl      |  32 +
 third_party/gpus/rocm/rocm_config.h.tpl       |  21 +
 third_party/gpus/rocm_configure.bzl           | 663 ++++++++++++++++++
 tools/bazel.rc                                |   3 +
 33 files changed, 1611 insertions(+), 23 deletions(-)
 create mode 100644 tensorflow/tools/ci_build/Dockerfile.rocm
 create mode 100755 tensorflow/tools/ci_build/linux/libtensorflow_rocm.sh
 create mode 100755 tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
 create mode 100755 tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
 create mode 100755 tensorflow/tools/ci_build/osx/libtensorflow_rocm.sh
 create mode 100755 tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
 create mode 100644 third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl
 create mode 100755 third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
 create mode 100644 third_party/gpus/rocm/BUILD
 create mode 100644 third_party/gpus/rocm/BUILD.tpl
 create mode 100644 third_party/gpus/rocm/build_defs.bzl.tpl
 create mode 100644 third_party/gpus/rocm/rocm_config.h.tpl
 create mode 100644 third_party/gpus/rocm_configure.bzl

diff --git a/configure.py b/configure.py
index 361bd4764d..4f998511aa 100644
--- a/configure.py
+++ b/configure.py
@@ -1521,6 +1521,13 @@ def main():
     else:
       set_trisycl_include_dir(environ_cp)
 
+  set_action_env_var(environ_cp, 'TF_NEED_ROCM', 'ROCm', False)
+  if (environ_cp.get('TF_NEED_ROCM') == '1' and
+      'LD_LIBRARY_PATH' in environ_cp and environ_cp.get(
+      'LD_LIBRARY_PATH') != '1'):
+      write_action_env_to_bazelrc('LD_LIBRARY_PATH',
+                                  environ_cp.get('LD_LIBRARY_PATH'))
+
   set_action_env_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False)
   if (environ_cp.get('TF_NEED_CUDA') == '1' and
       'TF_CUDA_CONFIG_REPO' not in environ_cp):
@@ -1561,6 +1568,19 @@ def main():
       write_to_bazelrc('build --config=download_clang')
       write_to_bazelrc('test --config=download_clang')
 
+  # SYCL / ROCm / CUDA are mutually exclusive.
+  # At most 1 GPU platform can be configured.
+  gpu_platform_count = 0
+  if environ_cp.get('TF_NEED_OPENCL_SYCL') == '1':
+    gpu_platform_count += 1
+  if environ_cp.get('TF_NEED_ROCM') == '1':
+    gpu_platform_count += 1
+  if environ_cp.get('TF_NEED_CUDA') == '1':
+    gpu_platform_count += 1
+  if gpu_platform_count >= 2:
+    raise UserInputError('SYCL / CUDA / ROCm are mututally exclusive. '
+                         'At most 1 GPU platform can be configured.')
+
   set_build_var(environ_cp, 'TF_NEED_MPI', 'MPI', 'with_mpi_support', False)
   if environ_cp.get('TF_NEED_MPI') == '1':
     set_mpi_home(environ_cp)
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c06fea130f..d5dfb8c813 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -146,7 +146,7 @@ load(
     "if_static",
     "tf_cuda_tests_tags",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "if_cuda_is_configured")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_proto_library")
 load(
     "//third_party/mkl:build_defs.bzl",
@@ -2941,7 +2941,7 @@ tf_cuda_library(
         "platform/device_tracer.h",
     ],
     copts = tf_copts(),
-    cuda_deps = tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps(),
+    cuda_deps = if_cuda_is_configured(tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps()),
     visibility = ["//visibility:private"],
     deps = [
         ":core_cpu_internal",
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 25063ac823..68fa8fa481 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -55,7 +55,8 @@ load(
     "if_mkl_ml",
     "mkl_deps",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "if_cuda_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm", "if_rocm_is_configured")
 
 config_setting(
     # Add "--define tensorflow_xsmm=1" to your build command to use libxsmm for
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index adac895a17..f51a628ca3 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -17,8 +17,15 @@ load(
 )
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
-    "cuda_default_copts",
     "if_cuda",
+    "if_cuda_is_configured",
+    "cuda_default_copts",
+)
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+    "if_rocm_is_configured",
+    "rocm_default_copts",
 )
 load(
     "//third_party/mkl:build_defs.bzl",
@@ -860,12 +867,14 @@ def tf_cuda_only_cc_test(
         srcs = srcs + tf_binary_additional_srcs(),
         size = size,
         args = args,
-        copts = _cuda_copts() + tf_copts(),
+        copts = _cuda_copts() + _rocm_copts() + tf_copts(),
         data = data + tf_binary_dynamic_kernel_dsos(kernels),
-        deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_cuda([
-            clean_dep("//tensorflow/core:cuda"),
-            clean_dep("//tensorflow/core:gpu_lib"),
-        ]),
+        deps = deps + tf_binary_dynamic_kernel_deps(kernels) +
+            if_cuda_is_configured([
+                clean_dep("//tensorflow/core:cuda"),
+                clean_dep("//tensorflow/core:gpu_lib")]) +
+            if_rocm_is_configured([
+                clean_dep("//tensorflow/core:gpu_lib")]),
         linkopts = if_not_windows(["-lpthread", "-lm"]) + linkopts + _rpath_linkopts(name),
         linkstatic = linkstatic or select({
             # cc_tests with ".so"s in srcs incorrectly link on Darwin
@@ -1000,7 +1009,7 @@ register_extension_info(
     label_regex_for_dep = "{extension_name}",
 )
 
-def _cuda_copts():
+def _cuda_copts(opts = []):
     """Gets the appropriate set of copts for (maybe) CUDA compilation.
 
       If we're doing CUDA compilation, returns copts for our particular CUDA
@@ -1016,13 +1025,31 @@ def _cuda_copts():
         "@local_config_cuda//cuda:using_clang": ([
             "-fcuda-flush-denormals-to-zero",
         ]),
-    })
+    }) + if_cuda_is_configured(opts)
+
+def _rocm_copts(opts = []):
+    """Gets the appropriate set of copts for (maybe) ROCm compilation.
+
+      If we're doing ROCm compilation, returns copts for our particular ROCm
+      compiler.  If we're not doing ROCm compilation, returns an empty list.
+
+      """
+    return rocm_default_copts() + select({
+        "//conditions:default": [],
+        "@local_config_rocm//rocm:using_hipcc": ([
+            "",
+        ])
+    }) + if_rocm_is_configured(opts)
 
 # Build defs for TensorFlow kernels
 
 # When this target is built using --config=cuda, a cc_library is built
 # that passes -DGOOGLE_CUDA=1 and '-x cuda', linking in additional
 # libraries needed by GPU kernels.
+#
+# When this target is built using --config=rocm, a cc_library is built
+# that passes -DTENSORFLOW_USE_ROCM and '-x rocm', linking in additional
+# libraries needed by GPU kernels.
 def tf_gpu_kernel_library(
         srcs,
         copts = [],
@@ -1030,16 +1057,18 @@ def tf_gpu_kernel_library(
         deps = [],
         hdrs = [],
         **kwargs):
-    copts = copts + _cuda_copts() + if_cuda(cuda_copts) + tf_copts()
+    copts = copts + tf_copts() + _cuda_copts(opts = cuda_copts) + _rocm_copts(opts = cuda_copts)
     kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"]
 
     native.cc_library(
         srcs = srcs,
         hdrs = hdrs,
         copts = copts,
-        deps = deps + if_cuda([
+        deps = deps + if_cuda_is_configured([
             clean_dep("//tensorflow/core:cuda"),
             clean_dep("//tensorflow/core:gpu_lib"),
+        ]) + if_rocm_is_configured([
+            clean_dep("//tensorflow/core:gpu_lib"),
         ]),
         alwayslink = 1,
         **kwargs
@@ -1075,11 +1104,13 @@ def tf_cuda_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs)
 
     kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"]
     native.cc_library(
-        deps = deps + if_cuda(cuda_deps + [
+        deps = deps + if_cuda_is_configured(cuda_deps + [
             clean_dep("//tensorflow/core:cuda"),
-            "@local_config_cuda//cuda:cuda_headers",
+            "@local_config_cuda//cuda:cuda_headers"
+        ]) + if_rocm_is_configured(cuda_deps + [
+            "@local_config_rocm//rocm:rocm_headers"
         ]),
-        copts = (copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]) +
+        copts = (copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_rocm(["-DTENSORFLOW_USE_ROCM=1"]) + if_mkl(["-DINTEL_MKL=1"]) +
                  if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
                  if_tensorrt(["-DGOOGLE_TENSORRT=1"])),
         **kwargs
@@ -1459,6 +1490,9 @@ def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cudart_static",
     ]
+    rocm_deps = [
+        clean_dep("//tensorflow/core:stream_executor_headers_lib"),
+    ]
     deps = deps + tf_custom_op_library_additional_deps()
     if gpu_srcs:
         basename = name.split(".")[0]
@@ -1467,13 +1501,14 @@ def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [
             srcs = gpu_srcs,
             copts = _cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
             features = if_cuda(["-use_header_modules"]),
-            deps = deps + if_cuda(cuda_deps),
+            deps = deps + if_cuda_is_configured(cuda_deps) + if_rocm_is_configured(rocm_deps)
         )
         cuda_deps.extend([":" + basename + "_gpu"])
+        rocm_deps.extend([":" + basename + "_gpu"])
 
     check_deps(
         name = name + "_check_deps",
-        deps = deps + if_cuda(cuda_deps),
+        deps = deps + if_cuda_is_configured(cuda_deps) + if_rocm_is_configured(rocm_deps),
         disallowed_deps = [
             clean_dep("//tensorflow/core:framework"),
             clean_dep("//tensorflow/core:lib"),
@@ -1482,7 +1517,7 @@ def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [
     tf_cc_shared_object(
         name = name,
         srcs = srcs,
-        deps = deps + if_cuda(cuda_deps),
+        deps = deps + if_cuda_is_configured(cuda_deps) + if_rocm_is_configured(rocm_deps),
         data = if_static([name + "_check_deps"]),
         copts = tf_copts(is_external = True),
         features = ["windows_export_all_symbols"],
diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
new file mode 100644
index 0000000000..aadaa8bac1
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@@ -0,0 +1,97 @@
+# This Dockerfile provides a starting point for a ROCm installation of 
+# MIOpen and tensorflow.  
+FROM ubuntu:xenial
+MAINTAINER Jeff Poznanovic <jeffrey.poznanovic@amd.com>
+
+ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/debian/
+ARG ROCM_PATH=/opt/rocm
+
+ENV DEBIAN_FRONTEND noninteractive
+ENV TF_NEED_ROCM 1
+ENV HOME /root/
+RUN apt update && apt install -y wget software-properties-common 
+
+# Add rocm repository
+RUN apt-get clean all
+RUN wget -qO - $DEB_ROCM_REPO/rocm.gpg.key | apt-key add -
+RUN sh -c  "echo deb [arch=amd64] $DEB_ROCM_REPO xenial main > /etc/apt/sources.list.d/rocm.list"
+
+# Install misc pkgs
+RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+  build-essential \
+  clang-3.8 \
+  clang-format-3.8 \
+  clang-tidy-3.8 \
+  cmake \
+  cmake-qt-gui \
+  ssh \
+  curl \
+  apt-utils \
+  pkg-config \
+  g++-multilib \
+  git \
+  libunwind-dev \
+  libfftw3-dev \
+  libelf-dev \
+  libncurses5-dev \
+  libpthread-stubs0-dev \
+  vim \
+  gfortran \
+  libboost-program-options-dev \
+  libssl-dev \
+  libboost-dev \
+  libboost-system-dev \
+  libboost-filesystem-dev \
+  rpm \
+  libnuma-dev \
+  virtualenv \
+  python-pip \
+  python3-pip \
+  wget && \
+  apt-get clean && \
+  rm -rf /var/lib/apt/lists/*
+
+# Install rocm pkgs
+RUN apt-get update --allow-insecure-repositories && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    rocm-dev rocm-libs rocm-utils \
+    rocfft miopen-hip miopengemm rocblas hipblas rocrand \
+    rocm-profiler cxlactivitylogger && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN cd ~ && git clone https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP.git
+RUN cd ~/HIP && mkdir -p build && cd build && cmake .. && make package -j && dpkg -i *.deb
+
+ENV HCC_HOME=$ROCM_PATH/hcc
+ENV HIP_PATH=$ROCM_PATH/hip
+ENV OPENCL_ROOT=$ROCM_PATH/opencl
+ENV PATH="$HCC_HOME/bin:$HIP_PATH/bin:${PATH}"
+ENV PATH="$ROCM_PATH/bin:${PATH}"
+ENV PATH="$OPENCL_ROOT/bin:${PATH}"
+
+# Add target file to help determine which device(s) to build for
+RUN echo -e "gfx803\ngfx900" >> /opt/rocm/bin/target.lst
+
+# Setup environment variables, and add those environment variables at the end of ~/.bashrc 
+ARG HCC_HOME=/opt/rocm/hcc
+ARG HIP_PATH=/opt/rocm/hip
+ARG PATH=$HCC_HOME/bin:$HIP_PATH/bin:$PATH
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+ARG DEBIAN_FRONTEND=noninteractive
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa && \
+    add-apt-repository -y ppa:george-edison55/cmake-3.x
+RUN /install/install_deb_packages.sh
+RUN /install/install_pip_packages.sh
+RUN /install/install_bazel.sh
+RUN /install/install_golang.sh
+
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
+
+# Configure the build for our CUDA configuration.
+ENV TF_NEED_ROCM 1
+
diff --git a/tensorflow/tools/ci_build/builds/docker_test.sh b/tensorflow/tools/ci_build/builds/docker_test.sh
index e337ea4b05..38891b60e5 100755
--- a/tensorflow/tools/ci_build/builds/docker_test.sh
+++ b/tensorflow/tools/ci_build/builds/docker_test.sh
@@ -19,7 +19,7 @@
 #
 # Usage: docker_test.sh <IMAGE_TYPE> <TAG> <WHL_PATH>
 # Arguments:
-#   IMAGE_TYPE : Type of the image: (CPU|GPU)
+#   IMAGE_TYPE : Type of the image: (CPU|GPU|ROCM)
 #   TAG        : Docker image tag
 #   WHL_PATH   : Path to the whl file to be installed inside the docker image
 #
@@ -60,6 +60,8 @@ if [[ "${IMAGE_TYPE}" == "cpu" ]]; then
   DOCKERFILE="tensorflow/tools/docker/Dockerfile"
 elif [[ "${IMAGE_TYPE}" == "gpu" ]]; then
   DOCKERFILE="tensorflow/tools/docker/Dockerfile.gpu"
+elif [[ "${IMAGE_TYPE}" == "rocm" ]]; then
+  DOCKERFILE="tensorflow/tools/docker/Dockerfile.rocm"
 else
   die "Unrecognized image type: $1"
 fi
@@ -106,13 +108,16 @@ if [ "${IMAGE_TYPE}" == "gpu" ]; then
   devices=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
   libs=$(\ls /usr/lib/x86_64-linux-gnu/libcuda.* | xargs -I{} echo '-v {}:{}')
   GPU_EXTRA_PARAMS="${devices} ${libs}"
+elif [ "${IMAGE_TYPE}" == "rocm" ]; then
+  ROCM_EXTRA_PARAMS="--device=/dev/kfd --device=/dev/dri --group-add video"
 else
   GPU_EXTRA_PARAMS=""
+  ROCM_EXTRA_PARAMS=""
 fi
 
 # Run docker image with source directory mapped
 docker run -v ${BASE_DIR}:/tensorflow-src -w /tensorflow-src \
-${GPU_EXTRA_PARAMS} \
+${GPU_EXTRA_PARAMS} ${ROCM_EXTRA_PARAMS} \
 "${DOCKER_IMG_TAG}" \
 /bin/bash -c "tensorflow/tools/ci_build/builds/run_pip_tests.sh && "\
 "tensorflow/tools/ci_build/builds/test_tutorials.sh && "\
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index fef121ab5a..6543779022 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -132,6 +132,7 @@ echo "Using Bazel flags: ${BAZEL_FLAGS}"
 PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
 GPU_FLAG=""
 if [[ ${CONTAINER_TYPE} == "cpu" ]] || \
+   [[ ${CONTAINER_TYPE} == "rocm" ]] || \
    [[ ${CONTAINER_TYPE} == "debian.jessie.cpu" ]]; then
   bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \
       die "Build failed."
@@ -255,7 +256,8 @@ if [[ $(uname) == "Linux" ]]; then
       die "ERROR: Cannot find repaired wheel."
     fi
   # Copy and rename for gpu manylinux as we do not want auditwheel to package in libcudart.so
-  elif [[ ${CONTAINER_TYPE} == "gpu" ]]; then
+  elif [[ ${CONTAINER_TYPE} == "gpu" ]] || \
+       [[ ${CONTAINER_TYPE} == "rocm" ]]; then
     WHL_PATH=${AUDITED_WHL_NAME}
     cp ${WHL_DIR}/${WHL_BASE_NAME} ${WHL_PATH}
     echo "Copied manylinx1 wheel file at ${WHL_PATH}"
diff --git a/tensorflow/tools/ci_build/builds/with_the_same_user b/tensorflow/tools/ci_build/builds/with_the_same_user
index b216e3549f..1cc5aed15d 100755
--- a/tensorflow/tools/ci_build/builds/with_the_same_user
+++ b/tensorflow/tools/ci_build/builds/with_the_same_user
@@ -48,6 +48,12 @@ getent passwd "${CI_BUILD_UID}" || adduser ${ADDUSER_OPTS} \
 usermod -a -G sudo "${CI_BUILD_USER}"
 echo "${CI_BUILD_USER} ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-nopasswd-sudo
 
+if [[ "${TF_NEED_ROCM}" -eq 1 ]]; then
+  # ROCm requires the video group in order to use the GPU for compute. If it
+  # exists on the host, add it to the container.
+  getent group video || addgroup video && adduser "${CI_BUILD_USER}" video
+fi
+
 if [ -e /root/.bazelrc ]; then
   cp /root/.bazelrc "${CI_BUILD_HOME}/.bazelrc"
   chown "${CI_BUILD_UID}:${CI_BUILD_GID}" "${CI_BUILD_HOME}/.bazelrc"
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 77265e0f50..eab0616513 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -18,7 +18,7 @@
 #                    <COMMAND>
 #
 # CONTAINER_TYPE: Type of the docker container used the run the build:
-#                 e.g., (cpu | gpu | android | tensorboard)
+#                 e.g., (cpu | gpu | rocm | android | tensorboard)
 #
 # DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docker build.
 #                  If this optional value is not supplied (via the
@@ -103,6 +103,14 @@ if [[ "${CONTAINER_TYPE}" != gpu* ]]; then
   GPU_EXTRA_PARAMS=""
 fi
 
+# Add extra params for rocm devices and libraries for ROCm container.
+if [[ "${CONTAINER_TYPE}" == "rocm" ]]; then
+  ROCM_EXTRA_PARAMS="--device=/dev/kfd --device=/dev/dri --group-add video"
+else
+  ROCM_EXTRA_PARAMS=""
+fi
+
+
 # Determine the docker image name
 DOCKER_IMG_NAME="${BUILD_TAG}.${CONTAINER_TYPE}"
 
@@ -159,6 +167,7 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -v ${WORKSPACE}:/workspace \
     -w /workspace \
     ${GPU_EXTRA_PARAMS} \
+    ${ROCM_EXTRA_PARAMS} \
     ${CI_DOCKER_EXTRA_PARAMS[@]} \
     "${DOCKER_IMG_NAME}" \
     ${CI_COMMAND_PREFIX[@]} \
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
index 8eeddcdb82..3b5c92d148 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
@@ -26,6 +26,7 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export TF_NEED_ROCM=0
 export CC_OPT_FLAGS='-mavx'
 # Only running cc tests, python version does not matter.
 export PYTHON_BIN_PATH=`which python`
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
index 8eca1987f0..52eff6330f 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
@@ -26,6 +26,7 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export TF_NEED_ROCM=0
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=`which python2`
 yes "" | $PYTHON_BIN_PATH configure.py
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
index f6fa9251d4..d12027599a 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
@@ -26,6 +26,7 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export TF_NEED_ROCM=0
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=`which python3`
 yes "" | $PYTHON_BIN_PATH configure.py
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
index 51eb2cd7e6..7c531a4d68 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
@@ -26,6 +26,7 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export TF_NEED_ROCM=0
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=`which python3`
 yes "" | $PYTHON_BIN_PATH configure.py
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow.sh b/tensorflow/tools/ci_build/linux/libtensorflow.sh
index beef8e063b..3b6e15feb9 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow.sh
@@ -27,5 +27,8 @@ SUFFIX="-cpu-linux-"
 if [ "${TF_NEED_CUDA}" == "1" ]; then
   SUFFIX="-gpu-linux-"
 fi
+if [ "${TF_NEED_ROCM}" == "1" ]; then
+  SUFFIX="-rocm-linux-"
+fi
 
 build_libtensorflow_tarball "${SUFFIX}$(uname -m)"
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/linux/libtensorflow_cpu.sh
index 4bf34dd299..b76262b6e9 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_cpu.sh
@@ -19,4 +19,5 @@
 set -ex
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 export TF_NEED_CUDA=0
+export TF_NEED_ROCM=0
 "${SCRIPT_DIR}/libtensorflow_docker.sh"
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
index 60c974c36b..467b8dc808 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
@@ -38,6 +38,11 @@ if [ "${TF_NEED_CUDA}" == "1" ]; then
   DOCKER_BINARY="nvidia-docker"
   DOCKER_FILE="Dockerfile.gpu"
 fi
+if [ "${TF_NEED_ROCM}" == "1" ]; then
+  DOCKER_IMAGE="tf-tensorflow-rocm"
+  DOCKER_BINARY="docker"
+  DOCKER_FILE="Dockerfile.rocm"
+fi
 
 docker build \
   -t "${DOCKER_IMAGE}" \
@@ -53,6 +58,7 @@ ${DOCKER_BINARY} run \
   -e "TF_NEED_HDFS=0" \
   -e "TF_NEED_CUDA=${TF_NEED_CUDA}" \
   -e "TF_NEED_TENSORRT=${TF_NEED_CUDA}" \
+  -e "TF_NEED_ROCM=${TF_NEED_ROCM}" \
   -e "TF_NEED_OPENCL_SYCL=0" \
   "${DOCKER_IMAGE}" \
   "/workspace/tensorflow/tools/ci_build/linux/libtensorflow.sh"
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_rocm.sh b/tensorflow/tools/ci_build/linux/libtensorflow_rocm.sh
new file mode 100755
index 0000000000..c1ebbe3630
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_rocm.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Script to build a binary releases of libtensorflow with GPU support.
+
+set -ex
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+export TF_NEED_ROCM=1
+"${SCRIPT_DIR}/libtensorflow_docker.sh"
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
new file mode 100755
index 0000000000..200089f90e
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export PYTHON_BIN_PATH=`which python3`
+export CC_OPT_FLAGS='-mavx'
+
+export TF_NEED_ROCM=1
+
+yes "" | $PYTHON_BIN_PATH configure.py
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test --config=rocm --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
+    --test_lang_filters=cc --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
+    --build_tests_only --test_output=errors --local_test_jobs=1 --config=opt \
+    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
new file mode 100755
index 0000000000..1d0b838c1b
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export PYTHON_BIN_PATH=`which python3`
+export CC_OPT_FLAGS='-mavx'
+
+export TF_NEED_ROCM=1
+
+yes "" | $PYTHON_BIN_PATH configure.py
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test --config=rocm --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
+    --test_lang_filters=py --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
+    --build_tests_only --test_output=errors --local_test_jobs=1 --config=opt \
+    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
index c7cc16e669..adee0d3171 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
@@ -27,6 +27,7 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export TF_NEED_ROCM=0
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2)
 yes "" | $PYTHON_BIN_PATH configure.py
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
index 9ae5fc6bea..06798adc03 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
@@ -26,6 +26,7 @@ source "${SCRIPT_DIR}/../builds/libtensorflow.sh"
 export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
+export TF_NEED_ROCM=0
 export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
index d95fcdeb85..95f1992d7d 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
@@ -27,6 +27,7 @@ export TF_NEED_CUDA=1
 export LD_LIBRARY_PATH="/usr/local/cuda/lib:/usr/local/cuda/extras/CUPTI/lib:${LD_LIBRARY_PATH}"
 export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_HDFS=0
+export TF_NEED_ROCM=0
 export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_rocm.sh b/tensorflow/tools/ci_build/osx/libtensorflow_rocm.sh
new file mode 100755
index 0000000000..aeabc0e39e
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_rocm.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Script to produce binary release of libtensorflow (C API, Java jars etc.).
+
+set -ex
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# See comments at the top of this file for details.
+source "${SCRIPT_DIR}/../builds/libtensorflow.sh"
+
+# Configure script
+export TF_NEED_ROCM=1
+export PYTHON_BIN_PATH="/usr/bin/python"
+export TF_NEED_GCP=0
+export TF_NEED_HDFS=0
+export TF_NEED_CUDA=0
+export TF_NEED_OPENCL_SYCL=0
+export TF_NEED_MKL=0
+export COMPUTECPP_PATH="/usr/local"
+
+export PATH="/usr/local/cuda/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
+build_libtensorflow_tarball "-gpu-darwin-$(uname -m)"
diff --git a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
new file mode 100755
index 0000000000..a0de128020
--- /dev/null
+++ b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export PYTHON_BIN_PATH=`which python3`
+
+export TF_NEED_ROCM=1
+
+yes "" | $PYTHON_BIN_PATH configure.py
+echo "build --distinct_host_configuration=false" >> .tf_configure.bazelrc
+
+bazel clean
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test --config=rocm --test_tag_filters=-no_gpu,-benchmark-test,-no_oss -k \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
+    --build_tests_only --test_output=errors --local_test_jobs=1 \
+    --config=xla -- \
+    //tensorflow/compiler/...
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 1e7c5d6790..87d1243563 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1,6 +1,7 @@
 # TensorFlow external dependencies that can be loaded in WORKSPACE files.
 
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
 load("//third_party:nccl/nccl_configure.bzl", "nccl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
@@ -43,6 +44,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     sycl_configure(name = "local_config_sycl")
     syslibs_configure(name = "local_config_syslibs")
     python_configure(name = "local_config_python")
+    rocm_configure(name="local_config_rocm")
 
     initialize_third_party()
 
diff --git a/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl b/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl
new file mode 100644
index 0000000000..0e175b3ef6
--- /dev/null
+++ b/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl
@@ -0,0 +1,158 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "piii"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "ppc"
+  toolchain_identifier: "local_linux"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  supports_thin_archives: false
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  # As part of the TensorFlow release, we place some ROCm-related compilation
+  # files in @local_config_rocm//crosstool/clang/bin, and this relative
+  # path, combined with the rest of our Bazel configuration causes our
+  # compilation to use those files.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_rocm" }
+  # Use "-std=c++11" for hipcc. For consistency, force both the host compiler
+  # and the device compiler to use "-std=c++11".
+  cxx_flag: "-std=c++11"
+  linker_flag: "-Wl,-no-as-needed"
+  linker_flag: "-lstdc++"
+  #linker_flag: "-B/usr/bin/"
+  linker_flag: "-B/opt/rocm/hcc/compiler/bin"
+
+%{host_compiler_includes}
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+
+  # C(++) compiles invoke the compiler (as that is the one knowing where
+  # to find libraries), but we provide LD so other rules can invoke the linker.
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Anticipated future default.
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+
+  # Make C++ compilation deterministic. Use linkstamping instead of these
+  # compiler symbols.
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+  unfiltered_cxx_flag: "-D__HIP_PLATFORM_HCC__"
+  # The macro EIGEN_USE_HIP is used to tell Eigen to use the HIP platform headers
+  # It needs to be always set when compiling Eigen headers
+  # (irrespective of whether the source file is being compiled via HIPCC)
+  # so adding -DEIGEN_USE_HIP as a default CXX flag here
+  unfiltered_cxx_flag: "-DEIGEN_USE_HIP"
+
+    
+  # Security hardening on by default.
+  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+  # We need to undef it before redefining it as some distributions now have
+  # it enabled by default.
+  #compiler_flag: "-U_FORTIFY_SOURCE"
+  #compiler_flag: "-D_FORTIFY_SOURCE=1"
+  #compiler_flag: "-fstack-protector"
+  #compiler_flag: "-fPIE"
+  #linker_flag: "-pie"
+  #linker_flag: "-Wl,-z,relro,-z,now"
+
+  # Enable coloring even if there's no attached terminal. Bazel removes the
+  # escape sequences if --nocolor is specified. This isn't supported by gcc
+  # on Ubuntu 14.04.
+  # compiler_flag: "-fcolor-diagnostics"
+
+  # All warnings are enabled. Maybe enable -Werror as well?
+  compiler_flag: "-Wall"
+  # Enable a few more warnings that aren't part of -Wall.
+  compiler_flag: "-Wunused-but-set-parameter"
+  # But disable some that are problematic.
+  compiler_flag: "-Wno-free-nonheap-object" # has false positives
+
+  # Keep stack frames for debugging, even in opt mode.
+  compiler_flag: "-fno-omit-frame-pointer"
+
+  # Anticipated future default.
+  linker_flag: "-no-canonical-prefixes"
+  unfiltered_cxx_flag: "-fno-canonical-system-headers"
+  # Have gcc return the exit code from ld.
+  linker_flag: "-pass-exit-codes"
+  # Stamp the binary with a unique identifier.
+  linker_flag: "-Wl,--build-id=md5"
+  linker_flag: "-Wl,--hash-style=gnu"
+  # Gold linker only? Can we enable this by default?
+  # linker_flag: "-Wl,--warn-execstack"
+  # linker_flag: "-Wl,--detect-odr-violations"
+
+  # Include directory for ROCm headers.
+%{rocm_include_path}
+
+  compilation_mode_flags {
+    mode: DBG
+    # Enable debug symbols.
+    compiler_flag: "-g"
+  }
+  compilation_mode_flags {
+    mode: OPT
+
+    # No debug symbols.
+    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or
+    # even generally? However, that can't happen here, as it requires special
+    # handling in Bazel.
+    compiler_flag: "-g0"
+
+    # Conservative choice for -O
+    # -O3 can increase binary size and even slow down the resulting binaries.
+    # Profile first and / or use FDO if you need better performance than this.
+    compiler_flag: "-O2"
+
+    # Disable assertions
+    compiler_flag: "-DNDEBUG"
+
+    # Removal of unused code and data at link time (can this increase binary size in some cases?).
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+    linker_flag: "-Wl,--gc-sections"
+  }
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
new file mode 100755
index 0000000000..824238022b
--- /dev/null
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -0,0 +1,241 @@
+#!/usr/bin/env python
+"""Crosstool wrapper for compiling ROCm programs.
+
+SYNOPSIS:
+  crosstool_wrapper_driver_rocm [options passed in by cc_library()
+                                or cc_binary() rule]
+
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "-x rocm" is present in the list of arguments passed
+  to this script, it invokes the hipcc compiler. Most arguments are passed
+  as is as a string to --compiler-options of hipcc. When "-x rocm" is not
+  present, this wrapper invokes gcc with the input arguments as is.
+"""
+
+from __future__ import print_function
+
+__author__ = 'whchung@gmail.com (Wen-Heng (Jack) Chung)'
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by rocm_configure.bzl.
+CPU_COMPILER = ('%{cpu_compiler}')
+GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
+
+HIPCC_PATH = '%{hipcc_path}'
+PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from the argv list.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    option: The option whose value to extract, without the leading '-'.
+
+  Returns:
+    A list of values, either directly following the option,
+    (eg., -opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., -opt val1 -opt val2).
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-' + option, nargs='*', action='append')
+  args, _ = parser.parse_known_args(argv)
+  if not args or not vars(args)[option]:
+    return []
+  else:
+    return sum(vars(args)[option], [])
+
+
+def GetHostCompilerOptions(argv):
+  """Collect the -isystem, -iquote, and --sysroot option values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be used as the --compiler-options to hipcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-isystem', nargs='*', action='append')
+  parser.add_argument('-iquote', nargs='*', action='append')
+  parser.add_argument('--sysroot', nargs=1)
+  parser.add_argument('-g', nargs='*', action='append')
+  parser.add_argument('-fno-canonical-system-headers', action='store_true')
+
+  args, _ = parser.parse_known_args(argv)
+
+  opts = ''
+
+  if args.isystem:
+    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+  if args.iquote:
+    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
+  if args.g:
+    opts += ' -g' + ' -g'.join(sum(args.g, []))
+  #if args.fno_canonical_system_headers:
+  #  opts += ' -fno-canonical-system-headers'
+  if args.sysroot:
+    opts += ' --sysroot ' + args.sysroot[0]
+
+  return opts
+
+def GetHipccOptions(argv):
+  """Collect the -hipcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be passed directly to hipcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-hipcc_options', nargs='*', action='append')
+
+  args, _ = parser.parse_known_args(argv)
+
+  if args.hipcc_options:
+    options = _update_options(sum(args.hipcc_options, []))
+    return ' '.join(['--'+a for a in options])
+  return ''
+
+
+def InvokeHipcc(argv, log=False):
+  """Call hipcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('hipcc ' + args)
+  """
+
+  host_compiler_options = GetHostCompilerOptions(argv)
+  hipcc_compiler_options = GetHipccOptions(argv)
+  opt_option = GetOptionValue(argv, 'O')
+  m_options = GetOptionValue(argv, 'm')
+  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
+  include_options = GetOptionValue(argv, 'I')
+  out_file = GetOptionValue(argv, 'o')
+  depfiles = GetOptionValue(argv, 'MF')
+  defines = GetOptionValue(argv, 'D')
+  defines = ''.join([' -D' + define for define in defines])
+  undefines = GetOptionValue(argv, 'U')
+  undefines = ''.join([' -U' + define for define in undefines])
+  std_options = GetOptionValue(argv, 'std')
+  hipcc_allowed_std_options = ["c++11"]
+  std_options = ''.join([' -std=' + define
+      for define in std_options if define in hipcc_allowed_std_options])
+
+  # The list of source files get passed after the -c option. I don't know of
+  # any other reliable way to just get the list of source files to be compiled.
+  src_files = GetOptionValue(argv, 'c')
+
+  if len(src_files) == 0:
+    return 1
+  if len(out_file) != 1:
+    return 1
+
+  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
+         else ' -g')
+
+  includes = (' -I ' + ' -I '.join(include_options)
+              if len(include_options) > 0
+              else '')
+
+  # Unfortunately, there are other options that have -c prefix too.
+  # So allowing only those look like C/C++ files.
+  src_files = [f for f in src_files if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  srcs = ' '.join(src_files)
+  out = ' -o ' + out_file[0]
+
+  hipccopts = ' '
+  hipccopts += ' ' + hipcc_compiler_options
+  hipccopts += undefines
+  hipccopts += defines
+  hipccopts += std_options
+  hipccopts += m_options
+
+  if depfiles:
+    # Generate the dependency file
+    depfile = depfiles[0]
+    cmd = (HIPCC_PATH + ' ' + hipccopts +
+           host_compiler_options +
+           ' ' + GCC_HOST_COMPILER_PATH +
+           ' -I .' + includes + ' ' + srcs + ' -M -o ' + depfile)
+    if log: Log(cmd)
+    exit_status = os.system(cmd)
+    if exit_status != 0:
+      return exit_status
+
+  cmd = (HIPCC_PATH + ' ' + hipccopts +
+         host_compiler_options + ' -fPIC' +
+         ' ' + GCC_HOST_COMPILER_PATH +
+         ' -I .' + opt + includes + ' -c ' + srcs + out)
+
+  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
+  # Need to investigate and fix.
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
+  if log: Log(cmd)
+  return os.system(cmd)
+
+
+def main():
+  # ignore PWD env var
+  os.environ['PWD']=''
+
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--rocm_log', action='store_true')
+  parser.add_argument('-pass-exit-codes', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'rocm':
+    if args.rocm_log: Log('-x rocm')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.rocm_log: Log('using hipcc')
+    return InvokeHipcc(leftover, log=args.rocm_log)
+
+  # XXX use hipcc to link
+  if args.pass_exit_codes:
+    gpu_compiler_flags = [flag for flag in sys.argv[1:]
+                               if not flag.startswith(('-pass-exit-codes'))]
+
+    # special handling for $ORIGIN
+    # - guard every argument with ''
+    modified_gpu_compiler_flags = []
+    for flag in gpu_compiler_flags:
+      modified_gpu_compiler_flags.append("'" + flag + "'")
+
+    if args.rocm_log: Log('Link with hipcc: %s' % (' '.join([HIPCC_PATH] + modified_gpu_compiler_flags)))
+    return subprocess.call([HIPCC_PATH] + modified_gpu_compiler_flags)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x rocm. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--rocm_log'))]
+
+  # XXX: SE codes need to be built with gcc, but need this macro defined
+  cpu_compiler_flags.append("-D__HIP_PLATFORM_HCC__")
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/gpus/rocm/BUILD b/third_party/gpus/rocm/BUILD
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/third_party/gpus/rocm/BUILD.tpl b/third_party/gpus/rocm/BUILD.tpl
new file mode 100644
index 0000000000..8258bb3589
--- /dev/null
+++ b/third_party/gpus/rocm/BUILD.tpl
@@ -0,0 +1,99 @@
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_hipcc",
+    values = {
+        "define": "using_rocm_hipcc=true",
+    },
+)
+
+cc_library(
+    name = "rocm_headers",
+    hdrs = [
+        "rocm/rocm_config.h",
+        %{rocm_headers}
+    ],
+    includes = [
+        ".",
+        "rocm/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "hip",
+    srcs = ["rocm/lib/%{hip_lib}"],
+    data = ["rocm/lib/%{hip_lib}"],
+    includes = [
+        ".",
+        "rocm/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "rocblas",
+    srcs = ["rocm/lib/%{rocblas_lib}"],
+    data = ["rocm/lib/%{rocblas_lib}"],
+    includes = [
+        ".",
+        "rocm/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "rocfft",
+    srcs = ["rocm/lib/%{rocfft_lib}"],
+    data = ["rocm/lib/%{rocfft_lib}"],
+    includes = [
+        ".",
+        "rocm/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "hiprand",
+    srcs = ["rocm/lib/%{hiprand_lib}"],
+    data = ["rocm/lib/%{hiprand_lib}"],
+    includes = [
+        ".",
+        "rocm/include",
+        "rocm/include/rocrand",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "miopen",
+    srcs = ["rocm/lib/%{miopen_lib}"],
+    data = ["rocm/lib/%{miopen_lib}"],
+    includes = [
+        ".",
+        "rocm/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "rocm",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":rocm_headers",
+        ":hip",
+        ":rocblas",
+        ":rocfft",
+        ":hiprand",
+        ":miopen",
+    ],
+)
+
+%{rocm_include_genrules}
diff --git a/third_party/gpus/rocm/build_defs.bzl.tpl b/third_party/gpus/rocm/build_defs.bzl.tpl
new file mode 100644
index 0000000000..306f57551f
--- /dev/null
+++ b/third_party/gpus/rocm/build_defs.bzl.tpl
@@ -0,0 +1,32 @@
+# Macros for building ROCm code.
+def if_rocm(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with ROCm.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with ROCm enabled.  Otherwise, the select statement evaluates to if_false.
+
+    """
+    return select({
+        "@local_config_rocm//rocm:using_hipcc": if_true,
+        "//conditions:default": if_false
+    })
+
+
+def rocm_default_copts():
+    """Default options for all ROCm compilations."""
+    return if_rocm(["-x", "rocm"] + %{rocm_extra_copts})
+
+
+def rocm_is_configured():
+    """Returns true if ROCm was enabled during the configure process."""
+    return %{rocm_is_configured}
+
+def if_rocm_is_configured(x):
+    """Tests if the ROCm was enabled during the configure process.
+
+    Unlike if_rocm(), this does not require that we are building with
+    --config=rocm. Used to allow non-ROCm code to depend on ROCm libraries.
+    """
+    if rocm_is_configured():
+      return x
+    return []
diff --git a/third_party/gpus/rocm/rocm_config.h.tpl b/third_party/gpus/rocm/rocm_config.h.tpl
new file mode 100644
index 0000000000..c5f25a845c
--- /dev/null
+++ b/third_party/gpus/rocm/rocm_config.h.tpl
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef ROCM_ROCM_CONFIG_H_
+#define ROCM_ROCM_CONFIG_H_
+
+#define TF_ROCM_TOOLKIT_PATH "/opt/rocm"
+
+#endif  // ROCM_ROCM_CONFIG_H_
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
new file mode 100644
index 0000000000..9371e33f97
--- /dev/null
+++ b/third_party/gpus/rocm_configure.bzl
@@ -0,0 +1,663 @@
+# -*- Python -*-
+"""Repository rule for ROCm autoconfiguration.
+
+`rocm_configure` depends on the following environment variables:
+
+  * `TF_NEED_ROCM`: Whether to enable building with ROCm.
+  * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
+  * `ROCM_TOOLKIT_PATH`: The path to the ROCm toolkit. Default is
+    `/opt/rocm`.
+  * `TF_ROCM_VERSION`: The version of the ROCm toolkit. If this is blank, then
+    use the system default.
+  * `TF_MIOPEN_VERSION`: The version of the MIOpen library.
+  * `TF_ROCM_AMDGPU_TARGETS`: The AMDGPU targets. Default is
+    `gfx803,gfx900`.
+"""
+
+_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
+_ROCM_TOOLKIT_PATH = "ROCM_TOOLKIT_PATH"
+_TF_ROCM_VERSION = "TF_ROCM_VERSION"
+_TF_MIOPEN_VERSION = "TF_MIOPEN_VERSION"
+_TF_ROCM_AMDGPU_TARGETS = "TF_ROCM_AMDGPU_TARGETS"
+_TF_ROCM_CONFIG_REPO = "TF_ROCM_CONFIG_REPO"
+
+_DEFAULT_ROCM_VERSION = ""
+_DEFAULT_MIOPEN_VERSION = ""
+_DEFAULT_ROCM_TOOLKIT_PATH = "/opt/rocm"
+_DEFAULT_ROCM_AMDGPU_TARGETS = ["gfx803", "gfx900"]
+
+def find_cc(repository_ctx):
+  """Find the C++ compiler."""
+  # Return a dummy value for GCC detection here to avoid error
+  target_cc_name = "gcc"
+  cc_path_envvar = _GCC_HOST_COMPILER_PATH
+  cc_name = target_cc_name
+
+  if cc_path_envvar in repository_ctx.os.environ:
+    cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
+    if cc_name_from_env:
+      cc_name = cc_name_from_env
+  if cc_name.startswith("/"):
+    # Absolute path, maybe we should make this supported by our which function.
+    return cc_name
+  cc = repository_ctx.which(cc_name)
+  if cc == None:
+    fail(("Cannot find {}, either correct your path or set the {}" +
+          " environment variable").format(target_cc_name, cc_path_envvar))
+  return cc
+
+_INC_DIR_MARKER_BEGIN = "#include <...>"
+
+def _cxx_inc_convert(path):
+  """Convert path returned by cc -E xc++ in a complete path."""
+  path = path.strip()
+  return path
+
+def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
+  """Compute the list of default C or C++ include directories."""
+  if lang_is_cpp:
+    lang = "c++"
+  else:
+    lang = "c"
+  # TODO: We pass -no-canonical-prefixes here to match the compiler flags,
+  #       but in rocm_clang CROSSTOOL file that is a `feature` and we should
+  #       handle the case when it's disabled and no flag is passed
+  result = repository_ctx.execute([cc, "-no-canonical-prefixes",
+                                   "-E", "-x" + lang, "-", "-v"])
+  index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
+  if index1 == -1:
+    return []
+  index1 = result.stderr.find("\n", index1)
+  if index1 == -1:
+    return []
+  index2 = result.stderr.rfind("\n ")
+  if index2 == -1 or index2 < index1:
+    return []
+  index2 = result.stderr.find("\n", index2 + 1)
+  if index2 == -1:
+    inc_dirs = result.stderr[index1 + 1:]
+  else:
+    inc_dirs = result.stderr[index1 + 1:index2].strip()
+
+  return [str(repository_ctx.path(_cxx_inc_convert(p)))
+          for p in inc_dirs.split("\n")]
+
+def get_cxx_inc_directories(repository_ctx, cc):
+  """Compute the list of default C and C++ include directories."""
+  # For some reason `clang -xc` sometimes returns include paths that are
+  # different from the ones from `clang -xc++`. (Symlink and a dir)
+  # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+  includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
+  includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
+
+  includes_cpp_set = depset(includes_cpp)
+  return includes_cpp + [inc for inc in includes_c
+                         if inc not in includes_cpp_set]
+
+def auto_configure_fail(msg):
+  """Output failure message when rocm configuration fails."""
+  red = "\033[0;31m"
+  no_color = "\033[0m"
+  fail("\n%sROCm Configuration Error:%s %s\n" % (red, no_color, msg))
+# END cc_configure common functions (see TODO above).
+
+def _host_compiler_includes(repository_ctx, cc):
+  """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
+
+  Args:
+    repository_ctx: The repository context.
+    cc: The path to the gcc host compiler.
+
+  Returns:
+    A string containing the cxx_builtin_include_directory for each of the gcc
+    host compiler include directories, which can be added to the CROSSTOOL
+    file.
+  """
+  inc_dirs = get_cxx_inc_directories(repository_ctx, cc)
+
+  # Add numpy headers
+  inc_dirs.append("/usr/lib/python2.7/dist-packages/numpy/core/include")
+
+  entries = []
+  for inc_dir in inc_dirs:
+    entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
+
+  # define TENSORFLOW_USE_ROCM
+  entries.append("  unfiltered_cxx_flag: \"-DTENSORFLOW_USE_ROCM\"")
+
+  return "\n".join(entries)
+
+def _rocm_include_path(repository_ctx, rocm_config):
+  """Generates the cxx_builtin_include_directory entries for rocm inc dirs.
+
+  Args:
+    repository_ctx: The repository context.
+    cc: The path to the gcc host compiler.
+
+  Returns:
+    A string containing the cxx_builtin_include_directory for each of the gcc
+    host compiler include directories, which can be added to the CROSSTOOL
+    file.
+  """
+  inc_dirs = []
+
+  # general ROCm include path
+  inc_dirs.append(rocm_config.rocm_toolkit_path + '/include')
+
+  # Add HSA headers
+  inc_dirs.append("/opt/rocm/hsa/include")
+
+  # Add HIP headers
+  inc_dirs.append("/opt/rocm/include/hip")
+  inc_dirs.append("/opt/rocm/include/hip/hcc_detail")
+
+  # Add rocrand and hiprand headers
+  inc_dirs.append("/opt/rocm/rocrand/include")
+  inc_dirs.append("/opt/rocm/hiprand/include")
+
+  # Add rocfft headers
+  inc_dirs.append("/opt/rocm/rocfft/include")
+
+  # Add rocBLAS headers
+  inc_dirs.append("/opt/rocm/rocblas/include")
+
+  # Add MIOpen headers
+  inc_dirs.append("/opt/rocm/miopen/include")
+
+  # Add hcc headers
+  inc_dirs.append("/opt/rocm/hcc/include")
+  inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/7.0.0/include/")
+  inc_dirs.append("/opt/rocm/hcc/lib/clang/7.0.0/include")
+  # Newer hcc builds use/are based off of clang 8.0.0.
+  inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/8.0.0/include/")
+  inc_dirs.append("/opt/rocm/hcc/lib/clang/8.0.0/include")
+
+  inc_entries = []
+  for inc_dir in inc_dirs:
+    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
+  return "\n".join(inc_entries)
+
+def _enable_rocm(repository_ctx):
+  if "TF_NEED_ROCM" in repository_ctx.os.environ:
+    enable_rocm = repository_ctx.os.environ["TF_NEED_ROCM"].strip()
+    return enable_rocm == "1"
+  return False
+
+def _rocm_toolkit_path(repository_ctx):
+  """Finds the rocm toolkit directory.
+
+  Args:
+    repository_ctx: The repository context.
+
+  Returns:
+    A speculative real path of the rocm toolkit install directory.
+  """
+  rocm_toolkit_path = _DEFAULT_ROCM_TOOLKIT_PATH
+  if _ROCM_TOOLKIT_PATH in repository_ctx.os.environ:
+    rocm_toolkit_path = repository_ctx.os.environ[_ROCM_TOOLKIT_PATH].strip()
+  if not repository_ctx.path(rocm_toolkit_path).exists:
+    auto_configure_fail("Cannot find rocm toolkit path.")
+  return str(repository_ctx.path(rocm_toolkit_path).realpath)
+
+def _amdgpu_targets(repository_ctx):
+  """Returns a list of strings representing AMDGPU targets."""
+  if _TF_ROCM_AMDGPU_TARGETS not in repository_ctx.os.environ:
+    return _DEFAULT_ROCM_AMDGPU_TARGETS
+  amdgpu_targets_str = repository_ctx.os.environ[_TF_ROCM_AMDGPU_TARGETS]
+  amdgpu_targets = amdgpu_targets_str.split(",")
+  for amdgpu_target in amdgpu_targets:
+    if amdgpu_target[:3] != "gfx" or not amdgpu_target[3:].isdigit():
+      auto_configure_fail("Invalid AMDGPU target: %s" % amdgpu_target)
+  return amdgpu_targets
+
+def _cpu_value(repository_ctx):
+  """Returns the name of the host operating system.
+
+  Args:
+    repository_ctx: The repository context.
+
+  Returns:
+    A string containing the name of the host operating system.
+  """
+  os_name = repository_ctx.os.name.lower()
+  if os_name.startswith("mac os"):
+    return "Darwin"
+  if os_name.find("windows") != -1:
+    return "Windows"
+  result = repository_ctx.execute(["uname", "-s"])
+  return result.stdout.strip()
+
+def _lib_name(lib, cpu_value, version="", static=False):
+  """Constructs the platform-specific name of a library.
+
+  Args:
+    lib: The name of the library, such as "hip"
+    cpu_value: The name of the host operating system.
+    version: The version of the library.
+    static: True the library is static or False if it is a shared object.
+
+  Returns:
+    The platform-specific name of the library.
+  """
+  if cpu_value in ("Linux"):
+    if static:
+      return "lib%s.a" % lib
+    else:
+      if version:
+        version = ".%s" % version
+      return "lib%s.so%s" % (lib, version)
+  elif cpu_value == "Windows":
+      return "%s.lib" % lib
+  elif cpu_value == "Darwin":
+      if static:
+          return "lib%s.a" % lib
+      elif version:
+          version = ".%s" % version
+      return "lib%s%s.dylib" % (lib, version)
+  else:
+    auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
+
+def _find_rocm_lib(lib, repository_ctx, cpu_value, basedir, version="",
+                   static=False):
+  """Finds the given ROCm libraries on the system.
+
+  Args:
+    lib: The name of the library, such as "hip"
+    repository_ctx: The repository context.
+    cpu_value: The name of the host operating system.
+    basedir: The install directory of ROCm.
+    version: The version of the library.
+    static: True if static library, False if shared object.
+
+  Returns:
+    Returns a struct with the following fields:
+      file_name: The basename of the library found on the system.
+      path: The full path to the library.
+  """
+  file_name = _lib_name(lib, cpu_value, version, static)
+  if cpu_value == "Linux":
+    path = repository_ctx.path("%s/lib64/%s" % (basedir, file_name))
+    if path.exists:
+      return struct(file_name=file_name, path=str(path.realpath))
+    path = repository_ctx.path("%s/lib64/stubs/%s" % (basedir, file_name))
+    if path.exists:
+      return struct(file_name=file_name, path=str(path.realpath))
+    path = repository_ctx.path(
+        "%s/lib/x86_64-linux-gnu/%s" % (basedir, file_name))
+    if path.exists:
+      return struct(file_name=file_name, path=str(path.realpath))
+
+  path = repository_ctx.path("%s/lib/%s" % (basedir, file_name))
+  if path.exists:
+    return struct(file_name=file_name, path=str(path.realpath))
+  path = repository_ctx.path("%s/%s" % (basedir, file_name))
+  if path.exists:
+    return struct(file_name=file_name, path=str(path.realpath))
+
+  auto_configure_fail("Cannot find rocm library %s" % file_name)
+
+def _find_libs(repository_ctx, rocm_config):
+  """Returns the ROCm libraries on the system.
+
+  Args:
+    repository_ctx: The repository context.
+    rocm_config: The ROCm config as returned by _get_rocm_config
+
+  Returns:
+    Map of library names to structs of filename and path as returned by
+    _find_rocm_lib.
+  """
+  cpu_value = rocm_config.cpu_value
+  return {
+      "hip": _find_rocm_lib(
+          "hip_hcc", repository_ctx, cpu_value, rocm_config.rocm_toolkit_path),
+      "rocblas": _find_rocm_lib(
+          "rocblas", repository_ctx, cpu_value, rocm_config.rocm_toolkit_path + "/rocblas"),
+      "rocfft": _find_rocm_lib(
+          "rocfft", repository_ctx, cpu_value, rocm_config.rocm_toolkit_path + "/rocfft"),
+      "hiprand": _find_rocm_lib(
+          "hiprand", repository_ctx, cpu_value, rocm_config.rocm_toolkit_path + "/hiprand"),
+      "miopen": _find_rocm_lib(
+          "MIOpen", repository_ctx, cpu_value, rocm_config.rocm_toolkit_path + "/miopen"),
+  }
+
+def _get_rocm_config(repository_ctx):
+  """Detects and returns information about the ROCm installation on the system.
+
+  Args:
+    repository_ctx: The repository context.
+
+  Returns:
+    A struct containing the following fields:
+      rocm_toolkit_path: The ROCm toolkit installation directory.
+      amdgpu_targets: A list of the system's AMDGPU targets.
+      cpu_value: The name of the host operating system.
+  """
+  cpu_value = _cpu_value(repository_ctx)
+  rocm_toolkit_path = _rocm_toolkit_path(repository_ctx)
+  return struct(
+      rocm_toolkit_path = rocm_toolkit_path,
+      amdgpu_targets = _amdgpu_targets(repository_ctx),
+      cpu_value = cpu_value)
+
+def _tpl(repository_ctx, tpl, substitutions={}, out=None):
+  if not out:
+    out = tpl.replace(":", "/")
+  repository_ctx.template(
+      out,
+      Label("//third_party/gpus/%s.tpl" % tpl),
+      substitutions)
+
+
+def _file(repository_ctx, label):
+  repository_ctx.template(
+      label.replace(":", "/"),
+      Label("//third_party/gpus/%s.tpl" % label),
+      {})
+
+
+_DUMMY_CROSSTOOL_BZL_FILE = """
+def error_gpu_disabled():
+  fail("ERROR: Building with --config=rocm but TensorFlow is not configured " +
+       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
+       "at the prompt to build with GPU support.")
+
+  native.genrule(
+      name = "error_gen_crosstool",
+      outs = ["CROSSTOOL"],
+      cmd = "echo 'Should not be run.' && exit 1",
+  )
+
+  native.filegroup(
+      name = "crosstool",
+      srcs = [":CROSSTOOL"],
+      output_licenses = ["unencumbered"],
+  )
+"""
+
+
+_DUMMY_CROSSTOOL_BUILD_FILE = """
+load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
+
+error_gpu_disabled()
+"""
+
+def _create_dummy_repository(repository_ctx):
+  cpu_value = _cpu_value(repository_ctx)
+
+  # Set up BUILD file for rocm/.
+  _tpl(repository_ctx, "rocm:build_defs.bzl",
+       {
+           "%{rocm_is_configured}": "False",
+           "%{rocm_extra_copts}": "[]"
+       })
+  _tpl(repository_ctx, "rocm:BUILD",
+       {
+           "%{hip_lib}": _lib_name("hip", cpu_value),
+           "%{rocblas_lib}": _lib_name("rocblas", cpu_value),
+           "%{miopen_lib}": _lib_name("miopen", cpu_value),
+           "%{rocfft_lib}": _lib_name("rocfft", cpu_value),
+           "%{hiprand_lib}": _lib_name("hiprand", cpu_value),
+           "%{rocm_include_genrules}": '',
+           "%{rocm_headers}": '',
+       })
+
+  # Create dummy files for the ROCm toolkit since they are still required by
+  # tensorflow/core/platform/default/build_config:rocm.
+  repository_ctx.file("rocm/hip/include/hip/hip_runtime.h", "")
+
+  # Set up rocm_config.h, which is used by
+  # tensorflow/stream_executor/dso_loader.cc.
+  _tpl(repository_ctx, "rocm:rocm_config.h",
+       {
+           "%{rocm_toolkit_path}": _DEFAULT_ROCM_TOOLKIT_PATH,
+       }, "rocm/rocm/rocm_config.h")
+
+  # If rocm_configure is not configured to build with GPU support, and the user
+  # attempts to build with --config=rocm, add a dummy build rule to intercept
+  # this and fail with an actionable error message.
+  repository_ctx.file("crosstool/error_gpu_disabled.bzl",
+                      _DUMMY_CROSSTOOL_BZL_FILE)
+  repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
+
+def _execute(repository_ctx, cmdline, error_msg=None, error_details=None,
+             empty_stdout_fine=False):
+  """Executes an arbitrary shell command.
+
+  Args:
+    repository_ctx: the repository_ctx object
+    cmdline: list of strings, the command to execute
+    error_msg: string, a summary of the error if the command fails
+    error_details: string, details about the error or steps to fix it
+    empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
+      it's an error
+  Return:
+    the result of repository_ctx.execute(cmdline)
+  """
+  result = repository_ctx.execute(cmdline)
+  if result.stderr or not (empty_stdout_fine or result.stdout):
+    auto_configure_fail(
+        "\n".join([
+            error_msg.strip() if error_msg else "Repository command failed",
+            result.stderr.strip(),
+            error_details if error_details else ""]))
+  return result
+
+def _norm_path(path):
+  """Returns a path with '/' and remove the trailing slash."""
+  path = path.replace("\\", "/")
+  if path[-1] == "/":
+    path = path[:-1]
+  return path
+
+def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
+    src_files = [], dest_files = []):
+  """Returns a genrule to symlink(or copy if on Windows) a set of files.
+
+  If src_dir is passed, files will be read from the given directory; otherwise
+  we assume files are in src_files and dest_files
+  """
+  if src_dir != None:
+    src_dir = _norm_path(src_dir)
+    dest_dir = _norm_path(dest_dir)
+    files = _read_dir(repository_ctx, src_dir)
+    # Create a list with the src_dir stripped to use for outputs.
+    dest_files = files.replace(src_dir, '').splitlines()
+    src_files = files.splitlines()
+  command = []
+  # We clear folders that might have been generated previously to avoid
+  # undesired inclusions
+  command.append('if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi')
+  command.append('if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi')
+  outs = []
+  for i in range(len(dest_files)):
+    if dest_files[i] != "":
+      # If we have only one file to link we do not want to use the dest_dir, as
+      # $(@D) will include the full path to the file.
+      dest = '$(@D)/' + dest_dir + dest_files[i] if len(dest_files) != 1 else '$(@D)/' + dest_files[i]
+      # On Windows, symlink is not supported, so we just copy all the files.
+      cmd = 'ln -s'
+      command.append(cmd + ' "%s" "%s"' % (src_files[i] , dest))
+      outs.append('        "' + dest_dir + dest_files[i] + '",')
+  genrule = _genrule(src_dir, genrule_name, " && ".join(command),
+                     "\n".join(outs))
+  return genrule
+
+def _genrule(src_dir, genrule_name, command, outs):
+  """Returns a string with a genrule.
+
+  Genrule executes the given command and produces the given outputs.
+  """
+  return (
+      'genrule(\n' +
+      '    name = "' +
+      genrule_name + '",\n' +
+      '    outs = [\n' +
+      outs +
+      '\n    ],\n' +
+      '    cmd = """\n' +
+      command +
+      '\n   """,\n' +
+      ')\n'
+  )
+
+def _read_dir(repository_ctx, src_dir):
+  """Returns a string with all files in a directory.
+
+  Finds all files inside a directory, traversing subfolders and following
+  symlinks. The returned string contains the full path of all files
+  separated by line breaks.
+  """
+  find_result = _execute(
+      repository_ctx, ["find", src_dir, "-follow", "-type", "f"],
+      empty_stdout_fine=True)
+  result = find_result.stdout
+  return result
+
+def _compute_rocm_extra_copts(repository_ctx, amdgpu_targets):
+  if False:
+    amdgpu_target_flags = ["--amdgpu-target=" +
+        amdgpu_target for amdgpu_target in amdgpu_targets]
+  else:
+    # AMDGPU targets are handled in the "crosstool_wrapper_driver_is_not_gcc"
+    amdgpu_target_flags = []
+  return str(amdgpu_target_flags)
+
+def _create_local_rocm_repository(repository_ctx):
+  """Creates the repository containing files set up to build with ROCm."""
+  rocm_config = _get_rocm_config(repository_ctx)
+
+  # Set up symbolic links for the rocm toolkit by creating genrules to do
+  # symlinking. We create one genrule for each directory we want to track under
+  # rocm_toolkit_path
+  rocm_toolkit_path = rocm_config.rocm_toolkit_path
+  rocm_include_path = rocm_toolkit_path + "/include"
+  genrules = [_symlink_genrule_for_dir(repository_ctx,
+      rocm_include_path, "rocm/include", "rocm-include")]
+  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+      rocm_toolkit_path + "/rocfft/include", "rocm/include/rocfft", "rocfft-include"))
+  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+      rocm_toolkit_path + "/rocblas/include", "rocm/include/rocblas", "rocblas-include"))
+  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+      rocm_toolkit_path + "/miopen/include", "rocm/include/miopen", "miopen-include"))
+
+  rocm_libs = _find_libs(repository_ctx, rocm_config)
+  rocm_lib_src = []
+  rocm_lib_dest = []
+  for lib in rocm_libs.values():
+    rocm_lib_src.append(lib.path)
+    rocm_lib_dest.append("rocm/lib/" + lib.file_name)
+  genrules.append(_symlink_genrule_for_dir(repository_ctx, None, "", "rocm-lib",
+                                       rocm_lib_src, rocm_lib_dest))
+
+  included_files = _read_dir(repository_ctx, rocm_include_path).replace(
+      rocm_include_path, '').splitlines()
+
+  # Set up BUILD file for rocm/
+  _tpl(repository_ctx, "rocm:build_defs.bzl",
+       {
+           "%{rocm_is_configured}": "True",
+           "%{rocm_extra_copts}": _compute_rocm_extra_copts(
+               repository_ctx, rocm_config.amdgpu_targets),
+
+       })
+  _tpl(repository_ctx, "rocm:BUILD",
+       {
+           "%{hip_lib}": rocm_libs["hip"].file_name,
+           "%{rocblas_lib}": rocm_libs["rocblas"].file_name,
+           "%{rocfft_lib}": rocm_libs["rocfft"].file_name,
+           "%{hiprand_lib}": rocm_libs["hiprand"].file_name,
+           "%{miopen_lib}": rocm_libs["miopen"].file_name,
+           "%{rocm_include_genrules}": "\n".join(genrules),
+           "%{rocm_headers}": ('":rocm-include",\n' +
+                               '":rocfft-include",\n' +
+                               '":rocblas-include",\n' +
+                               '":miopen-include",'),
+       })
+  # Set up crosstool/
+  _tpl(repository_ctx, "crosstool:BUILD", {"%{linker_files}": ":empty", "%{win_linker_files}": ":empty"})
+  cc = find_cc(repository_ctx)
+  host_compiler_includes = _host_compiler_includes(repository_ctx, cc)
+  rocm_defines = {
+           "%{rocm_include_path}": _rocm_include_path(repository_ctx,
+                                                      rocm_config),
+           "%{host_compiler_includes}": host_compiler_includes,
+           "%{clang_path}": str(cc),
+       }
+
+  _tpl(repository_ctx, "crosstool:CROSSTOOL_hipcc", rocm_defines, out="crosstool/CROSSTOOL")
+
+  _tpl(repository_ctx,
+       "crosstool:clang/bin/crosstool_wrapper_driver_rocm",
+       {
+           "%{cpu_compiler}": str(cc),
+           "%{hipcc_path}": "/opt/rocm/bin/hipcc",
+           "%{gcc_host_compiler_path}": str(cc),
+           "%{rocm_amdgpu_targets}": ",".join(
+               ["\"%s\"" % c for c in rocm_config.amdgpu_targets]),
+       })
+
+  # Set up rocm_config.h, which is used by
+  # tensorflow/stream_executor/dso_loader.cc.
+  _tpl(repository_ctx, "rocm:rocm_config.h",
+       {
+           "%{rocm_amdgpu_targets}": ",".join(
+               ["\"%s\"" % c for c in rocm_config.amdgpu_targets]),
+           "%{rocm_toolkit_path}": rocm_config.rocm_toolkit_path,
+       }, "rocm/rocm/rocm_config.h")
+
+
+def _create_remote_rocm_repository(repository_ctx, remote_config_repo):
+  """Creates pointers to a remotely configured repo set up to build with ROCm."""
+  _tpl(repository_ctx, "rocm:build_defs.bzl",
+       {
+           "%{rocm_is_configured}": "True",
+           "%{rocm_extra_copts}": _compute_rocm_extra_copts(
+               repository_ctx, #_compute_capabilities(repository_ctx)
+            ),
+
+       })
+  _tpl(repository_ctx, "rocm:remote.BUILD",
+       {
+           "%{remote_rocm_repo}": remote_config_repo,
+       }, "rocm/BUILD")
+  _tpl(repository_ctx, "crosstool:remote.BUILD", {
+           "%{remote_rocm_repo}": remote_config_repo,
+       }, "crosstool/BUILD")
+
+def _rocm_autoconf_impl(repository_ctx):
+  """Implementation of the rocm_autoconf repository rule."""
+  if not _enable_rocm(repository_ctx):
+    _create_dummy_repository(repository_ctx)
+  else:
+    if _TF_ROCM_CONFIG_REPO in repository_ctx.os.environ:
+      _create_remote_rocm_repository(repository_ctx,
+          repository_ctx.os.environ[_TF_ROCM_CONFIG_REPO])
+    else:
+      _create_local_rocm_repository(repository_ctx)
+
+
+rocm_configure = repository_rule(
+    implementation = _rocm_autoconf_impl,
+    environ = [
+        _GCC_HOST_COMPILER_PATH,
+        "TF_NEED_ROCM",
+        _ROCM_TOOLKIT_PATH,
+        _TF_ROCM_VERSION,
+        _TF_MIOPEN_VERSION,
+        _TF_ROCM_AMDGPU_TARGETS,
+        _TF_ROCM_CONFIG_REPO,
+    ],
+)
+
+"""Detects and configures the local ROCm toolchain.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+rocm_configure(name = "local_config_rocm")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 601e07ffdd..afc5cf56ab 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -42,6 +42,9 @@ build:download_clang_use_lld --linkopt='-fuse-ld=lld'
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
 
+build:rocm --crosstool_top=@local_config_rocm//crosstool:toolchain
+build:rocm --define=using_rocm=true --define=using_rocm_hipcc=true
+
 build:cuda_clang --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda_clang --define=using_cuda=true --define=using_cuda_clang=true --define=using_clang=true
 
-- 
GitLab


From d0574f6b25ab01052e093ab92612520a7e4ada8d Mon Sep 17 00:00:00 2001
From: Matt Conley <mconley@nvidia.com>
Date: Thu, 6 Sep 2018 08:22:37 -0700
Subject: [PATCH 019/570] Fixed clang formatting

---
 .../stream_executor/cuda/cuda_gpu_executor.cc   | 17 +++++++++--------
 .../stream_executor/cuda/cuda_gpu_executor.h    | 12 ++++++------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index ce2f1ce3ae..ef84d01a94 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -493,10 +493,10 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
 // Compute and return maximum blocks per core (occupancy) based on the
 // device description, some kernel characteristics and the number of threads per
 // block.  If unable to compute occupancy, zero is returned.
-int CUDAExecutor::CalculateOccupancy(const DeviceDescription& device_description,
-                       uint64 registers_per_thread,
-                       uint64 shared_memory_per_block,
-                       const ThreadDim& thread_dims, CUfunction func) {
+int CUDAExecutor::CalculateOccupancy(
+    const DeviceDescription& device_description, uint64 registers_per_thread,
+    uint64 shared_memory_per_block, const ThreadDim& thread_dims,
+    CUfunction func) {
   int suggested_blocks = 0;
   int suggested_threads = 0;
   CUresult err =
@@ -509,10 +509,11 @@ int CUDAExecutor::CalculateOccupancy(const DeviceDescription& device_description
 // Compute and return the suggested thread count to acheive ideal occupancy.
 // If the provided thread dimensions match this number, zero is returned.
 int CUDAExecutor::CompareOccupancy(int* initial_blocks,
-                     const DeviceDescription& device_description,
-                     uint64 registers_per_thread,
-                     uint64 shared_memory_per_block,
-                     const ThreadDim& thread_dims, CUfunction func) {
+                                   const DeviceDescription& device_description,
+                                   uint64 registers_per_thread,
+                                   uint64 shared_memory_per_block,
+                                   const ThreadDim& thread_dims,
+                                   CUfunction func) {
   int suggested_blocks = 0;
   int suggested_threads = 0;
   CUresult err =
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index e8ebbc3220..1481dcc19a 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -71,16 +71,16 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
               const KernelArgsArrayBase &args) override;
 
   int CalculateOccupancy(const DeviceDescription& device_description,
+                         uint64 registers_per_thread,
+                         uint64 shared_memory_per_block,
+                         const ThreadDim& thread_dims, CUfunction func);
+
+  int CompareOccupancy(int* initial_blocks,
+                       const DeviceDescription& device_description,
                        uint64 registers_per_thread,
                        uint64 shared_memory_per_block,
                        const ThreadDim& thread_dims, CUfunction func);
 
-  int CompareOccupancy(int* initial_blocks,
-                     const DeviceDescription& device_description,
-                     uint64 registers_per_thread,
-                     uint64 shared_memory_per_block,
-                     const ThreadDim& thread_dims, CUfunction func);
-
   void *Allocate(uint64 size) override;
 
   void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
-- 
GitLab


From e3654a3cb4e26c26409aeeb9e127e3addcb14cee Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 6 Sep 2018 19:20:11 +0000
Subject: [PATCH 020/570] Add float16 support on GPU for
 tf.contrib.image.transform

This fix tries to address the issue raised in 22115 where
there were no float16 support on GPU for tf.contrib.image.transform.

This fix fixes 22115.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/image/kernels/image_ops.cc        | 2 ++
 tensorflow/contrib/image/kernels/image_ops_gpu.cu.cc | 1 +
 2 files changed, 3 insertions(+)

diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index 370a8caf6a..788bf04b28 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -156,6 +156,7 @@ namespace functor {
 TF_CALL_uint8(DECLARE_FUNCTOR);
 TF_CALL_int32(DECLARE_FUNCTOR);
 TF_CALL_int64(DECLARE_FUNCTOR);
+TF_CALL_half(DECLARE_FUNCTOR);
 TF_CALL_float(DECLARE_FUNCTOR);
 TF_CALL_double(DECLARE_FUNCTOR);
 
@@ -175,6 +176,7 @@ TF_CALL_double(DECLARE_FUNCTOR);
 TF_CALL_uint8(REGISTER);
 TF_CALL_int32(REGISTER);
 TF_CALL_int64(REGISTER);
+TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
 
diff --git a/tensorflow/contrib/image/kernels/image_ops_gpu.cu.cc b/tensorflow/contrib/image/kernels/image_ops_gpu.cu.cc
index 8743a5ff72..36b9a236a6 100644
--- a/tensorflow/contrib/image/kernels/image_ops_gpu.cu.cc
+++ b/tensorflow/contrib/image/kernels/image_ops_gpu.cu.cc
@@ -32,6 +32,7 @@ typedef Eigen::GpuDevice GPUDevice;
 template class FillProjectiveTransform<GPUDevice, uint8>;
 template class FillProjectiveTransform<GPUDevice, int32>;
 template class FillProjectiveTransform<GPUDevice, int64>;
+template class FillProjectiveTransform<GPUDevice, Eigen::half>;
 template class FillProjectiveTransform<GPUDevice, float>;
 template class FillProjectiveTransform<GPUDevice, double>;
 
-- 
GitLab


From 7d7e8a725aeede4b724f7376d22df2c7f2ebdcf9 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 6 Sep 2018 19:22:39 +0000
Subject: [PATCH 021/570] Add test case for float16 support on GPU for
 tf.contrib.image.transform

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../contrib/image/python/kernel_tests/image_ops_test.py    | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index 376c0751ee..ef1f79bb94 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -272,6 +272,13 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
     with self.cached_session():
       self.assertAllEqual([[[[1], [0]], [[0], [1]]]], result.eval())
 
+  def test_transform_data_types(self):
+    for dtype in _DTYPES:
+      image = constant_op.constant([[1, 2], [3, 4]], dtype=dtype)
+      value = image_ops.transform(image, [1] * 8)
+      with self.test_session(use_gpu=True):
+        self.assertAllEqual(value.eval(), np.array([[4, 4], [4, 4]]).astype(dtype.as_numpy_dtype()))
+
 
 class BipartiteMatchTest(test_util.TensorFlowTestCase):
 
-- 
GitLab


From 04e20965487c36f43ba5c773b547b23e39478a5c Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 6 Sep 2018 19:25:22 +0000
Subject: [PATCH 022/570] Pylint fix

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../contrib/image/python/kernel_tests/image_ops_test.py       | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index ef1f79bb94..4997c31a7f 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -277,7 +277,9 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
       image = constant_op.constant([[1, 2], [3, 4]], dtype=dtype)
       value = image_ops.transform(image, [1] * 8)
       with self.test_session(use_gpu=True):
-        self.assertAllEqual(value.eval(), np.array([[4, 4], [4, 4]]).astype(dtype.as_numpy_dtype()))
+        self.assertAllEqual(
+            value.eval(),
+            np.array([[4, 4], [4, 4]]).astype(dtype.as_numpy_dtype()))
 
 
 class BipartiteMatchTest(test_util.TensorFlowTestCase):
-- 
GitLab


From 6a5090b086bc9d665eb9e65f05eb94cdb58baaa2 Mon Sep 17 00:00:00 2001
From: Matt Conley <mconley@nvidia.com>
Date: Thu, 6 Sep 2018 13:09:12 -0700
Subject: [PATCH 023/570] Fully fixed clang errors

---
 tensorflow/stream_executor/cuda/cuda_gpu_executor.cc | 12 ++++++------
 tensorflow/stream_executor/cuda/cuda_gpu_executor.h  | 10 +++++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index ef84d01a94..9d5bcc7f77 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -472,7 +472,7 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
   const DeviceDescription &device_description =
       kernel.parent()->GetDeviceDescription();
 
-  const CUDAKernel* cuda_kernel = AsCUDAKernel(&kernel);
+  const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
   CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
 
   int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
@@ -494,8 +494,8 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
 // device description, some kernel characteristics and the number of threads per
 // block.  If unable to compute occupancy, zero is returned.
 int CUDAExecutor::CalculateOccupancy(
-    const DeviceDescription& device_description, uint64 registers_per_thread,
-    uint64 shared_memory_per_block, const ThreadDim& thread_dims,
+    const DeviceDescription &device_description, uint64 registers_per_thread,
+    uint64 shared_memory_per_block, const ThreadDim &thread_dims,
     CUfunction func) {
   int suggested_blocks = 0;
   int suggested_threads = 0;
@@ -508,11 +508,11 @@ int CUDAExecutor::CalculateOccupancy(
 
 // Compute and return the suggested thread count to acheive ideal occupancy.
 // If the provided thread dimensions match this number, zero is returned.
-int CUDAExecutor::CompareOccupancy(int* initial_blocks,
-                                   const DeviceDescription& device_description,
+int CUDAExecutor::CompareOccupancy(int *initial_blocks,
+                                   const DeviceDescription &device_description,
                                    uint64 registers_per_thread,
                                    uint64 shared_memory_per_block,
-                                   const ThreadDim& thread_dims,
+                                   const ThreadDim &thread_dims,
                                    CUfunction func) {
   int suggested_blocks = 0;
   int suggested_threads = 0;
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 1481dcc19a..53b2a29ae7 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -70,16 +70,16 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
               const BlockDim &block_dims, const KernelBase &k,
               const KernelArgsArrayBase &args) override;
 
-  int CalculateOccupancy(const DeviceDescription& device_description,
+  int CalculateOccupancy(const DeviceDescription &device_description,
                          uint64 registers_per_thread,
                          uint64 shared_memory_per_block,
-                         const ThreadDim& thread_dims, CUfunction func);
+                         const ThreadDim &thread_dims, CUfunction func);
 
-  int CompareOccupancy(int* initial_blocks,
-                       const DeviceDescription& device_description,
+  int CompareOccupancy(int *initial_blocks,
+                       const DeviceDescription &device_description,
                        uint64 registers_per_thread,
                        uint64 shared_memory_per_block,
-                       const ThreadDim& thread_dims, CUfunction func);
+                       const ThreadDim &thread_dims, CUfunction func);
 
   void *Allocate(uint64 size) override;
 
-- 
GitLab


From e25cf78285fef5234380ee26fef9090a939e91f5 Mon Sep 17 00:00:00 2001
From: Richard Yu <yohan.richard.yu@gmail.com>
Date: Thu, 6 Sep 2018 17:05:08 -0700
Subject: [PATCH 024/570] Ensure all ValueErrors are raised

---
 tensorflow/contrib/quantize/python/fold_batch_norms.py | 2 +-
 tensorflow/python/keras/layers/embeddings.py           | 8 ++++----
 tensorflow/python/ops/nn_ops.py                        | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index d9f179bee4..d882b79892 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -628,7 +628,7 @@ def _GetBatchNormParams(graph, context, has_scaling):
   bn_decay_var_tensor = _FindMatchingTensor(graph, op_suffix_bn_decay_var,
                                             context)
   if batch_mean_tensor is None and moving_mean_tensor is None:
-    ValueError('Error folding unfused batch norms')
+    raise ValueError('Error folding unfused batch norms')
   if has_scaling:
     gamma_tensor = _FindMatchingTensor(graph, op_suffix_gamma, context)
 
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index 629a9ec9a1..a0b9393812 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -142,13 +142,13 @@ class Embedding(Layer):
       else:
         in_lens = [self.input_length]
       if len(in_lens) != len(input_shape) - 1:
-        ValueError('"input_length" is %s, but received input has shape %s' %
-                   (str(self.input_length), str(input_shape)))
+        raise ValueError('"input_length" is %s, but received input has shape %s' %
+                         (str(self.input_length), str(input_shape)))
       else:
         for i, (s1, s2) in enumerate(zip(in_lens, input_shape[1:])):
           if s1 is not None and s2 is not None and s1 != s2:
-            ValueError('"input_length" is %s, but received input has shape %s' %
-                       (str(self.input_length), str(input_shape)))
+            raise ValueError('"input_length" is %s, but received input has shape %s' %
+                             (str(self.input_length), str(input_shape)))
           elif s1 is None:
             in_lens[i] = s2
       return (input_shape[0],) + tuple(in_lens) + (self.output_dim,)
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index ef9afd9e8e..17e10995f2 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -427,8 +427,8 @@ class _WithSpaceToBatch(object):
     try:
       input_shape.with_rank_at_least(expected_input_rank)
     except ValueError:
-      ValueError("input tensor must have rank %d at least" %
-                 (expected_input_rank))
+      raise ValueError("input tensor must have rank %d at least" %
+                       (expected_input_rank))
 
     const_rate = tensor_util.constant_value(dilation_rate)
     rate_or_const_rate = dilation_rate
@@ -818,12 +818,12 @@ class Convolution(object):
     try:
       input_shape.with_rank(num_spatial_dims + 2)
     except ValueError:
-      ValueError("input tensor must have rank %d" % (num_spatial_dims + 2))
+      raise ValueError("input tensor must have rank %d" % (num_spatial_dims + 2))
 
     try:
       filter_shape.with_rank(num_spatial_dims + 2)
     except ValueError:
-      ValueError("filter tensor must have rank %d" % (num_spatial_dims + 2))
+      raise ValueError("filter tensor must have rank %d" % (num_spatial_dims + 2))
 
     if data_format is None or not data_format.startswith("NC"):
       input_channels_dim = input_shape[num_spatial_dims + 1]
-- 
GitLab


From 864e290d1776895d7877777b8368ca8bc6fc22a3 Mon Sep 17 00:00:00 2001
From: Edvard Fagerholm <edvard.fagerholm@gmail.com>
Date: Wed, 29 Aug 2018 11:56:35 +0300
Subject: [PATCH 025/570] Make tf.transpose emit simpler graph when possible

If not given an explicit 'perm' parameter, tf.transpose currently
emits a graph that dynamically calculates it from the rank of the
input tensor. This is completely unnecessary when the rank of the
input can be statically determined at graph construction time.

Modify tf.transpose to emit 'perm' as a single Const node whenever
possible.
---
 tensorflow/python/ops/array_ops.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 7bf3869ddf..9597839301 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1409,8 +1409,13 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
         gen_array_ops.conjugate_transpose
         if (conjugate and a.dtype.is_complex) else gen_array_ops.transpose)
     if perm is None:
-      rank = gen_array_ops.rank(a)
-      perm = (rank - 1) - gen_math_ops._range(0, rank, 1)
+      a = ops.convert_to_tensor(a, name="a")
+      if not a.get_shape().ndims:
+        rank = gen_array_ops.rank(a)
+        perm = (rank - 1) - gen_math_ops._range(0, rank, 1)
+      else:
+        rank = a.get_shape().ndims
+        perm = (rank - 1) - np.arange(rank)
       ret = transpose_fn(a, perm, name=name)
       # NOTE(mrry): Setting the shape explicitly because
       #   reverse is not handled by the shape function.
-- 
GitLab


From 90cf7fb7786c8a9c135ef73482856b082e80f61a Mon Sep 17 00:00:00 2001
From: Cao Zongyan <zongyan.cao@alibaba-inc.com>
Date: Tue, 11 Sep 2018 12:48:30 +0800
Subject: [PATCH 026/570] Fix lint errors and typos.

---
 tensorflow/compiler/tests/binary_ops_test.py  |  9 +++++----
 tensorflow/compiler/tf2xla/kernels/relu_op.cc | 14 +++++++-------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 8941dd4e27..069e83d083 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -179,11 +179,12 @@ class BinaryOpsTest(xla_test.XLATestCase):
           expected=np.array([0, 0, 0, 0, 0, 6, 7, 8, 9, 10, 0, 0], dtype=dtype))
 
       self._testBinary(
-          gen_nn_ops._leaky_relu_grad,
+          gen_nn_ops.leaky_relu_grad,
           np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dtype),
-          np.array(
-              [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9], dtype=dtype),
-          expected=np.array([0.2, 0.4, 0.6, 0.8, 1, 6, 7, 8, 9, 10], dtype=dtype))
+          np.array([-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+                   dtype=dtype),
+          expected=np.array([0.2, 0.4, 0.6, 0.8, 1, 6, 7, 8, 9, 10],
+                            dtype=dtype))
 
       self._testBinary(
           gen_nn_ops.softmax_cross_entropy_with_logits,
diff --git a/tensorflow/compiler/tf2xla/kernels/relu_op.cc b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
index ec14735884..8d65e0339c 100644
--- a/tensorflow/compiler/tf2xla/kernels/relu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
@@ -50,7 +50,6 @@ class Relu6Op : public XlaOpKernel {
   }
 };
 
-
 class LeakyReluOp : public XlaOpKernel {
  public:
   explicit LeakyReluOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
@@ -61,9 +60,9 @@ class LeakyReluOp : public XlaOpKernel {
     xla::XlaBuilder* builder = ctx->builder();
     auto alpha = XlaHelpers::FloatLiteral(builder, input_type(0),
                                           static_cast<double>(alpha_));
-    ctx->SetOutput(0,
-        xla::Max(xla::Mul(alpha, ctx->Input(0)), ctx->Input(0)));
+    ctx->SetOutput(0, xla::Max(xla::Mul(alpha, ctx->Input(0)), ctx->Input(0)));
   }
+
  private:
   float alpha_;
 };
@@ -115,11 +114,12 @@ class LeakyReluGradOp : public XlaOpKernel {
     const auto zero =
         xla::Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes());
     const auto pred = xla::Gt(ctx->Input(1), zero);
-    auto alpha = XlaHelpers::FloatLiteral(b, input_type(0),
-                                          static_cast<double>(alpha_));
-    ctx->SetOutput(0,
-        xla::Select(pred, ctx->Input(0), xla::Mul(alpha, ctx->Input(0))));
+    auto alpha =
+        XlaHelpers::FloatLiteral(b, input_type(0), static_cast<double>(alpha_));
+    ctx->SetOutput(
+        0, xla::Select(pred, ctx->Input(0), xla::Mul(alpha, ctx->Input(0))));
   }
+
  private:
   float alpha_;
 };
-- 
GitLab


From 8530167f68673fa756565c0394bbe2dcdc39db05 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Fri, 24 Aug 2018 16:52:07 +0300
Subject: [PATCH 027/570] Add IgniteDataset that allows to work with Apache
 Ignite.

---
 configure.py                                  |   2 +
 tensorflow/BUILD                              |   6 +
 tensorflow/contrib/BUILD                      |  15 +
 tensorflow/contrib/cmake/python_modules.txt   |   2 +
 tensorflow/contrib/ignite/BUILD               | 136 ++++
 tensorflow/contrib/ignite/README.md           | 167 ++++
 tensorflow/contrib/ignite/__init__.py         |  42 +
 .../kernels/ignite_binary_object_parser.cc    | 304 +++++++
 .../kernels/ignite_binary_object_parser.h     |  54 ++
 .../contrib/ignite/kernels/ignite_client.cc   |  55 ++
 .../contrib/ignite/kernels/ignite_client.h    |  40 +
 .../contrib/ignite/kernels/ignite_dataset.cc  | 123 +++
 .../contrib/ignite/kernels/ignite_dataset.h   |  65 ++
 .../ignite/kernels/ignite_dataset_iterator.cc | 447 ++++++++++
 .../ignite/kernels/ignite_dataset_iterator.h  |  87 ++
 .../ignite/kernels/ignite_dataset_ops.cc      | 145 ++++
 .../ignite/kernels/ignite_plain_client.h      |  43 +
 .../kernels/ignite_plain_client_unix.cc       | 132 +++
 .../kernels/ignite_plain_client_windows.cc    | 143 ++++
 .../ignite/kernels/ignite_ssl_wrapper.cc      | 149 ++++
 .../ignite/kernels/ignite_ssl_wrapper.h       |  49 ++
 tensorflow/contrib/ignite/ops/dataset_ops.cc  |  64 ++
 .../ignite/python/ops/ignite_dataset_ops.py   | 763 ++++++++++++++++++
 .../ignite/python/ops/ignite_op_loader.py     |  25 +
 .../ignite/python/tests/bin/start-plain.sh    |  24 +
 .../ignite/python/tests/bin/start-ssl-auth.sh |  28 +
 .../ignite/python/tests/bin/start-ssl.sh      |  26 +
 .../tests/config/ignite-config-plain.xml      |  39 +
 .../tests/config/ignite-config-ssl-auth.xml   |  59 ++
 .../python/tests/config/ignite-config-ssl.xml |  59 ++
 .../python/tests/ignite_dataset_test.py       |  77 ++
 .../ignite/python/tests/keystore/client.jks   | Bin 0 -> 3232 bytes
 .../ignite/python/tests/keystore/client.pem   |  69 ++
 .../ignite/python/tests/keystore/server.jks   | Bin 0 -> 3230 bytes
 .../ignite/python/tests/keystore/trust.jks    | Bin 0 -> 2432 bytes
 .../contrib/ignite/python/tests/sql/init.sql  |  20 +
 .../ignite/python/tests/start_ignite.sh       |  30 +
 .../ignite/python/tests/stop_ignite.sh        |  19 +
 38 files changed, 3508 insertions(+)
 create mode 100644 tensorflow/contrib/ignite/BUILD
 create mode 100644 tensorflow/contrib/ignite/README.md
 create mode 100644 tensorflow/contrib/ignite/__init__.py
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_client.cc
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_client.h
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_dataset.cc
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_dataset.h
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_plain_client.h
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
 create mode 100644 tensorflow/contrib/ignite/ops/dataset_ops.cc
 create mode 100644 tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
 create mode 100644 tensorflow/contrib/ignite/python/ops/ignite_op_loader.py
 create mode 100755 tensorflow/contrib/ignite/python/tests/bin/start-plain.sh
 create mode 100755 tensorflow/contrib/ignite/python/tests/bin/start-ssl-auth.sh
 create mode 100755 tensorflow/contrib/ignite/python/tests/bin/start-ssl.sh
 create mode 100644 tensorflow/contrib/ignite/python/tests/config/ignite-config-plain.xml
 create mode 100644 tensorflow/contrib/ignite/python/tests/config/ignite-config-ssl-auth.xml
 create mode 100644 tensorflow/contrib/ignite/python/tests/config/ignite-config-ssl.xml
 create mode 100644 tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
 create mode 100644 tensorflow/contrib/ignite/python/tests/keystore/client.jks
 create mode 100644 tensorflow/contrib/ignite/python/tests/keystore/client.pem
 create mode 100644 tensorflow/contrib/ignite/python/tests/keystore/server.jks
 create mode 100644 tensorflow/contrib/ignite/python/tests/keystore/trust.jks
 create mode 100644 tensorflow/contrib/ignite/python/tests/sql/init.sql
 create mode 100755 tensorflow/contrib/ignite/python/tests/start_ignite.sh
 create mode 100755 tensorflow/contrib/ignite/python/tests/stop_ignite.sh

diff --git a/configure.py b/configure.py
index 361bd4764d..8f1957e870 100644
--- a/configure.py
+++ b/configure.py
@@ -1502,6 +1502,8 @@ def main():
                 'with_aws_support', True, 'aws')
   set_build_var(environ_cp, 'TF_NEED_KAFKA', 'Apache Kafka Platform',
                 'with_kafka_support', True, 'kafka')
+  set_build_var(environ_cp, 'TF_NEED_IGNITE', 'Apache Ignite',
+                'with_ignite_support', True, 'ignite')
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
                 False, 'xla')
   set_build_var(environ_cp, 'TF_NEED_GDR', 'GDR', 'with_gdr_support',
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 386e0096ff..6c29c78793 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -248,6 +248,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_ignite_support",
+    define_values = {"with_ignite_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
 # Crosses between platforms and file system libraries not supported on those
 # platforms due to limitations in nested select() statements.
 config_setting(
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 798f499870..f055e643d0 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -118,6 +118,11 @@ py_library(
             "//tensorflow/contrib/kafka",
         ],
         "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_ignite_support": [
+            "//tensorflow/contrib/ignite",
+        ],
+        "//conditions:default": [],
     }) + select({
         "//tensorflow:with_aws_support_windows_override": [],
         "//tensorflow:with_aws_support": [
@@ -160,6 +165,11 @@ cc_library(
             "//tensorflow/contrib/kafka:dataset_kernels",
         ],
         "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_ignite_support": [
+            "//tensorflow/contrib/ignite:dataset_kernels",
+        ],
+        "//conditions:default": [],
     }) + select({
         "//tensorflow:with_aws_support_windows_override": [],
         "//tensorflow:with_aws_support": [
@@ -197,6 +207,11 @@ cc_library(
             "//tensorflow/contrib/kafka:dataset_ops_op_lib",
         ],
         "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_ignite_support": [
+            "//tensorflow/contrib/ignite:dataset_ops_op_lib",
+        ],
+        "//conditions:default": [],
     }) + select({
         "//tensorflow:with_aws_support_windows_override": [],
         "//tensorflow:with_aws_support": [
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index fb871acae9..56755e817a 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -207,6 +207,8 @@ tensorflow/contrib/integrate/python
 tensorflow/contrib/integrate/python/ops
 tensorflow/contrib/kafka/python
 tensorflow/contrib/kafka/python/ops
+tensorflow/contrib/ignite/python
+tensorflow/contrib/ignite/python/ops
 tensorflow/contrib/keras
 tensorflow/contrib/keras/api
 tensorflow/contrib/keras/api/keras
diff --git a/tensorflow/contrib/ignite/BUILD b/tensorflow/contrib/ignite/BUILD
new file mode 100644
index 0000000000..9f6c666893
--- /dev/null
+++ b/tensorflow/contrib/ignite/BUILD
@@ -0,0 +1,136 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+    "tf_custom_op_library",
+    "tf_custom_op_py_library",
+    "tf_gen_op_libs",
+    "tf_py_test",
+    "if_not_windows",
+    "if_windows",
+)
+
+py_library(
+    name = "ignite",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dataset_ops",
+    ],
+)
+
+tf_custom_op_library(
+    name = "_dataset_ops.so",
+    srcs = ["ops/dataset_ops.cc"],
+    deps = [":dataset_kernels"],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["dataset_ops"],
+)
+
+cc_library(
+    name = "dataset_kernels",
+    srcs = [
+        "kernels/ignite_dataset_ops.cc",
+        "kernels/ignite_client.h",
+        "kernels/ignite_client.cc",
+        "kernels/ignite_plain_client.h",
+        "kernels/ignite_ssl_wrapper.h",
+        "kernels/ignite_ssl_wrapper.cc",
+        "kernels/ignite_binary_object_parser.h",
+        "kernels/ignite_binary_object_parser.cc",
+        "kernels/ignite_dataset.h",
+        "kernels/ignite_dataset.cc",
+        "kernels/ignite_dataset_iterator.h",
+        "kernels/ignite_dataset_iterator.cc",
+    ] + if_not_windows([
+        "kernels/ignite_plain_client_unix.cc",
+    ]) + if_windows([
+        "kernels/ignite_plain_client_windows.cc",
+    ]),
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@boringssl//:ssl",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
+py_library(
+    name = "dataset_ops",
+    srcs = [
+        "python/ops/ignite_dataset_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ignite_op_loader",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_dataset_ops",
+    out = "python/ops/gen_dataset_ops.py",
+    deps = ["//tensorflow/contrib/ignite:dataset_ops_op_lib"],
+)
+
+tf_kernel_library(
+    name = "dataset_ops_kernels",
+    deps = [
+        ":dataset_kernels",
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_py_library(
+    name = "ignite_op_loader",
+    srcs = ["python/ops/ignite_op_loader.py"],
+    dso = ["//tensorflow/contrib/ignite:_dataset_ops.so"],
+    kernels = [
+        ":dataset_ops_kernels",
+        "//tensorflow/contrib/ignite:dataset_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_dataset_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
+# The Apache Ignite servers have to setup before the test and tear down
+# after the test manually. The docker engine has to be installed.
+#
+# To setup Apache Ignite servers:
+# $ bash ./python/tests/start_ignite.sh
+#
+# To tear down Apache Ignite servers:
+# $ bash ./python/tests/stop_ignite.sh
+tf_py_test(
+    name = "ignite_dataset_test",
+    srcs = ["python/tests/ignite_dataset_test.py"],
+    additional_deps = [
+        ":ignite",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = [
+        "manual",
+        "no_windows",
+        "notap",
+    ],
+)
diff --git a/tensorflow/contrib/ignite/README.md b/tensorflow/contrib/ignite/README.md
new file mode 100644
index 0000000000..9054344e94
--- /dev/null
+++ b/tensorflow/contrib/ignite/README.md
@@ -0,0 +1,167 @@
+### Ignite Dataset
+# Ignite Dataset
+
+- [Overview](#overview)
+- [Features](#features)
+  * [Distributed In-Memory Datasource](#distributed-in-memory-datasource)
+  * [Structured Objects](#structured-objects)
+  * [Distributed Training](#distributed-training)
+  * [SSL Connection](#ssl-connection)
+  * [Windows Support](#windows-support)
+- [Try it out](#try-it-out)
+- [Limitations](#limitations)
+
+## Overview
+
+[Apache Ignite](https://ignite.apache.org/) is a memory-centric distributed database, caching, and processing platform for
+transactional, analytical, and streaming workloads, delivering in-memory speeds at petabyte scale. This contrib package contains an integration between Apache Ignite and TensorFlow. The integration is based on [tf.data](https://www.tensorflow.org/api_docs/python/tf/data) from TensorFlow side and [Binary Client Protocol](https://apacheignite.readme.io/v2.6/docs/binary-client-protocol) from Apache Ignite side. It allows to use Apache Ignite as a datasource for neural network training, inference and all other computations supported by TensorFlow. 
+
+## Features
+
+Ignite Dataset provides a set of features that makes it possible to use it in a wide range of cases. The most important and interesting features are described below.
+
+### Distributed In-Memory Datasource
+[Apache Ignite](https://ignite.apache.org/) is a distributed in-memory database, caching, and processing platform that allows to avoid limitations of hard drive and provide high reading speed and ability to store and operate with as much data as you need in distributed cluster. Using of Ignite Dataset makes it possible to utilize all these advantages. 
+- If you have a **gigabyte** of data you can keep it on a single machine on a hard drive, but you will face with hard drive speed limitations. At the same time, you can store your data in Apache Ignite on the same machine and use it as a datasource for TensorFlow and thus avoid these limitations.
+- If you have a **terabyte** of data you probably still can keep it on a single machine on a hard drive, but you will face with hard drive speed limitations again. At the same time, you can store your data in Apache Ignite distributed in-memory cluster and use it as a datasource for TensorFlow and thus avoid these limitations.
+- If you have a **petabyte** of data you can't keep it on a single machine. At the same time, you can store your data in Apache Ignite distributed in-memory cluster and use it as a datasource for TensorFlow.
+
+It's  important that Apache Ignite is not just a step of ETL pipeline between database or data warehouse and TensorFlow. Apache Ignite is a high-grade database itself. Choosing Apache Ignite and TensorFlow you are getting everything you need to work with operational or historical data and, in the same time, an ability to use this data for neural network training and inference.
+
+```bash
+$ apache-ignite-fabric/bin/ignite.sh
+$ apache-ignite-fabric/bin/sqlline.sh -u "jdbc:ignite:thin://localhost:10800/"
+
+jdbc:ignite:thin://localhost/> CREATE TABLE KITTEN_CACHE (ID LONG PRIMARY KEY, NAME VARCHAR);
+jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (1, 'WARM KITTY');
+jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (2, 'SOFT KITTY');
+jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL OF FUR');
+```
+
+```python
+>>> import tensorflow as tf
+>>> from tensorflow.contrib.ignite import IgniteDataset
+>>> 
+>>> dataset = IgniteDataset(cache_name="SQL_PUBLIC_KITTEN_CACHE")
+>>> iterator = dataset.make_one_shot_iterator()
+>>> next_obj = iterator.get_next()
+>>>
+>>> with tf.Session() as sess:
+>>>   for _ in range(3):
+>>>     print(sess.run(next_obj))
+
+{'key': 1, 'val': {'NAME': b'WARM KITTY'}}
+{'key': 2, 'val': {'NAME': b'SOFT KITTY'}}
+{'key': 3, 'val': {'NAME': b'LITTLE BALL OF FUR'}}
+```
+
+### Structured Objects
+[Apache Ignite](https://ignite.apache.org/) allows to store any objects you would like to store. These objects can have any hierarchy. Ignite Dataset provides an ability to work with such objects.
+
+```python
+>>> import tensorflow as tf
+>>> from tensorflow.contrib.ignite import IgniteDataset
+>>> 
+>>> dataset = IgniteDataset(cache_name="IMAGES")
+>>> iterator = dataset.make_one_shot_iterator()
+>>> next_obj = iterator.get_next()
+>>>
+>>> with tf.Session() as sess:
+>>>   print(sess.run(next_obj))
+
+{
+    'key': 'kitten.png', 
+    'val': {
+        'metadata': {
+            'file_name': b'kitten.png',
+            'label': b'little ball of fur',
+            width: 800, 
+            height: 600
+        }, 
+        'pixels': [0, 0, 0, 0, ..., 0]
+    }
+}
+```
+ Neural network training and other computations require transformations that can be done as part of  [tf.data](https://www.tensorflow.org/api_docs/python/tf/data) pipeline if you use Ignite Dataset.
+
+```python
+>>> import tensorflow as tf
+>>> from tensorflow.contrib.ignite import IgniteDataset
+>>> 
+>>> dataset = IgniteDataset(cache_name="IMAGES").map(lambda obj: obj['val']['pixels'])
+>>> iterator = dataset.make_one_shot_iterator()
+>>> next_obj = iterator.get_next()
+>>>
+>>> with tf.Session() as sess:
+>>>   print(sess.run(next_obj))
+
+[0, 0, 0, 0, ..., 0]
+```
+
+### Distributed Training
+
+TensorFlow is a machine learning framework that [natively supports](https://www.tensorflow.org/deploy/distributed) distributed neural network training, inference and other computations. The main idea behind the distributed neural network training is an ability to calculate gradients of loss functions (squares of the errors) on every partition of data (in terms of horizontal partitioning) and then sum them to get loss function gradient of the whole dataset. 
+
+<a href="https://www.codecogs.com/eqnedit.php?latex=\nabla[\sum_1^n(y&space;-&space;\hat{y})^2]&space;=&space;\nabla[\sum_1^{n_1}(y&space;-&space;\hat{y})^2]&space;&plus;&space;\nabla[\sum_{n_1}^{n_2}(y&space;-&space;\hat{y})^2]&space;&plus;&space;...&space;&plus;&space;\nabla[\sum_{n_{k-1}}^n(y&space;-&space;\hat{y})^2]" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\nabla[\sum_1^n(y&space;-&space;\hat{y})^2]&space;=&space;\nabla[\sum_1^{n_1}(y&space;-&space;\hat{y})^2]&space;&plus;&space;\nabla[\sum_{n_1}^{n_2}(y&space;-&space;\hat{y})^2]&space;&plus;&space;...&space;&plus;&space;\nabla[\sum_{n_{k-1}}^n(y&space;-&space;\hat{y})^2]" title="\nabla[\sum_1^n(y - \hat{y})^2] = \nabla[\sum_1^{n_1}(y - \hat{y})^2] + \nabla[\sum_{n_1}^{n_2}(y - \hat{y})^2] + ... + \nabla[\sum_{n_{k-1}}^n(y - \hat{y})^2]" /></a>
+
+Utilizing this ability we can calculate gradients on the nodes the data is stored on, reduce them and then finally update model parameters. It allows to avoid data transfers between nodes and thus to avoid network bottleneck.
+
+Apache Ignite uses horizontal partitioning to store data in distributed cluster. When we create Apache Ignite cache (or table in terms of SQL) we can specify the number of partitions the data will be partitioned on. If, for example, Apache Ignite cluster consists of 10 machines and we creates cache with 10 partitions then every machine will maintain approximately one data partition.
+
+Ignite Dataset allows to utilize these two aspects of distributed neural network training (using TensorFlow) and Apache Ignite partitioning. Ignite Dataset is a computation graph operation that might be performed on a remote worker. The remote worker can override Ignite Dataset parameters (such as `host`, `port` or `part`) by setting correstondent environment variables for worker process (such as `IGNITE_DATASET_HOST`, `IGNITE_DATASET_PORT` or `IGNITE_DATASET_PART`). Using this overriding approach we are able to assign specific partition to every worker so that one worker handles one partition and, at the same time, transparently work with single dataset.
+
+```python
+>>> import tensorflow as tf
+>>> from tensorflow.contrib.ignite import IgniteDataset
+>>> 
+>>> dataset = IgniteDataset("IMAGES")
+>>>
+>>> # Compute gradients locally on every worker node.
+>>> gradients = []    
+>>> for i in range(5):
+>>>     with tf.device("/job:WORKER/task:%d" % i):
+>>>         device_iterator = dataset.make_one_shot_iterator()
+>>>         device_next_obj = device_iterator.get_next()
+>>>         gradient = compute_gradient(device_next_obj)
+>>>         gradients.append(gradient)        
+>>>        
+>>> # Aggregate them on master node.
+>>> result_gradient = tf.reduce_sum(gradients)
+>>>
+>>> with tf.Session("grpc://localhost:10000") as sess:
+>>>     print(sess.run(result_gradient))
+```
+
+High-level TensorFlow API for [distributed training](https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy) is supported as well. 
+
+### SSL Connection
+
+Your data should not be accessible without any control. Apache Ignite allows to protect data transfer channels by [SSL](https://en.wikipedia.org/wiki/Transport_Layer_Security) and authentification. Ignite Dataset supports both SSL connection with and without authntication. For more information please see [Apache Ignite SSL/TLS](https://apacheignite.readme.io/docs/ssltls) documentation.
+
+```python
+>>> import tensorflow as tf
+>>> from tensorflow.contrib.ignite import IgniteDataset
+>>> 
+>>> dataset = IgniteDataset(cache_name="IMAGES", certfile="client.pem", cert_password="password", username="ignite", password="ignite")
+>>> ...
+```
+
+### Windows Support
+
+Ignite Dataset is fully compatible with Windows, so you can use it as part of TensorFlow on your Windows workstation as well as on Linux/MacOS systems.
+
+## Try it out
+
+The simplest way to try Ignite Dataset out is to run [Docker](https://www.docker.com/) container with Apache Ignite and loaded [MNIST](http://yann.lecun.com/exdb/mnist/) data and then interruct with it using Ignite Dataset. Such container is available on Docker Hub: [dmitrievanthony/ignite-with-mnist](https://hub.docker.com/r/dmitrievanthony/ignite-with-mnist/). You need to start this container on your machine:
+
+```
+docker run -it -p 10800:10800 dmitrievanthony/ignite-with-mnist
+```
+
+After that you will be able to work with it following way:
+
+![ignite-dataset-mnist](https://s3.amazonaws.com/helloworld23423423ew23/ignite-dataset-mnist.png "Ignite Dataset Mnist")
+
+## Limitations
+
+Presently Ignite Dataset works with assumption that all objects in the cache have the same structure (homogeneous objects) and the cache contains at least one object. Another limitation concerns structured objects, Ignite Dataset does not support UUID, Maps and Object arrays that might be parts of object structures.
\ No newline at end of file
diff --git a/tensorflow/contrib/ignite/__init__.py b/tensorflow/contrib/ignite/__init__.py
new file mode 100644
index 0000000000..468920a557
--- /dev/null
+++ b/tensorflow/contrib/ignite/__init__.py
@@ -0,0 +1,42 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Apache Ignite is a memory-centric distributed database, caching, and
+   processing platform for transactional, analytical, and streaming workloads,
+   delivering in-memory speeds at petabyte scale. This contrib package
+   contains an integration between Apache Ignite and TensorFlow. The
+   integration is based on tf.data from TensorFlow side and Binary Client
+   Protocol from Apache Ignite side. It allows to use Apache Ignite as a
+   datasource for neural network training, inference and all other
+   computations supported by TensorFlow. Ignite Dataset is based on Apache
+   Ignite Binary Client Protocol:
+   https://apacheignite.readme.io/v2.6/docs/binary-client-protocol.
+
+@@IgniteDataset
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.ignite.python.ops.ignite_dataset_ops \
+import IgniteDataset
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "IgniteDataset",
+]
+
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc b/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
new file mode 100644
index 0000000000..bf0ef8766e
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
@@ -0,0 +1,304 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "ignite_binary_object_parser.h"
+
+namespace ignite {
+
+tensorflow::Status BinaryObjectParser::Parse(
+    uint8_t*& ptr, std::vector<tensorflow::Tensor>& out_tensors,
+    std::vector<int32_t>& types) {
+  uint8_t object_type_id = *ptr;
+  ptr += 1;
+
+  switch (object_type_id) {
+    case BYTE: {
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_UINT8, {});
+      tensor.scalar<tensorflow::uint8>()() = *((uint8_t*)ptr);
+      ptr += 1;
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case SHORT: {
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_INT16, {});
+      tensor.scalar<tensorflow::int16>()() = *((int16_t*)ptr);
+      ptr += 2;
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case INT: {
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_INT32, {});
+      tensor.scalar<tensorflow::int32>()() = *((int32_t*)ptr);
+      ptr += 4;
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case LONG: {
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_INT64, {});
+      tensor.scalar<tensorflow::int64>()() = *((int64_t*)ptr);
+      ptr += 8;
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case FLOAT: {
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_FLOAT, {});
+      tensor.scalar<float>()() = *((float*)ptr);
+      ptr += 4;
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case DOUBLE: {
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_DOUBLE, {});
+      tensor.scalar<double>()() = *((double*)ptr);
+      ptr += 8;
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case UCHAR: {
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_UINT16, {});
+      tensor.scalar<tensorflow::uint16>()() = *((uint16_t*)ptr);
+      ptr += 2;
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case BOOL: {
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_BOOL, {});
+      tensor.scalar<bool>()() = *((bool*)ptr);
+      ptr += 1;
+      out_tensors.emplace_back(std::move(tensor));
+
+      break;
+    }
+    case STRING: {
+      int32_t length = *((int32_t*)ptr);
+      ptr += 4;
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_STRING, {});
+      tensor.scalar<std::string>()() = std::string((char*)ptr, length);
+      ptr += length;
+      out_tensors.emplace_back(std::move(tensor));
+
+      break;
+    }
+    case DATE: {
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_INT64, {});
+      tensor.scalar<tensorflow::int64>()() = *((int64_t*)ptr);
+      ptr += 8;
+      out_tensors.emplace_back(std::move(tensor));
+
+      break;
+    }
+    case BYTE_ARR: {
+      int32_t length = *((int32_t*)ptr);
+      ptr += 4;
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_UINT8,
+                                tensorflow::TensorShape({length}));
+
+      uint8_t* arr = (uint8_t*)ptr;
+      ptr += length;
+
+      std::copy_n(arr, length, tensor.flat<tensorflow::uint8>().data());
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case SHORT_ARR: {
+      int32_t length = *((int32_t*)ptr);
+      ptr += 4;
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_INT16,
+                                tensorflow::TensorShape({length}));
+
+      int16_t* arr = (int16_t*)ptr;
+      ptr += length * 2;
+
+      std::copy_n(arr, length, tensor.flat<tensorflow::int16>().data());
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case INT_ARR: {
+      int32_t length = *((int32_t*)ptr);
+      ptr += 4;
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_INT32,
+                                tensorflow::TensorShape({length}));
+
+      int32_t* arr = (int32_t*)ptr;
+      ptr += length * 4;
+
+      std::copy_n(arr, length, tensor.flat<tensorflow::int32>().data());
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case LONG_ARR: {
+      int32_t length = *((int32_t*)ptr);
+      ptr += 4;
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_INT64,
+                                tensorflow::TensorShape({length}));
+
+      int64_t* arr = (int64_t*)ptr;
+      ptr += length * 8;
+
+      std::copy_n(arr, length, tensor.flat<tensorflow::int64>().data());
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case FLOAT_ARR: {
+      int32_t length = *((int32_t*)ptr);
+      ptr += 4;
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_FLOAT,
+                                tensorflow::TensorShape({length}));
+
+      float* arr = (float*)ptr;
+      ptr += 4 * length;
+
+      std::copy_n(arr, length, tensor.flat<float>().data());
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case DOUBLE_ARR: {
+      int32_t length = *((int32_t*)ptr);
+      ptr += 4;
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_DOUBLE,
+                                tensorflow::TensorShape({length}));
+
+      double* arr = (double*)ptr;
+      ptr += 8 * length;
+
+      std::copy_n(arr, length, tensor.flat<double>().data());
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case UCHAR_ARR: {
+      int32_t length = *((int32_t*)ptr);
+      ptr += 4;
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_UINT16,
+                                tensorflow::TensorShape({length}));
+
+      uint16_t* arr = (uint16_t*)ptr;
+      ptr += length * 2;
+
+      std::copy_n(arr, length, tensor.flat<tensorflow::uint16>().data());
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case BOOL_ARR: {
+      int32_t length = *((int32_t*)ptr);
+      ptr += 4;
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_BOOL,
+                                tensorflow::TensorShape({length}));
+
+      bool* arr = (bool*)ptr;
+      ptr += length;
+
+      std::copy_n(arr, length, tensor.flat<bool>().data());
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case STRING_ARR: {
+      int32_t length = *((int32_t*)ptr);
+      ptr += 4;
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_STRING,
+                                tensorflow::TensorShape({length}));
+
+      for (int32_t i = 0; i < length; i++) {
+        int32_t str_length = *((int32_t*)ptr);
+        ptr += 4;
+        const int8_t* str = (const int8_t*)ptr;
+        ptr += str_length;
+        tensor.vec<std::string>()(i) = std::string((char*)str, str_length);
+      }
+
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case DATE_ARR: {
+      int32_t length = *((int32_t*)ptr);
+      ptr += 4;
+      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
+                                tensorflow::DT_INT64,
+                                tensorflow::TensorShape({length}));
+      int64_t* arr = (int64_t*)ptr;
+      ptr += length * 8;
+
+      std::copy_n(arr, length, tensor.flat<tensorflow::int64>().data());
+      out_tensors.emplace_back(std::move(tensor));
+      break;
+    }
+    case WRAPPED_OBJ: {
+      int32_t byte_arr_size = *((int32_t*)ptr);
+      ptr += 4;
+
+      tensorflow::Status status = Parse(ptr, out_tensors, types);
+      if (!status.ok()) return status;
+
+      int32_t offset = *((int32_t*)ptr);
+      ptr += 4;
+
+      break;
+    }
+    case COMPLEX_OBJ: {
+      uint8_t version = *ptr;
+      ptr += 1;
+      int16_t flags = *((int16_t*)ptr);  // USER_TYPE = 1, HAS_SCHEMA = 2
+      ptr += 2;
+      int32_t type_id = *((int32_t*)ptr);
+      ptr += 4;
+      int32_t hash_code = *((int32_t*)ptr);
+      ptr += 4;
+      int32_t length = *((int32_t*)ptr);
+      ptr += 4;
+      int32_t schema_id = *((int32_t*)ptr);
+      ptr += 4;
+      int32_t schema_offset = *((int32_t*)ptr);
+      ptr += 4;
+
+      uint8_t* end = ptr + schema_offset - 24;
+      int32_t i = 0;
+      while (ptr < end) {
+        i++;
+        tensorflow::Status status = Parse(ptr, out_tensors, types);
+        if (!status.ok()) return status;
+      }
+
+      ptr += (length - schema_offset);
+
+      break;
+    }
+    default: {
+      return tensorflow::errors::Internal("Unknowd binary type (type id ",
+                                          (int)object_type_id, ")");
+    }
+  }
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace ignite
diff --git a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h b/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
new file mode 100644
index 0000000000..1e845cbc56
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace ignite {
+
+class BinaryObjectParser {
+ public:
+  tensorflow::Status Parse(uint8_t*& ptr,
+                           std::vector<tensorflow::Tensor>& out_tensors,
+                           std::vector<int32_t>& types);
+};
+
+enum ObjectType {
+  BYTE = 1,
+  SHORT = 2,
+  INT = 3,
+  LONG = 4,
+  FLOAT = 5,
+  DOUBLE = 6,
+  UCHAR = 7,
+  BOOL = 8,
+  STRING = 9,
+  DATE = 11,
+  BYTE_ARR = 12,
+  SHORT_ARR = 13,
+  INT_ARR = 14,
+  LONG_ARR = 15,
+  FLOAT_ARR = 16,
+  DOUBLE_ARR = 17,
+  UCHAR_ARR = 18,
+  BOOL_ARR = 19,
+  STRING_ARR = 20,
+  DATE_ARR = 22,
+  WRAPPED_OBJ = 27,
+  COMPLEX_OBJ = 103
+};
+
+}  // namespace ignite
diff --git a/tensorflow/contrib/ignite/kernels/ignite_client.cc b/tensorflow/contrib/ignite/kernels/ignite_client.cc
new file mode 100644
index 0000000000..5a8eddb944
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_client.cc
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef IGNITE_CLIENT_H
+#define IGNITE_CLIENT_H
+#include "ignite_client.h"
+#endif
+
+namespace ignite {
+
+tensorflow::Status Client::ReadByte(uint8_t& data) {
+  return ReadData((uint8_t*)&data, 1);
+}
+
+tensorflow::Status Client::ReadShort(int16_t& data) {
+  return ReadData((uint8_t*)&data, 2);
+}
+
+tensorflow::Status Client::ReadInt(int32_t& data) {
+  return ReadData((uint8_t*)&data, 4);
+}
+
+tensorflow::Status Client::ReadLong(int64_t& data) {
+  return ReadData((uint8_t*)&data, 8);
+}
+
+tensorflow::Status Client::WriteByte(uint8_t data) {
+  return WriteData((uint8_t*)&data, 1);
+}
+
+tensorflow::Status Client::WriteShort(int16_t data) {
+  return WriteData((uint8_t*)&data, 2);
+}
+
+tensorflow::Status Client::WriteInt(int32_t data) {
+  return WriteData((uint8_t*)&data, 4);
+}
+
+tensorflow::Status Client::WriteLong(int64_t data) {
+  return WriteData((uint8_t*)&data, 8);
+}
+
+}  // namespace ignite
diff --git a/tensorflow/contrib/ignite/kernels/ignite_client.h b/tensorflow/contrib/ignite/kernels/ignite_client.h
new file mode 100644
index 0000000000..64e28d75f0
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_client.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace ignite {
+
+class Client {
+ public:
+  virtual tensorflow::Status Connect() = 0;
+  virtual tensorflow::Status Disconnect() = 0;
+  virtual bool IsConnected() = 0;
+  virtual int GetSocketDescriptor() = 0;
+
+  virtual tensorflow::Status ReadByte(uint8_t& data);
+  virtual tensorflow::Status ReadShort(int16_t& data);
+  virtual tensorflow::Status ReadInt(int32_t& data);
+  virtual tensorflow::Status ReadLong(int64_t& data);
+  virtual tensorflow::Status ReadData(uint8_t* buf, int32_t length) = 0;
+
+  virtual tensorflow::Status WriteByte(uint8_t data);
+  virtual tensorflow::Status WriteShort(int16_t data);
+  virtual tensorflow::Status WriteInt(int32_t data);
+  virtual tensorflow::Status WriteLong(int64_t data);
+  virtual tensorflow::Status WriteData(uint8_t* buf, int32_t length) = 0;
+};
+
+}  // namespace ignite
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset.cc b/tensorflow/contrib/ignite/kernels/ignite_dataset.cc
new file mode 100644
index 0000000000..a9bf26955b
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset.cc
@@ -0,0 +1,123 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "ignite_dataset_iterator.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace ignite {
+
+IgniteDataset::IgniteDataset(tensorflow::OpKernelContext* ctx,
+                             std::string cache_name, std::string host,
+                             tensorflow::int32 port, bool local,
+                             tensorflow::int32 part,
+                             tensorflow::int32 page_size, std::string username,
+                             std::string password, std::string certfile,
+                             std::string keyfile, std::string cert_password,
+                             std::vector<tensorflow::int32> schema,
+                             std::vector<tensorflow::int32> permutation)
+    : DatasetBase(tensorflow::DatasetContext(ctx)),
+      cache_name(cache_name),
+      host(host),
+      port(port),
+      local(local),
+      part(part),
+      page_size(page_size),
+      username(username),
+      password(password),
+      certfile(certfile),
+      keyfile(keyfile),
+      cert_password(cert_password),
+      schema(schema),
+      permutation(permutation) {
+  SchemaToTypes();
+  SchemaToShapes();
+
+  LOG(INFO) << "Ignite Dataset created [cache_name='" << cache_name
+            << "', host='" << host << "', port=" << port << ", local=" << local
+            << ", part=" << part << ", page_size=" << page_size
+            << ", username='" << username << "', certfile='" << certfile
+            << "', keyfile='" << keyfile + "']";
+}
+
+IgniteDataset::~IgniteDataset() { LOG(INFO) << "Ignite Dataset destroyed"; }
+
+std::unique_ptr<tensorflow::IteratorBase> IgniteDataset::MakeIteratorInternal(
+    const tensorflow::string& prefix) const {
+  return std::unique_ptr<tensorflow::IteratorBase>(new IgniteDatasetIterator(
+      {this, tensorflow::strings::StrCat(prefix, "::Ignite")}, this->host,
+      this->port, this->cache_name, this->local, this->part, this->page_size,
+      this->username, this->password, this->certfile, this->keyfile,
+      this->cert_password, this->schema, this->permutation));
+}
+
+const tensorflow::DataTypeVector& IgniteDataset::output_dtypes() const {
+  return dtypes;
+}
+
+const std::vector<tensorflow::PartialTensorShape>&
+IgniteDataset::output_shapes() const {
+  return shapes;
+}
+
+tensorflow::string IgniteDataset::DebugString() const {
+  return "IgniteDatasetOp::Dataset";
+}
+
+tensorflow::Status IgniteDataset::AsGraphDefInternal(
+    tensorflow::SerializationContext* ctx, DatasetGraphDefBuilder* b,
+    tensorflow::Node** output) const {
+  return tensorflow::errors::Unimplemented(
+      "IgniteDataset does not support 'AsGraphDefInternal'");
+}
+
+void IgniteDataset::SchemaToTypes() {
+  for (auto e : schema) {
+    if (e == BYTE || e == BYTE_ARR) {
+      dtypes.push_back(tensorflow::DT_UINT8);
+    } else if (e == SHORT || e == SHORT_ARR) {
+      dtypes.push_back(tensorflow::DT_INT16);
+    } else if (e == INT || e == INT_ARR) {
+      dtypes.push_back(tensorflow::DT_INT32);
+    } else if (e == LONG || e == LONG_ARR) {
+      dtypes.push_back(tensorflow::DT_INT64);
+    } else if (e == FLOAT || e == FLOAT_ARR) {
+      dtypes.push_back(tensorflow::DT_FLOAT);
+    } else if (e == DOUBLE || e == DOUBLE_ARR) {
+      dtypes.push_back(tensorflow::DT_DOUBLE);
+    } else if (e == UCHAR || e == UCHAR_ARR) {
+      dtypes.push_back(tensorflow::DT_UINT8);
+    } else if (e == BOOL || e == BOOL_ARR) {
+      dtypes.push_back(tensorflow::DT_BOOL);
+    } else if (e == STRING || e == STRING_ARR) {
+      dtypes.push_back(tensorflow::DT_STRING);
+    } else {
+      LOG(ERROR) << "Unexpected type in schema [type_id=" << e << "]";
+    }
+  }
+}
+
+void IgniteDataset::SchemaToShapes() {
+  for (auto e : schema) {
+    if (e >= 1 && e < 10) {
+      shapes.push_back(tensorflow::PartialTensorShape({}));
+    } else if (e >= 12 && e < 21) {
+      shapes.push_back(tensorflow::PartialTensorShape({-1}));
+    } else {
+      LOG(ERROR) << "Unexpected type in schema [type_id=" << e << "]";
+    }
+  }
+}
+
+}  // namespace ignite
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset.h b/tensorflow/contrib/ignite/kernels/ignite_dataset.h
new file mode 100644
index 0000000000..2120dfd342
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace ignite {
+
+class IgniteDataset : public tensorflow::DatasetBase {
+ public:
+  IgniteDataset(tensorflow::OpKernelContext* ctx, std::string cache_name,
+                std::string host, tensorflow::int32 port, bool local,
+                tensorflow::int32 part, tensorflow::int32 page_size,
+                std::string username, std::string password,
+                std::string certfile, std::string keyfile,
+                std::string cert_password,
+                std::vector<tensorflow::int32> schema,
+                std::vector<tensorflow::int32> permutation);
+  ~IgniteDataset();
+  std::unique_ptr<tensorflow::IteratorBase> MakeIteratorInternal(
+      const tensorflow::string& prefix) const override;
+  const tensorflow::DataTypeVector& output_dtypes() const override;
+  const std::vector<tensorflow::PartialTensorShape>& output_shapes()
+      const override;
+  tensorflow::string DebugString() const override;
+
+ protected:
+  tensorflow::Status AsGraphDefInternal(
+      tensorflow::SerializationContext* ctx, DatasetGraphDefBuilder* b,
+      tensorflow::Node** output) const override;
+
+ private:
+  const std::string cache_name;
+  const std::string host;
+  const tensorflow::int32 port;
+  const bool local;
+  const tensorflow::int32 part;
+  const tensorflow::int32 page_size;
+  const std::string username;
+  const std::string password;
+  const std::string certfile;
+  const std::string keyfile;
+  const std::string cert_password;
+  const std::vector<tensorflow::int32> schema;
+  const std::vector<tensorflow::int32> permutation;
+
+  tensorflow::DataTypeVector dtypes;
+  std::vector<tensorflow::PartialTensorShape> shapes;
+
+  void SchemaToTypes();
+  void SchemaToShapes();
+};
+
+}  // namespace ignite
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc b/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
new file mode 100644
index 0000000000..03cc3c1291
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
@@ -0,0 +1,447 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "ignite_dataset_iterator.h"
+
+#include "ignite_plain_client.h"
+#include "ignite_ssl_wrapper.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include <time.h>
+#include <chrono>
+
+namespace ignite {
+
+#define CHECK_STATUS(status) \
+  if (!status.ok()) return status;
+
+IgniteDatasetIterator::IgniteDatasetIterator(
+    const Params& params, std::string host, tensorflow::int32 port,
+    std::string cache_name, bool local, tensorflow::int32 part,
+    tensorflow::int32 page_size, std::string username, std::string password,
+    std::string certfile, std::string keyfile, std::string cert_password,
+    std::vector<tensorflow::int32> schema,
+    std::vector<tensorflow::int32> permutation)
+    : tensorflow::DatasetIterator<IgniteDataset>(params),
+      cache_name(cache_name),
+      local(local),
+      part(part),
+      page_size(page_size),
+      username(username),
+      password(password),
+      schema(schema),
+      permutation(permutation),
+      remainder(-1),
+      cursor_id(-1),
+      last_page(false) {
+  Client* p_client = new PlainClient(host, port);
+
+  if (certfile.empty())
+    client = std::unique_ptr<Client>(p_client);
+  else
+    client = std::unique_ptr<Client>(new SslWrapper(
+        std::unique_ptr<Client>(p_client), certfile, keyfile, cert_password));
+
+  LOG(INFO) << "Ignite Dataset Iterator created";
+}
+
+IgniteDatasetIterator::~IgniteDatasetIterator() {
+  tensorflow::Status status = CloseConnection();
+  if (!status.ok()) LOG(ERROR) << status.ToString();
+
+  LOG(INFO) << "Ignite Dataset Iterator destroyed";
+}
+
+tensorflow::Status IgniteDatasetIterator::EstablishConnection() {
+  if (!client->IsConnected()) {
+    tensorflow::Status status = client->Connect();
+    if (!status.ok()) return status;
+
+    status = Handshake();
+    if (!status.ok()) {
+      tensorflow::Status disconnect_status = client->Disconnect();
+      if (!disconnect_status.ok()) LOG(ERROR) << disconnect_status.ToString();
+
+      return status;
+    }
+  }
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status IgniteDatasetIterator::CloseConnection() {
+  if (cursor_id != -1 && !last_page) {
+    tensorflow::Status conn_status = EstablishConnection();
+    if (!conn_status.ok()) return conn_status;
+
+    CHECK_STATUS(client->WriteInt(18));  // Message length
+    CHECK_STATUS(
+        client->WriteShort(close_connection_opcode));  // Operation code
+    CHECK_STATUS(client->WriteLong(0));                // Request ID
+    CHECK_STATUS(client->WriteLong(cursor_id));        // Resource ID
+
+    int32_t res_len;
+    CHECK_STATUS(client->ReadInt(res_len));
+    if (res_len < 12)
+      return tensorflow::errors::Internal(
+          "Close Resource Response is corrupted");
+
+    int64_t req_id;
+    CHECK_STATUS(client->ReadLong(req_id));
+    int32_t status;
+    CHECK_STATUS(client->ReadInt(status));
+    if (status != 0) {
+      uint8_t err_msg_header;
+      CHECK_STATUS(client->ReadByte(err_msg_header));
+      if (err_msg_header == string_val) {
+        int32_t err_msg_length;
+        CHECK_STATUS(client->ReadInt(err_msg_length));
+        uint8_t* err_msg_c = new uint8_t[err_msg_length];
+        CHECK_STATUS(client->ReadData(err_msg_c, err_msg_length));
+        std::string err_msg((char*)err_msg_c, err_msg_length);
+        delete[] err_msg_c;
+
+        return tensorflow::errors::Internal("Close Resource Error [status=",
+                                            status, ", message=", err_msg, "]");
+      }
+      return tensorflow::errors::Internal("Close Resource Error [status=",
+                                          status, "]");
+    }
+
+    LOG(INFO) << "Query Cursor " << cursor_id << " is closed";
+
+    cursor_id = -1;
+
+    return client->Disconnect();
+  } else {
+    LOG(INFO) << "Query Cursor " << cursor_id << " is already closed";
+  }
+
+  return client->IsConnected() ? client->Disconnect()
+                               : tensorflow::Status::OK();
+}
+
+tensorflow::Status IgniteDatasetIterator::GetNextInternal(
+    tensorflow::IteratorContext* ctx,
+    std::vector<tensorflow::Tensor>* out_tensors, bool* end_of_sequence) {
+  if (remainder == 0 && last_page) {
+    LOG(INFO) << "Query Cursor " << cursor_id << " is closed";
+
+    cursor_id = -1;
+    *end_of_sequence = true;
+    return tensorflow::Status::OK();
+  } else {
+    tensorflow::Status status = EstablishConnection();
+    if (!status.ok()) return status;
+
+    if (remainder == -1 || remainder == 0) {
+      tensorflow::Status status =
+          remainder == -1 ? ScanQuery() : LoadNextPage();
+      if (!status.ok()) return status;
+    }
+
+    uint8_t* initial_ptr = ptr;
+    std::vector<int32_t> types;
+    std::vector<tensorflow::Tensor> tensors;
+
+    status = parser.Parse(ptr, tensors, types);  // Parse key
+    if (!status.ok()) return status;
+
+    status = parser.Parse(ptr, tensors, types);  // Parse val
+    if (!status.ok()) return status;
+
+    remainder -= (ptr - initial_ptr);
+
+    out_tensors->resize(tensors.size());
+    for (int32_t i = 0; i < tensors.size(); i++)
+      (*out_tensors)[permutation[i]] = std::move(tensors[i]);
+
+    *end_of_sequence = false;
+    return tensorflow::Status::OK();
+  }
+
+  *end_of_sequence = true;
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status IgniteDatasetIterator::SaveInternal(
+    tensorflow::IteratorStateWriter* writer) {
+  return tensorflow::errors::Unimplemented(
+      "Iterator for IgniteDataset does not support 'SaveInternal'");
+}
+
+tensorflow::Status IgniteDatasetIterator::RestoreInternal(
+    tensorflow::IteratorContext* ctx, tensorflow::IteratorStateReader* reader) {
+  return tensorflow::errors::Unimplemented(
+      "Iterator for IgniteDataset does not support 'RestoreInternal')");
+}
+
+tensorflow::Status IgniteDatasetIterator::Handshake() {
+  int32_t msg_len = 8;
+
+  if (username.empty())
+    msg_len += 1;
+  else
+    msg_len += 5 + username.length();
+
+  if (password.empty())
+    msg_len += 1;
+  else
+    msg_len += 5 + password.length();
+
+  CHECK_STATUS(client->WriteInt(msg_len));
+  CHECK_STATUS(client->WriteByte(1));
+  CHECK_STATUS(client->WriteShort(protocol_major_version));
+  CHECK_STATUS(client->WriteShort(protocol_minor_version));
+  CHECK_STATUS(client->WriteShort(protocol_patch_version));
+  CHECK_STATUS(client->WriteByte(2));
+  if (username.empty()) {
+    CHECK_STATUS(client->WriteByte(null_val));
+  } else {
+    CHECK_STATUS(client->WriteByte(string_val));
+    CHECK_STATUS(client->WriteInt(username.length()));
+    CHECK_STATUS(
+        client->WriteData((uint8_t*)username.c_str(), username.length()));
+  }
+
+  if (password.empty()) {
+    CHECK_STATUS(client->WriteByte(null_val));
+  } else {
+    CHECK_STATUS(client->WriteByte(string_val));
+    CHECK_STATUS(client->WriteInt(password.length()));
+    CHECK_STATUS(
+        client->WriteData((uint8_t*)password.c_str(), password.length()));
+  }
+
+  int32_t handshake_res_len;
+  CHECK_STATUS(client->ReadInt(handshake_res_len));
+  uint8_t handshake_res;
+  CHECK_STATUS(client->ReadByte(handshake_res));
+
+  LOG(INFO) << "Handshake length " << handshake_res_len << ", res "
+            << (int16_t)handshake_res;
+
+  if (handshake_res != 1) {
+    int16_t serv_ver_major;
+    CHECK_STATUS(client->ReadShort(serv_ver_major));
+    int16_t serv_ver_minor;
+    CHECK_STATUS(client->ReadShort(serv_ver_minor));
+    int16_t serv_ver_patch;
+    CHECK_STATUS(client->ReadShort(serv_ver_patch));
+    uint8_t header;
+    CHECK_STATUS(client->ReadByte(header));
+
+    if (header == string_val) {
+      int32_t length;
+      CHECK_STATUS(client->ReadInt(length));
+      uint8_t* err_msg_c = new uint8_t[length];
+      CHECK_STATUS(client->ReadData(err_msg_c, length));
+      std::string err_msg((char*)err_msg_c, length);
+      delete[] err_msg_c;
+
+      return tensorflow::errors::Internal(
+          "Handshake Error [result=", handshake_res, ", version=",
+          serv_ver_major, ".", serv_ver_minor, ".", serv_ver_patch,
+          ", message='", err_msg, "']");
+    } else if (header == null_val) {
+      return tensorflow::errors::Internal(
+          "Handshake Error [result=", handshake_res, ", version=",
+          serv_ver_major, ".", serv_ver_minor, ".", serv_ver_patch, "]");
+    } else {
+      return tensorflow::errors::Internal(
+          "Handshake Error [result=", handshake_res, ", version=",
+          serv_ver_major, ".", serv_ver_minor, ".", serv_ver_patch, "]");
+    }
+  }
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status IgniteDatasetIterator::ScanQuery() {
+  CHECK_STATUS(client->WriteInt(25));                        // Message length
+  CHECK_STATUS(client->WriteShort(scan_query_opcode));       // Operation code
+  CHECK_STATUS(client->WriteLong(0));                        // Request ID
+  CHECK_STATUS(client->WriteInt(JavaHashCode(cache_name)));  // Cache name
+  CHECK_STATUS(client->WriteByte(0));                        // Flags
+  CHECK_STATUS(client->WriteByte(null_val));                 // Filter object
+  CHECK_STATUS(client->WriteInt(page_size));                 // Cursor page size
+  CHECK_STATUS(client->WriteInt(part));    // Partition to query
+  CHECK_STATUS(client->WriteByte(local));  // Local flag
+
+  int64_t wait_start = std::chrono::duration_cast<std::chrono::milliseconds>(
+                           std::chrono::system_clock::now().time_since_epoch())
+                           .count();
+
+  int32_t res_len;
+  CHECK_STATUS(client->ReadInt(res_len));
+
+  int64_t wait_stop = std::chrono::duration_cast<std::chrono::milliseconds>(
+                          std::chrono::system_clock::now().time_since_epoch())
+                          .count();
+
+  LOG(INFO) << "Scan Query waited " << (wait_stop - wait_start) << " ms";
+
+  if (res_len < 12)
+    return tensorflow::errors::Internal("Scan Query Response is corrupted");
+
+  int64_t req_id;
+  CHECK_STATUS(client->ReadLong(req_id));
+
+  int32_t status;
+  CHECK_STATUS(client->ReadInt(status));
+
+  if (status != 0) {
+    uint8_t err_msg_header;
+    CHECK_STATUS(client->ReadByte(err_msg_header));
+
+    if (err_msg_header == string_val) {
+      int32_t err_msg_length;
+      CHECK_STATUS(client->ReadInt(err_msg_length));
+
+      uint8_t* err_msg_c = new uint8_t[err_msg_length];
+      CHECK_STATUS(client->ReadData(err_msg_c, err_msg_length));
+      std::string err_msg((char*)err_msg_c, err_msg_length);
+      delete[] err_msg_c;
+
+      return tensorflow::errors::Internal("Scan Query Error [status=", status,
+                                          ", message=", err_msg, "]");
+    }
+    return tensorflow::errors::Internal("Scan Query Error [status=", status,
+                                        "]");
+  }
+
+  CHECK_STATUS(client->ReadLong(cursor_id));
+
+  LOG(INFO) << "Query Cursor " << cursor_id << " is opened";
+
+  int32_t row_cnt;
+  CHECK_STATUS(client->ReadInt(row_cnt));
+
+  remainder = res_len - 25;
+  page = std::unique_ptr<uint8_t>(new uint8_t[remainder]);
+  ptr = page.get();
+
+  int64_t start = std::chrono::duration_cast<std::chrono::milliseconds>(
+                      std::chrono::system_clock::now().time_since_epoch())
+                      .count();
+
+  CHECK_STATUS(client->ReadData(ptr, remainder));
+
+  int64_t stop = std::chrono::duration_cast<std::chrono::milliseconds>(
+                     std::chrono::system_clock::now().time_since_epoch())
+                     .count();
+  ;
+
+  double size_in_mb = 1.0 * remainder / 1024 / 1024;
+  double time_in_s = 1.0 * (stop - start) / 1000;
+  LOG(INFO) << "Page size " << size_in_mb << " Mb, time " << time_in_s * 1000
+            << " ms download speed " << size_in_mb / time_in_s << " Mb/sec";
+
+  uint8_t last_page_b;
+  CHECK_STATUS(client->ReadByte(last_page_b));
+
+  last_page = !last_page_b;
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status IgniteDatasetIterator::LoadNextPage() {
+  CHECK_STATUS(client->WriteInt(18));                       // Message length
+  CHECK_STATUS(client->WriteShort(load_next_page_opcode));  // Operation code
+  CHECK_STATUS(client->WriteLong(0));                       // Request ID
+  CHECK_STATUS(client->WriteLong(cursor_id));               // Cursor ID
+
+  int64_t wait_start = std::chrono::duration_cast<std::chrono::milliseconds>(
+                           std::chrono::system_clock::now().time_since_epoch())
+                           .count();
+
+  int32_t res_len;
+  CHECK_STATUS(client->ReadInt(res_len));
+
+  int64_t wait_stop = std::chrono::duration_cast<std::chrono::milliseconds>(
+                          std::chrono::system_clock::now().time_since_epoch())
+                          .count();
+
+  LOG(INFO) << "Load Next Page waited " << (wait_stop - wait_start) << " ms";
+
+  if (res_len < 12)
+    return tensorflow::errors::Internal("Load Next Page Response is corrupted");
+
+  int64_t req_id;
+  CHECK_STATUS(client->ReadLong(req_id));
+
+  int32_t status;
+  CHECK_STATUS(client->ReadInt(status));
+
+  if (status != 0) {
+    uint8_t err_msg_header;
+    CHECK_STATUS(client->ReadByte(err_msg_header));
+
+    if (err_msg_header == string_val) {
+      int32_t err_msg_length;
+      CHECK_STATUS(client->ReadInt(err_msg_length));
+
+      uint8_t* err_msg_c = new uint8_t[err_msg_length];
+      CHECK_STATUS(client->ReadData(err_msg_c, err_msg_length));
+      std::string err_msg((char*)err_msg_c, err_msg_length);
+      delete[] err_msg_c;
+
+      return tensorflow::errors::Internal("Load Next Page Error [status=",
+                                          status, ", message=", err_msg, "]");
+    }
+    return tensorflow::errors::Internal("Load Next Page Error [status=", status,
+                                        "]");
+  }
+
+  int32_t row_cnt;
+  CHECK_STATUS(client->ReadInt(row_cnt));
+
+  remainder = res_len - 17;
+  page = std::unique_ptr<uint8_t>(new uint8_t[remainder]);
+  ptr = page.get();
+
+  int64_t start = std::chrono::duration_cast<std::chrono::milliseconds>(
+                      std::chrono::system_clock::now().time_since_epoch())
+                      .count();
+
+  CHECK_STATUS(client->ReadData(ptr, remainder));
+
+  int64_t stop = std::chrono::duration_cast<std::chrono::milliseconds>(
+                     std::chrono::system_clock::now().time_since_epoch())
+                     .count();
+  ;
+
+  double size_in_mb = 1.0 * remainder / 1024 / 1024;
+  double time_in_s = 1.0 * (stop - start) / 1000;
+  LOG(INFO) << "Page size " << size_in_mb << " Mb, time " << time_in_s * 1000
+            << " ms download speed " << size_in_mb / time_in_s << " Mb/sec";
+
+  uint8_t last_page_b;
+  CHECK_STATUS(client->ReadByte(last_page_b));
+
+  last_page = !last_page_b;
+
+  return tensorflow::Status::OK();
+}
+
+int32_t IgniteDatasetIterator::JavaHashCode(std::string str) {
+  int32_t h = 0;
+  for (char& c : str) {
+    h = 31 * h + c;
+  }
+  return h;
+}
+
+}  // namespace ignite
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h b/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
new file mode 100644
index 0000000000..d1df4527f9
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
@@ -0,0 +1,87 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "ignite_binary_object_parser.h"
+#include "ignite_dataset.h"
+
+#ifndef IGNITE_CLIENT_H
+#define IGNITE_CLIENT_H
+#include "ignite_client.h"
+#endif
+
+namespace ignite {
+
+class IgniteDatasetIterator
+    : public tensorflow::DatasetIterator<IgniteDataset> {
+ public:
+  IgniteDatasetIterator(const Params& params, std::string host,
+                        tensorflow::int32 port, std::string cache_name,
+                        bool local, tensorflow::int32 part,
+                        tensorflow::int32 page_size, std::string username,
+                        std::string password, std::string certfile,
+                        std::string keyfile, std::string cert_password,
+                        std::vector<tensorflow::int32> schema,
+                        std::vector<tensorflow::int32> permutation);
+  ~IgniteDatasetIterator();
+  tensorflow::Status GetNextInternal(
+      tensorflow::IteratorContext* ctx,
+      std::vector<tensorflow::Tensor>* out_tensors,
+      bool* end_of_sequence) override;
+
+ protected:
+  tensorflow::Status SaveInternal(
+      tensorflow::IteratorStateWriter* writer) override;
+  tensorflow::Status RestoreInternal(
+      tensorflow::IteratorContext* ctx,
+      tensorflow::IteratorStateReader* reader) override;
+
+ private:
+  std::unique_ptr<Client> client;
+  BinaryObjectParser parser;
+
+  const std::string cache_name;
+  const bool local;
+  const tensorflow::int32 part;
+  const tensorflow::int32 page_size;
+  const std::string username;
+  const std::string password;
+  const std::vector<tensorflow::int32> schema;
+  const std::vector<tensorflow::int32> permutation;
+
+  int32_t remainder;
+  int64_t cursor_id;
+  bool last_page;
+
+  std::unique_ptr<uint8_t> page;
+  uint8_t* ptr;
+
+  tensorflow::Status EstablishConnection();
+  tensorflow::Status CloseConnection();
+  tensorflow::Status Handshake();
+  tensorflow::Status ScanQuery();
+  tensorflow::Status LoadNextPage();
+  int32_t JavaHashCode(std::string str);
+};
+
+constexpr uint8_t null_val = 101;
+constexpr uint8_t string_val = 9;
+constexpr uint8_t protocol_major_version = 1;
+constexpr uint8_t protocol_minor_version = 1;
+constexpr uint8_t protocol_patch_version = 0;
+constexpr int16_t scan_query_opcode = 2000;
+constexpr int16_t load_next_page_opcode = 2001;
+constexpr int16_t close_connection_opcode = 0;
+
+}  // namespace ignite
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc b/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
new file mode 100644
index 0000000000..543b5e4afc
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
@@ -0,0 +1,145 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "ignite_dataset.h"
+#include <stdlib.h>
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+
+class IgniteDatasetOp : public DatasetOpKernel {
+ public:
+  using DatasetOpKernel::DatasetOpKernel;
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    std::string cache_name = "";
+    std::string host = "";
+    int32 port = -1;
+    bool local = false;
+    int32 part = -1;
+    int32 page_size = -1;
+    std::string username = "";
+    std::string password = "";
+    std::string certfile = "";
+    std::string keyfile = "";
+    std::string cert_password = "";
+
+    const char* env_cache_name = std::getenv("IGNITE_DATASET_CACHE_NAME");
+    const char* env_host = std::getenv("IGNITE_DATASET_HOST");
+    const char* env_port = std::getenv("IGNITE_DATASET_PORT");
+    const char* env_local = std::getenv("IGNITE_DATASET_LOCAL");
+    const char* env_part = std::getenv("IGNITE_DATASET_PART");
+    const char* env_page_size = std::getenv("IGNITE_DATASET_PAGE_SIZE");
+    const char* env_username = std::getenv("IGNITE_DATASET_USERNAME");
+    const char* env_password = std::getenv("IGNITE_DATASET_PASSWORD");
+    const char* env_certfile = std::getenv("IGNITE_DATASET_CERTFILE");
+    const char* env_keyfile = std::getenv("IGNITE_DATASET_KEYFILE");
+    const char* env_cert_password = std::getenv("IGNITE_DATASET_CERT_PASSWORD");
+
+    if (env_cache_name)
+      cache_name = std::string(env_cache_name);
+    else
+      OP_REQUIRES_OK(ctx, ParseScalarArgument<std::string>(ctx, "cache_name",
+                                                           &cache_name));
+
+    if (env_host)
+      host = std::string(env_host);
+    else
+      OP_REQUIRES_OK(ctx, ParseScalarArgument<std::string>(ctx, "host", &host));
+
+    if (env_port)
+      port = atoi(env_port);
+    else
+      OP_REQUIRES_OK(ctx, ParseScalarArgument<int32>(ctx, "port", &port));
+
+    if (env_local)
+      local = true;
+    else
+      OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, "local", &local));
+
+    if (env_part)
+      part = atoi(env_part);
+    else
+      OP_REQUIRES_OK(ctx, ParseScalarArgument<int32>(ctx, "part", &part));
+
+    if (env_page_size)
+      page_size = atoi(env_page_size);
+    else
+      OP_REQUIRES_OK(ctx,
+                     ParseScalarArgument<int32>(ctx, "page_size", &page_size));
+
+    if (env_username)
+      username = std::string(env_username);
+    else
+      OP_REQUIRES_OK(
+          ctx, ParseScalarArgument<std::string>(ctx, "username", &username));
+
+    if (env_password)
+      password = std::string(env_password);
+    else
+      OP_REQUIRES_OK(
+          ctx, ParseScalarArgument<std::string>(ctx, "password", &password));
+
+    if (env_certfile)
+      certfile = std::string(env_certfile);
+    else
+      OP_REQUIRES_OK(
+          ctx, ParseScalarArgument<std::string>(ctx, "certfile", &certfile));
+
+    if (env_keyfile)
+      keyfile = std::string(env_keyfile);
+    else
+      OP_REQUIRES_OK(
+          ctx, ParseScalarArgument<std::string>(ctx, "keyfile", &keyfile));
+
+    if (env_cert_password)
+      cert_password = std::string(env_cert_password);
+    else
+      OP_REQUIRES_OK(ctx, ParseScalarArgument<std::string>(ctx, "cert_password",
+                                                           &cert_password));
+
+    const Tensor* schema_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("schema", &schema_tensor));
+    OP_REQUIRES(ctx, schema_tensor->dims() == 1,
+                errors::InvalidArgument("`schema` must be a vector."));
+
+    std::vector<int32> schema;
+    schema.reserve(schema_tensor->NumElements());
+    for (int i = 0; i < schema_tensor->NumElements(); i++) {
+      schema.push_back(schema_tensor->flat<int32>()(i));
+    }
+
+    const Tensor* permutation_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("permutation", &permutation_tensor));
+    OP_REQUIRES(ctx, schema_tensor->dims() == 1,
+                errors::InvalidArgument("`permutation` must be a vector."));
+
+    std::vector<int32> permutation;
+    permutation.reserve(permutation_tensor->NumElements());
+    for (int i = 0; i < permutation_tensor->NumElements(); i++) {
+      permutation.push_back(permutation_tensor->flat<int32>()(i));
+    }
+
+    *output = new ignite::IgniteDataset(
+        ctx, cache_name, host, port, local, part, page_size, username, password,
+        certfile, keyfile, cert_password, std::move(schema),
+        std::move(permutation));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("IgniteDataset").Device(DEVICE_CPU),
+                        IgniteDatasetOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h b/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
new file mode 100644
index 0000000000..5491af68d6
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef IGNITE_CLIENT_H
+#define IGNITE_CLIENT_H
+#include "ignite_client.h"
+#endif
+
+#include <string>
+
+namespace ignite {
+
+class PlainClient : public Client {
+ public:
+  PlainClient(std::string host, int port);
+  ~PlainClient();
+
+  virtual tensorflow::Status Connect();
+  virtual tensorflow::Status Disconnect();
+  virtual bool IsConnected();
+  virtual int GetSocketDescriptor();
+  virtual tensorflow::Status ReadData(uint8_t* buf, int32_t length);
+  virtual tensorflow::Status WriteData(uint8_t* buf, int32_t length);
+
+ private:
+  std::string host;
+  int port;
+  int sock;
+};
+
+}  // namespace ignite
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc b/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
new file mode 100644
index 0000000000..dbfa4f8786
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
@@ -0,0 +1,132 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "ignite_plain_client.h"
+
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <sys/socket.h>
+
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <map>
+
+#include <iostream>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace ignite {
+
+PlainClient::PlainClient(std::string host, int port)
+    : host(host), port(port), sock(-1) {}
+
+PlainClient::~PlainClient() {
+  if (IsConnected()) {
+    tensorflow::Status status = Disconnect();
+    if (!status.ok()) LOG(WARNING) << status.ToString();
+  }
+}
+
+tensorflow::Status PlainClient::Connect() {
+  if (sock == -1) {
+    sock = socket(AF_INET, SOCK_STREAM, 0);
+    if (sock == -1)
+      return tensorflow::errors::Internal("Failed to create socket");
+  }
+
+  sockaddr_in server;
+
+  server.sin_addr.s_addr = inet_addr(host.c_str());
+  if (server.sin_addr.s_addr == -1) {
+    hostent* he;
+    in_addr** addr_list;
+
+    if ((he = gethostbyname(host.c_str())) == NULL)
+      return tensorflow::errors::Internal("Failed to resolve hostname \"", host,
+                                          "\"");
+
+    addr_list = (in_addr**)he->h_addr_list;
+    if (addr_list[0] != NULL) server.sin_addr = *addr_list[0];
+  }
+
+  server.sin_family = AF_INET;
+  server.sin_port = htons(port);
+
+  if (connect(sock, (sockaddr*)&server, sizeof(server)) < 0)
+    return tensorflow::errors::Internal("Failed to connect to \"", host, ":",
+                                        port, "\"");
+
+  LOG(INFO) << "Connection to \"" << host << ":" << port << "\" established";
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status PlainClient::Disconnect() {
+  int close_res = close(sock);
+  sock = -1;
+
+  LOG(INFO) << "Connection to \"" << host << ":" << port << "\" is closed";
+
+  return close_res == 0 ? tensorflow::Status::OK()
+                        : tensorflow::errors::Internal(
+                              "Failed to correctly close connection");
+}
+
+bool PlainClient::IsConnected() { return sock != -1; }
+
+int PlainClient::GetSocketDescriptor() { return sock; }
+
+tensorflow::Status PlainClient::ReadData(uint8_t* buf, int32_t length) {
+  int recieved = 0;
+
+  while (recieved < length) {
+    int res = recv(sock, buf, length - recieved, 0);
+
+    if (res < 0)
+      return tensorflow::errors::Internal(
+          "Error occured while reading from socket: ", res, ", ",
+          std::string(strerror(errno)));
+
+    if (res == 0)
+      return tensorflow::errors::Internal("Server closed connection");
+
+    recieved += res;
+    buf += res;
+  }
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status PlainClient::WriteData(uint8_t* buf, int32_t length) {
+  int sent = 0;
+
+  while (sent < length) {
+    int res = send(sock, buf, length - sent, 0);
+
+    if (res < 0)
+      return tensorflow::errors::Internal(
+          "Error occured while writing into socket: ", res, ", ",
+          std::string(strerror(errno)));
+
+    sent += res;
+    buf += res;
+  }
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace ignite
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc b/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
new file mode 100644
index 0000000000..f78c9b3627
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
@@ -0,0 +1,143 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "ignite_plain_client.h"
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+
+#pragma comment(lib, "Ws2_32.lib")
+#pragma comment(lib, "Mswsock.lib")
+#pragma comment(lib, "AdvApi32.lib")
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace ignite {
+
+PlainClient::PlainClient(std::string host, int port)
+    : host(host), port(port), sock(INVALID_SOCKET) {}
+
+PlainClient::~PlainClient() {
+  if (IsConnected()) {
+    tensorflow::Status status = Disconnect();
+    if (!status.ok()) LOG(WARNING) << status.ToString();
+  }
+}
+
+tensorflow::Status PlainClient::Connect() {
+  WSADATA wsaData;
+  addrinfo *result = NULL, *ptr = NULL, hints;
+
+  int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
+  if (res != 0)
+    return tensorflow::errors::Internal("WSAStartup failed with error: ", res);
+
+  ZeroMemory(&hints, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;
+  hints.ai_socktype = SOCK_STREAM;
+  hints.ai_protocol = IPPROTO_TCP;
+
+  res =
+      getaddrinfo(host.c_str(), std::to_string(port).c_str(), &hints, &result);
+  if (res != 0)
+    return tensorflow::errors::Internal("Getaddrinfo failed with error: ", res);
+
+  for (ptr = result; ptr != NULL; ptr = ptr->ai_next) {
+    sock = socket(ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol);
+    if (sock == INVALID_SOCKET) {
+      WSACleanup();
+      return tensorflow::errors::Internal("Socket failed with error: ",
+                                          WSAGetLastError());
+    }
+
+    res = connect(sock, ptr->ai_addr, (int)ptr->ai_addrlen);
+    if (res == SOCKET_ERROR) {
+      closesocket(sock);
+      sock = INVALID_SOCKET;
+      continue;
+    }
+
+    break;
+  }
+
+  freeaddrinfo(result);
+
+  if (sock == INVALID_SOCKET) {
+    WSACleanup();
+    return tensorflow::errors::Internal("Unable to connect to server");
+  }
+
+  LOG(INFO) << "Connection to \"" << host << ":" << port << "\" established";
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status PlainClient::Disconnect() {
+  int res = shutdown(sock, SD_SEND);
+  closesocket(sock);
+  WSACleanup();
+
+  if (res == SOCKET_ERROR)
+    return tensorflow::errors::Internal("Shutdown failed with error: ",
+                                        WSAGetLastError());
+  else
+    return tensorflow::Status::OK();
+}
+
+bool PlainClient::IsConnected() { return sock != INVALID_SOCKET; }
+
+int PlainClient::GetSocketDescriptor() { return sock; }
+
+tensorflow::Status PlainClient::ReadData(uint8_t *buf, int32_t length) {
+  int recieved = 0;
+
+  while (recieved < length) {
+    int res = recv(sock, buf, length - recieved, 0);
+
+    if (res < 0)
+      return tensorflow::errors::Internal(
+          "Error occured while reading from socket: ", res);
+
+    if (res == 0)
+      return tensorflow::errors::Internal("Server closed connection");
+
+    recieved += res;
+    buf += res;
+  }
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status PlainClient::WriteData(uint8_t *buf, int32_t length) {
+  int sent = 0;
+
+  while (sent < length) {
+    int res = send(sock, buf, length - sent, 0);
+
+    if (res < 0)
+      return tensorflow::errors::Internal(
+          "Error occured while writing into socket: ", res);
+
+    sent += res;
+    buf += res;
+  }
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace ignite
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc b/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
new file mode 100644
index 0000000000..a1101b91f3
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
@@ -0,0 +1,149 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "ignite_ssl_wrapper.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include <openssl/err.h>
+#include <openssl/ssl.h>
+
+namespace ignite {
+
+static int PasswordCb(char *buf, int size, int rwflag, void *password) {
+  strncpy(buf, (char *)(password), size);
+  buf[size - 1] = '\0';
+  return (strlen(buf));
+}
+
+SslWrapper::SslWrapper(std::shared_ptr<Client> client, std::string certfile,
+                       std::string keyfile, std::string cert_password)
+    : client(client),
+      certfile(certfile),
+      keyfile(keyfile),
+      cert_password(cert_password),
+      ctx(NULL) {}
+
+SslWrapper::~SslWrapper() {
+  if (IsConnected()) {
+    tensorflow::Status status = Disconnect();
+    if (!status.ok()) LOG(WARNING) << status.ToString();
+  }
+
+  if (ctx != NULL) {
+    SSL_CTX_free(ctx);
+    ctx = NULL;
+  }
+}
+
+tensorflow::Status SslWrapper::InitSslContext() {
+  OpenSSL_add_all_algorithms();
+  SSL_load_error_strings();
+
+  ctx = SSL_CTX_new(SSLv23_method());
+  if (ctx == NULL)
+    return tensorflow::errors::Internal("Couldn't create SSL context");
+
+  SSL_CTX_set_default_passwd_cb(ctx, PasswordCb);
+  SSL_CTX_set_default_passwd_cb_userdata(ctx, (void *)cert_password.c_str());
+
+  if (SSL_CTX_use_certificate_chain_file(ctx, certfile.c_str()) != 1)
+    return tensorflow::errors::Internal(
+        "Couldn't load cetificate chain (file '", certfile, "')");
+
+  std::string private_key_file = keyfile.empty() ? certfile : keyfile;
+  if (SSL_CTX_use_PrivateKey_file(ctx, private_key_file.c_str(),
+                                  SSL_FILETYPE_PEM) != 1)
+    return tensorflow::errors::Internal("Couldn't load private key (file '",
+                                        private_key_file, "')");
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SslWrapper::Connect() {
+  tensorflow::Status status;
+
+  if (ctx == NULL) {
+    status = InitSslContext();
+    if (!status.ok()) return status;
+  }
+
+  ssl = SSL_new(ctx);
+  if (ssl == NULL)
+    return tensorflow::errors::Internal("Failed to establish SSL connection");
+
+  status = client->Connect();
+  if (!status.ok()) return status;
+
+  SSL_set_fd(ssl, client->GetSocketDescriptor());
+  if (SSL_connect(ssl) != 1)
+    return tensorflow::errors::Internal("Failed to establish SSL connection");
+
+  LOG(INFO) << "SSL connection established";
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SslWrapper::Disconnect() {
+  SSL_free(ssl);
+
+  LOG(INFO) << "SSL connection closed";
+
+  return client->Disconnect();
+}
+
+bool SslWrapper::IsConnected() { return client->IsConnected(); }
+
+int SslWrapper::GetSocketDescriptor() { return client->GetSocketDescriptor(); }
+
+tensorflow::Status SslWrapper::ReadData(uint8_t *buf, int32_t length) {
+  int recieved = 0;
+
+  while (recieved < length) {
+    int res = SSL_read(ssl, buf, length - recieved);
+
+    if (res < 0)
+      return tensorflow::errors::Internal(
+          "Error occured while reading from SSL socket: ", res);
+
+    if (res == 0)
+      return tensorflow::errors::Internal("Server closed SSL connection");
+
+    recieved += res;
+    buf += res;
+  }
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SslWrapper::WriteData(uint8_t *buf, int32_t length) {
+  int sent = 0;
+
+  while (sent < length) {
+    int res = SSL_write(ssl, buf, length - sent);
+
+    if (res < 0)
+      return tensorflow::errors::Internal(
+          "Error occured while writing into socket: ", res);
+
+    sent += res;
+    buf += res;
+  }
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace ignite
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h b/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
new file mode 100644
index 0000000000..e0c2a242dc
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef IGNITE_CLIENT_H
+#define IGNITE_CLIENT_H
+#include "ignite_client.h"
+#endif
+
+#include <openssl/ssl.h>
+#include <string>
+
+namespace ignite {
+
+class SslWrapper : public Client {
+ public:
+  SslWrapper(std::shared_ptr<Client> client, std::string certfile,
+             std::string keyfile, std::string cert_password);
+  ~SslWrapper();
+
+  virtual tensorflow::Status Connect();
+  virtual tensorflow::Status Disconnect();
+  virtual bool IsConnected();
+  virtual int GetSocketDescriptor();
+  virtual tensorflow::Status ReadData(uint8_t* buf, int32_t length);
+  virtual tensorflow::Status WriteData(uint8_t* buf, int32_t length);
+
+ private:
+  std::shared_ptr<Client> client;
+  std::string certfile;
+  std::string keyfile;
+  std::string cert_password;
+  SSL_CTX* ctx;
+  SSL* ssl;
+  tensorflow::Status InitSslContext();
+};
+
+}  // namespace ignite
diff --git a/tensorflow/contrib/ignite/ops/dataset_ops.cc b/tensorflow/contrib/ignite/ops/dataset_ops.cc
new file mode 100644
index 0000000000..17494d1cfd
--- /dev/null
+++ b/tensorflow/contrib/ignite/ops/dataset_ops.cc
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("IgniteDataset")
+  .Input("cache_name: string")
+  .Input("host: string")
+  .Input("port: int32")
+  .Input("local: bool")
+  .Input("part: int32")
+  .Input("page_size: int32")
+  .Input("username: string")
+  .Input("password: string")
+  .Input("certfile: string")
+  .Input("keyfile: string")
+  .Input("cert_password: string")
+  .Input("schema: int32")
+  .Input("permutation: int32")
+  .Output("handle: variant")
+  .SetIsStateful()
+  .SetShapeFn(shape_inference::ScalarShape)
+  .Doc(R"doc(
+Apache Ignite is a memory-centric distributed database, caching, and processing
+platform for transactional, analytical, and streaming workloads, delivering 
+in-memory speeds at petabyte scale. This contrib package contains an 
+integration between Apache Ignite and TensorFlow. The integration is based on 
+tf.data from TensorFlow side and Binary Client Protocol from Apache Ignite side. 
+It allows to use Apache Ignite as a datasource for neural network training, 
+inference and all other computations supported by TensorFlow. Ignite Dataset
+is based on Apache Ignite Binary Client Protocol.
+
+cache_name: Ignite Cache Name.
+host: Ignite Thin Client Host.
+port: Ignite Thin Client Port.
+local: Local flag that defines that data should be fetched from local host only.
+part: Partition data should be fetched from.
+page_size: Page size for Ignite Thin Client.
+username: Username to authenticate via Ignite Thin Client.
+password: Password to authenticate via Ignite Thin Client.
+certfile: SSL certificate to establish SSL connection.
+keyfile: Private key file to establish SSL connection.
+cert_password: SSL certificate password to establish SSL connection.
+schema: Internal structure that defines schema of cache objects.
+permutation: Internal structure that defines permutation of cache objects.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
new file mode 100644
index 0000000000..6fa073957a
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
@@ -0,0 +1,763 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Ignite Dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import socket
+import struct
+import ssl
+import abc
+
+from tensorflow.contrib.ignite.python.ops import ignite_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.ignite.python.ops import gen_dataset_ops
+from tensorflow.python.data.ops.dataset_ops import Dataset
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+
+class Readable():
+  """Readable abstract class that exposes methods to do reading-related
+     operations.
+  """
+
+  @abc.abstractmethod
+  def __init__(self):
+    pass
+
+  def read_byte(self):
+    """Reads and returnes byte."""
+    return self.__read("b", 1)
+
+  def read_short(self):
+    """Reads and returns short (2 bytes, little-endian)."""
+    return self.__read("h", 2)
+
+  def read_int(self):
+    """Reads and returns int (4 bytes, little-endian)."""
+    return self.__read("i", 4)
+
+  def read_long(self):
+    """Reads and returns long (8 bytes, little-endian)."""
+    return self.__read("q", 8)
+
+  def skip(self, length):
+    """Skips the specified number of bytes."""
+    self.read_data(length)
+
+  @abc.abstractmethod
+  def read_data(self, length):
+    """Reads the specified number of bytes and returns them as a buffer."""
+    return None
+
+  def __read(self, data_type, length):
+    """Reads, unpacks and returns specified type (little-endian)."""
+    buffer = self.read_data(length)
+    return struct.unpack("<" + data_type, buffer)[0]
+
+class DataBuffer(Readable):
+  """DataBuffer class that exposes methods to read data from a byte buffer."""
+
+  def __init__(self, buffer):
+    """Constructs a new instance of DataBuffer based on the specified byte
+       buffer.
+
+    Args:
+      buffer: Buffer to be read.
+    """
+    Readable.__init__(self)
+    self.buffer = buffer
+    self.ptr = 0
+
+  def read_data(self, length):
+    """Reads the specified number of bytes and returns them as a buffer."""
+    data_buffer = self.buffer[self.ptr:][:length]
+    self.ptr += length
+    return data_buffer
+
+class TcpClient(Readable):
+  """TcpClient class that exposes methods to read data from a socket."""
+
+  def __init__(self, host, port, certfile=None, keyfile=None, password=None):
+    """Constructs a new instance of TcpClient based on the specified host
+       and port.
+
+    Args:
+      host: Host to be connected.
+      port: Port to be connected.
+      certfile: File in PEM format containing the certificate as well as any
+        number of CA certificates needed to establish the certificate’s
+        authenticity.
+      keyfile: File containing the private key (otherwise the private key
+        will be taken from certfile as well).
+      password: Password to be used if the private key is encrypted and a
+        password is necessary.
+    """
+    Readable.__init__(self)
+    self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+
+    if certfile is not None:
+      context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
+      context.load_cert_chain(certfile, keyfile, password)
+      self.sock = context.wrap_socket(self.sock)
+    else:
+      if keyfile is not None:
+        raise Exception("SSL is disabled, keyfile must not be specified \
+          (to enable SSL specify certfile)")
+      if password is not None:
+        raise Exception("SSL is disabled, password must not be specified \
+          (to enable SSL specify certfile)")
+
+    self.host = host
+    self.port = port
+
+  def __enter__(self):
+    """Connects to host and port specified in the constructor."""
+    self.sock.connect((self.host, self.port))
+    return self
+
+  def __exit__(self, t, v, traceback):
+    """Disconnects the socket."""
+    self.sock.close()
+
+  def write_byte(self, v):
+    """Writes the specified byte."""
+    self.__write(v, "b")
+
+  def write_short(self, v):
+    """Writes the specified short (2 bytes, little-endian)."""
+    self.__write(v, "h")
+
+  def write_int(self, v):
+    """Writes the specified short (4 bytes, little-endian)."""
+    self.__write(v, "i")
+
+  def write_long(self, v):
+    """Writes the specified int (8 bytes, little-endian)."""
+    self.__write(v, "q")
+
+  def write_string(self, v):
+    """Writes the specified string."""
+    self.sock.sendall(v.encode("UTF-8"))
+
+  def read_data(self, length):
+    """Reads the specified number of bytes and returns them as a buffer."""
+    data_buffer = None
+    rem = length
+    while rem > 0:
+      buf = self.sock.recv(rem)
+      rem = rem - len(buf)
+      if data_buffer is None:
+        data_buffer = buf
+      else:
+        data_buffer += buf
+    return data_buffer
+
+  def __write(self, value, data_type):
+    """Packs and writes data using the specified type (little-endian)."""
+    data_buffer = struct.pack("<" + data_type, value)
+    self.sock.sendall(data_buffer)
+
+class BinaryType():
+  """BinaryType class that encapsulated type id, type name and fields."""
+
+  def __init__(self, type_id, type_name, fields):
+    """Constructs a new instance of BinaryType."""
+    self.type_id = type_id
+    self.type_name = type_name
+    self.fields = fields
+
+class BinaryField():
+  """BinaryField class that encapsulated field name, type id and field id."""
+
+  def __init__(self, field_name, type_id, field_id):
+    """Constructs a new instance of BinaryField."""
+    self.field_name = field_name
+    self.type_id = type_id
+    self.field_id = field_id
+
+# Binary types defined in Apache Ignite Thin client and supported by
+# TensorFlow on Apache Ignite, see
+# https://apacheignite.readme.io/v2.6/docs/binary-client-protocol.
+types = {
+    1: (dtypes.uint8, False),
+    2: (dtypes.int16, False),
+    3: (dtypes.int32, False),
+    4: (dtypes.int64, False),
+    5: (dtypes.float32, False),
+    6: (dtypes.float64, False),
+    7: (dtypes.uint16, False),
+    8: (dtypes.bool, False),
+    9: (dtypes.string, False),
+    12: (dtypes.uint8, True),
+    13: (dtypes.int16, True),
+    14: (dtypes.int32, True),
+    15: (dtypes.int64, True),
+    16: (dtypes.float32, True),
+    17: (dtypes.float64, True),
+    18: (dtypes.uint16, True),
+    19: (dtypes.bool, True),
+    20: (dtypes.string, True)
+}
+
+class TypeTreeNode():
+  """TypeTreeNode class exposes methods to format object tree structure
+     data.
+  """
+  def __init__(self, name, type_id, fields=None, permutation=None):
+    """Constructs a new instance of TypeTreeNode.
+
+    Args:
+      name: Name of the object tree node.
+      type_id: Type id of the object tree node.
+      fields: List of fields (children of the object tree node).
+      permutation: Permutation that should be applied to order object children.
+    """
+    self.name = name
+    self.type_id = type_id
+    self.fields = fields
+    self.permutation = permutation
+
+  def to_output_classes(self):
+    """Formats the tree object the way required in 'output_classes' property of
+       dataset.
+    """
+    if self.fields is None:
+      return ops.Tensor
+    output_classes = {}
+    for field in self.fields:
+      output_classes[field.name] = field.to_output_classes()
+    return output_classes
+
+  def to_output_shapes(self):
+    """Formats the tree object the way required in 'output_shapes' property of
+       dataset.
+    """
+    if self.fields is None:
+      object_type = types[self.type_id]
+      if object_type is not None:
+        is_array = object_type[1]
+        if is_array:
+          return tensor_shape.TensorShape([None])
+        return tensor_shape.TensorShape([])
+      raise Exception("Unsupported type [type_id=%d]" % self.type_id)
+    output_shapes = {}
+    for field in self.fields:
+      output_shapes[field.name] = field.to_output_shapes()
+    return output_shapes
+
+  def to_output_types(self):
+    """Formats the tree object the way required in 'output_types' property of
+       dataset.
+    """
+    if self.fields is None:
+      object_type = types[self.type_id]
+      if object_type is not None:
+        return object_type[0]
+      raise Exception("Unsupported type [type_id=%d]" % self.type_id)
+    else:
+      output_types = {}
+      for field in self.fields:
+        output_types[field.name] = field.to_output_types()
+      return output_types
+
+  def to_flat(self):
+    """Returns a list of leaf node types."""
+    return self.to_flat_rec([])
+
+  def to_permutation(self):
+    """Returns a permutation that should be applied to order object leafs."""
+    correct_order_dict = {}
+    self.traversal_rec(correct_order_dict, 0)
+    object_order = []
+    self.traversal_permutation_rec(object_order)
+    return [correct_order_dict[o] for o in object_order]
+
+  def to_flat_rec(self, flat):
+    """Formats a list of leaf node types."""
+    flat.append(self.type_id)
+    if self.fields is not None:
+      for field in self.fields:
+        field.to_flat_rec(flat)
+    return flat
+
+  def traversal_permutation_rec(self, permutation):
+    """Collects nodes in accordance with permutation."""
+    if self.fields is None:
+      permutation.append(self)
+    else:
+      for idx in self.permutation:
+        field = self.fields[idx]
+        field.traversal_permutation_rec(permutation)
+
+  def traversal_rec(self, d, i):
+    """Collects nodes in pre-order traversal."""
+    if self.fields is None:
+      d[self] = i
+      i += 1
+    else:
+      for field in self.fields:
+        i = field.traversal_rec(d, i)
+    return i
+
+class IgniteClient(TcpClient):
+  """IgniteClient class exposes methods to work with Apache Ignite using Thin
+     client. This client works with assumption that all object in the cache
+     have the same structure (homogeneous objects) and the cache contains at
+     least one object.
+  """
+  def __init__(self, host, port, username=None, password=None, certfile=None,\
+    keyfile=None, cert_password=None):
+    """Constructs a new instance of IgniteClient.
+
+    Args:
+      host: Apache Ignite Thin client host to be connected.
+      port: Apache Ignite Thin client port to be connected.
+      username: Apache Ignite Thin Client authentication username.
+      password: Apache Ignite Thin Client authentication password.
+      certfile: File in PEM format containing the certificate as well as
+        any number of CA certificates needed to establish the certificate’s
+        authenticity.
+      keyfile: File containing the private key (otherwise the private key
+        will be taken from certfile as well).
+      cert_password: Password to be used if the private key is encrypted and a
+        password is necessary.
+    """
+    TcpClient.__init__(self, host, port, certfile, keyfile, cert_password)
+    self.username = username
+    self.password = password
+
+  def handshake(self):
+    """Makes a handshake required to be made after connect before any other
+       calls.
+    """
+    msg_len = 8
+
+    if self.username is None:
+      msg_len += 1
+    else:
+      msg_len += 5 + len(self.username)
+
+    if self.password is None:
+      msg_len += 1
+    else:
+      msg_len += 5 + len(self.password)
+
+    self.write_int(msg_len)   # Message length
+    self.write_byte(1)        # Handshake operation
+    self.write_short(1)       # Version (1.1.0)
+    self.write_short(1)
+    self.write_short(0)
+    self.write_byte(2)        # Thin client
+
+    if self.username is None: # Username
+      self.write_byte(101)
+    else:
+      self.write_byte(9)
+      self.write_int(len(self.username))
+      self.write_string(self.username)
+
+    if self.password is None: # Password
+      self.write_byte(101)
+    else:
+      self.write_byte(9)
+      self.write_int(len(self.password))
+      self.write_string(self.password)
+
+    self.read_int()           # Result length
+    res = self.read_byte()
+
+    if res != 1:
+      serv_ver_major = self.read_short()
+      serv_ver_minor = self.read_short()
+      serv_ver_patch = self.read_short()
+      err_msg = self.__parse_string()
+      if err_msg is None:
+        raise Exception("Handshake Error [result=%d, version=%d.%d.%d]" \
+            % (res, serv_ver_major, serv_ver_minor, serv_ver_patch))
+      else:
+        raise Exception("Handshake Error [result=%d, version=%d.%d.%d, \
+            message='%s']" % (
+                res,
+                serv_ver_major,
+                serv_ver_minor,
+                serv_ver_patch,
+                err_msg
+            ))
+
+  def get_cache_type(self, cache_name):
+    """Collects type information about objects stored in the specified
+       cache.
+    """
+    cache_name_hash = self.__java_hash_code(cache_name)
+    self.write_int(25)        # Message length
+    self.write_short(2000)      # Operation code
+    self.write_long(0)        # Request ID
+    self.write_int(cache_name_hash) # Cache name
+    self.write_byte(0)        # Flags
+    self.write_byte(101)      # Filter (NULL)
+    self.write_int(1)         # Cursor page size
+    self.write_int(-1)        # Partition to query
+    self.write_byte(0)        # Local flag
+
+    result_length = self.read_int()
+    self.read_long()          # Request id
+    status = self.read_int()
+
+    if status != 0:
+      err_msg = self.__parse_string()
+      if err_msg is None:
+        raise Exception("Scan Query Error [status=%s]" % status)
+      else:
+        raise Exception("Scan Query Error [status=%s, message='%s']" \
+            % (status, err_msg))
+
+    self.read_long()          # Cursor id
+    row_count = self.read_int()
+
+    if row_count == 0:
+      raise Exception("Scan Query returned empty result, so it's \
+        impossible to derive the cache type")
+
+    payload = DataBuffer(self.read_data(result_length - 25))
+
+    self.read_byte()          # Next page
+
+    res = TypeTreeNode("root", 0, [
+        self.__collect_types("key", payload),
+        self.__collect_types("val", payload)
+    ], [0, 1])
+
+    return res
+
+  def __java_hash_code(self, s):
+    """Computes hash code of the specified string using Java code."""
+    h = 0
+    for c in s:
+      h = (31 * h + ord(c)) & 0xFFFFFFFF
+    return ((h + 0x80000000) & 0xFFFFFFFF) - 0x80000000
+
+  def __collect_types(self, field_name, data):
+    """Extracts type information from the specified object."""
+    type_id = data.read_byte()
+
+    # Byte scalar.
+    if type_id == 1:
+      data.skip(1)
+      return TypeTreeNode(field_name, type_id)
+
+    # Short scalar.
+    if type_id == 2:
+      data.skip(2)
+      return TypeTreeNode(field_name, type_id)
+
+    # Integer scalar.
+    if type_id == 3:
+      data.skip(4)
+      return TypeTreeNode(field_name, type_id)
+
+    # Long scalar.
+    if type_id == 4:
+      data.skip(8)
+      return TypeTreeNode(field_name, type_id)
+
+    # Float scalar.
+    if type_id == 5:
+      data.skip(4)
+      return TypeTreeNode(field_name, type_id)
+
+    # Double scalar.
+    if type_id == 6:
+      data.skip(8)
+      return TypeTreeNode(field_name, type_id)
+
+    # Char scalar.
+    if type_id == 7:
+      data.skip(2)
+      return TypeTreeNode(field_name, type_id)
+
+    # Bool scalar.
+    if type_id == 8:
+      data.skip(1)
+      return TypeTreeNode(field_name, type_id)
+
+    # String scalar.
+    if type_id == 9:
+      length = data.read_int()
+      data.skip(length)
+      return TypeTreeNode(field_name, type_id)
+
+    # UUID scalar.
+    if type_id == 10:
+      data.skip(16)
+      return TypeTreeNode(field_name, type_id)
+
+    # Date scalar.
+    if type_id == 11:
+      data.skip(8)
+      return TypeTreeNode(field_name, type_id)
+
+    # Byte array.
+    if type_id == 12:
+      length = data.read_int()
+      data.skip(length)
+      return TypeTreeNode(field_name, type_id)
+
+    # Short array.
+    if type_id == 13:
+      length = data.read_int()
+      data.skip(length * 2)
+      return TypeTreeNode(field_name, type_id)
+
+    # Integer array.
+    if type_id == 14:
+      length = data.read_int()
+      data.skip(length * 4)
+      return TypeTreeNode(field_name, type_id)
+
+    # Long array.
+    if type_id == 15:
+      length = data.read_int()
+      data.skip(length * 8)
+      return TypeTreeNode(field_name, type_id)
+
+    # Float array.
+    if type_id == 16:
+      length = data.read_int()
+      data.skip(length * 4)
+      return TypeTreeNode(field_name, type_id)
+
+    # Double array.
+    if type_id == 17:
+      length = data.read_int()
+      data.skip(length * 8)
+      return TypeTreeNode(field_name, type_id)
+
+    # Char array.
+    if type_id == 18:
+      length = data.read_int()
+      data.skip(length * 2)
+      return TypeTreeNode(field_name, type_id)
+
+    # Bool array.
+    if type_id == 19:
+      length = data.read_int()
+      data.skip(length)
+      return TypeTreeNode(field_name, type_id)
+
+    # String array.
+    if type_id == 20:
+      length = data.read_int()
+      for _ in range(length):
+        header = data.read_byte()
+        if header == 9:
+          str_length = data.read_int()
+          data.skip(str_length)
+        elif header == 101:
+          pass
+        else:
+          raise Exception("Unknown binary type when expected string \
+            [type_id=%d]" % header)
+      return TypeTreeNode(field_name, type_id)
+
+    # UUID array.
+    if type_id == 21:
+      length = data.read_int()
+      data.skip(length * 16) # TODO: support NULL values.
+      return TypeTreeNode(field_name, type_id)
+
+    # Date array.
+    if type_id == 22:
+      length = data.read_int()
+      data.skip(length * 8)
+      return TypeTreeNode(field_name, type_id)
+
+    # Wrapped Binary Object.
+    if type_id == 27:
+      length = data.read_int()
+      inner_data = data.read_data(length)
+      data.read_int()   # Offset
+      return self.__collect_types(field_name, DataBuffer(inner_data))
+
+    # Complex Object.
+    if type_id == 103:
+      data.read_byte()  # Object version
+      data.read_short() # Object flags
+      obj_type_id = data.read_int()
+      data.read_int()   # Object hash code
+      obj_length = data.read_int()
+      data.read_int()   # Object schema id
+      obj_schema_offset = data.read_int()
+
+      obj_type = self.__get_type(obj_type_id)
+      children = []
+
+      for obj_field in obj_type.fields:
+        child = self.__collect_types(obj_field.field_name, data)
+        children.append(child)
+
+      children_sorted = sorted(children, key=lambda child: child.name)
+      permutation = [children_sorted.index(child) for child in children]
+      children = children_sorted
+
+      data.skip(obj_length - obj_schema_offset)
+
+      return TypeTreeNode(field_name, type_id, children, permutation)
+
+    raise Exception("Unknown binary type [type_id=%d]" % type_id)
+
+  def __get_type(self, type_id):
+    """Queries Apache Ignite information about type by type id."""
+    self.write_int(14)      # Message length
+    self.write_short(3002)  # Operation code
+    self.write_long(0)      # Request ID
+    self.write_int(type_id) # Type ID
+
+    self.read_int()         # Result length
+    self.read_long()        # Request id
+    status = self.read_int()
+
+    if status != 0:
+      err_msg = self.__parse_string()
+      if err_msg is None:
+        raise Exception("Get Binary Type Error [status=%d, message='%s']" \
+            % (status, err_msg))
+      else:
+        raise Exception("Get Binary Type Error [status=%d]" % status)
+
+    binary_type_exists = self.read_byte()
+
+    if binary_type_exists == 0:
+      raise Exception("Binary type not found [type_id=%d] " % type_id)
+
+    binary_type_id = self.read_int()
+    binary_type_name = self.__parse_string()
+    self.__parse_string()   # Affinity field name
+
+    fields = []
+    for _ in range(self.read_int()):
+      field_name = self.__parse_string()
+      field_type_id = self.read_int()
+      field_id = self.read_int()
+
+      field = BinaryField(field_name, field_type_id, field_id)
+      fields.append(field)
+
+    is_enum = self.read_byte()
+    if is_enum == 1:
+      raise Exception("Enum fields are not supported yet")
+
+    schema_cnt = self.read_int()
+    for _ in range(schema_cnt):
+      self.read_int()       # Schema id
+      field_cnt = self.read_int()
+      self.skip(field_cnt * 4)
+
+    return BinaryType(binary_type_id, binary_type_name, fields)
+
+  def __parse_string(self):
+    """Parses string."""
+    header = self.read_byte()
+    if header == 9:
+      length = self.read_int()
+      return self.read_data(length).decode("utf-8")
+    if header == 101:
+      return None
+    raise Exception("Unknown binary type when expected string [type_id=%d]" \
+        % header)
+
+class IgniteDataset(Dataset):
+  """Apache Ignite is a memory-centric distributed database, caching, and
+     processing platform for transactional, analytical, and streaming workloads,
+     delivering in-memory speeds at petabyte scale. This contrib package
+     contains an integration between Apache Ignite and TensorFlow. The
+     integration is based on tf.data from TensorFlow side and Binary Client
+     Protocol from Apache Ignite side. It allows to use Apache Ignite as a
+     datasource for neural network training, inference and all other
+     computations supported by TensorFlow. Ignite Dataset is based on Apache
+     Ignite Binary Client Protocol.
+  """
+
+  def __init__(self, cache_name, host="localhost", port=10800, local=False,\
+    part=-1, page_size=100, username=None, password=None, certfile=None,\
+    keyfile=None, cert_password=None):
+    """Create a IgniteDataset.
+
+    Args:
+      cache_name: Cache name to be used as datasource.
+      host: Apache Ignite Thin Client host to be connected.
+      port: Apache Ignite Thin Client port to be connected.
+      local: Local flag that defines to query only local data.
+      part: Number of partitions to be queried.
+      page_size: Apache Ignite Thin Client page size.
+      username: Apache Ignite Thin Client authentication username.
+      password: Apache Ignite Thin Client authentication password.
+      certfile: File in PEM format containing the certificate as well as
+        any number of CA certificates needed to establish the certificate’s
+        authenticity.
+      keyfile: File containing the private key (otherwise the private key
+        will be taken from certfile as well).
+      cert_password: Password to be used if the private key is encrypted and a
+        password is necessary.
+    """
+    super(IgniteDataset, self).__init__()
+
+    with IgniteClient(host, port, username, password, certfile, keyfile,\
+        cert_password) as client:
+      client.handshake()
+      self.cache_type = client.get_cache_type(cache_name)
+
+    self.cache_name = ops.convert_to_tensor(cache_name, dtype=dtypes.string,\
+        name="cache_name")
+    self.host = ops.convert_to_tensor(host, dtype=dtypes.string, name="host")
+    self.port = ops.convert_to_tensor(port, dtype=dtypes.int32, name="port")
+    self.local = ops.convert_to_tensor(local, dtype=dtypes.bool, name="local")
+    self.part = ops.convert_to_tensor(part, dtype=dtypes.int32, name="part")
+    self.page_size = ops.convert_to_tensor(page_size, dtype=dtypes.int32,\
+        name="page_size")
+    self.username = ops.convert_to_tensor("" if username is None else username,\
+        dtype=dtypes.string, name="username")
+    self.password = ops.convert_to_tensor("" if password is None else password,\
+        dtype=dtypes.string, name="password")
+    self.certfile = ops.convert_to_tensor("" if certfile is None else certfile,\
+        dtype=dtypes.string, name="certfile")
+    self.keyfile = ops.convert_to_tensor("" if keyfile is None else keyfile,\
+        dtype=dtypes.string, name="keyfile")
+    self.cert_password = ops.convert_to_tensor("" if cert_password is None\
+        else cert_password, dtype=dtypes.string, name="cert_password")
+    self.schema = ops.convert_to_tensor(self.cache_type.to_flat(),\
+        dtype=dtypes.int32, name="schema")
+    self.permutation = ops.convert_to_tensor(self.cache_type.to_permutation(),\
+        dtype=dtypes.int32, name="permutation")
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.ignite_dataset(self.cache_name, self.host,\
+        self.port, self.local, self.part, self.page_size, self.username,\
+        self.password, self.certfile, self.keyfile, self.cert_password,\
+        self.schema, self.permutation)
+
+  @property
+  def output_classes(self):
+    return self.cache_type.to_output_classes()
+
+  @property
+  def output_shapes(self):
+    return self.cache_type.to_output_shapes()
+
+  @property
+  def output_types(self):
+    return self.cache_type.to_output_types()
diff --git a/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py b/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py
new file mode 100644
index 0000000000..8115bda85b
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py
@@ -0,0 +1,25 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Python helper for loading Ignite ops and kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_dataset_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("../../_dataset_ops.so"))
diff --git a/tensorflow/contrib/ignite/python/tests/bin/start-plain.sh b/tensorflow/contrib/ignite/python/tests/bin/start-plain.sh
new file mode 100755
index 0000000000..f4607ce8ad
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/bin/start-plain.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+nohup apache-ignite-fabric/bin/ignite.sh /data/config/ignite-config-plain.xml & 
+sleep 5 # Wait Apache Ignite to be started
+
+./apache-ignite-fabric/bin/sqlline.sh \
+-u "jdbc:ignite:thin://127.0.0.1/" \
+--run=/data/sql/init.sql
+
+tail -f nohup.out
diff --git a/tensorflow/contrib/ignite/python/tests/bin/start-ssl-auth.sh b/tensorflow/contrib/ignite/python/tests/bin/start-ssl-auth.sh
new file mode 100755
index 0000000000..dde1162816
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/bin/start-ssl-auth.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+nohup apache-ignite-fabric/bin/ignite.sh /data/config/ignite-config-ssl-auth.xml & 
+sleep 5 # Wait Apache Ignite to be started
+
+./apache-ignite-fabric/bin/sqlline.sh -u "jdbc:ignite:thin://127.0.0.1/?\
+sslMode=require&\
+sslClientCertificateKeyStoreUrl=/data/keystore/client.jks&\
+sslClientCertificateKeyStorePassword=123456&\
+sslTrustAll=true&\
+username=ignite&\
+password=ignite" --run=/data/sql/init.sql
+
+tail -f nohup.out
diff --git a/tensorflow/contrib/ignite/python/tests/bin/start-ssl.sh b/tensorflow/contrib/ignite/python/tests/bin/start-ssl.sh
new file mode 100755
index 0000000000..58b40b2738
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/bin/start-ssl.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+nohup apache-ignite-fabric/bin/ignite.sh /data/config/ignite-config-ssl.xml & 
+sleep 5 # Wait Apache Ignite to be started
+
+./apache-ignite-fabric/bin/sqlline.sh -u "jdbc:ignite:thin://127.0.0.1/?\
+sslMode=require&\
+sslClientCertificateKeyStoreUrl=/data/keystore/client.jks&\
+sslClientCertificateKeyStorePassword=123456&\
+sslTrustAll=true" --run=/data/sql/init.sql --verbose=true
+
+tail -f nohup.out
diff --git a/tensorflow/contrib/ignite/python/tests/config/ignite-config-plain.xml b/tensorflow/contrib/ignite/python/tests/config/ignite-config-plain.xml
new file mode 100644
index 0000000000..d900174a8a
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/config/ignite-config-plain.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<beans xmlns="http://www.springframework.org/schema/beans"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+       xmlns:util="http://www.springframework.org/schema/util"
+       xsi:schemaLocation="http://www.springframework.org/schema/beans
+       http://www.springframework.org/schema/beans/spring-beans.xsd
+       http://www.springframework.org/schema/util
+       http://www.springframework.org/schema/util/spring-util.xsd">  
+
+  <bean class="org.apache.ignite.configuration.IgniteConfiguration">
+    <property name="discoverySpi">
+      <bean class="org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi">
+        <property name="ipFinder">
+          <bean class="org.apache.ignite.spi.discovery.tcp.ipfinder.vm.TcpDiscoveryVmIpFinder">
+            <property name="addresses">
+              <list>
+                <value>127.0.0.1</value>
+              </list>
+            </property>
+          </bean>
+        </property>
+      </bean>
+    </property>
+  </bean>
+
+</beans>
diff --git a/tensorflow/contrib/ignite/python/tests/config/ignite-config-ssl-auth.xml b/tensorflow/contrib/ignite/python/tests/config/ignite-config-ssl-auth.xml
new file mode 100644
index 0000000000..8e001b28ab
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/config/ignite-config-ssl-auth.xml
@@ -0,0 +1,59 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+<beans xmlns="http://www.springframework.org/schema/beans"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+       xmlns:util="http://www.springframework.org/schema/util"
+       xsi:schemaLocation="http://www.springframework.org/schema/beans
+       http://www.springframework.org/schema/beans/spring-beans.xsd
+       http://www.springframework.org/schema/util
+       http://www.springframework.org/schema/util/spring-util.xsd">  
+
+  <bean id="client-connector-configuration" 
+        class="org.apache.ignite.configuration.ClientConnectorConfiguration">
+    <property name="sslClientAuth" value="true" />
+    <property name="sslEnabled" value="true" />
+    <property name="useIgniteSslContextFactory" value="true" />
+  </bean>
+
+  <bean id="ssl-context-factory" 
+        class="org.apache.ignite.ssl.SslContextFactory">
+    <property name="keyStoreFilePath" value="/data/keystore/server.jks"/>
+    <property name="keyStorePassword" value="123456"/>
+    <property name="trustStoreFilePath" value="/data/keystore/trust.jks"/>
+    <property name="trustStorePassword" value="123456"/>
+  </bean>
+
+  <bean id="ignite-configuration" 
+        class="org.apache.ignite.configuration.IgniteConfiguration">
+    <property name="clientConnectorConfiguration" 
+              ref="client-connector-configuration" />
+    <property name="sslContextFactory" ref="ssl-context-factory" />
+    <property name="discoverySpi">
+      <bean class="org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi">
+        <property name="ipFinder">
+          <bean class="org.apache.ignite.spi.discovery.tcp.ipfinder.vm.TcpDiscoveryVmIpFinder">
+            <property name="addresses">
+              <list>
+                <value>127.0.0.1</value>
+              </list>
+            </property>
+          </bean>
+        </property>
+      </bean>
+    </property>
+  </bean>
+
+</beans>
diff --git a/tensorflow/contrib/ignite/python/tests/config/ignite-config-ssl.xml b/tensorflow/contrib/ignite/python/tests/config/ignite-config-ssl.xml
new file mode 100644
index 0000000000..42d480c114
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/config/ignite-config-ssl.xml
@@ -0,0 +1,59 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+<beans xmlns="http://www.springframework.org/schema/beans"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+       xmlns:util="http://www.springframework.org/schema/util"
+       xsi:schemaLocation="http://www.springframework.org/schema/beans
+       http://www.springframework.org/schema/beans/spring-beans.xsd
+       http://www.springframework.org/schema/util
+       http://www.springframework.org/schema/util/spring-util.xsd">  
+
+  <bean id="client-connector-configuration" 
+        class="org.apache.ignite.configuration.ClientConnectorConfiguration">
+    <property name="sslClientAuth" value="false" />
+    <property name="sslEnabled" value="true" />
+    <property name="useIgniteSslContextFactory" value="true" />
+  </bean>
+
+  <bean id="ssl-context-factory" 
+        class="org.apache.ignite.ssl.SslContextFactory">
+    <property name="keyStoreFilePath" value="/data/keystore/server.jks"/>
+    <property name="keyStorePassword" value="123456"/>
+    <property name="trustStoreFilePath" value="/data/keystore/trust.jks"/>
+    <property name="trustStorePassword" value="123456"/>
+  </bean>
+
+  <bean id="ignite-configuration" 
+        class="org.apache.ignite.configuration.IgniteConfiguration">
+    <property name="clientConnectorConfiguration" 
+              ref="client-connector-configuration" />
+    <property name="sslContextFactory" ref="ssl-context-factory" />
+    <property name="discoverySpi">
+      <bean class="org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi">
+        <property name="ipFinder">
+          <bean class="org.apache.ignite.spi.discovery.tcp.ipfinder.vm.TcpDiscoveryVmIpFinder">
+            <property name="addresses">
+              <list>
+                <value>127.0.0.1</value>
+              </list>
+            </property>
+          </bean>
+        </property>
+      </bean>
+    </property>
+  </bean>
+
+</beans>
diff --git a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
new file mode 100644
index 0000000000..933e62b804
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
@@ -0,0 +1,77 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for IgniteDataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tensorflow as tf
+from tensorflow.contrib.ignite import IgniteDataset
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+class IgniteDatasetTest(test.TestCase):
+  """The Apache Ignite servers have to setup before the test and tear down
+     after the test manually. The docker engine has to be installed.
+
+     To setup Apache Ignite servers:
+     $ bash start_ignite.sh
+
+     To tear down Apache Ignite servers:
+     $ bash stop_ignite.sh
+  """
+
+  def test_ignite_dataset_with_plain_client(self):
+    ds = IgniteDataset(cache_name="SQL_PUBLIC_TEST_CACHE", port=42300)
+    self.__check_dataset(ds)
+
+  def test_ignite_dataset_with_ssl_client(self):
+    ds = IgniteDataset(cache_name="SQL_PUBLIC_TEST_CACHE", port=42301,\
+      certfile=os.path.dirname(os.path.realpath(__file__)) +\
+      "/keystore/client.pem", cert_password="123456")
+    self.__check_dataset(ds)
+
+  def test_ignite_dataset_with_ssl_client_and_auth(self):
+    ds = IgniteDataset(cache_name="SQL_PUBLIC_TEST_CACHE", port=42302,\
+      certfile=os.path.dirname(os.path.realpath(__file__)) +\
+      "/keystore/client.pem", cert_password="123456",\
+      username="ignite", password="ignite")
+    self.__check_dataset(ds)
+
+  def __check_dataset(self, dataset):
+    """Checks that dataset provids correct data.
+    """
+    self.assertEquals(tf.int64, dataset.output_types['key'])
+    self.assertEquals(tf.string, dataset.output_types['val']['NAME'])
+    self.assertEquals(tf.int64, dataset.output_types['val']['VAL'])
+
+    it = dataset.make_one_shot_iterator()
+    ne = it.get_next()
+
+    with tf.Session() as sess:
+      rows = [sess.run(ne), sess.run(ne), sess.run(ne)]
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(ne)
+
+    self.assertEquals({'key': 1, 'val': {'NAME': b'TEST1', 'VAL': 42}},\
+      rows[0])
+    self.assertEquals({'key': 2, 'val': {'NAME': b'TEST2', 'VAL': 43}},\
+      rows[1])
+    self.assertEquals({'key': 3, 'val': {'NAME': b'TEST3', 'VAL': 44}},\
+      rows[2])
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/ignite/python/tests/keystore/client.jks b/tensorflow/contrib/ignite/python/tests/keystore/client.jks
new file mode 100644
index 0000000000000000000000000000000000000000..1875c71b605253603eb63e446da8f07cd84f64a0
GIT binary patch
literal 3232
zcmezO_TO6u1_mZ5W@KPX&dE&8D`8+@G{5kwMTdcbX^%k@(+&eZHZE;8MixdbCP79<
zRtA<PrX_ZfJ6RuW=xI&a{Juu@TNk6n?t`_bmfw-w$HcMcfVITRQcFv&@Iarl*<JD%
zPl<oY&|+<wSv`NR+56Xv&-#Qm88yu?ROmRuI&VG0@+&M4a;zqusMk3raOz(P!;3eo
zjH?daoM820?wU85i#gkRrpXoO2P+juJhy9}c<_JjPF|^4j=Os5H<N^OHP<^b+<TrO
zwRUkf_l|#jkAE#)_;yFzeCeZp%l2N`X`66{`N+%rn`75l1h4%R^ZE7*{$BfMZQ%v8
zyNaet-|9Gb!SpMGnb?B`Q$l@r#{Otm{<9)K{_6KG1H%B{XQq$VDA*iN%sN~8z^6b{
zAiw6ZiC<!HLc;k~>Yq8p|C_4X`71ir_3D+J-^_h0{`v;Jq^I7?=cj)By2Gv~;-!V;
zrk^*tG^Q9QMw#R_-+6sBwsZISYYa0FeehS&sNA)_h$$$<F3ePXLu>xItuMbc#jm^U
z)EsGM7v48t=yRQpg+|uW%p2uNTMy;l<&ooQ-L|Fm+-LC}Ql7ti^n{IXo#DN-aQ4pg
z#|pK-#~2r{+<*SvmVdL<Pdc=QA8HWRh;?&4;!`}&bP0RxPZxE@ACiydU3=CV{}b+F
zQSjFCsonj(D*sag+pkG%Q^St4+~Ys~_r&F`%kEbgNZe0)x_tBTZ<Tkd=BGW+<xx`D
zQ+4RMy8B(7^Y1o17kFZ}ynr!GU15&qw~IeF?-aXh8@0$~(kqjdZ_fnWd~;!zI3HK~
zJhj5@EP@WVop(1z{{Jee!dP%g{giCLCclr5A9;Q8iT-TzCeW~~?a`^zZFL+c4kWzq
zU;8-hcCPNpzJnh=uKwCK)1#2*?3%?z&1ZuJkEiy^#N2M-@L}($nk@S&;YnWm8MeEN
zm#1t$Gc#bqrK9GT(=@hJ*0;~N{f7DL(oXj^A-|uT&RRC@VTAngkk*Ud;B?Izp=WAf
z$-uyL*Pw~%ra=>v(*kBDMkXdk6$4&2POUbNw(q=*jI68-2IYp_2Apinp)72|OhKWB
zJO*4K4u>#va6l4_#}45M8t{YUxP&>}i!xK(6EpK*3b=)tT~f<nCNT>$B_|rn8OXqG
z;$#%#Pfm15FUm|wPt45IOU}<VkQ3)MG%_$VurxF@FflZb66ZBW<{A{jjKk(If==N#
z;03vrSs0qT4V4TOkewq8$@HjBYMhT8I*hCg%#FPa292Fejg1VK*kz9?F+b=R5I$u(
zscP2kvk&h_^!FZo{-b#Fms5A&i7O?&F%rEc6sEhjMEtM9skgI#p1tvl$*JARsg~8A
z^Y9*hOP>1uEr#pj7x=Nuzv6iE!us=ylaaxU{}zcI(6gN!b!PABw|X+mmI_84U%E|>
z{mQW=_S;Qr`2HyF@K#y!w@;mknUR5UakW7uIJRVkS(pqM40PBy6WTl&+kQAP^0LT_
z$@&+h<^>1)D7dHQr4}WYq^2l1rxulDre!84mZTcUvN4Cs^0A1qh&cZ>+QiDUB3!Cv
z*QCz@@``+0T3;B*gQS&NBn-qFM7I9ms7T4H+0xmqwdiX68%NFUt%1m4!wL$UMuzTN
zKavj~N?{J}lzRGR$J_m%x8=zk+ue9hbM_;R=>eq;wo5iLG6bg=|6T5UzizMm4#|@0
zC3nU6D?6K?{L1|Lkcsc;orbHcP1{x#s@#Ya5UGy8w0-x%FE!^&Uhc18xbL`FJu7bh
zEc2IVEfii)u)od6#&v3~%l?{W-yQ1i>pxA|_=W+N&Y67;nwUKenwU00(m5x?%60vB
zcixWkLv!^YNbd$_rly8Qu=H*M<qn*~@TK~#cdK}mXP-E?-DF{X(U%zE_cH6lWA2`N
z=2ddUd2LwT;*&SI);KU!RlloSeL3^0TIZ>S!M-c39-n=bx7Ci%v0sW&eC`$Zt&A6J
zx2%mi{^N~-V|3b??)9@L{4(U4*Vx*;m6w(6+2y~S{15JDb~|jS)~V&%s_R*>%7H6j
zac`Q_$Ni90-+0HM@g`F8#hN-BS3=Tn<3fYRdDv6%!s;cJ0~XFrNNJJ><Wyl6Rs&{6
z#{Xz3l)15yp>5CIRj);LinkxSv+h!V3q#BbzJF1B6oSQ$a90?dH&#Df;JoYLpG4o4
z_vH`lZk%qiR`UI=vL~V2U#z=$!t$%x!4s7Yn#sG}#XXJ0Jr*}aeBxPNc>0L`(;ur`
z7niRTx7FTo)X_L*nZ@%!-U~AR;=(>@))7-*?Qk|v-SSgPPEf}ZT!%0vCqmjp&4Hu&
pj0zRrXg*^OWubk;VkqUaBW_g*^E|kHw{Q7<!r*Cx;ya&dW&r&#dOrXF

literal 0
HcmV?d00001

diff --git a/tensorflow/contrib/ignite/python/tests/keystore/client.pem b/tensorflow/contrib/ignite/python/tests/keystore/client.pem
new file mode 100644
index 0000000000..a71a87e0bb
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/keystore/client.pem
@@ -0,0 +1,69 @@
+Bag Attributes
+    friendlyName: client
+    localKeyID: 54 69 6D 65 20 31 33 33 39 32 33 39 38 35 39 34 34 36 
+Key Attributes: <No Attributes>
+-----BEGIN RSA PRIVATE KEY-----
+Proc-Type: 4,ENCRYPTED
+DEK-Info: DES-EDE3-CBC,CE61EDD98349D0C7
+
+Kzl16sj8R7YUXPCEZCqCrY4LSAjiKCRFNOagEehvN9Jpswcz4JbatoFmvVvOCgBF
+7kkeCaALhfM5a+46uynZ1sOOFUOn8fUFgguN3lLInWfm6vTuXDPslg0/tRNI0YqW
+ujfxyzrm1/k4RX0oLzRE1jZr69VZsBmZndkz9nkz3anWKLE7X/VIFV6U/N6YNPch
+BG1Fxpt/HtM9p3B5wNDSjCVaeNP1ROKe3APLRY6k+SppTuntHV5q9Ni82r1l3ahU
+zf2QvocSy9MLh+bGusJGHyJJAGuwPHm6ytPwbXGHn5xe4HPIno28j9kN7EL1ZoUs
+q0PhipAkFrGIM4zg6nAwVdzY5iGySDQ3fWpz2MkrKMDRftBwA3o/M321NBUW9/2X
+l+XmjXcJd0dEOslGxveb6UXLL2YvYszjQXRR4dCV/40bMJL3umRhVSay0NteoXfY
+82rQchm2NHKOiDfB4RpD8JJtVQeDSMXc9TH5y2Ua7FZND60JXtFpdnfCVfVZuBJm
+yBafyIsXR7EQzLG4z28Dvp4fs42A3JkF+e9Aq6Y6MmYA1wsvIKKT9HKEifqKmbgG
+4E9WOZn5IWi4ZJ44VAwN/uBGrLm//3OjByeB9y8vszNbyY8dQ8x5XqnF/IzIvgqc
+uKA8xuLAkTFmgRGQ/lmMDR+iMhet5dCtg9Orb9tYVL55JAb/OfsCX0LTJ3Y2RmIx
+CaFpkUP7KKYD+69ajnFCxvfGnGxyBkf+JeuDYIZVFklVT9SUtL9RJh26jUdvHt2A
+LQerBl8UCkVbPxsxYjdawvxuBNTD6tSRykM8zwtWcvIubp+gxE7png==
+-----END RSA PRIVATE KEY-----
+Bag Attributes
+    friendlyName: 1.2.840.113549.1.9.1=#1613636c69656e7440677269646761696e2e636f6d,CN=client,OU=Dev,O=GridGain,ST=SPb,C=RU
+    localKeyID: 54 69 6D 65 20 31 33 33 39 32 33 39 38 35 39 34 34 36 
+subject=/C=RU/ST=SPb/O=GridGain/OU=Dev/CN=client/emailAddress=client@gridgain.com
+issuer=/C=RU/ST=SPb/L=SPb/O=GridGain/OU=Dev/CN=ca/emailAddress=ca@gridgain.com
+-----BEGIN CERTIFICATE-----
+MIIC2TCCAkKgAwIBAgIBJDANBgkqhkiG9w0BAQUFADB3MQswCQYDVQQGEwJSVTEM
+MAoGA1UECBMDU1BiMQwwCgYDVQQHEwNTUGIxETAPBgNVBAoTCEdyaWRHYWluMQww
+CgYDVQQLEwNEZXYxCzAJBgNVBAMTAmNhMR4wHAYJKoZIhvcNAQkBFg9jYUBncmlk
+Z2Fpbi5jb20wHhcNMTIwNjA5MTEwNDE3WhcNMzIwNjA5MTEwNDE3WjBxMQswCQYD
+VQQGEwJSVTEMMAoGA1UECBMDU1BiMREwDwYDVQQKEwhHcmlkR2FpbjEMMAoGA1UE
+CxMDRGV2MQ8wDQYDVQQDEwZjbGllbnQxIjAgBgkqhkiG9w0BCQEWE2NsaWVudEBn
+cmlkZ2Fpbi5jb20wgZ8wDQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBANIHHcYiA+CP
+EBPKNZJ6mtvN4d9Yj43B5/hzs/TK3e4XImLsMhXaElYtrXQX/SDK7Zv5zdj6AkKH
+QkJ9BT8Jw7wvOQx/v4Qxrl+gTgcf6gjk6DvzqMlZUwH+ohbALj2TWsy9y+0uHKal
+EVrHpbYeB9TGpD+3NHwO/CG4SySk/Y4nAgMBAAGjezB5MAkGA1UdEwQCMAAwLAYJ
+YIZIAYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1Ud
+DgQWBBRD/TKyBQyoVxqEupLzUB8hDrSF6DAfBgNVHSMEGDAWgBS1+Ah4ZG58tImL
+KqLVX+xBKbeFUTANBgkqhkiG9w0BAQUFAAOBgQCL2vhjwcJkA1OJGuXsuO2/87Zu
+HMa7gc4pm+Iol1B1gD2ksQEAU2dz/adD3369H7gZdHuk3RYPeYmD5Ppp9eECDsXc
+gNWrNYaqcSTYWRAUe1/St7vB9HzPdOm/eADfQaMnal6fmjfpzTgg65A/2w4GCsqt
+RL98pvdAft8v5WSx7A==
+-----END CERTIFICATE-----
+Bag Attributes
+    friendlyName: 1.2.840.113549.1.9.1=#160f636140677269646761696e2e636f6d,CN=ca,OU=Dev,O=GridGain,L=SPb,ST=SPb,C=RU
+subject=/C=RU/ST=SPb/L=SPb/O=GridGain/OU=Dev/CN=ca/emailAddress=ca@gridgain.com
+issuer=/C=RU/ST=SPb/L=SPb/O=GridGain/OU=Dev/CN=ca/emailAddress=ca@gridgain.com
+-----BEGIN CERTIFICATE-----
+MIIDSTCCArKgAwIBAgIJAKmuj925215OMA0GCSqGSIb3DQEBBQUAMHcxCzAJBgNV
+BAYTAlJVMQwwCgYDVQQIEwNTUGIxDDAKBgNVBAcTA1NQYjERMA8GA1UEChMIR3Jp
+ZEdhaW4xDDAKBgNVBAsTA0RldjELMAkGA1UEAxMCY2ExHjAcBgkqhkiG9w0BCQEW
+D2NhQGdyaWRnYWluLmNvbTAeFw0xMjA2MDkwNjU1MTJaFw0zMjA2MDQwNjU1MTJa
+MHcxCzAJBgNVBAYTAlJVMQwwCgYDVQQIEwNTUGIxDDAKBgNVBAcTA1NQYjERMA8G
+A1UEChMIR3JpZEdhaW4xDDAKBgNVBAsTA0RldjELMAkGA1UEAxMCY2ExHjAcBgkq
+hkiG9w0BCQEWD2NhQGdyaWRnYWluLmNvbTCBnzANBgkqhkiG9w0BAQEFAAOBjQAw
+gYkCgYEAtd16DCObyM63NKF/cvRcE+8cr1dc3c7mSnTEQ61WfqPJ2QqsQAB6e+5+
+q9Np1SaJyqFTTag6483ibrU+DkGPGgEXndRHtQHQPbStWsf47DBBW2bMi6+bkPox
+Cp6BhYO1DQUG5tP9CQ/g32mLQLB7LH0KtS1JcKpAClCjjWZC8b8CAwEAAaOB3DCB
+2TAdBgNVHQ4EFgQUtfgIeGRufLSJiyqi1V/sQSm3hVEwgakGA1UdIwSBoTCBnoAU
+tfgIeGRufLSJiyqi1V/sQSm3hVGhe6R5MHcxCzAJBgNVBAYTAlJVMQwwCgYDVQQI
+EwNTUGIxDDAKBgNVBAcTA1NQYjERMA8GA1UEChMIR3JpZEdhaW4xDDAKBgNVBAsT
+A0RldjELMAkGA1UEAxMCY2ExHjAcBgkqhkiG9w0BCQEWD2NhQGdyaWRnYWluLmNv
+bYIJAKmuj925215OMAwGA1UdEwQFMAMBAf8wDQYJKoZIhvcNAQEFBQADgYEAhrzd
+qusVLHO3wtyu0o+EAFyoDv5avCBTFsQLeDDPMyfDcEO6wfxhTanfH8C7gZc0rRnv
+2nbkVbfortHIOfU2wch5gClju0cXSTIXSKOAWPIMp3HLxC/l+KpFo3epFz0rsMVB
+M1ymOOdRDdAcTxcTTGY7WJXquEM3ZbT5Gh4RLDk=
+-----END CERTIFICATE-----
diff --git a/tensorflow/contrib/ignite/python/tests/keystore/server.jks b/tensorflow/contrib/ignite/python/tests/keystore/server.jks
new file mode 100644
index 0000000000000000000000000000000000000000..006ececc31118aa18ddb6e4ec27d002e5e11646c
GIT binary patch
literal 3230
zcmezO_TO6u1_mZLW=c+EU|=-A@M=~y18anysevT}1GBF|6SJp56Vs*z%uI|-Oq>iW
z*Y)4sc{|R}fR~L^tIebBJ1-+6D=ULRxgoa!CmVAp3!5-gP^ck~0T+nFA<P^ckObqg
zLwJG){2)0lVGj4A%oO*;%siL^ZeeDZ)H0Y!%)(5`iH33pGH{zX8O8XM6CKivGE>qM
zGxPM4^K%X4#CZ*k49pBH4a`hU4UMA2d5u9_6DW7!9M(7=IgA)t8JHV;84MabnHn1z
zw%)DcQJ#I`+;)?N^+jJ|gx|}o50ANf?wMD~5$CmGb&F5l<XYpvP*we|ZuRBNt7@I6
z76$vSuzGy<QQlTNKF5A3M)A2<+_y4bu-&pY>iCa029D8bXS&zVp76_%YhGh(^HyF~
zwr7|Ba`Hd8pV{rOp<1VwYpbqj!72x?fW^IOP9OI(F*7nSE^fSI(0J28mW?@7mXAe@
zMP%y_j*67Lnk}8(T8pm6zj4&u-Wq7oxDq6<%+k2fpmAOURy7N&msAc|I5#1sNgj|>
zg;`h)m>C)WBZodKDD)c{+V<RC^;%S?c>AF{>n`=TFvP6j`xmuGAz17PcZI=uWA(!Y
z&bto&N%UQLU;e=E#_1+&CEwpFdlI_+#kz|pEWer^JW<)8nY`Ow+|x+hV{t>oC!Xbn
zr;q4A{jth*arsJdTkQ=;9gSm_Sv(Kqy&&T+F6@(L9WnLQ4rlY!EkC8?1a&MK7#J8C
z*osq&%2JCUd2Mzj0|V18gC?eJ27GK>+H8z0j9N^BjEt-dEKN*{JR;+brdQtCd04*1
z$YblrpVOXB<ym^M`G-LLJTCnxao6PMarN9H70d42Tl!<QJCo;uPp413*vKWxv@}ug
zogeFuTkqM-(ghhc6e`~BpT5^PYKigDv`*d2hEaDO^v=vQ%@*VoD?4J!;h4Io;ZSk#
z`P6w6S$vxB%J+)t=jxQac%*&o!GpcM-!tDgl<AfmaL6cRa+m+|&6~Q>Dssj0qL-e!
z_3tE^3-va%w|`$L;}Nj<>_c{`U$cx$gv{s4>?kxoKIdBIo;jx!9jbh5SLQifJJrzs
zKg>i-)9GBrhpt-!C&V3HuCAQCd+ibRlH+zhVzLkahd3JDTd_wa_k`B9sMrWek?13{
z*dBacz4IgAeb)@<DTjZ#B{%+0+ITE!Q-Ppme*`C2&?~-`MzL)V0<4=(raczZSdsln
zVe`sX^Gzq*^Y&Z6c3UUDGrQ~lJ4sH%6H8t%)lPXj^-QN<z#3!cz5fo}bt$%=wIrqP
ze~swA+9M3_Ss%@r>M~3AX3(WI_H1kwhYD0)`sg3qynpK@nTnN(CM+8|HvZ!6J^p;z
z70HDklrAs3a-g*4a!r-uh2Qu8U1U*M%*r<}VcnA4fROmk&r_Y<Ot0}u+~R6_-0f;I
zQT@#Vt>szlR%axuOEMR(yjB<ff1Pkedcd4l7Gibz?_1wxE_$2N!|6L=s_4G3D;H%r
zeYhV^m$!Q!H8&}A-lhwm4)Sjf?o(RDp0@s1<Z*)~f!CPMJvuqH#BR&WkDiOYH*Z^O
z`m6X<%&SKA!c2!%f&2?Z-p~J%(AH8Z@#mJ@mcFWAXTELN^S3`$>1N0!)447OT5hin
z`e@h^I(x}V?cc3Y48HPiKl(P^C`fr6_w|pjbFf;;&fi9lD<_^(-g^I5=a%BLPyRf3
zdU0>w&2)9mhk?Jp?O3_5|F@rk_~K+{aP0xBEtu{aG%?*YXkv1L)E11YXnAnJ3JF9l
zVQ6SzVg#=xETLS3LRh7Pt@<KZVeuRAf@(BoVQAhqR5DONuC#<9Ssu0G!dE|B`@3^R
z#`A<V;j`wxcVfPOP*qy#_}k^?ua`X%*DCzZp0eu3ybtwSW|sR@z1*D&Ueyb_v>$#x
zU$5)brMsWC-|mr^9a-qdEyI5}#;Jbot6!hlq^dr@w9ngL^i9Q-xAje)zPeoZQTZ3^
zI{SX>N-Unl-@TrBs?fRG_WGF4H4fa*%CAD|hiZdLaGfqI%)(^AV4%asnb79J*!IJT
zk(WhYOxC|3H7_{WN5MTcFSRJKBsE3BIkl)HGc7YYu_P6#T4`B)<&VueNfCvrDJrI6
z;+q!D*D*4X2Nl)IED{D{*y|Lul*8QE$guv)rbS0jOy~Khd5i0YgcQr&WH!FHLBEbS
zt>f>NJd)IQh2=oPhQ#@NHyz)uGMdNwXx?Y07a!}Fce*DBynQM9JI6`FqT^^1yQE9p
z&0|vfQ<(D9Zo~)i9e#GAtb#Qsng8dHFBKsVC+4kv*LYIX@L62#V&#`f_RBX&U0c1a
z>(PbZn?_qjRA{c@OZB6zBIZyQ>NJaoQmbfAcG9xQoh!B}_ieqmB!8{QbKdug03J1V
AGXMYp

literal 0
HcmV?d00001

diff --git a/tensorflow/contrib/ignite/python/tests/keystore/trust.jks b/tensorflow/contrib/ignite/python/tests/keystore/trust.jks
new file mode 100644
index 0000000000000000000000000000000000000000..a00f1251af72982ddcd42c0274fc7b16e35dbc4c
GIT binary patch
literal 2432
zcmezO_TO6u1_mYu1_ov@&6J$Tz`$sJ;nlo346G4)rUsS_49va;P0XGKO-!2>Ff%bS
zF>x}iT-Seh=j}K@170>xtu~Lg@4SqRtgH+M<%Zk_oNUaYENsF|L7|2`23#NxhcI(+
zKoX3{4&ezJ@Pp*IggM-cGE>|WGxJ~yxP_TrQp;c_F$*&#CmPBb$iQvlWEA61PIO2w
z%1lX5%*@kE&d)WF6X!KFGB7i+G%zzYH8hG6=QRd#O`zO?b6Dei<S=4nWngaXWiV*$
zWNK_=*m}2$M|t*%bK6Z8))#$=5q>YTK0M~`xo2J_N1WG&)h#}GlWUCwLsj*=y49C6
zuc~#PS{UrR!s_waM|oTA_#FGC7{%vaao@^#!FJ2qsN+B07&u0!o#|dbd%`b6u6d2E
z&0Bd{*`8hg%gO)XerC7BhH9N!uC2PB1*;sm0v7kCIepyE#LURRxVZ6-LE}vWSvKZS
zSw0pq7Llz#I4V-|YPNKCYc0AO|He^sduyOU<4TacGE3t^gT{FcSk)}7UQ#(=;oO9j
zCV4<k6=q>IU}j|ej~x1}pwMq*Xxnpl)oW3m;_Zj-th?0T!Vt59?_bm&g<!EG+!Y4r
zjnxkqIPW_6C((E1efa~s8>gGBm3)7z>`CbM7waybu>5Lv@I+;UX7X-#aZe+0kHrlU
zpLmuRo<5@g^v5dK#pNr-ZM8QXbu^AyX7N0b_kxVSxUf%}b;Q(HJDkl^xBQfn6V$N;
z=QFnCoXpg`5=dT~Ujff+Om_{Mm~I+0F*!l<8lwuD4+boV5qZqe(7?pd9G1r{p<IJP
zSeC_>`3Yusegj@mdB7|T4H-iv0|n$PFANDV)NGG0cU@waJ*LF`pkF}vl<B0ZS+~zV
zydTlud+_;>;>}-9-F+vnl=Q|(^p;SV?%ERZzY3?`&i;A!#xEwPb|<G=R(sCFd-N@N
z>i4%8u8Uvb$1eYh<H-x_&nr$w1~dL!Bz8d0c5>92y{F&m$t+tc7<GK<HaYey$ClV{
zH>u(KqqxIcWy#+@bx7{2HmC%bh_b>gOa=@FI&7Q?Z61tmKb#nOS>(lJ{R>j_f`fe&
z+*9*XixNvxQxu$2i%K%nGLsWaQjs#H^IxM)tUN2irCN4P`Wzsy$hW2Sg@HUMp)0dU
z7>Hrdk!UH0xv`O<`__--gNIU>gFB_3zS;41|L1LaGRJl|p3|KDNMm|HX@l*Ojf@Pz
z>BWDSJKwL{E5Adsq<YC+G5*TV<|n^0zdmH*J9?+#>T1)rRfQ@yA_YXM<1cOBeeg@o
z`I49WD;VxOE>_Qqn?K9^<yi}b*Awh-^RaQATI;gEX4!X#y8HT1Q#QVVr2XR5qO#N?
zNZMcEg*)x5qWNHuq<tfJ{Q^$=Mn+LoOZ$+J!AScMVdB&NwZA)8WIRt;6FzJ1dne}m
z2UVq&j=x=Q{(9LXajnAd>?x~m%==KUWoEfg)yv(f;8ne#OZ(yH^Yyw;UAp^O`|Tcy
z*^z~Q+%o)!W1Q;OzWVi<O{(hiOZ&Y2Mc-6Rd0XG)>8s0iAC-TxuCwpAuEgR={N3xB
zrwW~`ZLg2%T;ss~to$lf(|*h1D}QX(Ns1^`O;Iro6W_FGzK#(&X@C8fO^c46n9lQ0
z^A^_)2`QGl$!vUYgMJ-vTF2ijc_gXr3d@0n4T<ylZaThQWi*fV(Y()0FFw{U?{rTN
zc>7ZHcaD>UMaR)3c1f4Go5!T|r!eKI-G~q3JN)cKSp{oOGXKvXUn)W#PRv{TuJNR%
v;j_5f#mX;}?3Ztly0&^-*P{!+H%&8h)e~82@&Dp^^P<ERetXZ{Q569I9;OV&

literal 0
HcmV?d00001

diff --git a/tensorflow/contrib/ignite/python/tests/sql/init.sql b/tensorflow/contrib/ignite/python/tests/sql/init.sql
new file mode 100644
index 0000000000..5a192aef17
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/sql/init.sql
@@ -0,0 +1,20 @@
+-- Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+--
+-- Licensed under the Apache License, Version 2.0 (the "License");
+-- you may not use this file except in compliance with the License.
+-- You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS, 
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+-- ==============================================================================
+
+CREATE TABLE TEST_CACHE (ID LONG PRIMARY KEY, NAME VARCHAR, VAL LONG);
+
+INSERT INTO TEST_CACHE VALUES (1, 'TEST1', 42);
+INSERT INTO TEST_CACHE VALUES (2, 'TEST2', 43);
+INSERT INTO TEST_CACHE VALUES (3, 'TEST3', 44);
diff --git a/tensorflow/contrib/ignite/python/tests/start_ignite.sh b/tensorflow/contrib/ignite/python/tests/start_ignite.sh
new file mode 100755
index 0000000000..fbcf656afd
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/start_ignite.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+IGNITE_VERSION=2.6.0
+SCRIPT_PATH="$( cd "$(dirname "$0")" ; pwd -P )"
+
+# Start Apache Ignite with plain client listener.
+docker run -itd --name ignite-plain -p 42300:10800 \
+-v ${SCRIPT_PATH}:/data apacheignite/ignite:${IGNITE_VERSION} /data/bin/start-plain.sh
+
+# Start Apache Ignite with SSL client listener.
+docker run -itd --name ignite-ssl -p 42301:10800 \
+-v ${SCRIPT_PATH}:/data apacheignite/ignite:${IGNITE_VERSION} /data/bin/start-ssl.sh
+
+# Start Apache Ignite with SSL client listener with auth.
+docker run -itd --name ignite-ssl-auth -p 42302:10800 \
+-v ${SCRIPT_PATH}:/data apacheignite/ignite:${IGNITE_VERSION} /data/bin/start-ssl-auth.sh
diff --git a/tensorflow/contrib/ignite/python/tests/stop_ignite.sh b/tensorflow/contrib/ignite/python/tests/stop_ignite.sh
new file mode 100755
index 0000000000..8f03dbd1ed
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/stop_ignite.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+docker rm -f ignite-plain
+docker rm -f ignite-ssl
+docker rm -f ignite-ssl-auth
-- 
GitLab


From 28b0608a8536c287b4084449e36fd42b6f4aed5b Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Fri, 24 Aug 2018 18:15:57 +0300
Subject: [PATCH 028/570] Remove duplicated header from README.md.

---
 tensorflow/contrib/ignite/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/contrib/ignite/README.md b/tensorflow/contrib/ignite/README.md
index 9054344e94..f2596fc572 100644
--- a/tensorflow/contrib/ignite/README.md
+++ b/tensorflow/contrib/ignite/README.md
@@ -1,4 +1,3 @@
-### Ignite Dataset
 # Ignite Dataset
 
 - [Overview](#overview)
@@ -164,4 +163,4 @@ After that you will be able to work with it following way:
 
 ## Limitations
 
-Presently Ignite Dataset works with assumption that all objects in the cache have the same structure (homogeneous objects) and the cache contains at least one object. Another limitation concerns structured objects, Ignite Dataset does not support UUID, Maps and Object arrays that might be parts of object structures.
\ No newline at end of file
+Presently Ignite Dataset works with assumption that all objects in the cache have the same structure (homogeneous objects) and the cache contains at least one object. Another limitation concerns structured objects, Ignite Dataset does not support UUID, Maps and Object arrays that might be parts of object structures.
-- 
GitLab


From 241c1740ee26b57b7a5fe8f72b9d34f4515af760 Mon Sep 17 00:00:00 2001
From: dmitrievanthony <dmitrievanthony@gmail.com>
Date: Sun, 26 Aug 2018 16:03:04 +0000
Subject: [PATCH 029/570] Update after review: change 'ignite' namespace to
 'tensorflow', rename variables to satisty code style, use pointers instead of
 references.

---
 tensorflow/contrib/ignite/BUILD               |   1 -
 tensorflow/contrib/ignite/__init__.py         |   4 +-
 .../kernels/ignite_binary_object_parser.cc    | 322 +++++++---------
 .../kernels/ignite_binary_object_parser.h     |   9 +-
 .../contrib/ignite/kernels/ignite_client.cc   |  55 ---
 .../contrib/ignite/kernels/ignite_client.h    |  45 ++-
 .../contrib/ignite/kernels/ignite_dataset.cc  | 105 +++--
 .../contrib/ignite/kernels/ignite_dataset.h   |  65 ++--
 .../ignite/kernels/ignite_dataset_iterator.cc | 358 +++++++++---------
 .../ignite/kernels/ignite_dataset_iterator.h  |  80 ++--
 .../ignite/kernels/ignite_dataset_ops.cc      |  10 +-
 .../ignite/kernels/ignite_plain_client.h      |  21 +-
 .../kernels/ignite_plain_client_unix.cc       |  78 ++--
 .../kernels/ignite_plain_client_windows.cc    |  77 ++--
 .../ignite/kernels/ignite_ssl_wrapper.cc      | 107 +++---
 .../ignite/kernels/ignite_ssl_wrapper.h       |  30 +-
 16 files changed, 619 insertions(+), 748 deletions(-)
 delete mode 100644 tensorflow/contrib/ignite/kernels/ignite_client.cc

diff --git a/tensorflow/contrib/ignite/BUILD b/tensorflow/contrib/ignite/BUILD
index 9f6c666893..b7d40a99f7 100644
--- a/tensorflow/contrib/ignite/BUILD
+++ b/tensorflow/contrib/ignite/BUILD
@@ -40,7 +40,6 @@ cc_library(
     srcs = [
         "kernels/ignite_dataset_ops.cc",
         "kernels/ignite_client.h",
-        "kernels/ignite_client.cc",
         "kernels/ignite_plain_client.h",
         "kernels/ignite_ssl_wrapper.h",
         "kernels/ignite_ssl_wrapper.cc",
diff --git a/tensorflow/contrib/ignite/__init__.py b/tensorflow/contrib/ignite/__init__.py
index 468920a557..b78829d0f4 100644
--- a/tensorflow/contrib/ignite/__init__.py
+++ b/tensorflow/contrib/ignite/__init__.py
@@ -30,9 +30,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.ignite.python.ops.ignite_dataset_ops \
-import IgniteDataset
-
+from tensorflow.contrib.ignite.python.ops.ignite_dataset_ops import IgniteDataset
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
diff --git a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc b/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
index bf0ef8766e..9bf4480d2d 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
@@ -15,290 +15,258 @@ limitations under the License.
 
 #include "ignite_binary_object_parser.h"
 
-namespace ignite {
+namespace tensorflow {
 
-tensorflow::Status BinaryObjectParser::Parse(
-    uint8_t*& ptr, std::vector<tensorflow::Tensor>& out_tensors,
-    std::vector<int32_t>& types) {
-  uint8_t object_type_id = *ptr;
-  ptr += 1;
+Status BinaryObjectParser::Parse(uint8_t** ptr,
+                                 std::vector<Tensor>* out_tensors,
+                                 std::vector<int32_t>* types) {
+  uint8_t object_type_id = **ptr;
+  *ptr += 1;
 
   switch (object_type_id) {
     case BYTE: {
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_UINT8, {});
-      tensor.scalar<tensorflow::uint8>()() = *((uint8_t*)ptr);
-      ptr += 1;
-      out_tensors.emplace_back(std::move(tensor));
+      Tensor tensor(cpu_allocator(), DT_UINT8, {});
+      tensor.scalar<uint8>()() = *((uint8_t*)*ptr);
+      *ptr += 1;
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case SHORT: {
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_INT16, {});
-      tensor.scalar<tensorflow::int16>()() = *((int16_t*)ptr);
-      ptr += 2;
-      out_tensors.emplace_back(std::move(tensor));
+      Tensor tensor(cpu_allocator(), DT_INT16, {});
+      tensor.scalar<int16>()() = *((int16_t*)*ptr);
+      *ptr += 2;
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case INT: {
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_INT32, {});
-      tensor.scalar<tensorflow::int32>()() = *((int32_t*)ptr);
-      ptr += 4;
-      out_tensors.emplace_back(std::move(tensor));
+      Tensor tensor(cpu_allocator(), DT_INT32, {});
+      tensor.scalar<int32>()() = *((int32_t*)*ptr);
+      *ptr += 4;
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case LONG: {
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_INT64, {});
-      tensor.scalar<tensorflow::int64>()() = *((int64_t*)ptr);
-      ptr += 8;
-      out_tensors.emplace_back(std::move(tensor));
+      Tensor tensor(cpu_allocator(), DT_INT64, {});
+      tensor.scalar<int64>()() = *((int64_t*)*ptr);
+      *ptr += 8;
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case FLOAT: {
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_FLOAT, {});
-      tensor.scalar<float>()() = *((float*)ptr);
-      ptr += 4;
-      out_tensors.emplace_back(std::move(tensor));
+      Tensor tensor(cpu_allocator(), DT_FLOAT, {});
+      tensor.scalar<float>()() = *((float*)*ptr);
+      *ptr += 4;
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case DOUBLE: {
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_DOUBLE, {});
-      tensor.scalar<double>()() = *((double*)ptr);
-      ptr += 8;
-      out_tensors.emplace_back(std::move(tensor));
+      Tensor tensor(cpu_allocator(), DT_DOUBLE, {});
+      tensor.scalar<double>()() = *((double*)*ptr);
+      *ptr += 8;
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case UCHAR: {
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_UINT16, {});
-      tensor.scalar<tensorflow::uint16>()() = *((uint16_t*)ptr);
-      ptr += 2;
-      out_tensors.emplace_back(std::move(tensor));
+      Tensor tensor(cpu_allocator(), DT_UINT16, {});
+      tensor.scalar<uint16>()() = *((uint16_t*)*ptr);
+      *ptr += 2;
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case BOOL: {
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_BOOL, {});
-      tensor.scalar<bool>()() = *((bool*)ptr);
-      ptr += 1;
-      out_tensors.emplace_back(std::move(tensor));
+      Tensor tensor(cpu_allocator(), DT_BOOL, {});
+      tensor.scalar<bool>()() = *((bool*)*ptr);
+      *ptr += 1;
+      out_tensors->push_back(std::move(tensor));
 
       break;
     }
     case STRING: {
-      int32_t length = *((int32_t*)ptr);
-      ptr += 4;
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_STRING, {});
-      tensor.scalar<std::string>()() = std::string((char*)ptr, length);
-      ptr += length;
-      out_tensors.emplace_back(std::move(tensor));
+      int32_t length = *((int32_t*)*ptr);
+      *ptr += 4;
+      Tensor tensor(cpu_allocator(), DT_STRING, {});
+      tensor.scalar<std::string>()() = std::string((char*)*ptr, length);
+      *ptr += length;
+      out_tensors->push_back(std::move(tensor));
 
       break;
     }
     case DATE: {
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_INT64, {});
-      tensor.scalar<tensorflow::int64>()() = *((int64_t*)ptr);
-      ptr += 8;
-      out_tensors.emplace_back(std::move(tensor));
+      Tensor tensor(cpu_allocator(), DT_INT64, {});
+      tensor.scalar<int64>()() = *((int64_t*)*ptr);
+      *ptr += 8;
+      out_tensors->push_back(std::move(tensor));
 
       break;
     }
     case BYTE_ARR: {
-      int32_t length = *((int32_t*)ptr);
-      ptr += 4;
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_UINT8,
-                                tensorflow::TensorShape({length}));
+      int32_t length = *((int32_t*)*ptr);
+      *ptr += 4;
+      Tensor tensor(cpu_allocator(), DT_UINT8, TensorShape({length}));
 
-      uint8_t* arr = (uint8_t*)ptr;
-      ptr += length;
+      uint8_t* arr = (uint8_t*)*ptr;
+      *ptr += length;
 
-      std::copy_n(arr, length, tensor.flat<tensorflow::uint8>().data());
-      out_tensors.emplace_back(std::move(tensor));
+      std::copy_n(arr, length, tensor.flat<uint8>().data());
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case SHORT_ARR: {
-      int32_t length = *((int32_t*)ptr);
-      ptr += 4;
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_INT16,
-                                tensorflow::TensorShape({length}));
+      int32_t length = *((int32_t*)*ptr);
+      *ptr += 4;
+      Tensor tensor(cpu_allocator(), DT_INT16, TensorShape({length}));
 
-      int16_t* arr = (int16_t*)ptr;
-      ptr += length * 2;
+      int16_t* arr = (int16_t*)*ptr;
+      *ptr += length * 2;
 
-      std::copy_n(arr, length, tensor.flat<tensorflow::int16>().data());
-      out_tensors.emplace_back(std::move(tensor));
+      std::copy_n(arr, length, tensor.flat<int16>().data());
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case INT_ARR: {
-      int32_t length = *((int32_t*)ptr);
-      ptr += 4;
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_INT32,
-                                tensorflow::TensorShape({length}));
+      int32_t length = *((int32_t*)*ptr);
+      *ptr += 4;
+      Tensor tensor(cpu_allocator(), DT_INT32, TensorShape({length}));
 
-      int32_t* arr = (int32_t*)ptr;
-      ptr += length * 4;
+      int32_t* arr = (int32_t*)*ptr;
+      *ptr += length * 4;
 
-      std::copy_n(arr, length, tensor.flat<tensorflow::int32>().data());
-      out_tensors.emplace_back(std::move(tensor));
+      std::copy_n(arr, length, tensor.flat<int32>().data());
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case LONG_ARR: {
-      int32_t length = *((int32_t*)ptr);
-      ptr += 4;
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_INT64,
-                                tensorflow::TensorShape({length}));
+      int32_t length = *((int32_t*)*ptr);
+      *ptr += 4;
+      Tensor tensor(cpu_allocator(), DT_INT64, TensorShape({length}));
 
-      int64_t* arr = (int64_t*)ptr;
-      ptr += length * 8;
+      int64_t* arr = (int64_t*)*ptr;
+      *ptr += length * 8;
 
-      std::copy_n(arr, length, tensor.flat<tensorflow::int64>().data());
-      out_tensors.emplace_back(std::move(tensor));
+      std::copy_n(arr, length, tensor.flat<int64>().data());
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case FLOAT_ARR: {
-      int32_t length = *((int32_t*)ptr);
-      ptr += 4;
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_FLOAT,
-                                tensorflow::TensorShape({length}));
+      int32_t length = *((int32_t*)*ptr);
+      *ptr += 4;
+      Tensor tensor(cpu_allocator(), DT_FLOAT, TensorShape({length}));
 
-      float* arr = (float*)ptr;
-      ptr += 4 * length;
+      float* arr = (float*)*ptr;
+      *ptr += 4 * length;
 
       std::copy_n(arr, length, tensor.flat<float>().data());
-      out_tensors.emplace_back(std::move(tensor));
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case DOUBLE_ARR: {
-      int32_t length = *((int32_t*)ptr);
-      ptr += 4;
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_DOUBLE,
-                                tensorflow::TensorShape({length}));
+      int32_t length = *((int32_t*)*ptr);
+      *ptr += 4;
+      Tensor tensor(cpu_allocator(), DT_DOUBLE, TensorShape({length}));
 
-      double* arr = (double*)ptr;
-      ptr += 8 * length;
+      double* arr = (double*)*ptr;
+      *ptr += 8 * length;
 
       std::copy_n(arr, length, tensor.flat<double>().data());
-      out_tensors.emplace_back(std::move(tensor));
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case UCHAR_ARR: {
-      int32_t length = *((int32_t*)ptr);
-      ptr += 4;
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_UINT16,
-                                tensorflow::TensorShape({length}));
+      int32_t length = *((int32_t*)*ptr);
+      *ptr += 4;
+      Tensor tensor(cpu_allocator(), DT_UINT16, TensorShape({length}));
 
-      uint16_t* arr = (uint16_t*)ptr;
-      ptr += length * 2;
+      uint16_t* arr = (uint16_t*)*ptr;
+      *ptr += length * 2;
 
-      std::copy_n(arr, length, tensor.flat<tensorflow::uint16>().data());
-      out_tensors.emplace_back(std::move(tensor));
+      std::copy_n(arr, length, tensor.flat<uint16>().data());
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case BOOL_ARR: {
-      int32_t length = *((int32_t*)ptr);
-      ptr += 4;
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_BOOL,
-                                tensorflow::TensorShape({length}));
+      int32_t length = *((int32_t*)*ptr);
+      *ptr += 4;
+      Tensor tensor(cpu_allocator(), DT_BOOL, TensorShape({length}));
 
-      bool* arr = (bool*)ptr;
-      ptr += length;
+      bool* arr = (bool*)*ptr;
+      *ptr += length;
 
       std::copy_n(arr, length, tensor.flat<bool>().data());
-      out_tensors.emplace_back(std::move(tensor));
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case STRING_ARR: {
-      int32_t length = *((int32_t*)ptr);
-      ptr += 4;
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_STRING,
-                                tensorflow::TensorShape({length}));
+      int32_t length = *((int32_t*)*ptr);
+      *ptr += 4;
+      Tensor tensor(cpu_allocator(), DT_STRING, TensorShape({length}));
 
       for (int32_t i = 0; i < length; i++) {
-        int32_t str_length = *((int32_t*)ptr);
-        ptr += 4;
-        const int8_t* str = (const int8_t*)ptr;
-        ptr += str_length;
+        int32_t str_length = *((int32_t*)*ptr);
+        *ptr += 4;
+        const int8_t* str = (const int8_t*)*ptr;
+        *ptr += str_length;
         tensor.vec<std::string>()(i) = std::string((char*)str, str_length);
       }
 
-      out_tensors.emplace_back(std::move(tensor));
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case DATE_ARR: {
-      int32_t length = *((int32_t*)ptr);
-      ptr += 4;
-      tensorflow::Tensor tensor(tensorflow::cpu_allocator(),
-                                tensorflow::DT_INT64,
-                                tensorflow::TensorShape({length}));
-      int64_t* arr = (int64_t*)ptr;
-      ptr += length * 8;
-
-      std::copy_n(arr, length, tensor.flat<tensorflow::int64>().data());
-      out_tensors.emplace_back(std::move(tensor));
+      int32_t length = *((int32_t*)*ptr);
+      *ptr += 4;
+      Tensor tensor(cpu_allocator(), DT_INT64, TensorShape({length}));
+      int64_t* arr = (int64_t*)*ptr;
+      *ptr += length * 8;
+
+      std::copy_n(arr, length, tensor.flat<int64>().data());
+      out_tensors->push_back(std::move(tensor));
       break;
     }
     case WRAPPED_OBJ: {
-      int32_t byte_arr_size = *((int32_t*)ptr);
-      ptr += 4;
+      int32_t byte_arr_size = *((int32_t*)*ptr);
+      *ptr += 4;
 
-      tensorflow::Status status = Parse(ptr, out_tensors, types);
-      if (!status.ok()) return status;
+      TF_RETURN_IF_ERROR(Parse(ptr, out_tensors, types));
 
-      int32_t offset = *((int32_t*)ptr);
-      ptr += 4;
+      int32_t offset = *((int32_t*)*ptr);
+      *ptr += 4;
 
       break;
     }
     case COMPLEX_OBJ: {
-      uint8_t version = *ptr;
-      ptr += 1;
-      int16_t flags = *((int16_t*)ptr);  // USER_TYPE = 1, HAS_SCHEMA = 2
-      ptr += 2;
-      int32_t type_id = *((int32_t*)ptr);
-      ptr += 4;
-      int32_t hash_code = *((int32_t*)ptr);
-      ptr += 4;
-      int32_t length = *((int32_t*)ptr);
-      ptr += 4;
-      int32_t schema_id = *((int32_t*)ptr);
-      ptr += 4;
-      int32_t schema_offset = *((int32_t*)ptr);
-      ptr += 4;
-
-      uint8_t* end = ptr + schema_offset - 24;
+      uint8_t version = **ptr;
+      *ptr += 1;
+      int16_t flags = *((int16_t*)*ptr);  // USER_TYPE = 1, HAS_SCHEMA = 2
+      *ptr += 2;
+      int32_t type_id = *((int32_t*)*ptr);
+      *ptr += 4;
+      int32_t hash_code = *((int32_t*)*ptr);
+      *ptr += 4;
+      int32_t length = *((int32_t*)*ptr);
+      *ptr += 4;
+      int32_t schema_id = *((int32_t*)*ptr);
+      *ptr += 4;
+      int32_t schema_offset = *((int32_t*)*ptr);
+      *ptr += 4;
+
+      uint8_t* end = *ptr + schema_offset - 24;
       int32_t i = 0;
-      while (ptr < end) {
+      while (*ptr < end) {
         i++;
-        tensorflow::Status status = Parse(ptr, out_tensors, types);
-        if (!status.ok()) return status;
+        TF_RETURN_IF_ERROR(Parse(ptr, out_tensors, types));
       }
 
-      ptr += (length - schema_offset);
+      *ptr += (length - schema_offset);
 
       break;
     }
     default: {
-      return tensorflow::errors::Internal("Unknowd binary type (type id ",
-                                          (int)object_type_id, ")");
+      return errors::Internal("Unknowd binary type (type id ",
+                              (int)object_type_id, ")");
     }
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-}  // namespace ignite
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h b/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
index 1e845cbc56..9accbd796f 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
@@ -17,13 +17,12 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/lib/core/status.h"
 
-namespace ignite {
+namespace tensorflow {
 
 class BinaryObjectParser {
  public:
-  tensorflow::Status Parse(uint8_t*& ptr,
-                           std::vector<tensorflow::Tensor>& out_tensors,
-                           std::vector<int32_t>& types);
+  Status Parse(uint8_t** ptr, std::vector<Tensor>* out_tensors,
+               std::vector<int32_t>* types);
 };
 
 enum ObjectType {
@@ -51,4 +50,4 @@ enum ObjectType {
   COMPLEX_OBJ = 103
 };
 
-}  // namespace ignite
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/ignite_client.cc b/tensorflow/contrib/ignite/kernels/ignite_client.cc
deleted file mode 100644
index 5a8eddb944..0000000000
--- a/tensorflow/contrib/ignite/kernels/ignite_client.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef IGNITE_CLIENT_H
-#define IGNITE_CLIENT_H
-#include "ignite_client.h"
-#endif
-
-namespace ignite {
-
-tensorflow::Status Client::ReadByte(uint8_t& data) {
-  return ReadData((uint8_t*)&data, 1);
-}
-
-tensorflow::Status Client::ReadShort(int16_t& data) {
-  return ReadData((uint8_t*)&data, 2);
-}
-
-tensorflow::Status Client::ReadInt(int32_t& data) {
-  return ReadData((uint8_t*)&data, 4);
-}
-
-tensorflow::Status Client::ReadLong(int64_t& data) {
-  return ReadData((uint8_t*)&data, 8);
-}
-
-tensorflow::Status Client::WriteByte(uint8_t data) {
-  return WriteData((uint8_t*)&data, 1);
-}
-
-tensorflow::Status Client::WriteShort(int16_t data) {
-  return WriteData((uint8_t*)&data, 2);
-}
-
-tensorflow::Status Client::WriteInt(int32_t data) {
-  return WriteData((uint8_t*)&data, 4);
-}
-
-tensorflow::Status Client::WriteLong(int64_t data) {
-  return WriteData((uint8_t*)&data, 8);
-}
-
-}  // namespace ignite
diff --git a/tensorflow/contrib/ignite/kernels/ignite_client.h b/tensorflow/contrib/ignite/kernels/ignite_client.h
index 64e28d75f0..944b3fe184 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_client.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_client.h
@@ -13,28 +13,43 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
+
 #include "tensorflow/core/lib/core/status.h"
 
-namespace ignite {
+namespace tensorflow {
 
 class Client {
  public:
-  virtual tensorflow::Status Connect() = 0;
-  virtual tensorflow::Status Disconnect() = 0;
+  virtual Status Connect() = 0;
+  virtual Status Disconnect() = 0;
   virtual bool IsConnected() = 0;
   virtual int GetSocketDescriptor() = 0;
+  virtual Status ReadData(uint8_t* buf, int32_t length) = 0;
+  virtual Status WriteData(uint8_t* buf, int32_t length) = 0;
+
+  inline Status ReadByte(uint8_t* data) { return ReadData(data, 1); }
+
+  inline Status ReadShort(int16_t* data) { return ReadData((uint8_t*)data, 2); }
+
+  inline Status ReadInt(int32_t* data) { return ReadData((uint8_t*)data, 4); }
+
+  inline Status ReadLong(int64_t* data) { return ReadData((uint8_t*)data, 8); }
 
-  virtual tensorflow::Status ReadByte(uint8_t& data);
-  virtual tensorflow::Status ReadShort(int16_t& data);
-  virtual tensorflow::Status ReadInt(int32_t& data);
-  virtual tensorflow::Status ReadLong(int64_t& data);
-  virtual tensorflow::Status ReadData(uint8_t* buf, int32_t length) = 0;
-
-  virtual tensorflow::Status WriteByte(uint8_t data);
-  virtual tensorflow::Status WriteShort(int16_t data);
-  virtual tensorflow::Status WriteInt(int32_t data);
-  virtual tensorflow::Status WriteLong(int64_t data);
-  virtual tensorflow::Status WriteData(uint8_t* buf, int32_t length) = 0;
+  inline Status WriteByte(uint8_t data) { return WriteData(&data, 1); }
+
+  inline Status WriteShort(int16_t data) {
+    return WriteData((uint8_t*)&data, 2);
+  }
+
+  inline Status WriteInt(int32_t data) { return WriteData((uint8_t*)&data, 4); }
+
+  inline Status WriteLong(int64_t data) {
+    return WriteData((uint8_t*)&data, 8);
+  }
 };
 
-}  // namespace ignite
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset.cc b/tensorflow/contrib/ignite/kernels/ignite_dataset.cc
index a9bf26955b..f25f8a5b18 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset.cc
@@ -16,31 +16,29 @@ limitations under the License.
 #include "ignite_dataset_iterator.h"
 #include "tensorflow/core/platform/logging.h"
 
-namespace ignite {
+namespace tensorflow {
 
-IgniteDataset::IgniteDataset(tensorflow::OpKernelContext* ctx,
-                             std::string cache_name, std::string host,
-                             tensorflow::int32 port, bool local,
-                             tensorflow::int32 part,
-                             tensorflow::int32 page_size, std::string username,
+IgniteDataset::IgniteDataset(OpKernelContext* ctx, std::string cache_name,
+                             std::string host, int32 port, bool local,
+                             int32 part, int32 page_size, std::string username,
                              std::string password, std::string certfile,
                              std::string keyfile, std::string cert_password,
-                             std::vector<tensorflow::int32> schema,
-                             std::vector<tensorflow::int32> permutation)
-    : DatasetBase(tensorflow::DatasetContext(ctx)),
-      cache_name(cache_name),
-      host(host),
-      port(port),
-      local(local),
-      part(part),
-      page_size(page_size),
-      username(username),
-      password(password),
-      certfile(certfile),
-      keyfile(keyfile),
-      cert_password(cert_password),
-      schema(schema),
-      permutation(permutation) {
+                             std::vector<int32> schema,
+                             std::vector<int32> permutation)
+    : DatasetBase(DatasetContext(ctx)),
+      cache_name_(cache_name),
+      host_(host),
+      port_(port),
+      local_(local),
+      part_(part),
+      page_size_(page_size),
+      username_(username),
+      password_(password),
+      certfile_(certfile),
+      keyfile_(keyfile),
+      cert_password_(cert_password),
+      schema_(schema),
+      permutation_(permutation) {
   SchemaToTypes();
   SchemaToShapes();
 
@@ -53,55 +51,50 @@ IgniteDataset::IgniteDataset(tensorflow::OpKernelContext* ctx,
 
 IgniteDataset::~IgniteDataset() { LOG(INFO) << "Ignite Dataset destroyed"; }
 
-std::unique_ptr<tensorflow::IteratorBase> IgniteDataset::MakeIteratorInternal(
-    const tensorflow::string& prefix) const {
-  return std::unique_ptr<tensorflow::IteratorBase>(new IgniteDatasetIterator(
-      {this, tensorflow::strings::StrCat(prefix, "::Ignite")}, this->host,
-      this->port, this->cache_name, this->local, this->part, this->page_size,
-      this->username, this->password, this->certfile, this->keyfile,
-      this->cert_password, this->schema, this->permutation));
+std::unique_ptr<IteratorBase> IgniteDataset::MakeIteratorInternal(
+    const string& prefix) const {
+  return std::unique_ptr<IteratorBase>(new IgniteDatasetIterator(
+      {this, strings::StrCat(prefix, "::Ignite")}, this->host_, this->port_,
+      this->cache_name_, this->local_, this->part_, this->page_size_,
+      this->username_, this->password_, this->certfile_, this->keyfile_,
+      this->cert_password_, this->schema_, this->permutation_));
 }
 
-const tensorflow::DataTypeVector& IgniteDataset::output_dtypes() const {
-  return dtypes;
-}
+const DataTypeVector& IgniteDataset::output_dtypes() const { return dtypes_; }
 
-const std::vector<tensorflow::PartialTensorShape>&
-IgniteDataset::output_shapes() const {
-  return shapes;
+const std::vector<PartialTensorShape>& IgniteDataset::output_shapes() const {
+  return shapes_;
 }
 
-tensorflow::string IgniteDataset::DebugString() const {
-  return "IgniteDatasetOp::Dataset";
-}
+string IgniteDataset::DebugString() const { return "IgniteDatasetOp::Dataset"; }
 
-tensorflow::Status IgniteDataset::AsGraphDefInternal(
-    tensorflow::SerializationContext* ctx, DatasetGraphDefBuilder* b,
-    tensorflow::Node** output) const {
-  return tensorflow::errors::Unimplemented(
+Status IgniteDataset::AsGraphDefInternal(SerializationContext* ctx,
+                                         DatasetGraphDefBuilder* b,
+                                         Node** output) const {
+  return errors::Unimplemented(
       "IgniteDataset does not support 'AsGraphDefInternal'");
 }
 
 void IgniteDataset::SchemaToTypes() {
-  for (auto e : schema) {
+  for (auto e : schema_) {
     if (e == BYTE || e == BYTE_ARR) {
-      dtypes.push_back(tensorflow::DT_UINT8);
+      dtypes_.push_back(DT_UINT8);
     } else if (e == SHORT || e == SHORT_ARR) {
-      dtypes.push_back(tensorflow::DT_INT16);
+      dtypes_.push_back(DT_INT16);
     } else if (e == INT || e == INT_ARR) {
-      dtypes.push_back(tensorflow::DT_INT32);
+      dtypes_.push_back(DT_INT32);
     } else if (e == LONG || e == LONG_ARR) {
-      dtypes.push_back(tensorflow::DT_INT64);
+      dtypes_.push_back(DT_INT64);
     } else if (e == FLOAT || e == FLOAT_ARR) {
-      dtypes.push_back(tensorflow::DT_FLOAT);
+      dtypes_.push_back(DT_FLOAT);
     } else if (e == DOUBLE || e == DOUBLE_ARR) {
-      dtypes.push_back(tensorflow::DT_DOUBLE);
+      dtypes_.push_back(DT_DOUBLE);
     } else if (e == UCHAR || e == UCHAR_ARR) {
-      dtypes.push_back(tensorflow::DT_UINT8);
+      dtypes_.push_back(DT_UINT8);
     } else if (e == BOOL || e == BOOL_ARR) {
-      dtypes.push_back(tensorflow::DT_BOOL);
+      dtypes_.push_back(DT_BOOL);
     } else if (e == STRING || e == STRING_ARR) {
-      dtypes.push_back(tensorflow::DT_STRING);
+      dtypes_.push_back(DT_STRING);
     } else {
       LOG(ERROR) << "Unexpected type in schema [type_id=" << e << "]";
     }
@@ -109,15 +102,15 @@ void IgniteDataset::SchemaToTypes() {
 }
 
 void IgniteDataset::SchemaToShapes() {
-  for (auto e : schema) {
+  for (auto e : schema_) {
     if (e >= 1 && e < 10) {
-      shapes.push_back(tensorflow::PartialTensorShape({}));
+      shapes_.push_back(PartialTensorShape({}));
     } else if (e >= 12 && e < 21) {
-      shapes.push_back(tensorflow::PartialTensorShape({-1}));
+      shapes_.push_back(PartialTensorShape({-1}));
     } else {
       LOG(ERROR) << "Unexpected type in schema [type_id=" << e << "]";
     }
   }
 }
 
-}  // namespace ignite
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset.h b/tensorflow/contrib/ignite/kernels/ignite_dataset.h
index 2120dfd342..d3fec5910b 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset.h
@@ -15,51 +15,48 @@ limitations under the License.
 
 #include "tensorflow/core/framework/dataset.h"
 
-namespace ignite {
+namespace tensorflow {
 
-class IgniteDataset : public tensorflow::DatasetBase {
+class IgniteDataset : public DatasetBase {
  public:
-  IgniteDataset(tensorflow::OpKernelContext* ctx, std::string cache_name,
-                std::string host, tensorflow::int32 port, bool local,
-                tensorflow::int32 part, tensorflow::int32 page_size,
+  IgniteDataset(OpKernelContext* ctx, std::string cache_name, std::string host,
+                int32 port, bool local, int32 part, int32 page_size,
                 std::string username, std::string password,
                 std::string certfile, std::string keyfile,
-                std::string cert_password,
-                std::vector<tensorflow::int32> schema,
-                std::vector<tensorflow::int32> permutation);
+                std::string cert_password, std::vector<int32> schema,
+                std::vector<int32> permutation);
   ~IgniteDataset();
-  std::unique_ptr<tensorflow::IteratorBase> MakeIteratorInternal(
-      const tensorflow::string& prefix) const override;
-  const tensorflow::DataTypeVector& output_dtypes() const override;
-  const std::vector<tensorflow::PartialTensorShape>& output_shapes()
-      const override;
-  tensorflow::string DebugString() const override;
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override;
+  const DataTypeVector& output_dtypes() const override;
+  const std::vector<PartialTensorShape>& output_shapes() const override;
+  string DebugString() const override;
 
  protected:
-  tensorflow::Status AsGraphDefInternal(
-      tensorflow::SerializationContext* ctx, DatasetGraphDefBuilder* b,
-      tensorflow::Node** output) const override;
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override;
 
  private:
-  const std::string cache_name;
-  const std::string host;
-  const tensorflow::int32 port;
-  const bool local;
-  const tensorflow::int32 part;
-  const tensorflow::int32 page_size;
-  const std::string username;
-  const std::string password;
-  const std::string certfile;
-  const std::string keyfile;
-  const std::string cert_password;
-  const std::vector<tensorflow::int32> schema;
-  const std::vector<tensorflow::int32> permutation;
-
-  tensorflow::DataTypeVector dtypes;
-  std::vector<tensorflow::PartialTensorShape> shapes;
+  const std::string cache_name_;
+  const std::string host_;
+  const int32 port_;
+  const bool local_;
+  const int32 part_;
+  const int32 page_size_;
+  const std::string username_;
+  const std::string password_;
+  const std::string certfile_;
+  const std::string keyfile_;
+  const std::string cert_password_;
+  const std::vector<int32> schema_;
+  const std::vector<int32> permutation_;
+
+  DataTypeVector dtypes_;
+  std::vector<PartialTensorShape> shapes_;
 
   void SchemaToTypes();
   void SchemaToShapes();
 };
 
-}  // namespace ignite
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc b/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
index 03cc3c1291..1774585ecd 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
@@ -22,270 +22,262 @@ limitations under the License.
 #include <time.h>
 #include <chrono>
 
-namespace ignite {
-
-#define CHECK_STATUS(status) \
-  if (!status.ok()) return status;
+namespace tensorflow {
 
 IgniteDatasetIterator::IgniteDatasetIterator(
-    const Params& params, std::string host, tensorflow::int32 port,
-    std::string cache_name, bool local, tensorflow::int32 part,
-    tensorflow::int32 page_size, std::string username, std::string password,
-    std::string certfile, std::string keyfile, std::string cert_password,
-    std::vector<tensorflow::int32> schema,
-    std::vector<tensorflow::int32> permutation)
-    : tensorflow::DatasetIterator<IgniteDataset>(params),
-      cache_name(cache_name),
-      local(local),
-      part(part),
-      page_size(page_size),
-      username(username),
-      password(password),
-      schema(schema),
-      permutation(permutation),
-      remainder(-1),
-      cursor_id(-1),
-      last_page(false) {
+    const Params& params, std::string host, int32 port, std::string cache_name,
+    bool local, int32 part, int32 page_size, std::string username,
+    std::string password, std::string certfile, std::string keyfile,
+    std::string cert_password, std::vector<int32> schema,
+    std::vector<int32> permutation)
+    : DatasetIterator<IgniteDataset>(params),
+      cache_name_(cache_name),
+      local_(local),
+      part_(part),
+      page_size_(page_size),
+      username_(username),
+      password_(password),
+      schema_(schema),
+      permutation_(permutation),
+      remainder_(-1),
+      cursor_id_(-1),
+      last_page_(false) {
   Client* p_client = new PlainClient(host, port);
 
   if (certfile.empty())
-    client = std::unique_ptr<Client>(p_client);
+    client_ = std::unique_ptr<Client>(p_client);
   else
-    client = std::unique_ptr<Client>(new SslWrapper(
+    client_ = std::unique_ptr<Client>(new SslWrapper(
         std::unique_ptr<Client>(p_client), certfile, keyfile, cert_password));
 
   LOG(INFO) << "Ignite Dataset Iterator created";
 }
 
 IgniteDatasetIterator::~IgniteDatasetIterator() {
-  tensorflow::Status status = CloseConnection();
+  Status status = CloseConnection();
   if (!status.ok()) LOG(ERROR) << status.ToString();
 
   LOG(INFO) << "Ignite Dataset Iterator destroyed";
 }
 
-tensorflow::Status IgniteDatasetIterator::EstablishConnection() {
-  if (!client->IsConnected()) {
-    tensorflow::Status status = client->Connect();
+Status IgniteDatasetIterator::EstablishConnection() {
+  if (!client_->IsConnected()) {
+    Status status = client_->Connect();
     if (!status.ok()) return status;
 
     status = Handshake();
     if (!status.ok()) {
-      tensorflow::Status disconnect_status = client->Disconnect();
+      Status disconnect_status = client_->Disconnect();
       if (!disconnect_status.ok()) LOG(ERROR) << disconnect_status.ToString();
 
       return status;
     }
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status IgniteDatasetIterator::CloseConnection() {
-  if (cursor_id != -1 && !last_page) {
-    tensorflow::Status conn_status = EstablishConnection();
+Status IgniteDatasetIterator::CloseConnection() {
+  if (cursor_id_ != -1 && !last_page_) {
+    Status conn_status = EstablishConnection();
     if (!conn_status.ok()) return conn_status;
 
-    CHECK_STATUS(client->WriteInt(18));  // Message length
-    CHECK_STATUS(
-        client->WriteShort(close_connection_opcode));  // Operation code
-    CHECK_STATUS(client->WriteLong(0));                // Request ID
-    CHECK_STATUS(client->WriteLong(cursor_id));        // Resource ID
+    TF_RETURN_IF_ERROR(client_->WriteInt(18));  // Message length
+    TF_RETURN_IF_ERROR(
+        client_->WriteShort(close_connection_opcode));   // Operation code
+    TF_RETURN_IF_ERROR(client_->WriteLong(0));           // Request ID
+    TF_RETURN_IF_ERROR(client_->WriteLong(cursor_id_));  // Resource ID
 
     int32_t res_len;
-    CHECK_STATUS(client->ReadInt(res_len));
+    TF_RETURN_IF_ERROR(client_->ReadInt(&res_len));
     if (res_len < 12)
-      return tensorflow::errors::Internal(
-          "Close Resource Response is corrupted");
+      return errors::Internal("Close Resource Response is corrupted");
 
     int64_t req_id;
-    CHECK_STATUS(client->ReadLong(req_id));
+    TF_RETURN_IF_ERROR(client_->ReadLong(&req_id));
     int32_t status;
-    CHECK_STATUS(client->ReadInt(status));
+    TF_RETURN_IF_ERROR(client_->ReadInt(&status));
     if (status != 0) {
       uint8_t err_msg_header;
-      CHECK_STATUS(client->ReadByte(err_msg_header));
+      TF_RETURN_IF_ERROR(client_->ReadByte(&err_msg_header));
       if (err_msg_header == string_val) {
         int32_t err_msg_length;
-        CHECK_STATUS(client->ReadInt(err_msg_length));
+        TF_RETURN_IF_ERROR(client_->ReadInt(&err_msg_length));
         uint8_t* err_msg_c = new uint8_t[err_msg_length];
-        CHECK_STATUS(client->ReadData(err_msg_c, err_msg_length));
+        TF_RETURN_IF_ERROR(client_->ReadData(err_msg_c, err_msg_length));
         std::string err_msg((char*)err_msg_c, err_msg_length);
         delete[] err_msg_c;
 
-        return tensorflow::errors::Internal("Close Resource Error [status=",
-                                            status, ", message=", err_msg, "]");
+        return errors::Internal("Close Resource Error [status=", status,
+                                ", message=", err_msg, "]");
       }
-      return tensorflow::errors::Internal("Close Resource Error [status=",
-                                          status, "]");
+      return errors::Internal("Close Resource Error [status=", status, "]");
     }
 
-    LOG(INFO) << "Query Cursor " << cursor_id << " is closed";
+    LOG(INFO) << "Query Cursor " << cursor_id_ << " is closed";
 
-    cursor_id = -1;
+    cursor_id_ = -1;
 
-    return client->Disconnect();
+    return client_->Disconnect();
   } else {
-    LOG(INFO) << "Query Cursor " << cursor_id << " is already closed";
+    LOG(INFO) << "Query Cursor " << cursor_id_ << " is already closed";
   }
 
-  return client->IsConnected() ? client->Disconnect()
-                               : tensorflow::Status::OK();
+  return client_->IsConnected() ? client_->Disconnect() : Status::OK();
 }
 
-tensorflow::Status IgniteDatasetIterator::GetNextInternal(
-    tensorflow::IteratorContext* ctx,
-    std::vector<tensorflow::Tensor>* out_tensors, bool* end_of_sequence) {
-  if (remainder == 0 && last_page) {
-    LOG(INFO) << "Query Cursor " << cursor_id << " is closed";
+Status IgniteDatasetIterator::GetNextInternal(IteratorContext* ctx,
+                                              std::vector<Tensor>* out_tensors,
+                                              bool* end_of_sequence) {
+  if (remainder_ == 0 && last_page_) {
+    LOG(INFO) << "Query Cursor " << cursor_id_ << " is closed";
 
-    cursor_id = -1;
+    cursor_id_ = -1;
     *end_of_sequence = true;
-    return tensorflow::Status::OK();
+    return Status::OK();
   } else {
-    tensorflow::Status status = EstablishConnection();
+    Status status = EstablishConnection();
     if (!status.ok()) return status;
 
-    if (remainder == -1 || remainder == 0) {
-      tensorflow::Status status =
-          remainder == -1 ? ScanQuery() : LoadNextPage();
+    if (remainder_ == -1 || remainder_ == 0) {
+      Status status = remainder_ == -1 ? ScanQuery() : LoadNextPage();
       if (!status.ok()) return status;
     }
 
-    uint8_t* initial_ptr = ptr;
+    uint8_t* initial_ptr = ptr_;
     std::vector<int32_t> types;
-    std::vector<tensorflow::Tensor> tensors;
+    std::vector<Tensor> tensors;
 
-    status = parser.Parse(ptr, tensors, types);  // Parse key
+    status = parser_.Parse(&ptr_, &tensors, &types);  // Parse key
     if (!status.ok()) return status;
 
-    status = parser.Parse(ptr, tensors, types);  // Parse val
+    status = parser_.Parse(&ptr_, &tensors, &types);  // Parse val
     if (!status.ok()) return status;
 
-    remainder -= (ptr - initial_ptr);
+    remainder_ -= (ptr_ - initial_ptr);
 
     out_tensors->resize(tensors.size());
     for (int32_t i = 0; i < tensors.size(); i++)
-      (*out_tensors)[permutation[i]] = std::move(tensors[i]);
+      (*out_tensors)[permutation_[i]] = std::move(tensors[i]);
 
     *end_of_sequence = false;
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
   *end_of_sequence = true;
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status IgniteDatasetIterator::SaveInternal(
-    tensorflow::IteratorStateWriter* writer) {
-  return tensorflow::errors::Unimplemented(
+Status IgniteDatasetIterator::SaveInternal(IteratorStateWriter* writer) {
+  return errors::Unimplemented(
       "Iterator for IgniteDataset does not support 'SaveInternal'");
 }
 
-tensorflow::Status IgniteDatasetIterator::RestoreInternal(
-    tensorflow::IteratorContext* ctx, tensorflow::IteratorStateReader* reader) {
-  return tensorflow::errors::Unimplemented(
+Status IgniteDatasetIterator::RestoreInternal(IteratorContext* ctx,
+                                              IteratorStateReader* reader) {
+  return errors::Unimplemented(
       "Iterator for IgniteDataset does not support 'RestoreInternal')");
 }
 
-tensorflow::Status IgniteDatasetIterator::Handshake() {
+Status IgniteDatasetIterator::Handshake() {
   int32_t msg_len = 8;
 
-  if (username.empty())
+  if (username_.empty())
     msg_len += 1;
   else
-    msg_len += 5 + username.length();
+    msg_len += 5 + username_.length();
 
-  if (password.empty())
+  if (password_.empty())
     msg_len += 1;
   else
-    msg_len += 5 + password.length();
-
-  CHECK_STATUS(client->WriteInt(msg_len));
-  CHECK_STATUS(client->WriteByte(1));
-  CHECK_STATUS(client->WriteShort(protocol_major_version));
-  CHECK_STATUS(client->WriteShort(protocol_minor_version));
-  CHECK_STATUS(client->WriteShort(protocol_patch_version));
-  CHECK_STATUS(client->WriteByte(2));
-  if (username.empty()) {
-    CHECK_STATUS(client->WriteByte(null_val));
+    msg_len += 5 + password_.length();
+
+  TF_RETURN_IF_ERROR(client_->WriteInt(msg_len));
+  TF_RETURN_IF_ERROR(client_->WriteByte(1));
+  TF_RETURN_IF_ERROR(client_->WriteShort(protocol_major_version));
+  TF_RETURN_IF_ERROR(client_->WriteShort(protocol_minor_version));
+  TF_RETURN_IF_ERROR(client_->WriteShort(protocol_patch_version));
+  TF_RETURN_IF_ERROR(client_->WriteByte(2));
+  if (username_.empty()) {
+    TF_RETURN_IF_ERROR(client_->WriteByte(null_val));
   } else {
-    CHECK_STATUS(client->WriteByte(string_val));
-    CHECK_STATUS(client->WriteInt(username.length()));
-    CHECK_STATUS(
-        client->WriteData((uint8_t*)username.c_str(), username.length()));
+    TF_RETURN_IF_ERROR(client_->WriteByte(string_val));
+    TF_RETURN_IF_ERROR(client_->WriteInt(username_.length()));
+    TF_RETURN_IF_ERROR(
+        client_->WriteData((uint8_t*)username_.c_str(), username_.length()));
   }
 
-  if (password.empty()) {
-    CHECK_STATUS(client->WriteByte(null_val));
+  if (password_.empty()) {
+    TF_RETURN_IF_ERROR(client_->WriteByte(null_val));
   } else {
-    CHECK_STATUS(client->WriteByte(string_val));
-    CHECK_STATUS(client->WriteInt(password.length()));
-    CHECK_STATUS(
-        client->WriteData((uint8_t*)password.c_str(), password.length()));
+    TF_RETURN_IF_ERROR(client_->WriteByte(string_val));
+    TF_RETURN_IF_ERROR(client_->WriteInt(password_.length()));
+    TF_RETURN_IF_ERROR(
+        client_->WriteData((uint8_t*)password_.c_str(), password_.length()));
   }
 
   int32_t handshake_res_len;
-  CHECK_STATUS(client->ReadInt(handshake_res_len));
+  TF_RETURN_IF_ERROR(client_->ReadInt(&handshake_res_len));
   uint8_t handshake_res;
-  CHECK_STATUS(client->ReadByte(handshake_res));
+  TF_RETURN_IF_ERROR(client_->ReadByte(&handshake_res));
 
   LOG(INFO) << "Handshake length " << handshake_res_len << ", res "
             << (int16_t)handshake_res;
 
   if (handshake_res != 1) {
     int16_t serv_ver_major;
-    CHECK_STATUS(client->ReadShort(serv_ver_major));
+    TF_RETURN_IF_ERROR(client_->ReadShort(&serv_ver_major));
     int16_t serv_ver_minor;
-    CHECK_STATUS(client->ReadShort(serv_ver_minor));
+    TF_RETURN_IF_ERROR(client_->ReadShort(&serv_ver_minor));
     int16_t serv_ver_patch;
-    CHECK_STATUS(client->ReadShort(serv_ver_patch));
+    TF_RETURN_IF_ERROR(client_->ReadShort(&serv_ver_patch));
     uint8_t header;
-    CHECK_STATUS(client->ReadByte(header));
+    TF_RETURN_IF_ERROR(client_->ReadByte(&header));
 
     if (header == string_val) {
       int32_t length;
-      CHECK_STATUS(client->ReadInt(length));
+      TF_RETURN_IF_ERROR(client_->ReadInt(&length));
       uint8_t* err_msg_c = new uint8_t[length];
-      CHECK_STATUS(client->ReadData(err_msg_c, length));
+      TF_RETURN_IF_ERROR(client_->ReadData(err_msg_c, length));
       std::string err_msg((char*)err_msg_c, length);
       delete[] err_msg_c;
 
-      return tensorflow::errors::Internal(
-          "Handshake Error [result=", handshake_res, ", version=",
-          serv_ver_major, ".", serv_ver_minor, ".", serv_ver_patch,
-          ", message='", err_msg, "']");
+      return errors::Internal("Handshake Error [result=", handshake_res,
+                              ", version=", serv_ver_major, ".", serv_ver_minor,
+                              ".", serv_ver_patch, ", message='", err_msg,
+                              "']");
     } else if (header == null_val) {
-      return tensorflow::errors::Internal(
-          "Handshake Error [result=", handshake_res, ", version=",
-          serv_ver_major, ".", serv_ver_minor, ".", serv_ver_patch, "]");
+      return errors::Internal("Handshake Error [result=", handshake_res,
+                              ", version=", serv_ver_major, ".", serv_ver_minor,
+                              ".", serv_ver_patch, "]");
     } else {
-      return tensorflow::errors::Internal(
-          "Handshake Error [result=", handshake_res, ", version=",
-          serv_ver_major, ".", serv_ver_minor, ".", serv_ver_patch, "]");
+      return errors::Internal("Handshake Error [result=", handshake_res,
+                              ", version=", serv_ver_major, ".", serv_ver_minor,
+                              ".", serv_ver_patch, "]");
     }
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status IgniteDatasetIterator::ScanQuery() {
-  CHECK_STATUS(client->WriteInt(25));                        // Message length
-  CHECK_STATUS(client->WriteShort(scan_query_opcode));       // Operation code
-  CHECK_STATUS(client->WriteLong(0));                        // Request ID
-  CHECK_STATUS(client->WriteInt(JavaHashCode(cache_name)));  // Cache name
-  CHECK_STATUS(client->WriteByte(0));                        // Flags
-  CHECK_STATUS(client->WriteByte(null_val));                 // Filter object
-  CHECK_STATUS(client->WriteInt(page_size));                 // Cursor page size
-  CHECK_STATUS(client->WriteInt(part));    // Partition to query
-  CHECK_STATUS(client->WriteByte(local));  // Local flag
+Status IgniteDatasetIterator::ScanQuery() {
+  TF_RETURN_IF_ERROR(client_->WriteInt(25));                   // Message length
+  TF_RETURN_IF_ERROR(client_->WriteShort(scan_query_opcode));  // Operation code
+  TF_RETURN_IF_ERROR(client_->WriteLong(0));                   // Request ID
+  TF_RETURN_IF_ERROR(
+      client_->WriteInt(JavaHashCode(cache_name_)));  // Cache name
+  TF_RETURN_IF_ERROR(client_->WriteByte(0));          // Flags
+  TF_RETURN_IF_ERROR(client_->WriteByte(null_val));   // Filter object
+  TF_RETURN_IF_ERROR(client_->WriteInt(page_size_));  // Cursor page size
+  TF_RETURN_IF_ERROR(client_->WriteInt(part_));       // part_ition to query
+  TF_RETURN_IF_ERROR(client_->WriteByte(local_));     // local_ flag
 
   int64_t wait_start = std::chrono::duration_cast<std::chrono::milliseconds>(
                            std::chrono::system_clock::now().time_since_epoch())
                            .count();
 
   int32_t res_len;
-  CHECK_STATUS(client->ReadInt(res_len));
+  TF_RETURN_IF_ERROR(client_->ReadInt(&res_len));
 
   int64_t wait_stop = std::chrono::duration_cast<std::chrono::milliseconds>(
                           std::chrono::system_clock::now().time_since_epoch())
@@ -293,82 +285,81 @@ tensorflow::Status IgniteDatasetIterator::ScanQuery() {
 
   LOG(INFO) << "Scan Query waited " << (wait_stop - wait_start) << " ms";
 
-  if (res_len < 12)
-    return tensorflow::errors::Internal("Scan Query Response is corrupted");
+  if (res_len < 12) return errors::Internal("Scan Query Response is corrupted");
 
   int64_t req_id;
-  CHECK_STATUS(client->ReadLong(req_id));
+  TF_RETURN_IF_ERROR(client_->ReadLong(&req_id));
 
   int32_t status;
-  CHECK_STATUS(client->ReadInt(status));
+  TF_RETURN_IF_ERROR(client_->ReadInt(&status));
 
   if (status != 0) {
     uint8_t err_msg_header;
-    CHECK_STATUS(client->ReadByte(err_msg_header));
+    TF_RETURN_IF_ERROR(client_->ReadByte(&err_msg_header));
 
     if (err_msg_header == string_val) {
       int32_t err_msg_length;
-      CHECK_STATUS(client->ReadInt(err_msg_length));
+      TF_RETURN_IF_ERROR(client_->ReadInt(&err_msg_length));
 
       uint8_t* err_msg_c = new uint8_t[err_msg_length];
-      CHECK_STATUS(client->ReadData(err_msg_c, err_msg_length));
+      TF_RETURN_IF_ERROR(client_->ReadData(err_msg_c, err_msg_length));
       std::string err_msg((char*)err_msg_c, err_msg_length);
       delete[] err_msg_c;
 
-      return tensorflow::errors::Internal("Scan Query Error [status=", status,
-                                          ", message=", err_msg, "]");
+      return errors::Internal("Scan Query Error [status=", status, ", message=",
+                              err_msg, "]");
     }
-    return tensorflow::errors::Internal("Scan Query Error [status=", status,
-                                        "]");
+    return errors::Internal("Scan Query Error [status=", status, "]");
   }
 
-  CHECK_STATUS(client->ReadLong(cursor_id));
+  TF_RETURN_IF_ERROR(client_->ReadLong(&cursor_id_));
 
-  LOG(INFO) << "Query Cursor " << cursor_id << " is opened";
+  LOG(INFO) << "Query Cursor " << cursor_id_ << " is opened";
 
   int32_t row_cnt;
-  CHECK_STATUS(client->ReadInt(row_cnt));
+  TF_RETURN_IF_ERROR(client_->ReadInt(&row_cnt));
 
-  remainder = res_len - 25;
-  page = std::unique_ptr<uint8_t>(new uint8_t[remainder]);
-  ptr = page.get();
+  remainder_ = res_len - 25;
+  page_ = std::unique_ptr<uint8_t>(new uint8_t[remainder_]);
+  ptr_ = page_.get();
 
   int64_t start = std::chrono::duration_cast<std::chrono::milliseconds>(
                       std::chrono::system_clock::now().time_since_epoch())
                       .count();
 
-  CHECK_STATUS(client->ReadData(ptr, remainder));
+  TF_RETURN_IF_ERROR(client_->ReadData(ptr_, remainder_));
 
   int64_t stop = std::chrono::duration_cast<std::chrono::milliseconds>(
                      std::chrono::system_clock::now().time_since_epoch())
                      .count();
   ;
 
-  double size_in_mb = 1.0 * remainder / 1024 / 1024;
+  double size_in_mb = 1.0 * remainder_ / 1024 / 1024;
   double time_in_s = 1.0 * (stop - start) / 1000;
   LOG(INFO) << "Page size " << size_in_mb << " Mb, time " << time_in_s * 1000
             << " ms download speed " << size_in_mb / time_in_s << " Mb/sec";
 
   uint8_t last_page_b;
-  CHECK_STATUS(client->ReadByte(last_page_b));
+  TF_RETURN_IF_ERROR(client_->ReadByte(&last_page_b));
 
-  last_page = !last_page_b;
+  last_page_ = !last_page_b;
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status IgniteDatasetIterator::LoadNextPage() {
-  CHECK_STATUS(client->WriteInt(18));                       // Message length
-  CHECK_STATUS(client->WriteShort(load_next_page_opcode));  // Operation code
-  CHECK_STATUS(client->WriteLong(0));                       // Request ID
-  CHECK_STATUS(client->WriteLong(cursor_id));               // Cursor ID
+Status IgniteDatasetIterator::LoadNextPage() {
+  TF_RETURN_IF_ERROR(client_->WriteInt(18));  // Message length
+  TF_RETURN_IF_ERROR(
+      client_->WriteShort(load_next_page_opcode));     // Operation code
+  TF_RETURN_IF_ERROR(client_->WriteLong(0));           // Request ID
+  TF_RETURN_IF_ERROR(client_->WriteLong(cursor_id_));  // Cursor ID
 
   int64_t wait_start = std::chrono::duration_cast<std::chrono::milliseconds>(
                            std::chrono::system_clock::now().time_since_epoch())
                            .count();
 
   int32_t res_len;
-  CHECK_STATUS(client->ReadInt(res_len));
+  TF_RETURN_IF_ERROR(client_->ReadInt(&res_len));
 
   int64_t wait_stop = std::chrono::duration_cast<std::chrono::milliseconds>(
                           std::chrono::system_clock::now().time_since_epoch())
@@ -377,66 +368,65 @@ tensorflow::Status IgniteDatasetIterator::LoadNextPage() {
   LOG(INFO) << "Load Next Page waited " << (wait_stop - wait_start) << " ms";
 
   if (res_len < 12)
-    return tensorflow::errors::Internal("Load Next Page Response is corrupted");
+    return errors::Internal("Load Next Page Response is corrupted");
 
   int64_t req_id;
-  CHECK_STATUS(client->ReadLong(req_id));
+  TF_RETURN_IF_ERROR(client_->ReadLong(&req_id));
 
   int32_t status;
-  CHECK_STATUS(client->ReadInt(status));
+  TF_RETURN_IF_ERROR(client_->ReadInt(&status));
 
   if (status != 0) {
     uint8_t err_msg_header;
-    CHECK_STATUS(client->ReadByte(err_msg_header));
+    TF_RETURN_IF_ERROR(client_->ReadByte(&err_msg_header));
 
     if (err_msg_header == string_val) {
       int32_t err_msg_length;
-      CHECK_STATUS(client->ReadInt(err_msg_length));
+      TF_RETURN_IF_ERROR(client_->ReadInt(&err_msg_length));
 
       uint8_t* err_msg_c = new uint8_t[err_msg_length];
-      CHECK_STATUS(client->ReadData(err_msg_c, err_msg_length));
+      TF_RETURN_IF_ERROR(client_->ReadData(err_msg_c, err_msg_length));
       std::string err_msg((char*)err_msg_c, err_msg_length);
       delete[] err_msg_c;
 
-      return tensorflow::errors::Internal("Load Next Page Error [status=",
-                                          status, ", message=", err_msg, "]");
+      return errors::Internal("Load Next Page Error [status=", status,
+                              ", message=", err_msg, "]");
     }
-    return tensorflow::errors::Internal("Load Next Page Error [status=", status,
-                                        "]");
+    return errors::Internal("Load Next Page Error [status=", status, "]");
   }
 
   int32_t row_cnt;
-  CHECK_STATUS(client->ReadInt(row_cnt));
+  TF_RETURN_IF_ERROR(client_->ReadInt(&row_cnt));
 
-  remainder = res_len - 17;
-  page = std::unique_ptr<uint8_t>(new uint8_t[remainder]);
-  ptr = page.get();
+  remainder_ = res_len - 17;
+  page_ = std::unique_ptr<uint8_t>(new uint8_t[remainder_]);
+  ptr_ = page_.get();
 
   int64_t start = std::chrono::duration_cast<std::chrono::milliseconds>(
                       std::chrono::system_clock::now().time_since_epoch())
                       .count();
 
-  CHECK_STATUS(client->ReadData(ptr, remainder));
+  TF_RETURN_IF_ERROR(client_->ReadData(ptr_, remainder_));
 
   int64_t stop = std::chrono::duration_cast<std::chrono::milliseconds>(
                      std::chrono::system_clock::now().time_since_epoch())
                      .count();
   ;
 
-  double size_in_mb = 1.0 * remainder / 1024 / 1024;
+  double size_in_mb = 1.0 * remainder_ / 1024 / 1024;
   double time_in_s = 1.0 * (stop - start) / 1000;
   LOG(INFO) << "Page size " << size_in_mb << " Mb, time " << time_in_s * 1000
             << " ms download speed " << size_in_mb / time_in_s << " Mb/sec";
 
   uint8_t last_page_b;
-  CHECK_STATUS(client->ReadByte(last_page_b));
+  TF_RETURN_IF_ERROR(client_->ReadByte(&last_page_b));
 
-  last_page = !last_page_b;
+  last_page_ = !last_page_b;
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-int32_t IgniteDatasetIterator::JavaHashCode(std::string str) {
+int32_t IgniteDatasetIterator::JavaHashCode(std::string str) const {
   int32_t h = 0;
   for (char& c : str) {
     h = 31 * h + c;
@@ -444,4 +434,4 @@ int32_t IgniteDatasetIterator::JavaHashCode(std::string str) {
   return h;
 }
 
-}  // namespace ignite
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h b/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
index d1df4527f9..5858dbfcb9 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
@@ -14,65 +14,55 @@ limitations under the License.
 ==============================================================================*/
 
 #include "ignite_binary_object_parser.h"
-#include "ignite_dataset.h"
-
-#ifndef IGNITE_CLIENT_H
-#define IGNITE_CLIENT_H
 #include "ignite_client.h"
-#endif
+#include "ignite_dataset.h"
 
-namespace ignite {
+namespace tensorflow {
 
-class IgniteDatasetIterator
-    : public tensorflow::DatasetIterator<IgniteDataset> {
+class IgniteDatasetIterator : public DatasetIterator<IgniteDataset> {
  public:
-  IgniteDatasetIterator(const Params& params, std::string host,
-                        tensorflow::int32 port, std::string cache_name,
-                        bool local, tensorflow::int32 part,
-                        tensorflow::int32 page_size, std::string username,
+  IgniteDatasetIterator(const Params& params, std::string host, int32 port,
+                        std::string cache_name, bool local, int32 part,
+                        int32 page_size, std::string username,
                         std::string password, std::string certfile,
                         std::string keyfile, std::string cert_password,
-                        std::vector<tensorflow::int32> schema,
-                        std::vector<tensorflow::int32> permutation);
+                        std::vector<int32> schema,
+                        std::vector<int32> permutation);
   ~IgniteDatasetIterator();
-  tensorflow::Status GetNextInternal(
-      tensorflow::IteratorContext* ctx,
-      std::vector<tensorflow::Tensor>* out_tensors,
-      bool* end_of_sequence) override;
+  Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) override;
 
  protected:
-  tensorflow::Status SaveInternal(
-      tensorflow::IteratorStateWriter* writer) override;
-  tensorflow::Status RestoreInternal(
-      tensorflow::IteratorContext* ctx,
-      tensorflow::IteratorStateReader* reader) override;
+  Status SaveInternal(IteratorStateWriter* writer) override;
+  Status RestoreInternal(IteratorContext* ctx,
+                         IteratorStateReader* reader) override;
 
  private:
-  std::unique_ptr<Client> client;
-  BinaryObjectParser parser;
+  std::unique_ptr<Client> client_;
+  BinaryObjectParser parser_;
 
-  const std::string cache_name;
-  const bool local;
-  const tensorflow::int32 part;
-  const tensorflow::int32 page_size;
-  const std::string username;
-  const std::string password;
-  const std::vector<tensorflow::int32> schema;
-  const std::vector<tensorflow::int32> permutation;
+  const std::string cache_name_;
+  const bool local_;
+  const int32 part_;
+  const int32 page_size_;
+  const std::string username_;
+  const std::string password_;
+  const std::vector<int32> schema_;
+  const std::vector<int32> permutation_;
 
-  int32_t remainder;
-  int64_t cursor_id;
-  bool last_page;
+  int32_t remainder_;
+  int64_t cursor_id_;
+  bool last_page_;
 
-  std::unique_ptr<uint8_t> page;
-  uint8_t* ptr;
+  std::unique_ptr<uint8_t> page_;
+  uint8_t* ptr_;
 
-  tensorflow::Status EstablishConnection();
-  tensorflow::Status CloseConnection();
-  tensorflow::Status Handshake();
-  tensorflow::Status ScanQuery();
-  tensorflow::Status LoadNextPage();
-  int32_t JavaHashCode(std::string str);
+  Status EstablishConnection();
+  Status CloseConnection();
+  Status Handshake();
+  Status ScanQuery();
+  Status LoadNextPage();
+  int32_t JavaHashCode(std::string str) const;
 };
 
 constexpr uint8_t null_val = 101;
@@ -84,4 +74,4 @@ constexpr int16_t scan_query_opcode = 2000;
 constexpr int16_t load_next_page_opcode = 2001;
 constexpr int16_t close_connection_opcode = 0;
 
-}  // namespace ignite
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc b/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
index 543b5e4afc..89eecf9c14 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 
 namespace tensorflow {
+namespace {
 
 class IgniteDatasetOp : public DatasetOpKernel {
  public:
@@ -132,14 +133,15 @@ class IgniteDatasetOp : public DatasetOpKernel {
       permutation.push_back(permutation_tensor->flat<int32>()(i));
     }
 
-    *output = new ignite::IgniteDataset(
-        ctx, cache_name, host, port, local, part, page_size, username, password,
-        certfile, keyfile, cert_password, std::move(schema),
-        std::move(permutation));
+    *output =
+        new IgniteDataset(ctx, cache_name, host, port, local, part, page_size,
+                          username, password, certfile, keyfile, cert_password,
+                          std::move(schema), std::move(permutation));
   }
 };
 
 REGISTER_KERNEL_BUILDER(Name("IgniteDataset").Device(DEVICE_CPU),
                         IgniteDatasetOp);
 
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h b/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
index 5491af68d6..6f417a3cb5 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
@@ -13,31 +13,28 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef IGNITE_CLIENT_H
-#define IGNITE_CLIENT_H
 #include "ignite_client.h"
-#endif
 
 #include <string>
 
-namespace ignite {
+namespace tensorflow {
 
 class PlainClient : public Client {
  public:
   PlainClient(std::string host, int port);
   ~PlainClient();
 
-  virtual tensorflow::Status Connect();
-  virtual tensorflow::Status Disconnect();
+  virtual Status Connect();
+  virtual Status Disconnect();
   virtual bool IsConnected();
   virtual int GetSocketDescriptor();
-  virtual tensorflow::Status ReadData(uint8_t* buf, int32_t length);
-  virtual tensorflow::Status WriteData(uint8_t* buf, int32_t length);
+  virtual Status ReadData(uint8_t* buf, int32_t length);
+  virtual Status WriteData(uint8_t* buf, int32_t length);
 
  private:
-  std::string host;
-  int port;
-  int sock;
+  const std::string host_;
+  const int port_;
+  int sock_;
 };
 
-}  // namespace ignite
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc b/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
index dbfa4f8786..a4c58a9563 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
@@ -29,104 +29,98 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 
-namespace ignite {
+namespace tensorflow {
 
 PlainClient::PlainClient(std::string host, int port)
-    : host(host), port(port), sock(-1) {}
+    : host_(host), port_(port), sock_(-1) {}
 
 PlainClient::~PlainClient() {
   if (IsConnected()) {
-    tensorflow::Status status = Disconnect();
+    Status status = Disconnect();
     if (!status.ok()) LOG(WARNING) << status.ToString();
   }
 }
 
-tensorflow::Status PlainClient::Connect() {
-  if (sock == -1) {
-    sock = socket(AF_INET, SOCK_STREAM, 0);
-    if (sock == -1)
-      return tensorflow::errors::Internal("Failed to create socket");
+Status PlainClient::Connect() {
+  if (sock_ == -1) {
+    sock_ = socket(AF_INET, SOCK_STREAM, 0);
+    if (sock_ == -1) return errors::Internal("Failed to create socket");
   }
 
   sockaddr_in server;
 
-  server.sin_addr.s_addr = inet_addr(host.c_str());
+  server.sin_addr.s_addr = inet_addr(host_.c_str());
   if (server.sin_addr.s_addr == -1) {
     hostent* he;
     in_addr** addr_list;
 
-    if ((he = gethostbyname(host.c_str())) == NULL)
-      return tensorflow::errors::Internal("Failed to resolve hostname \"", host,
-                                          "\"");
+    if ((he = gethostbyname(host_.c_str())) == NULL)
+      return errors::Internal("Failed to resolve hostname \"", host_, "\"");
 
     addr_list = (in_addr**)he->h_addr_list;
     if (addr_list[0] != NULL) server.sin_addr = *addr_list[0];
   }
 
   server.sin_family = AF_INET;
-  server.sin_port = htons(port);
+  server.sin_port = htons(port_);
 
-  if (connect(sock, (sockaddr*)&server, sizeof(server)) < 0)
-    return tensorflow::errors::Internal("Failed to connect to \"", host, ":",
-                                        port, "\"");
+  if (connect(sock_, (sockaddr*)&server, sizeof(server)) < 0)
+    return errors::Internal("Failed to connect to \"", host_, ":", port_, "\"");
 
-  LOG(INFO) << "Connection to \"" << host << ":" << port << "\" established";
+  LOG(INFO) << "Connection to \"" << host_ << ":" << port_ << "\" established";
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status PlainClient::Disconnect() {
-  int close_res = close(sock);
-  sock = -1;
+Status PlainClient::Disconnect() {
+  int close_res = close(sock_);
+  sock_ = -1;
 
-  LOG(INFO) << "Connection to \"" << host << ":" << port << "\" is closed";
+  LOG(INFO) << "Connection to \"" << host_ << ":" << port_ << "\" is closed";
 
-  return close_res == 0 ? tensorflow::Status::OK()
-                        : tensorflow::errors::Internal(
-                              "Failed to correctly close connection");
+  return close_res == 0
+             ? Status::OK()
+             : errors::Internal("Failed to correctly close connection");
 }
 
-bool PlainClient::IsConnected() { return sock != -1; }
+bool PlainClient::IsConnected() { return sock_ != -1; }
 
-int PlainClient::GetSocketDescriptor() { return sock; }
+int PlainClient::GetSocketDescriptor() { return sock_; }
 
-tensorflow::Status PlainClient::ReadData(uint8_t* buf, int32_t length) {
+Status PlainClient::ReadData(uint8_t* buf, int32_t length) {
   int recieved = 0;
 
   while (recieved < length) {
-    int res = recv(sock, buf, length - recieved, 0);
+    int res = recv(sock_, buf, length - recieved, 0);
 
     if (res < 0)
-      return tensorflow::errors::Internal(
-          "Error occured while reading from socket: ", res, ", ",
-          std::string(strerror(errno)));
+      return errors::Internal("Error occured while reading from socket: ", res,
+                              ", ", std::string(strerror(errno)));
 
-    if (res == 0)
-      return tensorflow::errors::Internal("Server closed connection");
+    if (res == 0) return errors::Internal("Server closed connection");
 
     recieved += res;
     buf += res;
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status PlainClient::WriteData(uint8_t* buf, int32_t length) {
+Status PlainClient::WriteData(uint8_t* buf, int32_t length) {
   int sent = 0;
 
   while (sent < length) {
-    int res = send(sock, buf, length - sent, 0);
+    int res = send(sock_, buf, length - sent, 0);
 
     if (res < 0)
-      return tensorflow::errors::Internal(
-          "Error occured while writing into socket: ", res, ", ",
-          std::string(strerror(errno)));
+      return errors::Internal("Error occured while writing into socket: ", res,
+                              ", ", std::string(strerror(errno)));
 
     sent += res;
     buf += res;
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-}  // namespace ignite
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc b/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
index f78c9b3627..7ba037f2d2 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
@@ -27,48 +27,45 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 
-namespace ignite {
+namespace tensorflow {
 
 PlainClient::PlainClient(std::string host, int port)
-    : host(host), port(port), sock(INVALID_SOCKET) {}
+    : host_(host), port_(port), sock_(INVALID_SOCKET) {}
 
 PlainClient::~PlainClient() {
   if (IsConnected()) {
-    tensorflow::Status status = Disconnect();
+    Status status = Disconnect();
     if (!status.ok()) LOG(WARNING) << status.ToString();
   }
 }
 
-tensorflow::Status PlainClient::Connect() {
+Status PlainClient::Connect() {
   WSADATA wsaData;
   addrinfo *result = NULL, *ptr = NULL, hints;
 
   int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
-  if (res != 0)
-    return tensorflow::errors::Internal("WSAStartup failed with error: ", res);
+  if (res != 0) return errors::Internal("WSAStartup failed with error: ", res);
 
   ZeroMemory(&hints, sizeof(hints));
   hints.ai_family = AF_UNSPEC;
   hints.ai_socktype = SOCK_STREAM;
   hints.ai_protocol = IPPROTO_TCP;
 
-  res =
-      getaddrinfo(host.c_str(), std::to_string(port).c_str(), &hints, &result);
-  if (res != 0)
-    return tensorflow::errors::Internal("Getaddrinfo failed with error: ", res);
+  res = getaddrinfo(host_.c_str(), std::to_string(port_).c_str(), &hints,
+                    &result);
+  if (res != 0) return errors::Internal("Getaddrinfo failed with error: ", res);
 
   for (ptr = result; ptr != NULL; ptr = ptr->ai_next) {
-    sock = socket(ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol);
-    if (sock == INVALID_SOCKET) {
+    sock_ = socket(ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol);
+    if (sock_ == INVALID_SOCKET) {
       WSACleanup();
-      return tensorflow::errors::Internal("Socket failed with error: ",
-                                          WSAGetLastError());
+      return errors::Internal("Socket failed with error: ", WSAGetLastError());
     }
 
-    res = connect(sock, ptr->ai_addr, (int)ptr->ai_addrlen);
+    res = connect(sock_, ptr->ai_addr, (int)ptr->ai_addrlen);
     if (res == SOCKET_ERROR) {
-      closesocket(sock);
-      sock = INVALID_SOCKET;
+      closesocket(sock_);
+      sock_ = INVALID_SOCKET;
       continue;
     }
 
@@ -77,67 +74,63 @@ tensorflow::Status PlainClient::Connect() {
 
   freeaddrinfo(result);
 
-  if (sock == INVALID_SOCKET) {
+  if (sock_ == INVALID_SOCKET) {
     WSACleanup();
-    return tensorflow::errors::Internal("Unable to connect to server");
+    return errors::Internal("Unable to connect to server");
   }
 
-  LOG(INFO) << "Connection to \"" << host << ":" << port << "\" established";
+  LOG(INFO) << "Connection to \"" << host_ << ":" << port_ << "\" established";
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status PlainClient::Disconnect() {
-  int res = shutdown(sock, SD_SEND);
-  closesocket(sock);
+Status PlainClient::Disconnect() {
+  int res = shutdown(sock_, SD_SEND);
+  closesocket(sock_);
   WSACleanup();
 
   if (res == SOCKET_ERROR)
-    return tensorflow::errors::Internal("Shutdown failed with error: ",
-                                        WSAGetLastError());
+    return errors::Internal("Shutdown failed with error: ", WSAGetLastError());
   else
-    return tensorflow::Status::OK();
+    return Status::OK();
 }
 
-bool PlainClient::IsConnected() { return sock != INVALID_SOCKET; }
+bool PlainClient::IsConnected() { return sock_ != INVALID_SOCKET; }
 
-int PlainClient::GetSocketDescriptor() { return sock; }
+int PlainClient::GetSocketDescriptor() { return sock_; }
 
-tensorflow::Status PlainClient::ReadData(uint8_t *buf, int32_t length) {
+Status PlainClient::ReadData(uint8_t *buf, int32_t length) {
   int recieved = 0;
 
   while (recieved < length) {
-    int res = recv(sock, buf, length - recieved, 0);
+    int res = recv(sock_, buf, length - recieved, 0);
 
     if (res < 0)
-      return tensorflow::errors::Internal(
-          "Error occured while reading from socket: ", res);
+      return errors::Internal("Error occured while reading from socket: ", res);
 
-    if (res == 0)
-      return tensorflow::errors::Internal("Server closed connection");
+    if (res == 0) return errors::Internal("Server closed connection");
 
     recieved += res;
     buf += res;
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status PlainClient::WriteData(uint8_t *buf, int32_t length) {
+Status PlainClient::WriteData(uint8_t *buf, int32_t length) {
   int sent = 0;
 
   while (sent < length) {
-    int res = send(sock, buf, length - sent, 0);
+    int res = send(sock_, buf, length - sent, 0);
 
     if (res < 0)
-      return tensorflow::errors::Internal(
-          "Error occured while writing into socket: ", res);
+      return errors::Internal("Error occured while writing into socket: ", res);
 
     sent += res;
     buf += res;
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-}  // namespace ignite
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc b/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
index a1101b91f3..a2bc6b9609 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <openssl/err.h>
 #include <openssl/ssl.h>
 
-namespace ignite {
+namespace tensorflow {
 
 static int PasswordCb(char *buf, int size, int rwflag, void *password) {
   strncpy(buf, (char *)(password), size);
@@ -31,119 +31,112 @@ static int PasswordCb(char *buf, int size, int rwflag, void *password) {
 
 SslWrapper::SslWrapper(std::shared_ptr<Client> client, std::string certfile,
                        std::string keyfile, std::string cert_password)
-    : client(client),
-      certfile(certfile),
-      keyfile(keyfile),
-      cert_password(cert_password),
-      ctx(NULL) {}
+    : client_(client),
+      certfile_(certfile),
+      keyfile_(keyfile),
+      cert_password_(cert_password),
+      ctx_(NULL) {}
 
 SslWrapper::~SslWrapper() {
   if (IsConnected()) {
-    tensorflow::Status status = Disconnect();
+    Status status = Disconnect();
     if (!status.ok()) LOG(WARNING) << status.ToString();
   }
 
-  if (ctx != NULL) {
-    SSL_CTX_free(ctx);
-    ctx = NULL;
+  if (ctx_ != NULL) {
+    SSL_CTX_free(ctx_);
+    ctx_ = NULL;
   }
 }
 
-tensorflow::Status SslWrapper::InitSslContext() {
+Status SslWrapper::InitSslContext() {
   OpenSSL_add_all_algorithms();
   SSL_load_error_strings();
 
-  ctx = SSL_CTX_new(SSLv23_method());
-  if (ctx == NULL)
-    return tensorflow::errors::Internal("Couldn't create SSL context");
+  ctx_ = SSL_CTX_new(SSLv23_method());
+  if (ctx_ == NULL) return errors::Internal("Couldn't create SSL context");
 
-  SSL_CTX_set_default_passwd_cb(ctx, PasswordCb);
-  SSL_CTX_set_default_passwd_cb_userdata(ctx, (void *)cert_password.c_str());
+  SSL_CTX_set_default_passwd_cb(ctx_, PasswordCb);
+  SSL_CTX_set_default_passwd_cb_userdata(ctx_, (void *)cert_password_.c_str());
 
-  if (SSL_CTX_use_certificate_chain_file(ctx, certfile.c_str()) != 1)
-    return tensorflow::errors::Internal(
-        "Couldn't load cetificate chain (file '", certfile, "')");
+  if (SSL_CTX_use_certificate_chain_file(ctx_, certfile_.c_str()) != 1)
+    return errors::Internal("Couldn't load cetificate chain (file '", certfile_,
+                            "')");
 
-  std::string private_key_file = keyfile.empty() ? certfile : keyfile;
-  if (SSL_CTX_use_PrivateKey_file(ctx, private_key_file.c_str(),
+  std::string private_key_file = keyfile_.empty() ? certfile_ : keyfile_;
+  if (SSL_CTX_use_PrivateKey_file(ctx_, private_key_file.c_str(),
                                   SSL_FILETYPE_PEM) != 1)
-    return tensorflow::errors::Internal("Couldn't load private key (file '",
-                                        private_key_file, "')");
+    return errors::Internal("Couldn't load private key (file '",
+                            private_key_file, "')");
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status SslWrapper::Connect() {
-  tensorflow::Status status;
-
-  if (ctx == NULL) {
-    status = InitSslContext();
-    if (!status.ok()) return status;
+Status SslWrapper::Connect() {
+  if (ctx_ == NULL) {
+    TF_RETURN_IF_ERROR(InitSslContext());
   }
 
-  ssl = SSL_new(ctx);
-  if (ssl == NULL)
-    return tensorflow::errors::Internal("Failed to establish SSL connection");
+  ssl_ = SSL_new(ctx_);
+  if (ssl_ == NULL)
+    return errors::Internal("Failed to establish SSL connection");
 
-  status = client->Connect();
-  if (!status.ok()) return status;
+  TF_RETURN_IF_ERROR(client_->Connect());
 
-  SSL_set_fd(ssl, client->GetSocketDescriptor());
-  if (SSL_connect(ssl) != 1)
-    return tensorflow::errors::Internal("Failed to establish SSL connection");
+  SSL_set_fd(ssl_, client_->GetSocketDescriptor());
+  if (SSL_connect(ssl_) != 1)
+    return errors::Internal("Failed to establish SSL connection");
 
   LOG(INFO) << "SSL connection established";
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status SslWrapper::Disconnect() {
-  SSL_free(ssl);
+Status SslWrapper::Disconnect() {
+  SSL_free(ssl_);
 
   LOG(INFO) << "SSL connection closed";
 
-  return client->Disconnect();
+  return client_->Disconnect();
 }
 
-bool SslWrapper::IsConnected() { return client->IsConnected(); }
+bool SslWrapper::IsConnected() { return client_->IsConnected(); }
 
-int SslWrapper::GetSocketDescriptor() { return client->GetSocketDescriptor(); }
+int SslWrapper::GetSocketDescriptor() { return client_->GetSocketDescriptor(); }
 
-tensorflow::Status SslWrapper::ReadData(uint8_t *buf, int32_t length) {
+Status SslWrapper::ReadData(uint8_t *buf, int32_t length) {
   int recieved = 0;
 
   while (recieved < length) {
-    int res = SSL_read(ssl, buf, length - recieved);
+    int res = SSL_read(ssl_, buf, length - recieved);
 
     if (res < 0)
-      return tensorflow::errors::Internal(
-          "Error occured while reading from SSL socket: ", res);
+      return errors::Internal("Error occured while reading from SSL socket: ",
+                              res);
 
-    if (res == 0)
-      return tensorflow::errors::Internal("Server closed SSL connection");
+    if (res == 0) return errors::Internal("Server closed SSL connection");
 
     recieved += res;
     buf += res;
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status SslWrapper::WriteData(uint8_t *buf, int32_t length) {
+Status SslWrapper::WriteData(uint8_t *buf, int32_t length) {
   int sent = 0;
 
   while (sent < length) {
-    int res = SSL_write(ssl, buf, length - sent);
+    int res = SSL_write(ssl_, buf, length - sent);
 
     if (res < 0)
-      return tensorflow::errors::Internal(
-          "Error occured while writing into socket: ", res);
+      return errors::Internal("Error occured while writing into socket: ", res);
 
     sent += res;
     buf += res;
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-}  // namespace ignite
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h b/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
index e0c2a242dc..bbba6cc181 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
@@ -13,15 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef IGNITE_CLIENT_H
-#define IGNITE_CLIENT_H
 #include "ignite_client.h"
-#endif
 
 #include <openssl/ssl.h>
 #include <string>
 
-namespace ignite {
+namespace tensorflow {
 
 class SslWrapper : public Client {
  public:
@@ -29,21 +26,22 @@ class SslWrapper : public Client {
              std::string keyfile, std::string cert_password);
   ~SslWrapper();
 
-  virtual tensorflow::Status Connect();
-  virtual tensorflow::Status Disconnect();
+  virtual Status Connect();
+  virtual Status Disconnect();
   virtual bool IsConnected();
   virtual int GetSocketDescriptor();
-  virtual tensorflow::Status ReadData(uint8_t* buf, int32_t length);
-  virtual tensorflow::Status WriteData(uint8_t* buf, int32_t length);
+  virtual Status ReadData(uint8_t* buf, int32_t length);
+  virtual Status WriteData(uint8_t* buf, int32_t length);
 
  private:
-  std::shared_ptr<Client> client;
-  std::string certfile;
-  std::string keyfile;
-  std::string cert_password;
-  SSL_CTX* ctx;
-  SSL* ssl;
-  tensorflow::Status InitSslContext();
+  std::shared_ptr<Client> client_;
+  std::string certfile_;
+  std::string keyfile_;
+  std::string cert_password_;
+  SSL_CTX* ctx_;
+  SSL* ssl_;
+
+  Status InitSslContext();
 };
 
-}  // namespace ignite
+}  // namespace tensorflow
-- 
GitLab


From 1408a1563e73e69f68c1eb6f34a0976c7c950ad9 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Tue, 28 Aug 2018 11:32:57 +0300
Subject: [PATCH 030/570] Update README.md.

---
 tensorflow/contrib/ignite/README.md | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/tensorflow/contrib/ignite/README.md b/tensorflow/contrib/ignite/README.md
index f2596fc572..8fec4066c4 100644
--- a/tensorflow/contrib/ignite/README.md
+++ b/tensorflow/contrib/ignite/README.md
@@ -13,19 +13,20 @@
 ## Overview
 
 [Apache Ignite](https://ignite.apache.org/) is a memory-centric distributed database, caching, and processing platform for
-transactional, analytical, and streaming workloads, delivering in-memory speeds at petabyte scale. This contrib package contains an integration between Apache Ignite and TensorFlow. The integration is based on [tf.data](https://www.tensorflow.org/api_docs/python/tf/data) from TensorFlow side and [Binary Client Protocol](https://apacheignite.readme.io/v2.6/docs/binary-client-protocol) from Apache Ignite side. It allows to use Apache Ignite as a datasource for neural network training, inference and all other computations supported by TensorFlow. 
+transactional, analytical, and streaming workloads, delivering in-memory speeds at petabyte scale. This contrib package contains an integration between Apache Ignite and TensorFlow. The integration is based on [tf.data](https://www.tensorflow.org/api_docs/python/tf/data) from TensorFlow side and [Binary Client Protocol](https://apacheignite.readme.io/v2.6/docs/binary-client-protocol) from Apache Ignite side. It allows to use Apache Ignite as a data source for neural network training, inference and all other computations supported by TensorFlow. 
 
 ## Features
 
-Ignite Dataset provides a set of features that makes it possible to use it in a wide range of cases. The most important and interesting features are described below.
+Ignite Dataset provides features that that you can use in a wide range of cases. The most important and interesting features are described below.
 
 ### Distributed In-Memory Datasource
-[Apache Ignite](https://ignite.apache.org/) is a distributed in-memory database, caching, and processing platform that allows to avoid limitations of hard drive and provide high reading speed and ability to store and operate with as much data as you need in distributed cluster. Using of Ignite Dataset makes it possible to utilize all these advantages. 
+[Apache Ignite](https://ignite.apache.org/) is a distributed in-memory database, caching, and processing platform that provides fast data access. It allows you to avoid limitations of hard drive and and store and operate with as much data as you need in distributed cluster. You can utilize
+these benefits of Apache Ignite by using Ignite Dataset. Moreover, Ignite Dataset can be used for the following use-cases:
 - If you have a **gigabyte** of data you can keep it on a single machine on a hard drive, but you will face with hard drive speed limitations. At the same time, you can store your data in Apache Ignite on the same machine and use it as a datasource for TensorFlow and thus avoid these limitations.
 - If you have a **terabyte** of data you probably still can keep it on a single machine on a hard drive, but you will face with hard drive speed limitations again. At the same time, you can store your data in Apache Ignite distributed in-memory cluster and use it as a datasource for TensorFlow and thus avoid these limitations.
 - If you have a **petabyte** of data you can't keep it on a single machine. At the same time, you can store your data in Apache Ignite distributed in-memory cluster and use it as a datasource for TensorFlow.
 
-It's  important that Apache Ignite is not just a step of ETL pipeline between database or data warehouse and TensorFlow. Apache Ignite is a high-grade database itself. Choosing Apache Ignite and TensorFlow you are getting everything you need to work with operational or historical data and, in the same time, an ability to use this data for neural network training and inference.
+Note that Apache Ignite is not just a step of ETL pipeline between a database or a data warehouse and TensorFlow. Apache Ignite is a high-grade database itself. By choosing Apache Ignite and TensorFlow you are getting everything you need to work with operational or historical data and, at the same time, an ability to use this data for neural network training and inference.
 
 ```bash
 $ apache-ignite-fabric/bin/ignite.sh
@@ -55,7 +56,7 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```
 
 ### Structured Objects
-[Apache Ignite](https://ignite.apache.org/) allows to store any objects you would like to store. These objects can have any hierarchy. Ignite Dataset provides an ability to work with such objects.
+[Apache Ignite](https://ignite.apache.org/) allows to store any type of objects. These objects can have any hierarchy. Ignite Dataset provides an ability to work with such objects.
 
 ```python
 >>> import tensorflow as tf
@@ -81,7 +82,7 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
     }
 }
 ```
- Neural network training and other computations require transformations that can be done as part of  [tf.data](https://www.tensorflow.org/api_docs/python/tf/data) pipeline if you use Ignite Dataset.
+ Neural network training and other computations require transformations that can be done as part of [tf.data](https://www.tensorflow.org/api_docs/python/tf/data) pipeline if you use Ignite Dataset.
 
 ```python
 >>> import tensorflow as tf
@@ -99,15 +100,15 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 
 ### Distributed Training
 
-TensorFlow is a machine learning framework that [natively supports](https://www.tensorflow.org/deploy/distributed) distributed neural network training, inference and other computations. The main idea behind the distributed neural network training is an ability to calculate gradients of loss functions (squares of the errors) on every partition of data (in terms of horizontal partitioning) and then sum them to get loss function gradient of the whole dataset. 
+TensorFlow is a machine learning framework that [natively supports](https://www.tensorflow.org/deploy/distributed) distributed neural network training, inference and other computations. The main idea behind the distributed neural network training is the ability to calculate gradients of loss functions (squares of the errors) on every partition of data (in terms of horizontal partitioning) and then sum them to get loss function gradient of the whole dataset. 
 
 <a href="https://www.codecogs.com/eqnedit.php?latex=\nabla[\sum_1^n(y&space;-&space;\hat{y})^2]&space;=&space;\nabla[\sum_1^{n_1}(y&space;-&space;\hat{y})^2]&space;&plus;&space;\nabla[\sum_{n_1}^{n_2}(y&space;-&space;\hat{y})^2]&space;&plus;&space;...&space;&plus;&space;\nabla[\sum_{n_{k-1}}^n(y&space;-&space;\hat{y})^2]" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\nabla[\sum_1^n(y&space;-&space;\hat{y})^2]&space;=&space;\nabla[\sum_1^{n_1}(y&space;-&space;\hat{y})^2]&space;&plus;&space;\nabla[\sum_{n_1}^{n_2}(y&space;-&space;\hat{y})^2]&space;&plus;&space;...&space;&plus;&space;\nabla[\sum_{n_{k-1}}^n(y&space;-&space;\hat{y})^2]" title="\nabla[\sum_1^n(y - \hat{y})^2] = \nabla[\sum_1^{n_1}(y - \hat{y})^2] + \nabla[\sum_{n_1}^{n_2}(y - \hat{y})^2] + ... + \nabla[\sum_{n_{k-1}}^n(y - \hat{y})^2]" /></a>
 
-Utilizing this ability we can calculate gradients on the nodes the data is stored on, reduce them and then finally update model parameters. It allows to avoid data transfers between nodes and thus to avoid network bottleneck.
+Using this ability we can calculate gradients on the nodes the data is stored on, reduce them and then finally update model parameters. It allows to avoid data transfers between nodes and thus to avoid network bottlenecks.
 
-Apache Ignite uses horizontal partitioning to store data in distributed cluster. When we create Apache Ignite cache (or table in terms of SQL) we can specify the number of partitions the data will be partitioned on. If, for example, Apache Ignite cluster consists of 10 machines and we creates cache with 10 partitions then every machine will maintain approximately one data partition.
+Apache Ignite uses horizontal partitioning to store data in distributed cluster. When we create Apache Ignite cache (or table in terms of SQL), we can specify the number of partitions the data will be partitioned on. For example, if an Apache Ignite cluster consists of 10 machines and we create cache with 10 partitions, then every machine will maintain approximately one data partition.
 
-Ignite Dataset allows to utilize these two aspects of distributed neural network training (using TensorFlow) and Apache Ignite partitioning. Ignite Dataset is a computation graph operation that might be performed on a remote worker. The remote worker can override Ignite Dataset parameters (such as `host`, `port` or `part`) by setting correstondent environment variables for worker process (such as `IGNITE_DATASET_HOST`, `IGNITE_DATASET_PORT` or `IGNITE_DATASET_PART`). Using this overriding approach we are able to assign specific partition to every worker so that one worker handles one partition and, at the same time, transparently work with single dataset.
+Ignite Dataset allows using these two aspects of distributed neural network training (using TensorFlow) and Apache Ignite partitioning. Ignite Dataset is a computation graph operation that can be performed on a remote worker. The remote worker can override Ignite Dataset parameters (such as `host`, `port` or `part`) by setting correstondent environment variables for worker process (such as `IGNITE_DATASET_HOST`, `IGNITE_DATASET_PORT` or `IGNITE_DATASET_PART`). Using this overriding approach, we can assign a specific partition to every worker so that one worker handles one partition and, at the same time, transparently work with single dataset.
 
 ```python
 >>> import tensorflow as tf
@@ -135,7 +136,7 @@ High-level TensorFlow API for [distributed training](https://www.tensorflow.org/
 
 ### SSL Connection
 
-Your data should not be accessible without any control. Apache Ignite allows to protect data transfer channels by [SSL](https://en.wikipedia.org/wiki/Transport_Layer_Security) and authentification. Ignite Dataset supports both SSL connection with and without authntication. For more information please see [Apache Ignite SSL/TLS](https://apacheignite.readme.io/docs/ssltls) documentation.
+Apache Ignite allows to protect data transfer channels by [SSL](https://en.wikipedia.org/wiki/Transport_Layer_Security) and authentification. Ignite Dataset supports both SSL connection with and without authntication. For more information, please refer to the [Apache Ignite SSL/TLS](https://apacheignite.readme.io/docs/ssltls) documentation.
 
 ```python
 >>> import tensorflow as tf
@@ -147,11 +148,11 @@ Your data should not be accessible without any control. Apache Ignite allows to
 
 ### Windows Support
 
-Ignite Dataset is fully compatible with Windows, so you can use it as part of TensorFlow on your Windows workstation as well as on Linux/MacOS systems.
+Ignite Dataset is fully compatible with Windows. You can use it as part of TensorFlow on your Windows workstation as well as on Linux/MacOS systems.
 
 ## Try it out
 
-The simplest way to try Ignite Dataset out is to run [Docker](https://www.docker.com/) container with Apache Ignite and loaded [MNIST](http://yann.lecun.com/exdb/mnist/) data and then interruct with it using Ignite Dataset. Such container is available on Docker Hub: [dmitrievanthony/ignite-with-mnist](https://hub.docker.com/r/dmitrievanthony/ignite-with-mnist/). You need to start this container on your machine:
+The simplest way to try Ignite Dataset is to run a [Docker](https://www.docker.com/) container with Apache Ignite and loaded [MNIST](http://yann.lecun.com/exdb/mnist/) data and after start interruct with it using Ignite Dataset. Such container is available on Docker Hub: [dmitrievanthony/ignite-with-mnist](https://hub.docker.com/r/dmitrievanthony/ignite-with-mnist/). You need to start this container on your machine:
 
 ```
 docker run -it -p 10800:10800 dmitrievanthony/ignite-with-mnist
@@ -163,4 +164,4 @@ After that you will be able to work with it following way:
 
 ## Limitations
 
-Presently Ignite Dataset works with assumption that all objects in the cache have the same structure (homogeneous objects) and the cache contains at least one object. Another limitation concerns structured objects, Ignite Dataset does not support UUID, Maps and Object arrays that might be parts of object structures.
+Presently, Ignite Dataset works with assumption that all objects in the cache have the same structure (homogeneous objects) and the cache contains at least one object. Another limitation concerns structured objects, Ignite Dataset does not support UUID, Maps and Object arrays that might be parts of an object structure.
-- 
GitLab


From 92019765d7b7db99d0235268d00f349b7a53d1a9 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Wed, 5 Sep 2018 14:47:20 +0000
Subject: [PATCH 031/570] Fix pylint checks, fix VS compilation issue.

---
 .../contrib/ignite/kernels/ignite_plain_client_windows.cc | 4 ++--
 .../contrib/ignite/python/ops/ignite_dataset_ops.py       | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc b/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
index 7ba037f2d2..e1e2ee3b20 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
@@ -103,7 +103,7 @@ Status PlainClient::ReadData(uint8_t *buf, int32_t length) {
   int recieved = 0;
 
   while (recieved < length) {
-    int res = recv(sock_, buf, length - recieved, 0);
+    int res = recv(sock_, (char*)buf, length - recieved, 0);
 
     if (res < 0)
       return errors::Internal("Error occured while reading from socket: ", res);
@@ -121,7 +121,7 @@ Status PlainClient::WriteData(uint8_t *buf, int32_t length) {
   int sent = 0;
 
   while (sent < length) {
-    int res = send(sock_, buf, length - sent, 0);
+    int res = send(sock_, (char*)buf, length - sent, 0);
 
     if (res < 0)
       return errors::Internal("Error occured while writing into socket: ", res);
diff --git a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
index 6fa073957a..60003ca3b7 100644
--- a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
+++ b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
@@ -66,13 +66,13 @@ class Readable():
 
   def __read(self, data_type, length):
     """Reads, unpacks and returns specified type (little-endian)."""
-    buffer = self.read_data(length)
-    return struct.unpack("<" + data_type, buffer)[0]
+    data_buffer = self.read_data(length)
+    return struct.unpack("<" + data_type, data_buffer)[0]
 
 class DataBuffer(Readable):
   """DataBuffer class that exposes methods to read data from a byte buffer."""
 
-  def __init__(self, buffer):
+  def __init__(self, data_buffer):
     """Constructs a new instance of DataBuffer based on the specified byte
        buffer.
 
@@ -80,7 +80,7 @@ class DataBuffer(Readable):
       buffer: Buffer to be read.
     """
     Readable.__init__(self)
-    self.buffer = buffer
+    self.buffer = data_buffer
     self.ptr = 0
 
   def read_data(self, length):
-- 
GitLab


From 0b6654bc223f4f3807209043dc34ccb07b55474e Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Tue, 11 Sep 2018 09:50:47 +0000
Subject: [PATCH 032/570] Fix code style.

---
 .../ignite/kernels/ignite_dataset_ops.cc      |  2 +-
 .../kernels/ignite_plain_client_windows.cc    |  4 +--
 tensorflow/contrib/ignite/ops/dataset_ops.cc  | 34 +++++++++----------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc b/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
index 89eecf9c14..d03404a460 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "ignite_dataset.h"
 #include <stdlib.h>
+#include "ignite_dataset.h"
 #include "tensorflow/core/framework/dataset.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc b/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
index e1e2ee3b20..8182fde6d9 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
@@ -103,7 +103,7 @@ Status PlainClient::ReadData(uint8_t *buf, int32_t length) {
   int recieved = 0;
 
   while (recieved < length) {
-    int res = recv(sock_, (char*)buf, length - recieved, 0);
+    int res = recv(sock_, (char *)buf, length - recieved, 0);
 
     if (res < 0)
       return errors::Internal("Error occured while reading from socket: ", res);
@@ -121,7 +121,7 @@ Status PlainClient::WriteData(uint8_t *buf, int32_t length) {
   int sent = 0;
 
   while (sent < length) {
-    int res = send(sock_, (char*)buf, length - sent, 0);
+    int res = send(sock_, (char *)buf, length - sent, 0);
 
     if (res < 0)
       return errors::Internal("Error occured while writing into socket: ", res);
diff --git a/tensorflow/contrib/ignite/ops/dataset_ops.cc b/tensorflow/contrib/ignite/ops/dataset_ops.cc
index 17494d1cfd..fb16b290b1 100644
--- a/tensorflow/contrib/ignite/ops/dataset_ops.cc
+++ b/tensorflow/contrib/ignite/ops/dataset_ops.cc
@@ -20,23 +20,23 @@ limitations under the License.
 namespace tensorflow {
 
 REGISTER_OP("IgniteDataset")
-  .Input("cache_name: string")
-  .Input("host: string")
-  .Input("port: int32")
-  .Input("local: bool")
-  .Input("part: int32")
-  .Input("page_size: int32")
-  .Input("username: string")
-  .Input("password: string")
-  .Input("certfile: string")
-  .Input("keyfile: string")
-  .Input("cert_password: string")
-  .Input("schema: int32")
-  .Input("permutation: int32")
-  .Output("handle: variant")
-  .SetIsStateful()
-  .SetShapeFn(shape_inference::ScalarShape)
-  .Doc(R"doc(
+    .Input("cache_name: string")
+    .Input("host: string")
+    .Input("port: int32")
+    .Input("local: bool")
+    .Input("part: int32")
+    .Input("page_size: int32")
+    .Input("username: string")
+    .Input("password: string")
+    .Input("certfile: string")
+    .Input("keyfile: string")
+    .Input("cert_password: string")
+    .Input("schema: int32")
+    .Input("permutation: int32")
+    .Output("handle: variant")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
 Apache Ignite is a memory-centric distributed database, caching, and processing
 platform for transactional, analytical, and streaming workloads, delivering 
 in-memory speeds at petabyte scale. This contrib package contains an 
-- 
GitLab


From 5e9a9547f907599f6954fc5e28b7a78acf3b54eb Mon Sep 17 00:00:00 2001
From: Cao Zongyan <zongyan.cao@alibaba-inc.com>
Date: Wed, 12 Sep 2018 11:02:12 +0800
Subject: [PATCH 033/570] Revert "Add XLA support for LeakyReluOp."

This reverts commit d2ad105d2dff3c79d8f49f5fb8ce74c38f424e74.

Since bfloat16 was not supported by LeakyRelu, but it should be
supported in XLA Ops.
---
 tensorflow/compiler/tests/binary_ops_test.py  |  8 ----
 tensorflow/compiler/tests/unary_ops_test.py   |  5 ---
 tensorflow/compiler/tf2xla/kernels/relu_op.cc | 42 -------------------
 3 files changed, 55 deletions(-)

diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index c478ff4eea..17280e445b 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -178,14 +178,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
               [0, 0, 0, 0, 0, 0.1, 0.3, 0.5, 0.7, 0.9, 6.1, 10.0], dtype=dtype),
           expected=np.array([0, 0, 0, 0, 0, 6, 7, 8, 9, 10, 0, 0], dtype=dtype))
 
-      self._testBinary(
-          gen_nn_ops.leaky_relu_grad,
-          np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dtype),
-          np.array([-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-                   dtype=dtype),
-          expected=np.array([0.2, 0.4, 0.6, 0.8, 1, 6, 7, 8, 9, 10],
-                            dtype=dtype))
-
       self._testBinary(
           gen_nn_ops.softmax_cross_entropy_with_logits,
           np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=dtype),
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index dd29ef34ce..5b0e57f83f 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -361,11 +361,6 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([[-0.05, 6.05, 5]], dtype=dtype),
           expected=np.array([[0, 6, 5]], dtype=dtype))
 
-      self._assertOpOutputMatchesExpected(
-          nn_ops.leaky_relu,
-          np.array([[-1.0, 1.0]], dtype=dtype),
-          expected=np.array([[-0.2, 1.0]], dtype=dtype))
-
       self._assertOpOutputMatchesExpected(
           nn_ops.softmax,
           np.array([1, 2, 3, 4], dtype=dtype),
diff --git a/tensorflow/compiler/tf2xla/kernels/relu_op.cc b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
index 8d65e0339c..d35777ccb1 100644
--- a/tensorflow/compiler/tf2xla/kernels/relu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
@@ -50,23 +50,6 @@ class Relu6Op : public XlaOpKernel {
   }
 };
 
-class LeakyReluOp : public XlaOpKernel {
- public:
-  explicit LeakyReluOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("alpha", &alpha_));
-  }
-  // Compute the max of the input x and alpha*x.
-  void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaBuilder* builder = ctx->builder();
-    auto alpha = XlaHelpers::FloatLiteral(builder, input_type(0),
-                                          static_cast<double>(alpha_));
-    ctx->SetOutput(0, xla::Max(xla::Mul(alpha, ctx->Input(0)), ctx->Input(0)));
-  }
-
- private:
-  float alpha_;
-};
-
 class ReluGradOp : public XlaOpKernel {
  public:
   explicit ReluGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
@@ -101,35 +84,10 @@ class Relu6GradOp : public XlaOpKernel {
   }
 };
 
-class LeakyReluGradOp : public XlaOpKernel {
- public:
-  explicit LeakyReluGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("alpha", &alpha_));
-  }
-  // Return the lhs (incoming gradient) if the rhs (input feature) > 0,
-  // otherwise return the alpha * lhs.
-  void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaBuilder* b = ctx->builder();
-    const TensorShape shape = ctx->InputShape(0);
-    const auto zero =
-        xla::Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes());
-    const auto pred = xla::Gt(ctx->Input(1), zero);
-    auto alpha =
-        XlaHelpers::FloatLiteral(b, input_type(0), static_cast<double>(alpha_));
-    ctx->SetOutput(
-        0, xla::Select(pred, ctx->Input(0), xla::Mul(alpha, ctx->Input(0))));
-  }
-
- private:
-  float alpha_;
-};
-
 REGISTER_XLA_OP(Name("Relu"), ReluOp);
 REGISTER_XLA_OP(Name("Relu6"), Relu6Op);
-REGISTER_XLA_OP(Name("LeakyRelu"), LeakyReluOp);
 REGISTER_XLA_OP(Name("ReluGrad"), ReluGradOp);
 REGISTER_XLA_OP(Name("Relu6Grad"), Relu6GradOp);
-REGISTER_XLA_OP(Name("LeakyReluGrad"), LeakyReluGradOp);
 
 }  // namespace
 }  // namespace tensorflow
-- 
GitLab


From 9ec9c8b24cca5f1e746fef8cd351b3cae6d5a740 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Wed, 12 Sep 2018 20:42:01 +0300
Subject: [PATCH 034/570] Fixes after second review.

---
 tensorflow/contrib/ignite/BUILD               |   1 +
 tensorflow/contrib/ignite/__init__.py         |  22 +-
 .../kernels/ignite_binary_object_parser.cc    | 404 ++++++++++--------
 .../kernels/ignite_binary_object_parser.h     |  36 +-
 .../contrib/ignite/kernels/ignite_client.h    |  55 ++-
 .../contrib/ignite/kernels/ignite_dataset.cc  |  99 ++---
 .../contrib/ignite/kernels/ignite_dataset.h   |  37 +-
 .../ignite/kernels/ignite_dataset_iterator.cc | 383 ++++++++---------
 .../ignite/kernels/ignite_dataset_iterator.h  |  74 ++--
 .../ignite/kernels/ignite_dataset_ops.cc      | 123 ++++--
 .../ignite/kernels/ignite_plain_client.h      |  15 +-
 .../kernels/ignite_plain_client_unix.cc       |  14 +-
 .../kernels/ignite_plain_client_windows.cc    |  17 +-
 .../ignite/kernels/ignite_ssl_wrapper.cc      |  34 +-
 .../ignite/kernels/ignite_ssl_wrapper.h       |  26 +-
 tensorflow/contrib/ignite/ops/dataset_ops.cc  |   2 +
 .../ignite/python/ops/ignite_dataset_ops.py   | 176 ++++----
 17 files changed, 848 insertions(+), 670 deletions(-)

diff --git a/tensorflow/contrib/ignite/BUILD b/tensorflow/contrib/ignite/BUILD
index b7d40a99f7..2f598b4aed 100644
--- a/tensorflow/contrib/ignite/BUILD
+++ b/tensorflow/contrib/ignite/BUILD
@@ -40,6 +40,7 @@ cc_library(
     srcs = [
         "kernels/ignite_dataset_ops.cc",
         "kernels/ignite_client.h",
+        "kernels/ignite_byte_swapper.h",
         "kernels/ignite_plain_client.h",
         "kernels/ignite_ssl_wrapper.h",
         "kernels/ignite_ssl_wrapper.cc",
diff --git a/tensorflow/contrib/ignite/__init__.py b/tensorflow/contrib/ignite/__init__.py
index b78829d0f4..f42947696f 100644
--- a/tensorflow/contrib/ignite/__init__.py
+++ b/tensorflow/contrib/ignite/__init__.py
@@ -12,16 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Apache Ignite is a memory-centric distributed database, caching, and
-   processing platform for transactional, analytical, and streaming workloads,
-   delivering in-memory speeds at petabyte scale. This contrib package
-   contains an integration between Apache Ignite and TensorFlow. The
-   integration is based on tf.data from TensorFlow side and Binary Client
-   Protocol from Apache Ignite side. It allows to use Apache Ignite as a
-   datasource for neural network training, inference and all other
-   computations supported by TensorFlow. Ignite Dataset is based on Apache
-   Ignite Binary Client Protocol:
-   https://apacheignite.readme.io/v2.6/docs/binary-client-protocol.
+"""IgniteDataset that allows to get data from Apache Ignite.
+
+Apache Ignite is a memory-centric distributed database, caching, and
+processing platform for transactional, analytical, and streaming workloads,
+delivering in-memory speeds at petabyte scale. This contrib package
+contains an integration between Apache Ignite and TensorFlow. The
+integration is based on tf.data from TensorFlow side and Binary Client
+Protocol from Apache Ignite side. It allows to use Apache Ignite as a
+datasource for neural network training, inference and all other
+computations supported by TensorFlow. Ignite Dataset is based on Apache
+Ignite Binary Client Protocol:
+https://apacheignite.readme.io/v2.6/docs/binary-client-protocol.
 
 @@IgniteDataset
 """
diff --git a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc b/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
index 9bf4480d2d..2c8a7d44b0 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
@@ -13,242 +13,171 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
+BinaryObjectParser::BinaryObjectParser() : byte_swapper_(ByteSwapper(false)) {}
+
 Status BinaryObjectParser::Parse(uint8_t** ptr,
                                  std::vector<Tensor>* out_tensors,
-                                 std::vector<int32_t>* types) {
-  uint8_t object_type_id = **ptr;
-  *ptr += 1;
+                                 std::vector<int32_t>* types) const {
+  uint8_t object_type_id = ParseByte(ptr);
+
+  // Skip non-leaf nodes.
+  if (object_type_id != WRAPPED_OBJ && object_type_id != COMPLEX_OBJ)
+    types->push_back(object_type_id);
 
   switch (object_type_id) {
     case BYTE: {
-      Tensor tensor(cpu_allocator(), DT_UINT8, {});
-      tensor.scalar<uint8>()() = *((uint8_t*)*ptr);
-      *ptr += 1;
-      out_tensors->push_back(std::move(tensor));
+      out_tensors->emplace_back(cpu_allocator(), DT_UINT8, TensorShape({}));
+      out_tensors->back().scalar<uint8>()() = ParseByte(ptr);
       break;
     }
     case SHORT: {
-      Tensor tensor(cpu_allocator(), DT_INT16, {});
-      tensor.scalar<int16>()() = *((int16_t*)*ptr);
-      *ptr += 2;
-      out_tensors->push_back(std::move(tensor));
+      out_tensors->emplace_back(cpu_allocator(), DT_INT16, TensorShape({}));
+      out_tensors->back().scalar<int16>()() = ParseShort(ptr);
+      break;
+    }
+    case USHORT: {
+      out_tensors->emplace_back(cpu_allocator(), DT_UINT16, TensorShape({}));
+      out_tensors->back().scalar<uint16>()() = ParseUnsignedShort(ptr);
       break;
     }
     case INT: {
-      Tensor tensor(cpu_allocator(), DT_INT32, {});
-      tensor.scalar<int32>()() = *((int32_t*)*ptr);
-      *ptr += 4;
-      out_tensors->push_back(std::move(tensor));
+      out_tensors->emplace_back(cpu_allocator(), DT_INT32, TensorShape({}));
+      out_tensors->back().scalar<int32>()() = ParseInt(ptr);
       break;
     }
     case LONG: {
-      Tensor tensor(cpu_allocator(), DT_INT64, {});
-      tensor.scalar<int64>()() = *((int64_t*)*ptr);
-      *ptr += 8;
-      out_tensors->push_back(std::move(tensor));
+      out_tensors->emplace_back(cpu_allocator(), DT_INT64, TensorShape({}));
+      out_tensors->back().scalar<int64>()() = ParseLong(ptr);
       break;
     }
     case FLOAT: {
-      Tensor tensor(cpu_allocator(), DT_FLOAT, {});
-      tensor.scalar<float>()() = *((float*)*ptr);
-      *ptr += 4;
-      out_tensors->push_back(std::move(tensor));
+      out_tensors->emplace_back(cpu_allocator(), DT_FLOAT, TensorShape({}));
+      out_tensors->back().scalar<float>()() = ParseFloat(ptr);
       break;
     }
     case DOUBLE: {
-      Tensor tensor(cpu_allocator(), DT_DOUBLE, {});
-      tensor.scalar<double>()() = *((double*)*ptr);
-      *ptr += 8;
-      out_tensors->push_back(std::move(tensor));
-      break;
-    }
-    case UCHAR: {
-      Tensor tensor(cpu_allocator(), DT_UINT16, {});
-      tensor.scalar<uint16>()() = *((uint16_t*)*ptr);
-      *ptr += 2;
-      out_tensors->push_back(std::move(tensor));
+      out_tensors->emplace_back(cpu_allocator(), DT_DOUBLE, TensorShape({}));
+      out_tensors->back().scalar<double>()() = ParseDouble(ptr);
       break;
     }
     case BOOL: {
-      Tensor tensor(cpu_allocator(), DT_BOOL, {});
-      tensor.scalar<bool>()() = *((bool*)*ptr);
-      *ptr += 1;
-      out_tensors->push_back(std::move(tensor));
-
+      out_tensors->emplace_back(cpu_allocator(), DT_BOOL, TensorShape({}));
+      out_tensors->back().scalar<bool>()() = ParseBool(ptr);
       break;
     }
     case STRING: {
-      int32_t length = *((int32_t*)*ptr);
-      *ptr += 4;
-      Tensor tensor(cpu_allocator(), DT_STRING, {});
-      tensor.scalar<std::string>()() = std::string((char*)*ptr, length);
-      *ptr += length;
-      out_tensors->push_back(std::move(tensor));
-
+      out_tensors->emplace_back(cpu_allocator(), DT_STRING, TensorShape({}));
+      out_tensors->back().scalar<string>()() = ParseString(ptr);
       break;
     }
     case DATE: {
-      Tensor tensor(cpu_allocator(), DT_INT64, {});
-      tensor.scalar<int64>()() = *((int64_t*)*ptr);
-      *ptr += 8;
-      out_tensors->push_back(std::move(tensor));
-
+      out_tensors->emplace_back(cpu_allocator(), DT_INT64, TensorShape({}));
+      out_tensors->back().scalar<int64>()() = ParseLong(ptr);
       break;
     }
     case BYTE_ARR: {
-      int32_t length = *((int32_t*)*ptr);
-      *ptr += 4;
-      Tensor tensor(cpu_allocator(), DT_UINT8, TensorShape({length}));
-
-      uint8_t* arr = (uint8_t*)*ptr;
-      *ptr += length;
-
-      std::copy_n(arr, length, tensor.flat<uint8>().data());
-      out_tensors->push_back(std::move(tensor));
+      int32_t length = ParseInt(ptr);
+      uint8_t* arr = ParseByteArr(ptr, length);
+      out_tensors->emplace_back(cpu_allocator(), DT_UINT8,
+                                TensorShape({length}));
+      std::copy_n(arr, length, out_tensors->back().flat<uint8>().data());
       break;
     }
     case SHORT_ARR: {
-      int32_t length = *((int32_t*)*ptr);
-      *ptr += 4;
-      Tensor tensor(cpu_allocator(), DT_INT16, TensorShape({length}));
-
-      int16_t* arr = (int16_t*)*ptr;
-      *ptr += length * 2;
-
-      std::copy_n(arr, length, tensor.flat<int16>().data());
-      out_tensors->push_back(std::move(tensor));
+      int32_t length = ParseInt(ptr);
+      int16_t* arr = ParseShortArr(ptr, length);
+      out_tensors->emplace_back(cpu_allocator(), DT_INT16,
+                                TensorShape({length}));
+      std::copy_n(arr, length, out_tensors->back().flat<int16>().data());
+      break;
+    }
+    case USHORT_ARR: {
+      int32_t length = ParseInt(ptr);
+      uint16_t* arr = ParseUnsignedShortArr(ptr, length);
+      out_tensors->emplace_back(cpu_allocator(), DT_UINT16,
+                                TensorShape({length}));
+      std::copy_n(arr, length, out_tensors->back().flat<uint16>().data());
       break;
     }
     case INT_ARR: {
-      int32_t length = *((int32_t*)*ptr);
-      *ptr += 4;
-      Tensor tensor(cpu_allocator(), DT_INT32, TensorShape({length}));
-
-      int32_t* arr = (int32_t*)*ptr;
-      *ptr += length * 4;
-
-      std::copy_n(arr, length, tensor.flat<int32>().data());
-      out_tensors->push_back(std::move(tensor));
+      int32_t length = ParseInt(ptr);
+      int32_t* arr = ParseIntArr(ptr, length);
+      out_tensors->emplace_back(cpu_allocator(), DT_INT32,
+                                TensorShape({length}));
+      std::copy_n(arr, length, out_tensors->back().flat<int32>().data());
       break;
     }
     case LONG_ARR: {
-      int32_t length = *((int32_t*)*ptr);
-      *ptr += 4;
-      Tensor tensor(cpu_allocator(), DT_INT64, TensorShape({length}));
-
-      int64_t* arr = (int64_t*)*ptr;
-      *ptr += length * 8;
-
-      std::copy_n(arr, length, tensor.flat<int64>().data());
-      out_tensors->push_back(std::move(tensor));
+      int32_t length = ParseInt(ptr);
+      int64_t* arr = ParseLongArr(ptr, length);
+      out_tensors->emplace_back(cpu_allocator(), DT_INT64,
+                                TensorShape({length}));
+      std::copy_n(arr, length, out_tensors->back().flat<int64>().data());
       break;
     }
     case FLOAT_ARR: {
-      int32_t length = *((int32_t*)*ptr);
-      *ptr += 4;
-      Tensor tensor(cpu_allocator(), DT_FLOAT, TensorShape({length}));
-
-      float* arr = (float*)*ptr;
-      *ptr += 4 * length;
-
-      std::copy_n(arr, length, tensor.flat<float>().data());
-      out_tensors->push_back(std::move(tensor));
+      int32_t length = ParseInt(ptr);
+      float* arr = ParseFloatArr(ptr, length);
+      out_tensors->emplace_back(cpu_allocator(), DT_FLOAT,
+                                TensorShape({length}));
+      std::copy_n(arr, length, out_tensors->back().flat<float>().data());
       break;
     }
     case DOUBLE_ARR: {
-      int32_t length = *((int32_t*)*ptr);
-      *ptr += 4;
-      Tensor tensor(cpu_allocator(), DT_DOUBLE, TensorShape({length}));
-
-      double* arr = (double*)*ptr;
-      *ptr += 8 * length;
-
-      std::copy_n(arr, length, tensor.flat<double>().data());
-      out_tensors->push_back(std::move(tensor));
-      break;
-    }
-    case UCHAR_ARR: {
-      int32_t length = *((int32_t*)*ptr);
-      *ptr += 4;
-      Tensor tensor(cpu_allocator(), DT_UINT16, TensorShape({length}));
-
-      uint16_t* arr = (uint16_t*)*ptr;
-      *ptr += length * 2;
-
-      std::copy_n(arr, length, tensor.flat<uint16>().data());
-      out_tensors->push_back(std::move(tensor));
+      int32_t length = ParseInt(ptr);
+      double* arr = ParseDoubleArr(ptr, length);
+      out_tensors->emplace_back(cpu_allocator(), DT_DOUBLE,
+                                TensorShape({length}));
+      std::copy_n(arr, length, out_tensors->back().flat<double>().data());
       break;
     }
     case BOOL_ARR: {
-      int32_t length = *((int32_t*)*ptr);
-      *ptr += 4;
-      Tensor tensor(cpu_allocator(), DT_BOOL, TensorShape({length}));
-
-      bool* arr = (bool*)*ptr;
-      *ptr += length;
-
-      std::copy_n(arr, length, tensor.flat<bool>().data());
-      out_tensors->push_back(std::move(tensor));
+      int32_t length = ParseInt(ptr);
+      bool* arr = ParseBoolArr(ptr, length);
+      out_tensors->emplace_back(cpu_allocator(), DT_BOOL,
+                                TensorShape({length}));
+      std::copy_n(arr, length, out_tensors->back().flat<bool>().data());
       break;
     }
     case STRING_ARR: {
-      int32_t length = *((int32_t*)*ptr);
-      *ptr += 4;
-      Tensor tensor(cpu_allocator(), DT_STRING, TensorShape({length}));
-
-      for (int32_t i = 0; i < length; i++) {
-        int32_t str_length = *((int32_t*)*ptr);
-        *ptr += 4;
-        const int8_t* str = (const int8_t*)*ptr;
-        *ptr += str_length;
-        tensor.vec<std::string>()(i) = std::string((char*)str, str_length);
-      }
-
-      out_tensors->push_back(std::move(tensor));
+      int32_t length = ParseInt(ptr);
+      out_tensors->emplace_back(cpu_allocator(), DT_STRING,
+                                TensorShape({length}));
+      for (int32_t i = 0; i < length; i++)
+        out_tensors->back().vec<string>()(i) = ParseString(ptr);
       break;
     }
     case DATE_ARR: {
-      int32_t length = *((int32_t*)*ptr);
-      *ptr += 4;
-      Tensor tensor(cpu_allocator(), DT_INT64, TensorShape({length}));
-      int64_t* arr = (int64_t*)*ptr;
-      *ptr += length * 8;
-
-      std::copy_n(arr, length, tensor.flat<int64>().data());
-      out_tensors->push_back(std::move(tensor));
+      int32_t length = ParseInt(ptr);
+      int64_t* arr = ParseLongArr(ptr, length);
+      out_tensors->emplace_back(cpu_allocator(), DT_INT64,
+                                TensorShape({length}));
+      std::copy_n(arr, length, out_tensors->back().flat<int64>().data());
       break;
     }
     case WRAPPED_OBJ: {
-      int32_t byte_arr_size = *((int32_t*)*ptr);
-      *ptr += 4;
-
+      int32_t byte_arr_size = ParseInt(ptr);
       TF_RETURN_IF_ERROR(Parse(ptr, out_tensors, types));
-
-      int32_t offset = *((int32_t*)*ptr);
-      *ptr += 4;
+      int32_t offset = ParseInt(ptr);
 
       break;
     }
     case COMPLEX_OBJ: {
-      uint8_t version = **ptr;
-      *ptr += 1;
-      int16_t flags = *((int16_t*)*ptr);  // USER_TYPE = 1, HAS_SCHEMA = 2
-      *ptr += 2;
-      int32_t type_id = *((int32_t*)*ptr);
-      *ptr += 4;
-      int32_t hash_code = *((int32_t*)*ptr);
-      *ptr += 4;
-      int32_t length = *((int32_t*)*ptr);
-      *ptr += 4;
-      int32_t schema_id = *((int32_t*)*ptr);
-      *ptr += 4;
-      int32_t schema_offset = *((int32_t*)*ptr);
-      *ptr += 4;
-
+      uint8_t version = ParseByte(ptr);
+      int16_t flags = ParseShort(ptr);
+      int32_t type_id = ParseInt(ptr);
+      int32_t hash_code = ParseInt(ptr);
+      int32_t length = ParseInt(ptr);
+      int32_t schema_id = ParseInt(ptr);
+      int32_t schema_offset = ParseInt(ptr);
+
+      // 24 is size of header just read.
       uint8_t* end = *ptr + schema_offset - 24;
       int32_t i = 0;
       while (*ptr < end) {
@@ -261,12 +190,145 @@ Status BinaryObjectParser::Parse(uint8_t** ptr,
       break;
     }
     default: {
-      return errors::Internal("Unknowd binary type (type id ",
-                              (int)object_type_id, ")");
+      return errors::Unknown("Unknowd binary type (type id ",
+                             (int)object_type_id, ")");
     }
   }
 
   return Status::OK();
 }
 
+uint8_t BinaryObjectParser::ParseByte(uint8_t** ptr) const {
+  uint8_t res = **ptr;
+  *ptr += 1;
+
+  return res;
+}
+
+int16_t BinaryObjectParser::ParseShort(uint8_t** ptr) const {
+  int16_t* res = *reinterpret_cast<int16_t**>(ptr);
+  byte_swapper_.SwapIfRequiredInt16(res);
+  *ptr += 2;
+
+  return *res;
+}
+
+uint16_t BinaryObjectParser::ParseUnsignedShort(uint8_t** ptr) const {
+  uint16_t* res = *reinterpret_cast<uint16_t**>(ptr);
+  byte_swapper_.SwapIfRequiredUnsignedInt16(res);
+  *ptr += 2;
+
+  return *res;
+}
+
+int32_t BinaryObjectParser::ParseInt(uint8_t** ptr) const {
+  int32_t* res = *reinterpret_cast<int32_t**>(ptr);
+  byte_swapper_.SwapIfRequiredInt32(res);
+  *ptr += 4;
+
+  return *res;
+}
+
+int64_t BinaryObjectParser::ParseLong(uint8_t** ptr) const {
+  int64_t* res = *reinterpret_cast<int64_t**>(ptr);
+  byte_swapper_.SwapIfRequiredInt64(res);
+  *ptr += 8;
+
+  return *res;
+}
+
+float BinaryObjectParser::ParseFloat(uint8_t** ptr) const {
+  float* res = *reinterpret_cast<float**>(ptr);
+  byte_swapper_.SwapIfRequiredFloat(res);
+  *ptr += 4;
+
+  return *res;
+}
+
+double BinaryObjectParser::ParseDouble(uint8_t** ptr) const {
+  double* res = *reinterpret_cast<double**>(ptr);
+  byte_swapper_.SwapIfRequiredDouble(res);
+  *ptr += 8;
+
+  return *res;
+}
+
+bool BinaryObjectParser::ParseBool(uint8_t** ptr) const {
+  bool res = **reinterpret_cast<bool**>(ptr);
+  *ptr += 1;
+
+  return res;
+}
+
+string BinaryObjectParser::ParseString(uint8_t** ptr) const {
+  int32_t length = ParseInt(ptr);
+  string res(*reinterpret_cast<char**>(ptr), length);
+  *ptr += length;
+
+  return res;
+}
+
+uint8_t* BinaryObjectParser::ParseByteArr(uint8_t** ptr, int length) const {
+  uint8_t* res = *reinterpret_cast<uint8_t**>(ptr);
+  *ptr += length;
+
+  return res;
+}
+
+int16_t* BinaryObjectParser::ParseShortArr(uint8_t** ptr, int length) const {
+  int16_t* res = *reinterpret_cast<int16_t**>(ptr);
+  byte_swapper_.SwapIfRequiredInt16Arr(res, length);
+  *ptr += length * 2;
+
+  return res;
+}
+
+uint16_t* BinaryObjectParser::ParseUnsignedShortArr(uint8_t** ptr,
+                                                    int length) const {
+  uint16_t* res = *reinterpret_cast<uint16_t**>(ptr);
+  byte_swapper_.SwapIfRequiredUnsignedInt16Arr(res, length);
+  *ptr += length * 2;
+
+  return res;
+}
+
+int32_t* BinaryObjectParser::ParseIntArr(uint8_t** ptr, int length) const {
+  int32_t* res = *reinterpret_cast<int32_t**>(ptr);
+  byte_swapper_.SwapIfRequiredInt32Arr(res, length);
+  *ptr += length * 4;
+
+  return res;
+}
+
+int64_t* BinaryObjectParser::ParseLongArr(uint8_t** ptr, int length) const {
+  int64_t* res = *reinterpret_cast<int64_t**>(ptr);
+  byte_swapper_.SwapIfRequiredInt64Arr(res, length);
+  *ptr += length * 8;
+
+  return res;
+}
+
+float* BinaryObjectParser::ParseFloatArr(uint8_t** ptr, int length) const {
+  float* res = *reinterpret_cast<float**>(ptr);
+  byte_swapper_.SwapIfRequiredFloatArr(res, length);
+  *ptr += length * 4;
+
+  return res;
+}
+
+double* BinaryObjectParser::ParseDoubleArr(uint8_t** ptr, int length) const {
+  double* res = *reinterpret_cast<double**>(ptr);
+  byte_swapper_.SwapIfRequiredDoubleArr(res, length);
+  *ptr += length * 8;
+
+  return res;
+}
+
+bool* BinaryObjectParser::ParseBoolArr(uint8_t** ptr, int length) const {
+  bool* res = *reinterpret_cast<bool**>(ptr);
+  *ptr += length;
+
+  return res;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h b/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
index 9accbd796f..eb1f856643 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
@@ -13,16 +13,42 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BINARY_OBJECT_PARSER_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BINARY_OBJECT_PARSER_H_
+
 #include <vector>
-#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
 class BinaryObjectParser {
  public:
+  BinaryObjectParser();
   Status Parse(uint8_t** ptr, std::vector<Tensor>* out_tensors,
-               std::vector<int32_t>* types);
+               std::vector<int32_t>* types) const;
+
+ private:
+  uint8_t ParseByte(uint8_t** ptr) const;
+  int16_t ParseShort(uint8_t** ptr) const;
+  uint16_t ParseUnsignedShort(uint8_t** ptr) const;
+  int32_t ParseInt(uint8_t** ptr) const;
+  int64_t ParseLong(uint8_t** ptr) const;
+  float ParseFloat(uint8_t** ptr) const;
+  double ParseDouble(uint8_t** ptr) const;
+  bool ParseBool(uint8_t** ptr) const;
+  string ParseString(uint8_t** ptr) const;
+  uint8_t* ParseByteArr(uint8_t** ptr, int length) const;
+  int16_t* ParseShortArr(uint8_t** ptr, int length) const;
+  uint16_t* ParseUnsignedShortArr(uint8_t** ptr, int length) const;
+  int32_t* ParseIntArr(uint8_t** ptr, int length) const;
+  int64_t* ParseLongArr(uint8_t** ptr, int length) const;
+  float* ParseFloatArr(uint8_t** ptr, int length) const;
+  double* ParseDoubleArr(uint8_t** ptr, int length) const;
+  bool* ParseBoolArr(uint8_t** ptr, int length) const;
+
+  const ByteSwapper byte_swapper_;
 };
 
 enum ObjectType {
@@ -32,7 +58,7 @@ enum ObjectType {
   LONG = 4,
   FLOAT = 5,
   DOUBLE = 6,
-  UCHAR = 7,
+  USHORT = 7,
   BOOL = 8,
   STRING = 9,
   DATE = 11,
@@ -42,7 +68,7 @@ enum ObjectType {
   LONG_ARR = 15,
   FLOAT_ARR = 16,
   DOUBLE_ARR = 17,
-  UCHAR_ARR = 18,
+  USHORT_ARR = 18,
   BOOL_ARR = 19,
   STRING_ARR = 20,
   DATE_ARR = 22,
@@ -51,3 +77,5 @@ enum ObjectType {
 };
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BINARY_OBJECT_PARSER_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_client.h b/tensorflow/contrib/ignite/kernels/ignite_client.h
index 944b3fe184..508b6e4a60 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_client.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_client.h
@@ -16,40 +16,69 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
 #define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
 
+#include "tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
 class Client {
  public:
+  Client(bool big_endian) : byte_swapper_(ByteSwapper(big_endian)){};
   virtual Status Connect() = 0;
   virtual Status Disconnect() = 0;
   virtual bool IsConnected() = 0;
   virtual int GetSocketDescriptor() = 0;
-  virtual Status ReadData(uint8_t* buf, int32_t length) = 0;
-  virtual Status WriteData(uint8_t* buf, int32_t length) = 0;
+  virtual Status ReadData(uint8_t *buf, const int32_t length) = 0;
+  virtual Status WriteData(const uint8_t *buf, const int32_t length) = 0;
 
-  inline Status ReadByte(uint8_t* data) { return ReadData(data, 1); }
+  inline Status ReadByte(uint8_t *data) { return ReadData(data, 1); }
 
-  inline Status ReadShort(int16_t* data) { return ReadData((uint8_t*)data, 2); }
+  inline Status ReadShort(int16_t *data) {
+    TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 2));
+    byte_swapper_.SwapIfRequiredInt16(data);
 
-  inline Status ReadInt(int32_t* data) { return ReadData((uint8_t*)data, 4); }
+    return Status::OK();
+  }
+
+  inline Status ReadInt(int32_t *data) {
+    TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 4));
+    byte_swapper_.SwapIfRequiredInt32(data);
+
+    return Status::OK();
+  }
 
-  inline Status ReadLong(int64_t* data) { return ReadData((uint8_t*)data, 8); }
+  inline Status ReadLong(int64_t *data) {
+    TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 8));
+    byte_swapper_.SwapIfRequiredInt64(data);
 
-  inline Status WriteByte(uint8_t data) { return WriteData(&data, 1); }
+    return Status::OK();
+  }
+
+  inline Status WriteByte(const uint8_t data) { return WriteData(&data, 1); }
 
-  inline Status WriteShort(int16_t data) {
-    return WriteData((uint8_t*)&data, 2);
+  inline Status WriteShort(const int16_t data) {
+    int16_t tmp = data;
+    byte_swapper_.SwapIfRequiredInt16(&tmp);
+    return WriteData((uint8_t *)&tmp, 2);
   }
 
-  inline Status WriteInt(int32_t data) { return WriteData((uint8_t*)&data, 4); }
+  inline Status WriteInt(const int32_t data) {
+    int32_t tmp = data;
+    byte_swapper_.SwapIfRequiredInt32(&tmp);
+    return WriteData((uint8_t *)&tmp, 4);
+  }
 
-  inline Status WriteLong(int64_t data) {
-    return WriteData((uint8_t*)&data, 8);
+  inline Status WriteLong(const int64_t data) {
+    int64_t tmp = data;
+    byte_swapper_.SwapIfRequiredInt64(&tmp);
+    return WriteData((uint8_t *)&tmp, 8);
   }
+
+ private:
+  const ByteSwapper byte_swapper_;
 };
 
 }  // namespace tensorflow
 
-#endif
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset.cc b/tensorflow/contrib/ignite/kernels/ignite_dataset.cc
index f25f8a5b18..c4a7d3c513 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset.cc
@@ -13,40 +13,41 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "ignite_dataset_iterator.h"
+#include "tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 
-IgniteDataset::IgniteDataset(OpKernelContext* ctx, std::string cache_name,
-                             std::string host, int32 port, bool local,
-                             int32 part, int32 page_size, std::string username,
-                             std::string password, std::string certfile,
-                             std::string keyfile, std::string cert_password,
-                             std::vector<int32> schema,
-                             std::vector<int32> permutation)
+IgniteDataset::IgniteDataset(OpKernelContext* ctx, string cache_name,
+                             string host, int32 port, bool local, int32 part,
+                             int32 page_size, string username, string password,
+                             string certfile, string keyfile,
+                             string cert_password, std::vector<int32> schema,
+                             std::vector<int32> permutation,
+                             DataTypeVector dtypes,
+                             std::vector<PartialTensorShape> shapes)
     : DatasetBase(DatasetContext(ctx)),
-      cache_name_(cache_name),
-      host_(host),
+      cache_name_(std::move(cache_name)),
+      host_(std::move(host)),
       port_(port),
       local_(local),
       part_(part),
       page_size_(page_size),
-      username_(username),
-      password_(password),
-      certfile_(certfile),
-      keyfile_(keyfile),
-      cert_password_(cert_password),
-      schema_(schema),
-      permutation_(permutation) {
-  SchemaToTypes();
-  SchemaToShapes();
-
-  LOG(INFO) << "Ignite Dataset created [cache_name='" << cache_name
-            << "', host='" << host << "', port=" << port << ", local=" << local
-            << ", part=" << part << ", page_size=" << page_size
-            << ", username='" << username << "', certfile='" << certfile
-            << "', keyfile='" << keyfile + "']";
+      username_(std::move(username)),
+      password_(std::move(password)),
+      certfile_(std::move(certfile)),
+      keyfile_(std::move(keyfile)),
+      cert_password_(std::move(cert_password)),
+      schema_(std::move(schema)),
+      permutation_(std::move(permutation)),
+      dtypes_(dtypes),
+      shapes_(shapes) {
+  LOG(INFO) << "Ignite Dataset created [cache_name='" << cache_name_
+            << "', host='" << host_ << "', port=" << port_
+            << ", local=" << local_ << ", part=" << part_
+            << ", page_size=" << page_size_ << ", username='" << username_
+            << "', certfile='" << certfile_ << "', keyfile='"
+            << keyfile_ + "']";
 }
 
 IgniteDataset::~IgniteDataset() { LOG(INFO) << "Ignite Dataset destroyed"; }
@@ -54,10 +55,12 @@ IgniteDataset::~IgniteDataset() { LOG(INFO) << "Ignite Dataset destroyed"; }
 std::unique_ptr<IteratorBase> IgniteDataset::MakeIteratorInternal(
     const string& prefix) const {
   return std::unique_ptr<IteratorBase>(new IgniteDatasetIterator(
-      {this, strings::StrCat(prefix, "::Ignite")}, this->host_, this->port_,
-      this->cache_name_, this->local_, this->part_, this->page_size_,
-      this->username_, this->password_, this->certfile_, this->keyfile_,
-      this->cert_password_, this->schema_, this->permutation_));
+      {this, strings::StrCat(prefix, "::Ignite")}, std::move(this->host_),
+      this->port_, std::move(this->cache_name_), this->local_, this->part_,
+      this->page_size_, std::move(this->username_), std::move(this->password_),
+      std::move(this->certfile_), std::move(this->keyfile_),
+      std::move(this->cert_password_), std::move(this->schema_),
+      std::move(this->permutation_)));
 }
 
 const DataTypeVector& IgniteDataset::output_dtypes() const { return dtypes_; }
@@ -75,42 +78,4 @@ Status IgniteDataset::AsGraphDefInternal(SerializationContext* ctx,
       "IgniteDataset does not support 'AsGraphDefInternal'");
 }
 
-void IgniteDataset::SchemaToTypes() {
-  for (auto e : schema_) {
-    if (e == BYTE || e == BYTE_ARR) {
-      dtypes_.push_back(DT_UINT8);
-    } else if (e == SHORT || e == SHORT_ARR) {
-      dtypes_.push_back(DT_INT16);
-    } else if (e == INT || e == INT_ARR) {
-      dtypes_.push_back(DT_INT32);
-    } else if (e == LONG || e == LONG_ARR) {
-      dtypes_.push_back(DT_INT64);
-    } else if (e == FLOAT || e == FLOAT_ARR) {
-      dtypes_.push_back(DT_FLOAT);
-    } else if (e == DOUBLE || e == DOUBLE_ARR) {
-      dtypes_.push_back(DT_DOUBLE);
-    } else if (e == UCHAR || e == UCHAR_ARR) {
-      dtypes_.push_back(DT_UINT8);
-    } else if (e == BOOL || e == BOOL_ARR) {
-      dtypes_.push_back(DT_BOOL);
-    } else if (e == STRING || e == STRING_ARR) {
-      dtypes_.push_back(DT_STRING);
-    } else {
-      LOG(ERROR) << "Unexpected type in schema [type_id=" << e << "]";
-    }
-  }
-}
-
-void IgniteDataset::SchemaToShapes() {
-  for (auto e : schema_) {
-    if (e >= 1 && e < 10) {
-      shapes_.push_back(PartialTensorShape({}));
-    } else if (e >= 12 && e < 21) {
-      shapes_.push_back(PartialTensorShape({-1}));
-    } else {
-      LOG(ERROR) << "Unexpected type in schema [type_id=" << e << "]";
-    }
-  }
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset.h b/tensorflow/contrib/ignite/kernels/ignite_dataset.h
index d3fec5910b..66bfdf2e2a 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset.h
@@ -13,18 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_H_
+
 #include "tensorflow/core/framework/dataset.h"
 
 namespace tensorflow {
 
 class IgniteDataset : public DatasetBase {
  public:
-  IgniteDataset(OpKernelContext* ctx, std::string cache_name, std::string host,
+  IgniteDataset(OpKernelContext* ctx, string cache_name, string host,
                 int32 port, bool local, int32 part, int32 page_size,
-                std::string username, std::string password,
-                std::string certfile, std::string keyfile,
-                std::string cert_password, std::vector<int32> schema,
-                std::vector<int32> permutation);
+                string username, string password, string certfile,
+                string keyfile, string cert_password, std::vector<int32> schema,
+                std::vector<int32> permutation, DataTypeVector dtypes,
+                std::vector<PartialTensorShape> shapes);
   ~IgniteDataset();
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override;
@@ -38,25 +41,23 @@ class IgniteDataset : public DatasetBase {
                             Node** output) const override;
 
  private:
-  const std::string cache_name_;
-  const std::string host_;
+  const string cache_name_;
+  const string host_;
   const int32 port_;
   const bool local_;
   const int32 part_;
   const int32 page_size_;
-  const std::string username_;
-  const std::string password_;
-  const std::string certfile_;
-  const std::string keyfile_;
-  const std::string cert_password_;
+  const string username_;
+  const string password_;
+  const string certfile_;
+  const string keyfile_;
+  const string cert_password_;
   const std::vector<int32> schema_;
   const std::vector<int32> permutation_;
-
-  DataTypeVector dtypes_;
-  std::vector<PartialTensorShape> shapes_;
-
-  void SchemaToTypes();
-  void SchemaToShapes();
+  const DataTypeVector dtypes_;
+  const std::vector<PartialTensorShape> shapes_;
 };
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc b/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
index 1774585ecd..f68ded5a3a 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "ignite_dataset_iterator.h"
-
-#include "ignite_plain_client.h"
-#include "ignite_ssl_wrapper.h"
+#include "tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h"
+#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
 
 #include <time.h>
@@ -25,30 +25,31 @@ limitations under the License.
 namespace tensorflow {
 
 IgniteDatasetIterator::IgniteDatasetIterator(
-    const Params& params, std::string host, int32 port, std::string cache_name,
-    bool local, int32 part, int32 page_size, std::string username,
-    std::string password, std::string certfile, std::string keyfile,
-    std::string cert_password, std::vector<int32> schema,
-    std::vector<int32> permutation)
+    const Params& params, string host, int32 port, string cache_name,
+    bool local, int32 part, int32 page_size, string username, string password,
+    string certfile, string keyfile, string cert_password,
+    std::vector<int32> schema, std::vector<int32> permutation)
     : DatasetIterator<IgniteDataset>(params),
-      cache_name_(cache_name),
+      cache_name_(std::move(cache_name)),
       local_(local),
       part_(part),
       page_size_(page_size),
-      username_(username),
-      password_(password),
-      schema_(schema),
-      permutation_(permutation),
+      username_(std::move(username)),
+      password_(std::move(password)),
+      schema_(std::move(schema)),
+      permutation_(std::move(permutation)),
       remainder_(-1),
       cursor_id_(-1),
-      last_page_(false) {
-  Client* p_client = new PlainClient(host, port);
+      last_page_(false),
+      valid_state_(true) {
+  Client* p_client = new PlainClient(std::move(host), port, false);
 
   if (certfile.empty())
     client_ = std::unique_ptr<Client>(p_client);
   else
-    client_ = std::unique_ptr<Client>(new SslWrapper(
-        std::unique_ptr<Client>(p_client), certfile, keyfile, cert_password));
+    client_ = std::unique_ptr<Client>(
+        new SslWrapper(std::unique_ptr<Client>(p_client), std::move(certfile),
+                       std::move(keyfile), std::move(cert_password), false));
 
   LOG(INFO) << "Ignite Dataset Iterator created";
 }
@@ -60,12 +61,80 @@ IgniteDatasetIterator::~IgniteDatasetIterator() {
   LOG(INFO) << "Ignite Dataset Iterator destroyed";
 }
 
+Status IgniteDatasetIterator::GetNextInternal(IteratorContext* ctx,
+                                              std::vector<Tensor>* out_tensors,
+                                              bool* end_of_sequence) {
+  mutex_lock l(mutex_);
+
+  if (valid_state_) {
+    Status status =
+        GetNextInternalWithValidState(ctx, out_tensors, end_of_sequence);
+
+    if (!status.ok()) valid_state_ = false;
+
+    return status;
+  }
+
+  return errors::Unknown("Iterator is invalid");
+}
+
+Status IgniteDatasetIterator::SaveInternal(IteratorStateWriter* writer) {
+  return errors::Unimplemented(
+      "Iterator for IgniteDataset does not support 'SaveInternal'");
+}
+
+Status IgniteDatasetIterator::RestoreInternal(IteratorContext* ctx,
+                                              IteratorStateReader* reader) {
+  return errors::Unimplemented(
+      "Iterator for IgniteDataset does not support 'RestoreInternal')");
+}
+
+Status IgniteDatasetIterator::GetNextInternalWithValidState(
+    IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+    bool* end_of_sequence) {
+  if (remainder_ == 0 && last_page_) {
+    cursor_id_ = -1;
+    *end_of_sequence = true;
+
+    return Status::OK();
+  } else {
+    TF_RETURN_IF_ERROR(EstablishConnection());
+
+    if (remainder_ == -1) {
+      TF_RETURN_IF_ERROR(ScanQuery());
+    } else if (remainder_ == 0) {
+      TF_RETURN_IF_ERROR(LoadNextPage());
+    }
+
+    uint8_t* initial_ptr = ptr_;
+    std::vector<Tensor> tensors;
+    std::vector<int32_t> types;
+
+    TF_RETURN_IF_ERROR(parser_.Parse(&ptr_, &tensors, &types));  // Parse key
+    TF_RETURN_IF_ERROR(parser_.Parse(&ptr_, &tensors, &types));  // Parse val
+
+    remainder_ -= (ptr_ - initial_ptr);
+
+    TF_RETURN_IF_ERROR(CheckTypes(types));
+
+    for (size_t i = 0; i < tensors.size(); i++)
+      out_tensors->push_back(tensors[permutation_[i]]);
+
+    *end_of_sequence = false;
+
+    return Status::OK();
+  }
+
+  *end_of_sequence = true;
+
+  return Status::OK();
+}
+
 Status IgniteDatasetIterator::EstablishConnection() {
   if (!client_->IsConnected()) {
-    Status status = client_->Connect();
-    if (!status.ok()) return status;
+    TF_RETURN_IF_ERROR(client_->Connect());
 
-    status = Handshake();
+    Status status = Handshake();
     if (!status.ok()) {
       Status disconnect_status = client_->Disconnect();
       if (!disconnect_status.ok()) LOG(ERROR) << disconnect_status.ToString();
@@ -79,19 +148,17 @@ Status IgniteDatasetIterator::EstablishConnection() {
 
 Status IgniteDatasetIterator::CloseConnection() {
   if (cursor_id_ != -1 && !last_page_) {
-    Status conn_status = EstablishConnection();
-    if (!conn_status.ok()) return conn_status;
+    TF_RETURN_IF_ERROR(EstablishConnection());
 
-    TF_RETURN_IF_ERROR(client_->WriteInt(18));  // Message length
-    TF_RETURN_IF_ERROR(
-        client_->WriteShort(close_connection_opcode));   // Operation code
+    TF_RETURN_IF_ERROR(client_->WriteInt(kCloseConnectionReqLength));
+    TF_RETURN_IF_ERROR(client_->WriteShort(kCloseConnectionOpcode));
     TF_RETURN_IF_ERROR(client_->WriteLong(0));           // Request ID
     TF_RETURN_IF_ERROR(client_->WriteLong(cursor_id_));  // Resource ID
 
     int32_t res_len;
     TF_RETURN_IF_ERROR(client_->ReadInt(&res_len));
-    if (res_len < 12)
-      return errors::Internal("Close Resource Response is corrupted");
+    if (res_len < kMinResLength)
+      return errors::Unknown("Close Resource Response is corrupted");
 
     int64_t req_id;
     TF_RETURN_IF_ERROR(client_->ReadLong(&req_id));
@@ -100,22 +167,21 @@ Status IgniteDatasetIterator::CloseConnection() {
     if (status != 0) {
       uint8_t err_msg_header;
       TF_RETURN_IF_ERROR(client_->ReadByte(&err_msg_header));
-      if (err_msg_header == string_val) {
+      if (err_msg_header == kStringVal) {
         int32_t err_msg_length;
         TF_RETURN_IF_ERROR(client_->ReadInt(&err_msg_length));
+
         uint8_t* err_msg_c = new uint8_t[err_msg_length];
+        auto clean = gtl::MakeCleanup([err_msg_c] { delete[] err_msg_c; });
         TF_RETURN_IF_ERROR(client_->ReadData(err_msg_c, err_msg_length));
-        std::string err_msg((char*)err_msg_c, err_msg_length);
-        delete[] err_msg_c;
+        string err_msg(reinterpret_cast<char*>(err_msg_c), err_msg_length);
 
-        return errors::Internal("Close Resource Error [status=", status,
-                                ", message=", err_msg, "]");
+        return errors::Unknown("Close Resource Error [status=", status,
+                               ", message=", err_msg, "]");
       }
-      return errors::Internal("Close Resource Error [status=", status, "]");
+      return errors::Unknown("Close Resource Error [status=", status, "]");
     }
 
-    LOG(INFO) << "Query Cursor " << cursor_id_ << " is closed";
-
     cursor_id_ = -1;
 
     return client_->Disconnect();
@@ -126,94 +192,43 @@ Status IgniteDatasetIterator::CloseConnection() {
   return client_->IsConnected() ? client_->Disconnect() : Status::OK();
 }
 
-Status IgniteDatasetIterator::GetNextInternal(IteratorContext* ctx,
-                                              std::vector<Tensor>* out_tensors,
-                                              bool* end_of_sequence) {
-  if (remainder_ == 0 && last_page_) {
-    LOG(INFO) << "Query Cursor " << cursor_id_ << " is closed";
-
-    cursor_id_ = -1;
-    *end_of_sequence = true;
-    return Status::OK();
-  } else {
-    Status status = EstablishConnection();
-    if (!status.ok()) return status;
-
-    if (remainder_ == -1 || remainder_ == 0) {
-      Status status = remainder_ == -1 ? ScanQuery() : LoadNextPage();
-      if (!status.ok()) return status;
-    }
-
-    uint8_t* initial_ptr = ptr_;
-    std::vector<int32_t> types;
-    std::vector<Tensor> tensors;
-
-    status = parser_.Parse(&ptr_, &tensors, &types);  // Parse key
-    if (!status.ok()) return status;
-
-    status = parser_.Parse(&ptr_, &tensors, &types);  // Parse val
-    if (!status.ok()) return status;
-
-    remainder_ -= (ptr_ - initial_ptr);
-
-    out_tensors->resize(tensors.size());
-    for (int32_t i = 0; i < tensors.size(); i++)
-      (*out_tensors)[permutation_[i]] = std::move(tensors[i]);
-
-    *end_of_sequence = false;
-    return Status::OK();
-  }
-
-  *end_of_sequence = true;
-  return Status::OK();
-}
-
-Status IgniteDatasetIterator::SaveInternal(IteratorStateWriter* writer) {
-  return errors::Unimplemented(
-      "Iterator for IgniteDataset does not support 'SaveInternal'");
-}
-
-Status IgniteDatasetIterator::RestoreInternal(IteratorContext* ctx,
-                                              IteratorStateReader* reader) {
-  return errors::Unimplemented(
-      "Iterator for IgniteDataset does not support 'RestoreInternal')");
-}
-
 Status IgniteDatasetIterator::Handshake() {
-  int32_t msg_len = 8;
+  int32_t msg_len = kHandshakeReqDefaultLength;
 
   if (username_.empty())
     msg_len += 1;
   else
-    msg_len += 5 + username_.length();
+    msg_len += 5 + username_.length();  // 1 byte header, 4 bytes length.
 
   if (password_.empty())
     msg_len += 1;
   else
-    msg_len += 5 + password_.length();
+    msg_len += 5 + password_.length();  // 1 byte header, 4 bytes length.
 
   TF_RETURN_IF_ERROR(client_->WriteInt(msg_len));
   TF_RETURN_IF_ERROR(client_->WriteByte(1));
-  TF_RETURN_IF_ERROR(client_->WriteShort(protocol_major_version));
-  TF_RETURN_IF_ERROR(client_->WriteShort(protocol_minor_version));
-  TF_RETURN_IF_ERROR(client_->WriteShort(protocol_patch_version));
+  TF_RETURN_IF_ERROR(client_->WriteShort(kProtocolMajorVersion));
+  TF_RETURN_IF_ERROR(client_->WriteShort(kProtocolMinorVersion));
+  TF_RETURN_IF_ERROR(client_->WriteShort(kProtocolPatchVersion));
   TF_RETURN_IF_ERROR(client_->WriteByte(2));
   if (username_.empty()) {
-    TF_RETURN_IF_ERROR(client_->WriteByte(null_val));
+    TF_RETURN_IF_ERROR(client_->WriteByte(kNullVal));
   } else {
-    TF_RETURN_IF_ERROR(client_->WriteByte(string_val));
+    TF_RETURN_IF_ERROR(client_->WriteByte(kStringVal));
     TF_RETURN_IF_ERROR(client_->WriteInt(username_.length()));
     TF_RETURN_IF_ERROR(
-        client_->WriteData((uint8_t*)username_.c_str(), username_.length()));
+        client_->WriteData(reinterpret_cast<const uint8_t*>(username_.c_str()),
+                           username_.length()));
   }
 
   if (password_.empty()) {
-    TF_RETURN_IF_ERROR(client_->WriteByte(null_val));
+    TF_RETURN_IF_ERROR(client_->WriteByte(kNullVal));
   } else {
-    TF_RETURN_IF_ERROR(client_->WriteByte(string_val));
+    TF_RETURN_IF_ERROR(client_->WriteByte(kStringVal));
     TF_RETURN_IF_ERROR(client_->WriteInt(password_.length()));
     TF_RETURN_IF_ERROR(
-        client_->WriteData((uint8_t*)password_.c_str(), password_.length()));
+        client_->WriteData(reinterpret_cast<const uint8_t*>(password_.c_str()),
+                           password_.length()));
   }
 
   int32_t handshake_res_len;
@@ -221,9 +236,6 @@ Status IgniteDatasetIterator::Handshake() {
   uint8_t handshake_res;
   TF_RETURN_IF_ERROR(client_->ReadByte(&handshake_res));
 
-  LOG(INFO) << "Handshake length " << handshake_res_len << ", res "
-            << (int16_t)handshake_res;
-
   if (handshake_res != 1) {
     int16_t serv_ver_major;
     TF_RETURN_IF_ERROR(client_->ReadShort(&serv_ver_major));
@@ -234,26 +246,26 @@ Status IgniteDatasetIterator::Handshake() {
     uint8_t header;
     TF_RETURN_IF_ERROR(client_->ReadByte(&header));
 
-    if (header == string_val) {
+    if (header == kStringVal) {
       int32_t length;
       TF_RETURN_IF_ERROR(client_->ReadInt(&length));
+
       uint8_t* err_msg_c = new uint8_t[length];
+      auto clean = gtl::MakeCleanup([err_msg_c] { delete[] err_msg_c; });
       TF_RETURN_IF_ERROR(client_->ReadData(err_msg_c, length));
-      std::string err_msg((char*)err_msg_c, length);
-      delete[] err_msg_c;
-
-      return errors::Internal("Handshake Error [result=", handshake_res,
-                              ", version=", serv_ver_major, ".", serv_ver_minor,
-                              ".", serv_ver_patch, ", message='", err_msg,
-                              "']");
-    } else if (header == null_val) {
-      return errors::Internal("Handshake Error [result=", handshake_res,
-                              ", version=", serv_ver_major, ".", serv_ver_minor,
-                              ".", serv_ver_patch, "]");
+      string err_msg(reinterpret_cast<char*>(err_msg_c), length);
+
+      return errors::Unknown("Handshake Error [result=", handshake_res,
+                             ", version=", serv_ver_major, ".", serv_ver_minor,
+                             ".", serv_ver_patch, ", message='", err_msg, "']");
+    } else if (header == kNullVal) {
+      return errors::Unknown("Handshake Error [result=", handshake_res,
+                             ", version=", serv_ver_major, ".", serv_ver_minor,
+                             ".", serv_ver_patch, "]");
     } else {
-      return errors::Internal("Handshake Error [result=", handshake_res,
-                              ", version=", serv_ver_major, ".", serv_ver_minor,
-                              ".", serv_ver_patch, "]");
+      return errors::Unknown("Handshake Error [result=", handshake_res,
+                             ", version=", serv_ver_major, ".", serv_ver_minor,
+                             ".", serv_ver_patch, "]");
     }
   }
 
@@ -261,31 +273,26 @@ Status IgniteDatasetIterator::Handshake() {
 }
 
 Status IgniteDatasetIterator::ScanQuery() {
-  TF_RETURN_IF_ERROR(client_->WriteInt(25));                   // Message length
-  TF_RETURN_IF_ERROR(client_->WriteShort(scan_query_opcode));  // Operation code
-  TF_RETURN_IF_ERROR(client_->WriteLong(0));                   // Request ID
+  TF_RETURN_IF_ERROR(client_->WriteInt(kScanQueryReqLength));
+  TF_RETURN_IF_ERROR(client_->WriteShort(kScanQueryOpcode));
+  TF_RETURN_IF_ERROR(client_->WriteLong(0));  // Request ID
   TF_RETURN_IF_ERROR(
       client_->WriteInt(JavaHashCode(cache_name_)));  // Cache name
   TF_RETURN_IF_ERROR(client_->WriteByte(0));          // Flags
-  TF_RETURN_IF_ERROR(client_->WriteByte(null_val));   // Filter object
+  TF_RETURN_IF_ERROR(client_->WriteByte(kNullVal));   // Filter object
   TF_RETURN_IF_ERROR(client_->WriteInt(page_size_));  // Cursor page size
   TF_RETURN_IF_ERROR(client_->WriteInt(part_));       // part_ition to query
   TF_RETURN_IF_ERROR(client_->WriteByte(local_));     // local_ flag
 
-  int64_t wait_start = std::chrono::duration_cast<std::chrono::milliseconds>(
-                           std::chrono::system_clock::now().time_since_epoch())
-                           .count();
-
+  uint64 wait_start = Env::Default()->NowMicros();
   int32_t res_len;
   TF_RETURN_IF_ERROR(client_->ReadInt(&res_len));
+  int64_t wait_stop = Env::Default()->NowMicros();
 
-  int64_t wait_stop = std::chrono::duration_cast<std::chrono::milliseconds>(
-                          std::chrono::system_clock::now().time_since_epoch())
-                          .count();
+  LOG(INFO) << "Scan Query waited " << (wait_stop - wait_start) / 1000 << " ms";
 
-  LOG(INFO) << "Scan Query waited " << (wait_stop - wait_start) << " ms";
-
-  if (res_len < 12) return errors::Internal("Scan Query Response is corrupted");
+  if (res_len < kMinResLength)
+    return errors::Unknown("Scan Query Response is corrupted");
 
   int64_t req_id;
   TF_RETURN_IF_ERROR(client_->ReadLong(&req_id));
@@ -297,78 +304,47 @@ Status IgniteDatasetIterator::ScanQuery() {
     uint8_t err_msg_header;
     TF_RETURN_IF_ERROR(client_->ReadByte(&err_msg_header));
 
-    if (err_msg_header == string_val) {
+    if (err_msg_header == kStringVal) {
       int32_t err_msg_length;
       TF_RETURN_IF_ERROR(client_->ReadInt(&err_msg_length));
 
       uint8_t* err_msg_c = new uint8_t[err_msg_length];
+      auto clean = gtl::MakeCleanup([err_msg_c] { delete[] err_msg_c; });
       TF_RETURN_IF_ERROR(client_->ReadData(err_msg_c, err_msg_length));
-      std::string err_msg((char*)err_msg_c, err_msg_length);
-      delete[] err_msg_c;
+      string err_msg(reinterpret_cast<char*>(err_msg_c), err_msg_length);
 
-      return errors::Internal("Scan Query Error [status=", status, ", message=",
-                              err_msg, "]");
+      return errors::Unknown("Scan Query Error [status=", status, ", message=",
+                             err_msg, "]");
     }
-    return errors::Internal("Scan Query Error [status=", status, "]");
+    return errors::Unknown("Scan Query Error [status=", status, "]");
   }
 
   TF_RETURN_IF_ERROR(client_->ReadLong(&cursor_id_));
 
-  LOG(INFO) << "Query Cursor " << cursor_id_ << " is opened";
-
   int32_t row_cnt;
   TF_RETURN_IF_ERROR(client_->ReadInt(&row_cnt));
 
-  remainder_ = res_len - 25;
-  page_ = std::unique_ptr<uint8_t>(new uint8_t[remainder_]);
-  ptr_ = page_.get();
-
-  int64_t start = std::chrono::duration_cast<std::chrono::milliseconds>(
-                      std::chrono::system_clock::now().time_since_epoch())
-                      .count();
-
-  TF_RETURN_IF_ERROR(client_->ReadData(ptr_, remainder_));
-
-  int64_t stop = std::chrono::duration_cast<std::chrono::milliseconds>(
-                     std::chrono::system_clock::now().time_since_epoch())
-                     .count();
-  ;
-
-  double size_in_mb = 1.0 * remainder_ / 1024 / 1024;
-  double time_in_s = 1.0 * (stop - start) / 1000;
-  LOG(INFO) << "Page size " << size_in_mb << " Mb, time " << time_in_s * 1000
-            << " ms download speed " << size_in_mb / time_in_s << " Mb/sec";
-
-  uint8_t last_page_b;
-  TF_RETURN_IF_ERROR(client_->ReadByte(&last_page_b));
-
-  last_page_ = !last_page_b;
+  int32_t page_size = res_len - kScanQueryResHeaderLength;
 
-  return Status::OK();
+  return ReceivePage(page_size);
 }
 
 Status IgniteDatasetIterator::LoadNextPage() {
-  TF_RETURN_IF_ERROR(client_->WriteInt(18));  // Message length
-  TF_RETURN_IF_ERROR(
-      client_->WriteShort(load_next_page_opcode));     // Operation code
+  TF_RETURN_IF_ERROR(client_->WriteInt(kLoadNextPageReqLength));
+  TF_RETURN_IF_ERROR(client_->WriteShort(kLoadNextPageOpcode));
   TF_RETURN_IF_ERROR(client_->WriteLong(0));           // Request ID
   TF_RETURN_IF_ERROR(client_->WriteLong(cursor_id_));  // Cursor ID
 
-  int64_t wait_start = std::chrono::duration_cast<std::chrono::milliseconds>(
-                           std::chrono::system_clock::now().time_since_epoch())
-                           .count();
-
+  uint64 wait_start = Env::Default()->NowMicros();
   int32_t res_len;
   TF_RETURN_IF_ERROR(client_->ReadInt(&res_len));
+  uint64 wait_stop = Env::Default()->NowMicros();
 
-  int64_t wait_stop = std::chrono::duration_cast<std::chrono::milliseconds>(
-                          std::chrono::system_clock::now().time_since_epoch())
-                          .count();
+  LOG(INFO) << "Load Next Page waited " << (wait_stop - wait_start) / 1000
+            << " ms";
 
-  LOG(INFO) << "Load Next Page waited " << (wait_stop - wait_start) << " ms";
-
-  if (res_len < 12)
-    return errors::Internal("Load Next Page Response is corrupted");
+  if (res_len < kMinResLength)
+    return errors::Unknown("Load Next Page Response is corrupted");
 
   int64_t req_id;
   TF_RETURN_IF_ERROR(client_->ReadLong(&req_id));
@@ -380,41 +356,40 @@ Status IgniteDatasetIterator::LoadNextPage() {
     uint8_t err_msg_header;
     TF_RETURN_IF_ERROR(client_->ReadByte(&err_msg_header));
 
-    if (err_msg_header == string_val) {
+    if (err_msg_header == kStringVal) {
       int32_t err_msg_length;
       TF_RETURN_IF_ERROR(client_->ReadInt(&err_msg_length));
 
       uint8_t* err_msg_c = new uint8_t[err_msg_length];
+      auto clean = gtl::MakeCleanup([err_msg_c] { delete[] err_msg_c; });
       TF_RETURN_IF_ERROR(client_->ReadData(err_msg_c, err_msg_length));
-      std::string err_msg((char*)err_msg_c, err_msg_length);
-      delete[] err_msg_c;
+      string err_msg(reinterpret_cast<char*>(err_msg_c), err_msg_length);
 
-      return errors::Internal("Load Next Page Error [status=", status,
-                              ", message=", err_msg, "]");
+      return errors::Unknown("Load Next Page Error [status=", status,
+                             ", message=", err_msg, "]");
     }
-    return errors::Internal("Load Next Page Error [status=", status, "]");
+    return errors::Unknown("Load Next Page Error [status=", status, "]");
   }
 
   int32_t row_cnt;
   TF_RETURN_IF_ERROR(client_->ReadInt(&row_cnt));
 
-  remainder_ = res_len - 17;
+  int32_t page_size = res_len - kLoadNextPageResHeaderLength;
+
+  return ReceivePage(page_size);
+}
+
+Status IgniteDatasetIterator::ReceivePage(int32_t page_size) {
+  remainder_ = page_size;
   page_ = std::unique_ptr<uint8_t>(new uint8_t[remainder_]);
   ptr_ = page_.get();
 
-  int64_t start = std::chrono::duration_cast<std::chrono::milliseconds>(
-                      std::chrono::system_clock::now().time_since_epoch())
-                      .count();
-
+  uint64 start = Env::Default()->NowMicros();
   TF_RETURN_IF_ERROR(client_->ReadData(ptr_, remainder_));
-
-  int64_t stop = std::chrono::duration_cast<std::chrono::milliseconds>(
-                     std::chrono::system_clock::now().time_since_epoch())
-                     .count();
-  ;
+  uint64 stop = Env::Default()->NowMicros();
 
   double size_in_mb = 1.0 * remainder_ / 1024 / 1024;
-  double time_in_s = 1.0 * (stop - start) / 1000;
+  double time_in_s = 1.0 * (stop - start) / 1000 / 1000;
   LOG(INFO) << "Page size " << size_in_mb << " Mb, time " << time_in_s * 1000
             << " ms download speed " << size_in_mb / time_in_s << " Mb/sec";
 
@@ -426,7 +401,19 @@ Status IgniteDatasetIterator::LoadNextPage() {
   return Status::OK();
 }
 
-int32_t IgniteDatasetIterator::JavaHashCode(std::string str) const {
+Status IgniteDatasetIterator::CheckTypes(const std::vector<int32_t>& types) {
+  if (schema_.size() != types.size())
+    return errors::Unknown("Object has unexpected schema");
+
+  for (size_t i = 0; i < schema_.size(); i++) {
+    if (schema_[i] != types[permutation_[i]])
+      return errors::Unknown("Object has unexpected schema");
+  }
+
+  return Status::OK();
+}
+
+int32_t IgniteDatasetIterator::JavaHashCode(string str) const {
   int32_t h = 0;
   for (char& c : str) {
     h = 31 * h + c;
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h b/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
index 5858dbfcb9..c499e2c9cc 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
@@ -13,19 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "ignite_binary_object_parser.h"
-#include "ignite_client.h"
-#include "ignite_dataset.h"
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_ITERATOR_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_ITERATOR_H_
+
+#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
+#include "tensorflow/contrib/ignite/kernels/ignite_dataset.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 
 class IgniteDatasetIterator : public DatasetIterator<IgniteDataset> {
  public:
-  IgniteDatasetIterator(const Params& params, std::string host, int32 port,
-                        std::string cache_name, bool local, int32 part,
-                        int32 page_size, std::string username,
-                        std::string password, std::string certfile,
-                        std::string keyfile, std::string cert_password,
+  IgniteDatasetIterator(const Params& params, string host, int32 port,
+                        string cache_name, bool local, int32 part,
+                        int32 page_size, string username, string password,
+                        string certfile, string keyfile, string cert_password,
                         std::vector<int32> schema,
                         std::vector<int32> permutation);
   ~IgniteDatasetIterator();
@@ -38,15 +41,28 @@ class IgniteDatasetIterator : public DatasetIterator<IgniteDataset> {
                          IteratorStateReader* reader) override;
 
  private:
+  Status GetNextInternalWithValidState(IteratorContext* ctx,
+                                       std::vector<Tensor>* out_tensors,
+                                       bool* end_of_sequence);
+
+  Status EstablishConnection();
+  Status CloseConnection();
+  Status Handshake();
+  Status ScanQuery();
+  Status LoadNextPage();
+  Status ReceivePage(int32_t page_size);
+  Status CheckTypes(const std::vector<int32_t>& types);
+  int32_t JavaHashCode(string str) const;
+
   std::unique_ptr<Client> client_;
   BinaryObjectParser parser_;
 
-  const std::string cache_name_;
+  const string cache_name_;
   const bool local_;
   const int32 part_;
   const int32 page_size_;
-  const std::string username_;
-  const std::string password_;
+  const string username_;
+  const string password_;
   const std::vector<int32> schema_;
   const std::vector<int32> permutation_;
 
@@ -54,24 +70,30 @@ class IgniteDatasetIterator : public DatasetIterator<IgniteDataset> {
   int64_t cursor_id_;
   bool last_page_;
 
+  bool valid_state_;
+
+  mutex mutex_;
+
   std::unique_ptr<uint8_t> page_;
   uint8_t* ptr_;
-
-  Status EstablishConnection();
-  Status CloseConnection();
-  Status Handshake();
-  Status ScanQuery();
-  Status LoadNextPage();
-  int32_t JavaHashCode(std::string str) const;
 };
 
-constexpr uint8_t null_val = 101;
-constexpr uint8_t string_val = 9;
-constexpr uint8_t protocol_major_version = 1;
-constexpr uint8_t protocol_minor_version = 1;
-constexpr uint8_t protocol_patch_version = 0;
-constexpr int16_t scan_query_opcode = 2000;
-constexpr int16_t load_next_page_opcode = 2001;
-constexpr int16_t close_connection_opcode = 0;
+constexpr uint8_t kNullVal = 101;
+constexpr uint8_t kStringVal = 9;
+constexpr uint8_t kProtocolMajorVersion = 1;
+constexpr uint8_t kProtocolMinorVersion = 1;
+constexpr uint8_t kProtocolPatchVersion = 0;
+constexpr int16_t kScanQueryOpcode = 2000;
+constexpr int16_t kLoadNextPageOpcode = 2001;
+constexpr int16_t kCloseConnectionOpcode = 0;
+constexpr int32_t kScanQueryReqLength = 25;
+constexpr int32_t kScanQueryResHeaderLength = 25;
+constexpr int32_t kLoadNextPageReqLength = 18;
+constexpr int32_t kLoadNextPageResHeaderLength = 17;
+constexpr int32_t kCloseConnectionReqLength = 18;
+constexpr int32_t kHandshakeReqDefaultLength = 8;
+constexpr int32_t kMinResLength = 12;
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_ITERATOR_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc b/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
index d03404a460..eeb29ef30b 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
@@ -13,29 +13,73 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/contrib/ignite/kernels/ignite_dataset.h"
 #include <stdlib.h>
-#include "ignite_dataset.h"
+#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
 #include "tensorflow/core/framework/dataset.h"
 
 namespace tensorflow {
 namespace {
 
+Status SchemaToTypes(const std::vector<int32>& schema, DataTypeVector* dtypes) {
+  for (auto e : schema) {
+    if (e == BYTE || e == BYTE_ARR) {
+      dtypes->push_back(DT_UINT8);
+    } else if (e == SHORT || e == SHORT_ARR) {
+      dtypes->push_back(DT_INT16);
+    } else if (e == INT || e == INT_ARR) {
+      dtypes->push_back(DT_INT32);
+    } else if (e == LONG || e == LONG_ARR) {
+      dtypes->push_back(DT_INT64);
+    } else if (e == FLOAT || e == FLOAT_ARR) {
+      dtypes->push_back(DT_FLOAT);
+    } else if (e == DOUBLE || e == DOUBLE_ARR) {
+      dtypes->push_back(DT_DOUBLE);
+    } else if (e == USHORT || e == USHORT_ARR) {
+      dtypes->push_back(DT_UINT8);
+    } else if (e == BOOL || e == BOOL_ARR) {
+      dtypes->push_back(DT_BOOL);
+    } else if (e == STRING || e == STRING_ARR) {
+      dtypes->push_back(DT_STRING);
+    } else {
+      return errors::Unknown("Unexpected type in schema [type_id=", e, "]");
+    }
+  }
+
+  return Status::OK();
+}
+
+Status SchemaToShapes(const std::vector<int32>& schema,
+                      std::vector<PartialTensorShape>* shapes) {
+  for (auto e : schema) {
+    if (e >= 1 && e < 10) {
+      shapes->push_back(PartialTensorShape({}));
+    } else if (e >= 12 && e < 21) {
+      shapes->push_back(PartialTensorShape({-1}));
+    } else {
+      return errors::Unknown("Unexpected type in schema [type_id=", e, "]");
+    }
+  }
+
+  return Status::OK();
+}
+
 class IgniteDatasetOp : public DatasetOpKernel {
  public:
   using DatasetOpKernel::DatasetOpKernel;
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
-    std::string cache_name = "";
-    std::string host = "";
+    string cache_name = "";
+    string host = "";
     int32 port = -1;
     bool local = false;
     int32 part = -1;
     int32 page_size = -1;
-    std::string username = "";
-    std::string password = "";
-    std::string certfile = "";
-    std::string keyfile = "";
-    std::string cert_password = "";
+    string username = "";
+    string password = "";
+    string certfile = "";
+    string keyfile = "";
+    string cert_password = "";
 
     const char* env_cache_name = std::getenv("IGNITE_DATASET_CACHE_NAME");
     const char* env_host = std::getenv("IGNITE_DATASET_HOST");
@@ -50,15 +94,15 @@ class IgniteDatasetOp : public DatasetOpKernel {
     const char* env_cert_password = std::getenv("IGNITE_DATASET_CERT_PASSWORD");
 
     if (env_cache_name)
-      cache_name = std::string(env_cache_name);
+      cache_name = string(env_cache_name);
     else
-      OP_REQUIRES_OK(ctx, ParseScalarArgument<std::string>(ctx, "cache_name",
-                                                           &cache_name));
+      OP_REQUIRES_OK(
+          ctx, ParseScalarArgument<string>(ctx, "cache_name", &cache_name));
 
     if (env_host)
-      host = std::string(env_host);
+      host = string(env_host);
     else
-      OP_REQUIRES_OK(ctx, ParseScalarArgument<std::string>(ctx, "host", &host));
+      OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "host", &host));
 
     if (env_port)
       port = atoi(env_port);
@@ -82,34 +126,34 @@ class IgniteDatasetOp : public DatasetOpKernel {
                      ParseScalarArgument<int32>(ctx, "page_size", &page_size));
 
     if (env_username)
-      username = std::string(env_username);
+      username = string(env_username);
     else
-      OP_REQUIRES_OK(
-          ctx, ParseScalarArgument<std::string>(ctx, "username", &username));
+      OP_REQUIRES_OK(ctx,
+                     ParseScalarArgument<string>(ctx, "username", &username));
 
     if (env_password)
-      password = std::string(env_password);
+      password = string(env_password);
     else
-      OP_REQUIRES_OK(
-          ctx, ParseScalarArgument<std::string>(ctx, "password", &password));
+      OP_REQUIRES_OK(ctx,
+                     ParseScalarArgument<string>(ctx, "password", &password));
 
     if (env_certfile)
-      certfile = std::string(env_certfile);
+      certfile = string(env_certfile);
     else
-      OP_REQUIRES_OK(
-          ctx, ParseScalarArgument<std::string>(ctx, "certfile", &certfile));
+      OP_REQUIRES_OK(ctx,
+                     ParseScalarArgument<string>(ctx, "certfile", &certfile));
 
     if (env_keyfile)
-      keyfile = std::string(env_keyfile);
+      keyfile = string(env_keyfile);
     else
-      OP_REQUIRES_OK(
-          ctx, ParseScalarArgument<std::string>(ctx, "keyfile", &keyfile));
+      OP_REQUIRES_OK(ctx,
+                     ParseScalarArgument<string>(ctx, "keyfile", &keyfile));
 
     if (env_cert_password)
-      cert_password = std::string(env_cert_password);
+      cert_password = string(env_cert_password);
     else
-      OP_REQUIRES_OK(ctx, ParseScalarArgument<std::string>(ctx, "cert_password",
-                                                           &cert_password));
+      OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "cert_password",
+                                                      &cert_password));
 
     const Tensor* schema_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("schema", &schema_tensor));
@@ -124,19 +168,28 @@ class IgniteDatasetOp : public DatasetOpKernel {
 
     const Tensor* permutation_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("permutation", &permutation_tensor));
-    OP_REQUIRES(ctx, schema_tensor->dims() == 1,
+    OP_REQUIRES(ctx, permutation_tensor->dims() == 1,
                 errors::InvalidArgument("`permutation` must be a vector."));
 
     std::vector<int32> permutation;
-    permutation.reserve(permutation_tensor->NumElements());
+    permutation.resize(permutation_tensor->NumElements());
     for (int i = 0; i < permutation_tensor->NumElements(); i++) {
-      permutation.push_back(permutation_tensor->flat<int32>()(i));
+      // Inversed permutation.
+      permutation[permutation_tensor->flat<int32>()(i)] = i;
     }
 
-    *output =
-        new IgniteDataset(ctx, cache_name, host, port, local, part, page_size,
-                          username, password, certfile, keyfile, cert_password,
-                          std::move(schema), std::move(permutation));
+    DataTypeVector dtypes;
+    std::vector<PartialTensorShape> shapes;
+
+    OP_REQUIRES_OK(ctx, SchemaToTypes(schema, &dtypes));
+    OP_REQUIRES_OK(ctx, SchemaToShapes(schema, &shapes));
+
+    *output = new IgniteDataset(
+        ctx, std::move(cache_name), std::move(host), port, local, part,
+        page_size, std::move(username), std::move(password),
+        std::move(certfile), std::move(keyfile), std::move(cert_password),
+        std::move(schema), std::move(permutation), std::move(dtypes),
+        std::move(shapes));
   }
 };
 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h b/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
index 6f417a3cb5..750ebe605a 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
@@ -13,28 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "ignite_client.h"
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_PLAIN_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_PLAIN_CLIENT_H_
 
-#include <string>
+#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
 
 namespace tensorflow {
 
 class PlainClient : public Client {
  public:
-  PlainClient(std::string host, int port);
+  PlainClient(string host, int port, bool big_endian);
   ~PlainClient();
 
   virtual Status Connect();
   virtual Status Disconnect();
   virtual bool IsConnected();
   virtual int GetSocketDescriptor();
-  virtual Status ReadData(uint8_t* buf, int32_t length);
-  virtual Status WriteData(uint8_t* buf, int32_t length);
+  virtual Status ReadData(uint8_t* buf, const int32_t length);
+  virtual Status WriteData(const uint8_t* buf, const int32_t length);
 
  private:
-  const std::string host_;
+  const string host_;
   const int port_;
   int sock_;
 };
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_PLAIN_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc b/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
index a4c58a9563..e16c92307d 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
 
 #include <arpa/inet.h>
 #include <netdb.h>
@@ -31,8 +31,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-PlainClient::PlainClient(std::string host, int port)
-    : host_(host), port_(port), sock_(-1) {}
+PlainClient::PlainClient(string host, int port, bool big_endian)
+    : Client(big_endian), host_(std::move(host)), port_(port), sock_(-1) {}
 
 PlainClient::~PlainClient() {
   if (IsConnected()) {
@@ -87,7 +87,7 @@ bool PlainClient::IsConnected() { return sock_ != -1; }
 
 int PlainClient::GetSocketDescriptor() { return sock_; }
 
-Status PlainClient::ReadData(uint8_t* buf, int32_t length) {
+Status PlainClient::ReadData(uint8_t* buf, const int32_t length) {
   int recieved = 0;
 
   while (recieved < length) {
@@ -95,7 +95,7 @@ Status PlainClient::ReadData(uint8_t* buf, int32_t length) {
 
     if (res < 0)
       return errors::Internal("Error occured while reading from socket: ", res,
-                              ", ", std::string(strerror(errno)));
+                              ", ", string(strerror(errno)));
 
     if (res == 0) return errors::Internal("Server closed connection");
 
@@ -106,7 +106,7 @@ Status PlainClient::ReadData(uint8_t* buf, int32_t length) {
   return Status::OK();
 }
 
-Status PlainClient::WriteData(uint8_t* buf, int32_t length) {
+Status PlainClient::WriteData(const uint8_t* buf, const int32_t length) {
   int sent = 0;
 
   while (sent < length) {
@@ -114,7 +114,7 @@ Status PlainClient::WriteData(uint8_t* buf, int32_t length) {
 
     if (res < 0)
       return errors::Internal("Error occured while writing into socket: ", res,
-                              ", ", std::string(strerror(errno)));
+                              ", ", string(strerror(errno)));
 
     sent += res;
     buf += res;
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc b/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
index 8182fde6d9..9cd08a7779 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
 
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
@@ -29,8 +29,11 @@ limitations under the License.
 
 namespace tensorflow {
 
-PlainClient::PlainClient(std::string host, int port)
-    : host_(host), port_(port), sock_(INVALID_SOCKET) {}
+PlainClient::PlainClient(string host, int port, bool big_endian)
+    : Client(big_endian),
+      host_(std::move(host)),
+      port_(port),
+      sock_(INVALID_SOCKET) {}
 
 PlainClient::~PlainClient() {
   if (IsConnected()) {
@@ -55,6 +58,8 @@ Status PlainClient::Connect() {
                     &result);
   if (res != 0) return errors::Internal("Getaddrinfo failed with error: ", res);
 
+  auto clean = gtl::MakeCleanup([result] { reeaddrinfo(result); });
+
   for (ptr = result; ptr != NULL; ptr = ptr->ai_next) {
     sock_ = socket(ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol);
     if (sock_ == INVALID_SOCKET) {
@@ -72,8 +77,6 @@ Status PlainClient::Connect() {
     break;
   }
 
-  freeaddrinfo(result);
-
   if (sock_ == INVALID_SOCKET) {
     WSACleanup();
     return errors::Internal("Unable to connect to server");
@@ -99,7 +102,7 @@ bool PlainClient::IsConnected() { return sock_ != INVALID_SOCKET; }
 
 int PlainClient::GetSocketDescriptor() { return sock_; }
 
-Status PlainClient::ReadData(uint8_t *buf, int32_t length) {
+Status PlainClient::ReadData(uint8_t *buf, const int32_t length) {
   int recieved = 0;
 
   while (recieved < length) {
@@ -117,7 +120,7 @@ Status PlainClient::ReadData(uint8_t *buf, int32_t length) {
   return Status::OK();
 }
 
-Status PlainClient::WriteData(uint8_t *buf, int32_t length) {
+Status PlainClient::WriteData(const uint8_t *buf, const int32_t length) {
   int sent = 0;
 
   while (sent < length) {
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc b/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
index a2bc6b9609..28db509eaa 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "ignite_ssl_wrapper.h"
+#include "tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h"
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
@@ -29,13 +29,15 @@ static int PasswordCb(char *buf, int size, int rwflag, void *password) {
   return (strlen(buf));
 }
 
-SslWrapper::SslWrapper(std::shared_ptr<Client> client, std::string certfile,
-                       std::string keyfile, std::string cert_password)
-    : client_(client),
-      certfile_(certfile),
-      keyfile_(keyfile),
-      cert_password_(cert_password),
-      ctx_(NULL) {}
+SslWrapper::SslWrapper(std::shared_ptr<Client> client, string certfile,
+                       string keyfile, string cert_password, bool big_endian)
+    : Client(big_endian),
+      client_(client),
+      certfile_(std::move(certfile)),
+      keyfile_(std::move(keyfile)),
+      cert_password_(std::move(cert_password)),
+      ctx_(nullptr),
+      ssl_(nullptr) {}
 
 SslWrapper::~SslWrapper() {
   if (IsConnected()) {
@@ -43,9 +45,14 @@ SslWrapper::~SslWrapper() {
     if (!status.ok()) LOG(WARNING) << status.ToString();
   }
 
-  if (ctx_ != NULL) {
+  if (ctx_ != nullptr) {
     SSL_CTX_free(ctx_);
-    ctx_ = NULL;
+    ctx_ = nullptr;
+  }
+
+  if (ssl_ != nullptr) {
+    SSL_free(ssl_);
+    ssl_ = nullptr;
   }
 }
 
@@ -63,7 +70,7 @@ Status SslWrapper::InitSslContext() {
     return errors::Internal("Couldn't load cetificate chain (file '", certfile_,
                             "')");
 
-  std::string private_key_file = keyfile_.empty() ? certfile_ : keyfile_;
+  string private_key_file = keyfile_.empty() ? certfile_ : keyfile_;
   if (SSL_CTX_use_PrivateKey_file(ctx_, private_key_file.c_str(),
                                   SSL_FILETYPE_PEM) != 1)
     return errors::Internal("Couldn't load private key (file '",
@@ -94,6 +101,7 @@ Status SslWrapper::Connect() {
 
 Status SslWrapper::Disconnect() {
   SSL_free(ssl_);
+  ssl_ = nullptr;
 
   LOG(INFO) << "SSL connection closed";
 
@@ -104,7 +112,7 @@ bool SslWrapper::IsConnected() { return client_->IsConnected(); }
 
 int SslWrapper::GetSocketDescriptor() { return client_->GetSocketDescriptor(); }
 
-Status SslWrapper::ReadData(uint8_t *buf, int32_t length) {
+Status SslWrapper::ReadData(uint8_t *buf, const int32_t length) {
   int recieved = 0;
 
   while (recieved < length) {
@@ -123,7 +131,7 @@ Status SslWrapper::ReadData(uint8_t *buf, int32_t length) {
   return Status::OK();
 }
 
-Status SslWrapper::WriteData(uint8_t *buf, int32_t length) {
+Status SslWrapper::WriteData(const uint8_t *buf, const int32_t length) {
   int sent = 0;
 
   while (sent < length) {
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h b/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
index bbba6cc181..d59ce91aba 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
@@ -13,35 +13,39 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "ignite_client.h"
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_SSL_WRAPPER_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_SSL_WRAPPER_H_
+
+#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
 
 #include <openssl/ssl.h>
-#include <string>
 
 namespace tensorflow {
 
 class SslWrapper : public Client {
  public:
-  SslWrapper(std::shared_ptr<Client> client, std::string certfile,
-             std::string keyfile, std::string cert_password);
+  SslWrapper(std::shared_ptr<Client> client, string certfile, string keyfile,
+             string cert_password, bool big_endian);
   ~SslWrapper();
 
   virtual Status Connect();
   virtual Status Disconnect();
   virtual bool IsConnected();
   virtual int GetSocketDescriptor();
-  virtual Status ReadData(uint8_t* buf, int32_t length);
-  virtual Status WriteData(uint8_t* buf, int32_t length);
+  virtual Status ReadData(uint8_t* buf, const int32_t length);
+  virtual Status WriteData(const uint8_t* buf, const int32_t length);
 
  private:
+  Status InitSslContext();
+
   std::shared_ptr<Client> client_;
-  std::string certfile_;
-  std::string keyfile_;
-  std::string cert_password_;
+  string certfile_;
+  string keyfile_;
+  string cert_password_;
   SSL_CTX* ctx_;
   SSL* ssl_;
-
-  Status InitSslContext();
 };
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_SSL_WRAPPER_H_
\ No newline at end of file
diff --git a/tensorflow/contrib/ignite/ops/dataset_ops.cc b/tensorflow/contrib/ignite/ops/dataset_ops.cc
index fb16b290b1..7d18df11aa 100644
--- a/tensorflow/contrib/ignite/ops/dataset_ops.cc
+++ b/tensorflow/contrib/ignite/ops/dataset_ops.cc
@@ -37,6 +37,8 @@ REGISTER_OP("IgniteDataset")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
+IgniteDataset that allows to get data from Apache Ignite.
+
 Apache Ignite is a memory-centric distributed database, caching, and processing
 platform for transactional, analytical, and streaming workloads, delivering 
 in-memory speeds at petabyte scale. This contrib package contains an 
diff --git a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
index 60003ca3b7..c0e24b1c69 100644
--- a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
+++ b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
@@ -41,19 +41,19 @@ class Readable():
 
   def read_byte(self):
     """Reads and returnes byte."""
-    return self.__read("b", 1)
+    return self._read("b", 1)
 
   def read_short(self):
     """Reads and returns short (2 bytes, little-endian)."""
-    return self.__read("h", 2)
+    return self._read("h", 2)
 
   def read_int(self):
     """Reads and returns int (4 bytes, little-endian)."""
-    return self.__read("i", 4)
+    return self._read("i", 4)
 
   def read_long(self):
     """Reads and returns long (8 bytes, little-endian)."""
-    return self.__read("q", 8)
+    return self._read("q", 8)
 
   def skip(self, length):
     """Skips the specified number of bytes."""
@@ -64,7 +64,7 @@ class Readable():
     """Reads the specified number of bytes and returns them as a buffer."""
     return None
 
-  def __read(self, data_type, length):
+  def _read(self, data_type, length):
     """Reads, unpacks and returns specified type (little-endian)."""
     data_buffer = self.read_data(length)
     return struct.unpack("<" + data_type, data_buffer)[0]
@@ -116,10 +116,10 @@ class TcpClient(Readable):
       self.sock = context.wrap_socket(self.sock)
     else:
       if keyfile is not None:
-        raise Exception("SSL is disabled, keyfile must not be specified \
+        raise RuntimeError("SSL is disabled, keyfile must not be specified \
           (to enable SSL specify certfile)")
       if password is not None:
-        raise Exception("SSL is disabled, password must not be specified \
+        raise RuntimeError("SSL is disabled, password must not be specified \
           (to enable SSL specify certfile)")
 
     self.host = host
@@ -136,19 +136,19 @@ class TcpClient(Readable):
 
   def write_byte(self, v):
     """Writes the specified byte."""
-    self.__write(v, "b")
+    self._write(v, "b")
 
   def write_short(self, v):
     """Writes the specified short (2 bytes, little-endian)."""
-    self.__write(v, "h")
+    self._write(v, "h")
 
   def write_int(self, v):
     """Writes the specified short (4 bytes, little-endian)."""
-    self.__write(v, "i")
+    self._write(v, "i")
 
   def write_long(self, v):
     """Writes the specified int (8 bytes, little-endian)."""
-    self.__write(v, "q")
+    self._write(v, "q")
 
   def write_string(self, v):
     """Writes the specified string."""
@@ -167,7 +167,7 @@ class TcpClient(Readable):
         data_buffer += buf
     return data_buffer
 
-  def __write(self, value, data_type):
+  def _write(self, value, data_type):
     """Packs and writes data using the specified type (little-endian)."""
     data_buffer = struct.pack("<" + data_type, value)
     self.sock.sendall(data_buffer)
@@ -193,6 +193,7 @@ class BinaryField():
 # Binary types defined in Apache Ignite Thin client and supported by
 # TensorFlow on Apache Ignite, see
 # https://apacheignite.readme.io/v2.6/docs/binary-client-protocol.
+# True means that type is a vector, False means type is scalar.
 types = {
     1: (dtypes.uint8, False),
     2: (dtypes.int16, False),
@@ -248,13 +249,13 @@ class TypeTreeNode():
        dataset.
     """
     if self.fields is None:
-      object_type = types[self.type_id]
-      if object_type is not None:
+      if self.type_id in types:
+        object_type = types[self.type_id]
         is_array = object_type[1]
         if is_array:
           return tensor_shape.TensorShape([None])
         return tensor_shape.TensorShape([])
-      raise Exception("Unsupported type [type_id=%d]" % self.type_id)
+      raise ValueError("Unsupported type [type_id=%d]" % self.type_id)
     output_shapes = {}
     for field in self.fields:
       output_shapes[field.name] = field.to_output_shapes()
@@ -265,10 +266,10 @@ class TypeTreeNode():
        dataset.
     """
     if self.fields is None:
-      object_type = types[self.type_id]
-      if object_type is not None:
+      if self.type_id in types:
+        object_type = types[self.type_id]
         return object_type[0]
-      raise Exception("Unsupported type [type_id=%d]" % self.type_id)
+      raise ValueError("Unsupported type [type_id=%d]" % self.type_id)
     else:
       output_types = {}
       for field in self.fields:
@@ -276,11 +277,11 @@ class TypeTreeNode():
       return output_types
 
   def to_flat(self):
-    """Returns a list of leaf node types."""
+    """Returns a list of node types."""
     return self.to_flat_rec([])
 
   def to_permutation(self):
-    """Returns a permutation that should be applied to order object leafs."""
+    """Returns a permutation that should be applied to order object leaves."""
     correct_order_dict = {}
     self.traversal_rec(correct_order_dict, 0)
     object_order = []
@@ -288,9 +289,10 @@ class TypeTreeNode():
     return [correct_order_dict[o] for o in object_order]
 
   def to_flat_rec(self, flat):
-    """Formats a list of leaf node types."""
-    flat.append(self.type_id)
-    if self.fields is not None:
+    """Formats a list of leaf node types in pre-order."""
+    if self.fields is None:
+      flat.append(self.type_id)
+    else:
       for field in self.fields:
         field.to_flat_rec(flat)
     return flat
@@ -320,8 +322,8 @@ class IgniteClient(TcpClient):
      have the same structure (homogeneous objects) and the cache contains at
      least one object.
   """
-  def __init__(self, host, port, username=None, password=None, certfile=None,\
-    keyfile=None, cert_password=None):
+  def __init__(self, host, port, username=None, password=None, certfile=None,
+               keyfile=None, cert_password=None):
     """Constructs a new instance of IgniteClient.
 
     Args:
@@ -385,12 +387,13 @@ class IgniteClient(TcpClient):
       serv_ver_major = self.read_short()
       serv_ver_minor = self.read_short()
       serv_ver_patch = self.read_short()
-      err_msg = self.__parse_string()
+      err_msg = self._parse_string()
       if err_msg is None:
-        raise Exception("Handshake Error [result=%d, version=%d.%d.%d]" \
-            % (res, serv_ver_major, serv_ver_minor, serv_ver_patch))
+        raise RuntimeError("Handshake Error [result=%d, version=%d.%d.%d]"
+                           % (res, serv_ver_major, serv_ver_minor,
+                              serv_ver_patch))
       else:
-        raise Exception("Handshake Error [result=%d, version=%d.%d.%d, \
+        raise RuntimeError("Handshake Error [result=%d, version=%d.%d.%d, \
             message='%s']" % (
                 res,
                 serv_ver_major,
@@ -403,7 +406,7 @@ class IgniteClient(TcpClient):
     """Collects type information about objects stored in the specified
        cache.
     """
-    cache_name_hash = self.__java_hash_code(cache_name)
+    cache_name_hash = self._java_hash_code(cache_name)
     self.write_int(25)        # Message length
     self.write_short(2000)      # Operation code
     self.write_long(0)        # Request ID
@@ -419,18 +422,18 @@ class IgniteClient(TcpClient):
     status = self.read_int()
 
     if status != 0:
-      err_msg = self.__parse_string()
+      err_msg = self._parse_string()
       if err_msg is None:
-        raise Exception("Scan Query Error [status=%s]" % status)
+        raise RuntimeError("Scan Query Error [status=%s]" % status)
       else:
-        raise Exception("Scan Query Error [status=%s, message='%s']" \
-            % (status, err_msg))
+        raise RuntimeError("Scan Query Error [status=%s, message='%s']"
+                           % (status, err_msg))
 
     self.read_long()          # Cursor id
     row_count = self.read_int()
 
     if row_count == 0:
-      raise Exception("Scan Query returned empty result, so it's \
+      raise RuntimeError("Scan Query returned empty result, so it's \
         impossible to derive the cache type")
 
     payload = DataBuffer(self.read_data(result_length - 25))
@@ -438,20 +441,20 @@ class IgniteClient(TcpClient):
     self.read_byte()          # Next page
 
     res = TypeTreeNode("root", 0, [
-        self.__collect_types("key", payload),
-        self.__collect_types("val", payload)
+        self._collect_types("key", payload),
+        self._collect_types("val", payload)
     ], [0, 1])
 
     return res
 
-  def __java_hash_code(self, s):
+  def _java_hash_code(self, s):
     """Computes hash code of the specified string using Java code."""
     h = 0
     for c in s:
       h = (31 * h + ord(c)) & 0xFFFFFFFF
     return ((h + 0x80000000) & 0xFFFFFFFF) - 0x80000000
 
-  def __collect_types(self, field_name, data):
+  def _collect_types(self, field_name, data):
     """Extracts type information from the specified object."""
     type_id = data.read_byte()
 
@@ -570,7 +573,7 @@ class IgniteClient(TcpClient):
         elif header == 101:
           pass
         else:
-          raise Exception("Unknown binary type when expected string \
+          raise RuntimeError("Unknown binary type when expected string \
             [type_id=%d]" % header)
       return TypeTreeNode(field_name, type_id)
 
@@ -591,7 +594,7 @@ class IgniteClient(TcpClient):
       length = data.read_int()
       inner_data = data.read_data(length)
       data.read_int()   # Offset
-      return self.__collect_types(field_name, DataBuffer(inner_data))
+      return self._collect_types(field_name, DataBuffer(inner_data))
 
     # Complex Object.
     if type_id == 103:
@@ -603,11 +606,11 @@ class IgniteClient(TcpClient):
       data.read_int()   # Object schema id
       obj_schema_offset = data.read_int()
 
-      obj_type = self.__get_type(obj_type_id)
+      obj_type = self._get_type(obj_type_id)
       children = []
 
       for obj_field in obj_type.fields:
-        child = self.__collect_types(obj_field.field_name, data)
+        child = self._collect_types(obj_field.field_name, data)
         children.append(child)
 
       children_sorted = sorted(children, key=lambda child: child.name)
@@ -618,9 +621,9 @@ class IgniteClient(TcpClient):
 
       return TypeTreeNode(field_name, type_id, children, permutation)
 
-    raise Exception("Unknown binary type [type_id=%d]" % type_id)
+    raise RuntimeError("Unknown binary type [type_id=%d]" % type_id)
 
-  def __get_type(self, type_id):
+  def _get_type(self, type_id):
     """Queries Apache Ignite information about type by type id."""
     self.write_int(14)      # Message length
     self.write_short(3002)  # Operation code
@@ -632,25 +635,25 @@ class IgniteClient(TcpClient):
     status = self.read_int()
 
     if status != 0:
-      err_msg = self.__parse_string()
+      err_msg = self._parse_string()
       if err_msg is None:
-        raise Exception("Get Binary Type Error [status=%d, message='%s']" \
-            % (status, err_msg))
+        raise RuntimeError("Get Binary Type Error [status=%d, message='%s']"
+                           % (status, err_msg))
       else:
-        raise Exception("Get Binary Type Error [status=%d]" % status)
+        raise RuntimeError("Get Binary Type Error [status=%d]" % status)
 
     binary_type_exists = self.read_byte()
 
     if binary_type_exists == 0:
-      raise Exception("Binary type not found [type_id=%d] " % type_id)
+      raise RuntimeError("Binary type not found [type_id=%d] " % type_id)
 
     binary_type_id = self.read_int()
-    binary_type_name = self.__parse_string()
-    self.__parse_string()   # Affinity field name
+    binary_type_name = self._parse_string()
+    self._parse_string()   # Affinity field name
 
     fields = []
     for _ in range(self.read_int()):
-      field_name = self.__parse_string()
+      field_name = self._parse_string()
       field_type_id = self.read_int()
       field_id = self.read_int()
 
@@ -659,7 +662,7 @@ class IgniteClient(TcpClient):
 
     is_enum = self.read_byte()
     if is_enum == 1:
-      raise Exception("Enum fields are not supported yet")
+      raise RuntimeError("Enum fields are not supported yet")
 
     schema_cnt = self.read_int()
     for _ in range(schema_cnt):
@@ -669,7 +672,7 @@ class IgniteClient(TcpClient):
 
     return BinaryType(binary_type_id, binary_type_name, fields)
 
-  def __parse_string(self):
+  def _parse_string(self):
     """Parses string."""
     header = self.read_byte()
     if header == 9:
@@ -677,8 +680,8 @@ class IgniteClient(TcpClient):
       return self.read_data(length).decode("utf-8")
     if header == 101:
       return None
-    raise Exception("Unknown binary type when expected string [type_id=%d]" \
-        % header)
+    raise RuntimeError("Unknown binary type when expected string [type_id=%d]"
+                       % header)
 
 class IgniteDataset(Dataset):
   """Apache Ignite is a memory-centric distributed database, caching, and
@@ -692,9 +695,9 @@ class IgniteDataset(Dataset):
      Ignite Binary Client Protocol.
   """
 
-  def __init__(self, cache_name, host="localhost", port=10800, local=False,\
-    part=-1, page_size=100, username=None, password=None, certfile=None,\
-    keyfile=None, cert_password=None):
+  def __init__(self, cache_name, host="localhost", port=10800, local=False,
+               part=-1, page_size=100, username=None, password=None,
+               certfile=None, keyfile=None, cert_password=None):
     """Create a IgniteDataset.
 
     Args:
@@ -716,39 +719,44 @@ class IgniteDataset(Dataset):
     """
     super(IgniteDataset, self).__init__()
 
-    with IgniteClient(host, port, username, password, certfile, keyfile,\
-        cert_password) as client:
+    with IgniteClient(host, port, username, password, certfile, keyfile,
+                      cert_password) as client:
       client.handshake()
       self.cache_type = client.get_cache_type(cache_name)
 
-    self.cache_name = ops.convert_to_tensor(cache_name, dtype=dtypes.string,\
-        name="cache_name")
+    self.cache_name = ops.convert_to_tensor(cache_name, dtype=dtypes.string,
+                                            name="cache_name")
     self.host = ops.convert_to_tensor(host, dtype=dtypes.string, name="host")
     self.port = ops.convert_to_tensor(port, dtype=dtypes.int32, name="port")
     self.local = ops.convert_to_tensor(local, dtype=dtypes.bool, name="local")
     self.part = ops.convert_to_tensor(part, dtype=dtypes.int32, name="part")
-    self.page_size = ops.convert_to_tensor(page_size, dtype=dtypes.int32,\
-        name="page_size")
-    self.username = ops.convert_to_tensor("" if username is None else username,\
-        dtype=dtypes.string, name="username")
-    self.password = ops.convert_to_tensor("" if password is None else password,\
-        dtype=dtypes.string, name="password")
-    self.certfile = ops.convert_to_tensor("" if certfile is None else certfile,\
-        dtype=dtypes.string, name="certfile")
-    self.keyfile = ops.convert_to_tensor("" if keyfile is None else keyfile,\
-        dtype=dtypes.string, name="keyfile")
-    self.cert_password = ops.convert_to_tensor("" if cert_password is None\
-        else cert_password, dtype=dtypes.string, name="cert_password")
-    self.schema = ops.convert_to_tensor(self.cache_type.to_flat(),\
-        dtype=dtypes.int32, name="schema")
-    self.permutation = ops.convert_to_tensor(self.cache_type.to_permutation(),\
-        dtype=dtypes.int32, name="permutation")
+    self.page_size = ops.convert_to_tensor(page_size, dtype=dtypes.int32,
+                                           name="page_size")
+    self.username = ops.convert_to_tensor("" if username is None else username,
+                                          dtype=dtypes.string, name="username")
+    self.password = ops.convert_to_tensor("" if password is None else password,
+                                          dtype=dtypes.string, name="password")
+    self.certfile = ops.convert_to_tensor("" if certfile is None else certfile,
+                                          dtype=dtypes.string, name="certfile")
+    self.keyfile = ops.convert_to_tensor("" if keyfile is None else keyfile,
+                                         dtype=dtypes.string, name="keyfile")
+    self.cert_password = ops.convert_to_tensor("" if cert_password is None
+                                               else cert_password,
+                                               dtype=dtypes.string,
+                                               name="cert_password")
+    self.schema = ops.convert_to_tensor(self.cache_type.to_flat(),
+                                        dtype=dtypes.int32, name="schema")
+    self.permutation = ops.convert_to_tensor(self.cache_type.to_permutation(),
+                                             dtype=dtypes.int32,
+                                             name="permutation")
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.ignite_dataset(self.cache_name, self.host,\
-        self.port, self.local, self.part, self.page_size, self.username,\
-        self.password, self.certfile, self.keyfile, self.cert_password,\
-        self.schema, self.permutation)
+    return gen_dataset_ops.ignite_dataset(self.cache_name, self.host,
+                                          self.port, self.local, self.part,
+                                          self.page_size, self.username,
+                                          self.password, self.certfile,
+                                          self.keyfile, self.cert_password,
+                                          self.schema, self.permutation)
 
   @property
   def output_classes(self):
-- 
GitLab


From ce9b23070638094022036656e5d1fbf3e23b74c6 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Thu, 13 Sep 2018 11:24:37 +0300
Subject: [PATCH 035/570] Add forgotten ignite_byte_swapper.h

---
 .../ignite/kernels/ignite_byte_swapper.h      | 129 ++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h

diff --git a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h b/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
new file mode 100644
index 0000000000..986bedcf69
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BYTE_SWAPPER_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BYTE_SWAPPER_H_
+
+#include <stdint.h>
+
+namespace tensorflow {
+
+class ByteSwapper {
+ public:
+  ByteSwapper(bool big_endian) {
+    int x = 1;
+    bool is_little_endian = (*(char *)&x == 1);
+    swap_ = big_endian == is_little_endian;
+  }
+
+  inline void SwapIfRequiredInt16(int16_t *x) const {
+    if (swap_) {
+      Swap16(x);
+    }
+  }
+
+  inline void SwapIfRequiredUnsignedInt16(uint16_t *x) const {
+    if (swap_) {
+      Swap16(reinterpret_cast<int16_t*>(x));
+    }
+  }
+
+  inline void SwapIfRequiredInt32(int32_t *x) const {
+    if (swap_) {
+      Swap32(x);
+    }
+  }
+
+  inline void SwapIfRequiredFloat(float *x) const {
+    if (swap_) {
+      Swap32(reinterpret_cast<int32_t*>(x));
+    }
+  }
+
+  inline void SwapIfRequiredInt64(int64_t *x) const {
+    if (swap_) {
+      Swap64(x);
+    }
+  }
+
+  inline void SwapIfRequiredDouble(double *x) const {
+    if (swap_) {
+      Swap64(reinterpret_cast<int64_t*>(x));
+    }
+  }
+
+  inline void SwapIfRequiredInt16Arr(int16_t *x, int32_t length) const {
+    if (swap_) {
+      for (int32_t i = 0; i < length; i++) Swap16(&x[i]);
+    }
+  }
+
+  inline void SwapIfRequiredUnsignedInt16Arr(uint16_t *x,
+                                             int32_t length) const {
+    if (swap_) {
+      for (int32_t i = 0; i < length; i++) 
+        Swap16(reinterpret_cast<int16_t*>(&x[i]));
+    }
+  }
+
+  inline void SwapIfRequiredInt32Arr(int32_t *x, int32_t length) const {
+    if (swap_) {
+      for (int32_t i = 0; i < length; i++) Swap32(&x[i]);
+    }
+  }
+
+  inline void SwapIfRequiredFloatArr(float *x, int32_t length) const {
+    if (swap_) {
+      for (int32_t i = 0; i < length; i++) 
+        Swap32(reinterpret_cast<int32_t*>(&x[i]));
+    }
+  }
+
+  inline void SwapIfRequiredInt64Arr(int64_t *x, int32_t length) const {
+    if (swap_) {
+      for (int32_t i = 0; i < length; i++) Swap64(&x[i]);
+    }
+  }
+
+  inline void SwapIfRequiredDoubleArr(double *x, int32_t length) const {
+    if (swap_) {
+      for (int32_t i = 0; i < length; i++) 
+        Swap64(reinterpret_cast<int64_t*>(&x[i]));
+    }
+  }
+
+ private:
+  inline void Swap16(int16_t *x) const {
+    *x = ((*x & 0xFF) << 8) | ((*x >> 8) & 0xFF);
+  }
+
+  inline void Swap32(int32_t *x) const {
+    *x = ((*x & 0xFF) << 24) | (((*x >> 8) & 0xFF) << 16) |
+         (((*x >> 16) & 0xFF) << 8) | ((*x >> 24) & 0xFF);
+  }
+
+  inline void Swap64(int64_t *x) const {
+    *x = ((*x & 0xFF) << 56) | (((*x >> 8) & 0xFF) << 48) |
+         (((*x >> 16) & 0xFF) << 40) | (((*x >> 24) & 0xFF) << 32) |
+         (((*x >> 32) & 0xFF) << 24) | (((*x >> 40) & 0xFF) << 16) |
+         (((*x >> 48) & 0xFF) << 8) | ((*x >> 56) & 0xFF);
+  }
+
+  bool swap_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BYTE_SWAPPER_H_
-- 
GitLab


From d797e99a043e01609583a37c04e1e509d126e1a0 Mon Sep 17 00:00:00 2001
From: dmitrievanthony <dmitrievanthony@gmail.com>
Date: Thu, 13 Sep 2018 09:42:16 +0000
Subject: [PATCH 036/570] Fix windows build.

---
 .../contrib/ignite/kernels/ignite_plain_client_windows.cc      | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc b/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
index 9cd08a7779..17f2bf45d1 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #pragma comment(lib, "Mswsock.lib")
 #pragma comment(lib, "AdvApi32.lib")
 
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -58,7 +59,7 @@ Status PlainClient::Connect() {
                     &result);
   if (res != 0) return errors::Internal("Getaddrinfo failed with error: ", res);
 
-  auto clean = gtl::MakeCleanup([result] { reeaddrinfo(result); });
+  auto clean = gtl::MakeCleanup([result] { freeaddrinfo(result); });
 
   for (ptr = result; ptr != NULL; ptr = ptr->ai_next) {
     sock_ = socket(ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol);
-- 
GitLab


From c8b60b894b91cfdb4176176d7dcf328d2b40b41f Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Thu, 13 Sep 2018 16:34:59 +0300
Subject: [PATCH 037/570] Fix code style.

---
 .../ignite/kernels/ignite_byte_swapper.h       | 18 +++++++++---------
 .../ignite/kernels/ignite_dataset_ops.cc       |  2 +-
 .../kernels/ignite_plain_client_windows.cc     |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h b/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
index 986bedcf69..5b42de4c5a 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
@@ -36,7 +36,7 @@ class ByteSwapper {
 
   inline void SwapIfRequiredUnsignedInt16(uint16_t *x) const {
     if (swap_) {
-      Swap16(reinterpret_cast<int16_t*>(x));
+      Swap16(reinterpret_cast<int16_t *>(x));
     }
   }
 
@@ -48,7 +48,7 @@ class ByteSwapper {
 
   inline void SwapIfRequiredFloat(float *x) const {
     if (swap_) {
-      Swap32(reinterpret_cast<int32_t*>(x));
+      Swap32(reinterpret_cast<int32_t *>(x));
     }
   }
 
@@ -60,7 +60,7 @@ class ByteSwapper {
 
   inline void SwapIfRequiredDouble(double *x) const {
     if (swap_) {
-      Swap64(reinterpret_cast<int64_t*>(x));
+      Swap64(reinterpret_cast<int64_t *>(x));
     }
   }
 
@@ -73,8 +73,8 @@ class ByteSwapper {
   inline void SwapIfRequiredUnsignedInt16Arr(uint16_t *x,
                                              int32_t length) const {
     if (swap_) {
-      for (int32_t i = 0; i < length; i++) 
-        Swap16(reinterpret_cast<int16_t*>(&x[i]));
+      for (int32_t i = 0; i < length; i++)
+        Swap16(reinterpret_cast<int16_t *>(&x[i]));
     }
   }
 
@@ -86,8 +86,8 @@ class ByteSwapper {
 
   inline void SwapIfRequiredFloatArr(float *x, int32_t length) const {
     if (swap_) {
-      for (int32_t i = 0; i < length; i++) 
-        Swap32(reinterpret_cast<int32_t*>(&x[i]));
+      for (int32_t i = 0; i < length; i++)
+        Swap32(reinterpret_cast<int32_t *>(&x[i]));
     }
   }
 
@@ -99,8 +99,8 @@ class ByteSwapper {
 
   inline void SwapIfRequiredDoubleArr(double *x, int32_t length) const {
     if (swap_) {
-      for (int32_t i = 0; i < length; i++) 
-        Swap64(reinterpret_cast<int64_t*>(&x[i]));
+      for (int32_t i = 0; i < length; i++)
+        Swap64(reinterpret_cast<int64_t *>(&x[i]));
     }
   }
 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc b/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
index eeb29ef30b..e48fce4ed2 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset.h"
 #include <stdlib.h>
 #include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/ignite_dataset.h"
 #include "tensorflow/core/framework/dataset.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc b/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
index 17f2bf45d1..43d6108c34 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #pragma comment(lib, "Mswsock.lib")
 #pragma comment(lib, "AdvApi32.lib")
 
-#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
-- 
GitLab


From c513c04aed8790c78c46b78f90ec848555498ce4 Mon Sep 17 00:00:00 2001
From: dmitrievanthony <dmitrievanthony@gmail.com>
Date: Thu, 13 Sep 2018 15:13:54 +0000
Subject: [PATCH 038/570] Add -DWIN32_LEAN_AND_MEAN option into BUILD.

---
 tensorflow/contrib/ignite/BUILD | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/contrib/ignite/BUILD b/tensorflow/contrib/ignite/BUILD
index 2f598b4aed..1adc6c6ccc 100644
--- a/tensorflow/contrib/ignite/BUILD
+++ b/tensorflow/contrib/ignite/BUILD
@@ -61,6 +61,9 @@ cc_library(
         "@boringssl//:ssl",
         "@protobuf_archive//:protobuf_headers",
     ],
+    copts = if_windows([
+        "-DWIN32_LEAN_AND_MEAN",
+    ]),
     alwayslink = 1,
 )
 
-- 
GitLab


From f54856b1448bed24534189e4aa2ebb9d0b4f5b9a Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Thu, 13 Sep 2018 18:13:47 +0000
Subject: [PATCH 039/570] Apply buildifier changes.

---
 tensorflow/contrib/ignite/BUILD | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/ignite/BUILD b/tensorflow/contrib/ignite/BUILD
index 1adc6c6ccc..9393b702d1 100644
--- a/tensorflow/contrib/ignite/BUILD
+++ b/tensorflow/contrib/ignite/BUILD
@@ -6,14 +6,14 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_gen_op_wrapper_py",
-    "tf_kernel_library",
+    "if_not_windows",
+    "if_windows",
     "tf_custom_op_library",
     "tf_custom_op_py_library",
     "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
     "tf_py_test",
-    "if_not_windows",
-    "if_windows",
 )
 
 py_library(
@@ -55,15 +55,15 @@ cc_library(
     ]) + if_windows([
         "kernels/ignite_plain_client_windows.cc",
     ]),
+    copts = if_windows([
+        "-DWIN32_LEAN_AND_MEAN",
+    ]),
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
         "@boringssl//:ssl",
         "@protobuf_archive//:protobuf_headers",
     ],
-    copts = if_windows([
-        "-DWIN32_LEAN_AND_MEAN",
-    ]),
     alwayslink = 1,
 )
 
-- 
GitLab


From 74b9d6a48286f38807bbd204d9d55467e02387ca Mon Sep 17 00:00:00 2001
From: Niranjan Hasabnis <niranjan.hasabnis@intel.com>
Date: Fri, 14 Sep 2018 16:25:36 -0700
Subject: [PATCH 040/570] [Intel MKL] Fixes for unit test failures

1) Changes in partitioned_function_ops.cc are for passing
   Global OpRegistry as default_registry in PartitionedFunction op

   This fix addresses failure in MKL layout pass when PartitionedFunction
   op calls graph optimization passes. The problem was that the function
   library definition that is used to create function graph and corresponding
   subgraphs after partitioning did not use global OpRegistry as the
   default OpRegistry used for look of operator names. Because of that,
   standard operators such as "Const" were not available to graph passes.

2) Changes in mkl_cpu_allocator.h are to address failure in
   mkl_cpu_allocator_test which was expecting that max_bytes_limits is returned
   via GetStats() in MKLCPUAllocator.
---
 tensorflow/core/common_runtime/mkl_cpu_allocator.h  |  3 +++
 tensorflow/core/kernels/partitioned_function_ops.cc | 12 +++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index df9c3a686c..593f855ea2 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -277,6 +277,9 @@ class MklCPUAllocator : public VisitableAllocator {
     // max_alloc_size from large_size_allocator would be the maximum
     // size allocated by MklCPUAllocator.
     stats->max_alloc_size = l_stats.max_alloc_size;
+
+    stats->bytes_limit =
+        std::max(s_stats.bytes_limit, l_stats.bytes_limit);
   }
 
   void ClearStats() override {
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index fc1c9003aa..ddb621967a 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -97,7 +97,12 @@ class PartitionedCallOp : public AsyncOpKernel {
         OP_REQUIRES_ASYNC(ctx, fbody != nullptr,
                           errors::Internal("Could not find handle ", handle),
                           done);
-        auto graph = tensorflow::MakeUnique<Graph>(fbody->graph->flib_def());
+        // We need to pass global op_registry as default_registry when creating
+        // graph. So that graph optimization passes can lookup all possible ops
+        // by name.
+        FunctionLibraryDefinition func_lib_def(OpRegistry::Global(),
+                                            fbody->graph->flib_def().ToProto());
+        auto graph = tensorflow::MakeUnique<Graph>(func_lib_def);
         CopyGraph(*fbody->graph, graph.get());
         OP_REQUIRES_OK_ASYNC(ctx, PinResourceArgs(graph.get(), args), done);
 
@@ -250,9 +255,10 @@ class PartitionedCallOp : public AsyncOpKernel {
     VLOG(3) << "Partitioned function '" << func_.name() << "', yielding "
             << partitions.size() << " shards.";
 
-    const FunctionLibraryDefinition* flib_def = &graph->flib_def();
+    FunctionLibraryDefinition func_lib_def(OpRegistry::Global(),
+                                          graph->flib_def().ToProto());
     for (const auto& partition : partitions) {
-      std::unique_ptr<Graph> subgraph(new Graph(flib_def));
+      std::unique_ptr<Graph> subgraph(new Graph(func_lib_def));
       GraphConstructorOptions opts;
       opts.allow_internal_ops = true;
       opts.expect_device_spec = true;
-- 
GitLab


From fa80a920f2a3bc00522fe95fc9a07a28d67fc055 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Mon, 17 Sep 2018 12:50:18 +0300
Subject: [PATCH 041/570] Add 'override' specifier to ReadData, WriteData.

---
 tensorflow/contrib/ignite/kernels/ignite_plain_client.h | 4 ++--
 tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h b/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
index 750ebe605a..d12d56fdc1 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
@@ -29,8 +29,8 @@ class PlainClient : public Client {
   virtual Status Disconnect();
   virtual bool IsConnected();
   virtual int GetSocketDescriptor();
-  virtual Status ReadData(uint8_t* buf, const int32_t length);
-  virtual Status WriteData(const uint8_t* buf, const int32_t length);
+  virtual Status ReadData(uint8_t* buf, const int32_t length) override;
+  virtual Status WriteData(const uint8_t* buf, const int32_t length) override;
 
  private:
   const string host_;
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h b/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
index d59ce91aba..372156a757 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
@@ -32,8 +32,8 @@ class SslWrapper : public Client {
   virtual Status Disconnect();
   virtual bool IsConnected();
   virtual int GetSocketDescriptor();
-  virtual Status ReadData(uint8_t* buf, const int32_t length);
-  virtual Status WriteData(const uint8_t* buf, const int32_t length);
+  virtual Status ReadData(uint8_t* buf, const int32_t length) override;
+  virtual Status WriteData(const uint8_t* buf, const int32_t length) override;
 
  private:
   Status InitSslContext();
-- 
GitLab


From 7820ead0c58c9d90d7776bea31a294bbcc9a30f8 Mon Sep 17 00:00:00 2001
From: Samuel Matzek <smatzek@us.ibm.com>
Date: Mon, 30 Jul 2018 09:46:05 -0500
Subject: [PATCH 042/570] Make full model before calling set_model on callback

Commit 1b67ccbe8006eacffd268553abd01310e8b187d6 removed the _make_train_function calls from Keras training fit_generator for eager execution.

This breaks some callbacks that depend on the entire model to be populated on the set_model or on_train_begin methods.

This commit adds the method calls back in but guarded by an eager check.  It is not doing a revert / fix because the fix that removed the calls also put a test case in for eager fit_generator testing which we want to retain.
---
 tensorflow/python/keras/engine/training_generator.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index 413c1f4fba..2e074699da 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras.utils.data_utils import GeneratorEnqueuer
 from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
@@ -48,6 +49,10 @@ def fit_generator(model,
   epoch = initial_epoch
 
   do_validation = bool(validation_data)
+  if not context.executing_eagerly():
+    model._make_train_function()
+    if do_validation:
+      model._make_test_function()
 
   is_sequence = isinstance(generator, Sequence)
   if not is_sequence and use_multiprocessing and workers > 1:
@@ -233,6 +238,9 @@ def evaluate_generator(model,
                        use_multiprocessing=False,
                        verbose=0):
   """See docstring for `Model.evaluate_generator`."""
+  if not context.executing_eagerly():
+    model._make_test_function()
+
   if hasattr(model, 'metrics'):
     for m in model.stateful_metric_functions:
       m.reset_states()
@@ -342,6 +350,9 @@ def predict_generator(model,
                       use_multiprocessing=False,
                       verbose=0):
   """See docstring for `Model.predict_generator`."""
+  if not context.executing_eagerly():
+    model._make_test_function()
+
   steps_done = 0
   wait_time = 0.01
   all_outs = []
-- 
GitLab


From 66575e0537ba8952de8ebc45d45d1b9e4ba1b6ba Mon Sep 17 00:00:00 2001
From: Samuel Matzek <smatzek@us.ibm.com>
Date: Thu, 2 Aug 2018 13:39:48 -0500
Subject: [PATCH 043/570] Add unit test for fit_generator changes

Add unit test for fit_generator change for callbacks.
---
 .../python/keras/engine/training_test.py      | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 30be4131a4..465b4ad65f 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine.training_utils import weighted_masked_objective
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
@@ -1190,6 +1191,37 @@ class TestGeneratorMethods(test.TestCase):
                                  use_multiprocessing=False,
                                  workers=0)
 
+  def test_fit_generator_with_callback(self):
+    model = keras.Sequential()
+    model.add(keras.layers.Dense(4, input_shape=(3,)))
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    model.compile(optimizer, 'mse', metrics=['mae'])
+
+    x = np.random.random((10, 3))
+    y = np.random.random((10, 4))
+
+    def iterator():
+      while 1:
+        yield x, y
+
+    class TestCallback(callbacks.Callback):
+      def set_model(self, model):
+        # Check the model operations for the optimizer operations that
+        # the _make_train_function adds under a named scope for the
+        # optimizer. This ensurs the full model is populated before the
+        # set_model callback is called.
+        optimizer_name_scope = 'training/TFOptimizer/'
+        graph_def = ops.get_default_graph().as_graph_def()
+        for node in graph_def.node:
+            if node.name.startswith(optimizer_name_scope):
+                return
+        raise RuntimeError('The optimizer operations are not present in the '
+                           'model graph when the Callback.set_model function '
+                           'is called')
+
+    model.fit_generator(iterator(), steps_per_epoch=3, epochs=1,
+                        callbacks=[TestCallback()])
+
   def test_generator_methods_with_sample_weights(self):
     arr_data = np.random.random((50, 2))
     arr_labels = np.random.random((50,))
-- 
GitLab


From da3ccfda9b75f3cf60eb237d9a4da68c436e9f18 Mon Sep 17 00:00:00 2001
From: Samuel Matzek <smatzek@us.ibm.com>
Date: Mon, 17 Sep 2018 11:59:14 -0500
Subject: [PATCH 044/570] Move test to callbacks_test

---
 tensorflow/python/keras/callbacks_test.py     | 40 +++++++++++++++++++
 .../python/keras/engine/training_test.py      | 31 --------------
 2 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index b6fae19823..28f7614463 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -30,6 +30,7 @@ import numpy as np
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python import keras
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
@@ -1222,6 +1223,45 @@ class KerasCallbacksTest(test.TestCase):
             callbacks=cbks,
             epochs=1)
 
+  def test_fit_generator_with_callback(self):
+
+    class TestCallback(keras.callbacks.Callback):
+      def set_model(self, model):
+        # Check the model operations for the optimizer operations that
+        # the _make_train_function adds under a named scope for the
+        # optimizer. This ensurs the full model is populated before the
+        # set_model callback is called.
+        optimizer_name_scope = 'training/' + model.optimizer.__class__.__name__
+        graph_def = ops.get_default_graph().as_graph_def()
+        for node in graph_def.node:
+            if node.name.startswith(optimizer_name_scope):
+                return
+        raise RuntimeError('The optimizer operations are not present in the '
+                           'model graph when the Callback.set_model function '
+                           'is called')
+    np.random.seed(1337)
+
+    def generator():
+      x = np.random.randn(10, 100).astype(np.float32)
+      y = np.random.randn(10, 10).astype(np.float32)
+      while True:
+        yield x, y
+
+    with self.cached_session():
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=10, num_classes=10, input_dim=100)
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+      model.fit_generator(
+          generator(),
+          steps_per_epoch=2,
+          epochs=1,
+          validation_data=generator(),
+          validation_steps=2,
+          callbacks=[TestCallback()],
+          verbose=0)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 465b4ad65f..d8510c1f23 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -1191,37 +1191,6 @@ class TestGeneratorMethods(test.TestCase):
                                  use_multiprocessing=False,
                                  workers=0)
 
-  def test_fit_generator_with_callback(self):
-    model = keras.Sequential()
-    model.add(keras.layers.Dense(4, input_shape=(3,)))
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    model.compile(optimizer, 'mse', metrics=['mae'])
-
-    x = np.random.random((10, 3))
-    y = np.random.random((10, 4))
-
-    def iterator():
-      while 1:
-        yield x, y
-
-    class TestCallback(callbacks.Callback):
-      def set_model(self, model):
-        # Check the model operations for the optimizer operations that
-        # the _make_train_function adds under a named scope for the
-        # optimizer. This ensurs the full model is populated before the
-        # set_model callback is called.
-        optimizer_name_scope = 'training/TFOptimizer/'
-        graph_def = ops.get_default_graph().as_graph_def()
-        for node in graph_def.node:
-            if node.name.startswith(optimizer_name_scope):
-                return
-        raise RuntimeError('The optimizer operations are not present in the '
-                           'model graph when the Callback.set_model function '
-                           'is called')
-
-    model.fit_generator(iterator(), steps_per_epoch=3, epochs=1,
-                        callbacks=[TestCallback()])
-
   def test_generator_methods_with_sample_weights(self):
     arr_data = np.random.random((50, 2))
     arr_labels = np.random.random((50,))
-- 
GitLab


From 3fe9c54b6181bc2bbfa535b28ecb7d3b74342bd8 Mon Sep 17 00:00:00 2001
From: Samuel Matzek <smatzek@us.ibm.com>
Date: Mon, 17 Sep 2018 12:13:15 -0500
Subject: [PATCH 045/570] Remove unnecessary import of callbacks

---
 tensorflow/python/keras/engine/training_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index d8510c1f23..30be4131a4 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -31,7 +31,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import metrics as metrics_module
-from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine.training_utils import weighted_masked_objective
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
-- 
GitLab


From 12718f0204bad8aaa3984c7a176914451eb0bbab Mon Sep 17 00:00:00 2001
From: Samuel Matzek <smatzek@us.ibm.com>
Date: Mon, 17 Sep 2018 13:24:29 -0500
Subject: [PATCH 046/570] Fix pylint error

---
 tensorflow/python/keras/callbacks_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 28f7614463..467bc4cdc4 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -1234,8 +1234,8 @@ class KerasCallbacksTest(test.TestCase):
         optimizer_name_scope = 'training/' + model.optimizer.__class__.__name__
         graph_def = ops.get_default_graph().as_graph_def()
         for node in graph_def.node:
-            if node.name.startswith(optimizer_name_scope):
-                return
+          if node.name.startswith(optimizer_name_scope):
+            return
         raise RuntimeError('The optimizer operations are not present in the '
                            'model graph when the Callback.set_model function '
                            'is called')
-- 
GitLab


From fbd48c7a8bb088d92988fce4f757d1719e9c57a2 Mon Sep 17 00:00:00 2001
From: Guozhong Zhuang <guozhong.zhuang@intel.com>
Date: Mon, 17 Sep 2018 12:24:43 -0700
Subject: [PATCH 047/570] fix type error within an environment variable name

---
 tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc | 2 +-
 tensorflow/core/kernels/mkl_conv_grad_input_ops.cc  | 2 +-
 tensorflow/core/kernels/mkl_conv_ops.cc             | 2 +-
 tensorflow/core/util/mkl_util.h                     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 52157ed5fb..f406ad2ab5 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -853,7 +853,7 @@ class MklConvCustomBackpropFilterOp
 
       // MKL DNN allocates large buffers when a conv gradient filter primtive is
       // created. So we don't cache conv backward primitives when the env
-      // variable TF_MKL_OPTIMIZE_PRIMITVE_MEMUSE is set to true.
+      // variable TF_MKL_OPTIMIZE_PRIMITIVE_MEMUSE is set to true.
       bool do_not_cache = MklPrimitiveFactory<T>::IsPrimitiveMemOptEnabled();
       conv_bwd_filter = MklConvBwdFilterPrimitiveFactory<T>::Get(
           convBwdFilterDims, do_not_cache);
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index c38c9cc27c..a501ce2c93 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -713,7 +713,7 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
           TFPaddingToMklDnnPadding(this->padding_));
 
       // We don't cache those primitves if the env variable
-      // TF_MKL_OPTIMIZE_PRIMITVE_MEMUSE is true and if primitve descriptor
+      // TF_MKL_OPTIMIZE_PRIMITIVE_MEMUSE is true and if primitve descriptor
       // includes potentialy large buffers. MKL DNN allocates buffers
       // in the following cases
       //   1. Legacy CPU without AVX512/AVX2, or
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 184e0cb003..b332edad0a 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -901,7 +901,7 @@ class MklConvOp : public OpKernel {
 
       // In some cases, primitve descriptor includes potentialy large buffers,
       // we don't cache those primitves if the env variable
-      // TF_MKL_OPTIMIZE_PRIMITVE_MEMUSE is true. MKL DNN allocates buffers
+      // TF_MKL_OPTIMIZE_PRIMITIVE_MEMUSE is true. MKL DNN allocates buffers
       // in the following cases
       //   1. Legacy CPU without AVX512/AVX2, or
       //   2. 1x1 convolution with stride != 1
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 680211edff..5ea8f2ee47 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -2040,7 +2040,7 @@ class MklPrimitiveFactory {
   /// Fuction to check whether primitive memory optimization is enabled
   static inline bool IsPrimitiveMemOptEnabled() {
     bool is_primitive_mem_opt_enabled = true;
-    TF_CHECK_OK(ReadBoolFromEnvVar("TF_MKL_OPTIMIZE_PRIMITVE_MEMUSE", true,
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_MKL_OPTIMIZE_PRIMITIVE_MEMUSE", true,
           &is_primitive_mem_opt_enabled));
     return is_primitive_mem_opt_enabled;
   }
-- 
GitLab


From 6d9bb99ea7a697e465ef66dea821a86ca94f845d Mon Sep 17 00:00:00 2001
From: Niranjan Hasabnis <niranjan.hasabnis@intel.com>
Date: Mon, 17 Sep 2018 17:22:40 -0700
Subject: [PATCH 048/570] Addressing review comments: indentation

---
 tensorflow/core/common_runtime/mkl_cpu_allocator.h  | 4 +---
 tensorflow/core/kernels/partitioned_function_ops.cc | 6 +++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 593f855ea2..01e5af5f8c 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -277,9 +277,7 @@ class MklCPUAllocator : public VisitableAllocator {
     // max_alloc_size from large_size_allocator would be the maximum
     // size allocated by MklCPUAllocator.
     stats->max_alloc_size = l_stats.max_alloc_size;
-
-    stats->bytes_limit =
-        std::max(s_stats.bytes_limit, l_stats.bytes_limit);
+    stats->bytes_limit = std::max(s_stats.bytes_limit, l_stats.bytes_limit);
   }
 
   void ClearStats() override {
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index ddb621967a..42f99a73e6 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -100,8 +100,8 @@ class PartitionedCallOp : public AsyncOpKernel {
         // We need to pass global op_registry as default_registry when creating
         // graph. So that graph optimization passes can lookup all possible ops
         // by name.
-        FunctionLibraryDefinition func_lib_def(OpRegistry::Global(),
-                                            fbody->graph->flib_def().ToProto());
+        FunctionLibraryDefinition func_lib_def(
+            OpRegistry::Global(), fbody->graph->flib_def().ToProto());
         auto graph = tensorflow::MakeUnique<Graph>(func_lib_def);
         CopyGraph(*fbody->graph, graph.get());
         OP_REQUIRES_OK_ASYNC(ctx, PinResourceArgs(graph.get(), args), done);
@@ -256,7 +256,7 @@ class PartitionedCallOp : public AsyncOpKernel {
             << partitions.size() << " shards.";
 
     FunctionLibraryDefinition func_lib_def(OpRegistry::Global(),
-                                          graph->flib_def().ToProto());
+                                           graph->flib_def().ToProto());
     for (const auto& partition : partitions) {
       std::unique_ptr<Graph> subgraph(new Graph(func_lib_def));
       GraphConstructorOptions opts;
-- 
GitLab


From 6d67ba41f566e963e2c061ca7df63edad89e1fca Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Tue, 18 Sep 2018 18:56:55 +0300
Subject: [PATCH 049/570] Work out the endianness statically.

---
 tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h b/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
index 5b42de4c5a..484cc4d6f5 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
@@ -20,12 +20,12 @@ limitations under the License.
 
 namespace tensorflow {
 
+constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
+
 class ByteSwapper {
  public:
   ByteSwapper(bool big_endian) {
-    int x = 1;
-    bool is_little_endian = (*(char *)&x == 1);
-    swap_ = big_endian == is_little_endian;
+    swap_ = big_endian == kLittleEndian;
   }
 
   inline void SwapIfRequiredInt16(int16_t *x) const {
-- 
GitLab


From 30f28a7f44f39cb8f24fde17252c3e2539c22bb0 Mon Sep 17 00:00:00 2001
From: Guozhong Zhuang <guozhong.zhuang@intel.com>
Date: Tue, 18 Sep 2018 09:52:03 -0700
Subject: [PATCH 050/570] change per code style check

---
 tensorflow/core/util/mkl_util.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 5ea8f2ee47..387e5ee5a6 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
-#define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#ifndef TENSORFLOW_TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#define TENSORFLOW_TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #ifdef INTEL_MKL
 
 #include <string>
@@ -2040,8 +2040,8 @@ class MklPrimitiveFactory {
   /// Fuction to check whether primitive memory optimization is enabled
   static inline bool IsPrimitiveMemOptEnabled() {
     bool is_primitive_mem_opt_enabled = true;
-    TF_CHECK_OK(ReadBoolFromEnvVar("TF_MKL_OPTIMIZE_PRIMITIVE_MEMUSE", true,
-          &is_primitive_mem_opt_enabled));
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_MKL_OPTIMIZE_PRIMITIVE_MEMUSE",
+        true, &is_primitive_mem_opt_enabled));
     return is_primitive_mem_opt_enabled;
   }
 
@@ -2098,7 +2098,7 @@ static inline memory::format get_desired_format(int channel,
              (channel % 8) == 0) {
     fmt_desired = is_2d
                       ? memory::format::nChw8c
-                      : memory::format::ncdhw;  //not support avx2 for 3d yet.
+                      : memory::format::ncdhw;  // not support avx2 for 3d yet.
   } else {
     fmt_desired = is_2d ? memory::format::nchw : memory::format::ncdhw;
   }
@@ -2210,7 +2210,8 @@ inline primitive FindOrCreateReorder(const memory* from, const memory* to) {
 
 // utility function to determine if it is conv 1x1 and stride != 1
 // for purpose of temporarily disabling primitive reuse
-inline bool IsConv1x1StrideNot1(memory::dims filter_dims, memory::dims strides) {
+inline bool IsConv1x1StrideNot1(memory::dims filter_dims,
+    memory::dims strides) {
   if (filter_dims.size() != 4 || strides.size() != 2) return false;
 
   return ((filter_dims[2] == 1) && (filter_dims[3] == 1) &&
@@ -2221,4 +2222,4 @@ inline bool IsConv1x1StrideNot1(memory::dims filter_dims, memory::dims strides)
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
-#endif  // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#endif  // TENSORFLOW_TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
-- 
GitLab


From 14e9345a88b08f5d2a12f3f441b1d82c041d7ea3 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Tue, 18 Sep 2018 18:23:52 +0000
Subject: [PATCH 051/570] Avoid saving sensitive information in graph.

---
 .../ignite/kernels/ignite_dataset_ops.cc      | 30 ++-------
 tensorflow/contrib/ignite/ops/dataset_ops.cc  | 10 ---
 .../ignite/python/ops/ignite_dataset_ops.py   | 18 +----
 .../python/tests/ignite_dataset_test.py       | 66 ++++++++++++++-----
 4 files changed, 56 insertions(+), 68 deletions(-)

diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc b/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
index e48fce4ed2..bdaed72387 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
+++ b/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
@@ -125,35 +125,15 @@ class IgniteDatasetOp : public DatasetOpKernel {
       OP_REQUIRES_OK(ctx,
                      ParseScalarArgument<int32>(ctx, "page_size", &page_size));
 
-    if (env_username)
-      username = string(env_username);
-    else
-      OP_REQUIRES_OK(ctx,
-                     ParseScalarArgument<string>(ctx, "username", &username));
+    if (env_username) username = string(env_username);
 
-    if (env_password)
-      password = string(env_password);
-    else
-      OP_REQUIRES_OK(ctx,
-                     ParseScalarArgument<string>(ctx, "password", &password));
+    if (env_password) password = string(env_password);
 
-    if (env_certfile)
-      certfile = string(env_certfile);
-    else
-      OP_REQUIRES_OK(ctx,
-                     ParseScalarArgument<string>(ctx, "certfile", &certfile));
+    if (env_certfile) certfile = string(env_certfile);
 
-    if (env_keyfile)
-      keyfile = string(env_keyfile);
-    else
-      OP_REQUIRES_OK(ctx,
-                     ParseScalarArgument<string>(ctx, "keyfile", &keyfile));
+    if (env_keyfile) keyfile = string(env_keyfile);
 
-    if (env_cert_password)
-      cert_password = string(env_cert_password);
-    else
-      OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "cert_password",
-                                                      &cert_password));
+    if (env_cert_password) cert_password = string(env_cert_password);
 
     const Tensor* schema_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("schema", &schema_tensor));
diff --git a/tensorflow/contrib/ignite/ops/dataset_ops.cc b/tensorflow/contrib/ignite/ops/dataset_ops.cc
index 7d18df11aa..3d6fbe00e6 100644
--- a/tensorflow/contrib/ignite/ops/dataset_ops.cc
+++ b/tensorflow/contrib/ignite/ops/dataset_ops.cc
@@ -26,11 +26,6 @@ REGISTER_OP("IgniteDataset")
     .Input("local: bool")
     .Input("part: int32")
     .Input("page_size: int32")
-    .Input("username: string")
-    .Input("password: string")
-    .Input("certfile: string")
-    .Input("keyfile: string")
-    .Input("cert_password: string")
     .Input("schema: int32")
     .Input("permutation: int32")
     .Output("handle: variant")
@@ -54,11 +49,6 @@ port: Ignite Thin Client Port.
 local: Local flag that defines that data should be fetched from local host only.
 part: Partition data should be fetched from.
 page_size: Page size for Ignite Thin Client.
-username: Username to authenticate via Ignite Thin Client.
-password: Password to authenticate via Ignite Thin Client.
-certfile: SSL certificate to establish SSL connection.
-keyfile: Private key file to establish SSL connection.
-cert_password: SSL certificate password to establish SSL connection.
 schema: Internal structure that defines schema of cache objects.
 permutation: Internal structure that defines permutation of cache objects.
 )doc");
diff --git a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
index c0e24b1c69..7fc9e1fdd1 100644
--- a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
+++ b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
@@ -732,18 +732,6 @@ class IgniteDataset(Dataset):
     self.part = ops.convert_to_tensor(part, dtype=dtypes.int32, name="part")
     self.page_size = ops.convert_to_tensor(page_size, dtype=dtypes.int32,
                                            name="page_size")
-    self.username = ops.convert_to_tensor("" if username is None else username,
-                                          dtype=dtypes.string, name="username")
-    self.password = ops.convert_to_tensor("" if password is None else password,
-                                          dtype=dtypes.string, name="password")
-    self.certfile = ops.convert_to_tensor("" if certfile is None else certfile,
-                                          dtype=dtypes.string, name="certfile")
-    self.keyfile = ops.convert_to_tensor("" if keyfile is None else keyfile,
-                                         dtype=dtypes.string, name="keyfile")
-    self.cert_password = ops.convert_to_tensor("" if cert_password is None
-                                               else cert_password,
-                                               dtype=dtypes.string,
-                                               name="cert_password")
     self.schema = ops.convert_to_tensor(self.cache_type.to_flat(),
                                         dtype=dtypes.int32, name="schema")
     self.permutation = ops.convert_to_tensor(self.cache_type.to_permutation(),
@@ -753,10 +741,8 @@ class IgniteDataset(Dataset):
   def _as_variant_tensor(self):
     return gen_dataset_ops.ignite_dataset(self.cache_name, self.host,
                                           self.port, self.local, self.part,
-                                          self.page_size, self.username,
-                                          self.password, self.certfile,
-                                          self.keyfile, self.cert_password,
-                                          self.schema, self.permutation)
+                                          self.page_size, self.schema,
+                                          self.permutation)
 
   @property
   def output_classes(self):
diff --git a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
index 933e62b804..5d74617690 100644
--- a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
+++ b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
@@ -35,28 +35,60 @@ class IgniteDatasetTest(test.TestCase):
   """
 
   def test_ignite_dataset_with_plain_client(self):
+    """Test Ignite Dataset with plain client.
+    """
+    self._clear_env()
     ds = IgniteDataset(cache_name="SQL_PUBLIC_TEST_CACHE", port=42300)
-    self.__check_dataset(ds)
+    self._check_dataset(ds)
 
   def test_ignite_dataset_with_ssl_client(self):
-    ds = IgniteDataset(cache_name="SQL_PUBLIC_TEST_CACHE", port=42301,\
-      certfile=os.path.dirname(os.path.realpath(__file__)) +\
-      "/keystore/client.pem", cert_password="123456")
-    self.__check_dataset(ds)
+    """Test Ignite Dataset with ssl client.
+    """
+    self._clear_env()
+    os.environ["IGNITE_DATASET_CERTFILE"] = os.path.dirname(
+        os.path.realpath(__file__)) + "/keystore/client.pem"
+    os.environ["IGNITE_DATASET_CERT_PASSWORD"] = "123456"
+
+    ds = IgniteDataset(cache_name="SQL_PUBLIC_TEST_CACHE", port=42301,
+                       certfile=os.environ["IGNITE_DATASET_CERTFILE"],
+                       cert_password=os.environ["IGNITE_DATASET_CERT_PASSWORD"])
+    self._check_dataset(ds)
 
   def test_ignite_dataset_with_ssl_client_and_auth(self):
-    ds = IgniteDataset(cache_name="SQL_PUBLIC_TEST_CACHE", port=42302,\
-      certfile=os.path.dirname(os.path.realpath(__file__)) +\
-      "/keystore/client.pem", cert_password="123456",\
-      username="ignite", password="ignite")
-    self.__check_dataset(ds)
+    """Test Ignite Dataset with ssl client and authentication.
+    """
+    self._clear_env()
+    os.environ['IGNITE_DATASET_USERNAME'] = "ignite"
+    os.environ['IGNITE_DATASET_PASSWORD'] = "ignite"
+    os.environ['IGNITE_DATASET_CERTFILE'] = os.path.dirname(
+        os.path.realpath(__file__)) + "/keystore/client.pem"
+    os.environ['IGNITE_DATASET_CERT_PASSWORD'] = "123456"
+
+    ds = IgniteDataset(cache_name="SQL_PUBLIC_TEST_CACHE", port=42302,
+                       certfile=os.environ['IGNITE_DATASET_CERTFILE'],
+                       cert_password=os.environ['IGNITE_DATASET_CERT_PASSWORD'],
+                       username=os.environ['IGNITE_DATASET_USERNAME'],
+                       password=os.environ['IGNITE_DATASET_PASSWORD'])
+    self._check_dataset(ds)
+
+  def _clear_env(self):
+    """Clears environment variables used by Ignite Dataset.
+    """
+    if 'IGNITE_DATASET_USERNAME' in os.environ:
+      del os.environ['IGNITE_DATASET_USERNAME']
+    if 'IGNITE_DATASET_PASSWORD' in os.environ:
+      del os.environ['IGNITE_DATASET_PASSWORD']
+    if 'IGNITE_DATASET_CERTFILE' in os.environ:
+      del os.environ['IGNITE_DATASET_CERTFILE']
+    if 'IGNITE_DATASET_CERT_PASSWORD' in os.environ:
+      del os.environ['IGNITE_DATASET_CERT_PASSWORD']
 
-  def __check_dataset(self, dataset):
+  def _check_dataset(self, dataset):
     """Checks that dataset provids correct data.
     """
-    self.assertEquals(tf.int64, dataset.output_types['key'])
-    self.assertEquals(tf.string, dataset.output_types['val']['NAME'])
-    self.assertEquals(tf.int64, dataset.output_types['val']['VAL'])
+    self.assertEqual(tf.int64, dataset.output_types['key'])
+    self.assertEqual(tf.string, dataset.output_types['val']['NAME'])
+    self.assertEqual(tf.int64, dataset.output_types['val']['VAL'])
 
     it = dataset.make_one_shot_iterator()
     ne = it.get_next()
@@ -66,11 +98,11 @@ class IgniteDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(ne)
 
-    self.assertEquals({'key': 1, 'val': {'NAME': b'TEST1', 'VAL': 42}},\
+    self.assertEqual({'key': 1, 'val': {'NAME': b'TEST1', 'VAL': 42}},\
       rows[0])
-    self.assertEquals({'key': 2, 'val': {'NAME': b'TEST2', 'VAL': 43}},\
+    self.assertEqual({'key': 2, 'val': {'NAME': b'TEST2', 'VAL': 43}},\
       rows[1])
-    self.assertEquals({'key': 3, 'val': {'NAME': b'TEST3', 'VAL': 44}},\
+    self.assertEqual({'key': 3, 'val': {'NAME': b'TEST3', 'VAL': 44}},\
       rows[2])
 
 if __name__ == "__main__":
-- 
GitLab


From 1e821cd9a02b59a90a8b983759cf74eded16265f Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 19 Sep 2018 11:06:40 -0700
Subject: [PATCH 052/570] Fix bug in metrics sparse_categorical_accuracy and
 sparse_top_k_categorical_accuracy

---
 tensorflow/python/keras/metrics.py      | 15 ++++++++------
 tensorflow/python/keras/metrics_test.py | 26 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index e64241e5cf..2fd3244800 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -635,7 +635,9 @@ def categorical_accuracy(y_true, y_pred):
 
 @tf_export('keras.metrics.sparse_categorical_accuracy')
 def sparse_categorical_accuracy(y_true, y_pred):
-  y_true = math_ops.reduce_max(y_true, axis=-1)
+  # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
+  if (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))):
+    y_true = array_ops.squeeze(y_true, [-1])
   y_pred = math_ops.argmax(y_pred, axis=-1)
 
   # If the expected labels are float, we need to cast the int returned by
@@ -654,11 +656,12 @@ def top_k_categorical_accuracy(y_true, y_pred, k=5):
 
 @tf_export('keras.metrics.sparse_top_k_categorical_accuracy')
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
-  return K.mean(
-      nn.in_top_k(y_pred,
-                  math_ops.cast(math_ops.reduce_max(y_true, axis=-1), 'int32'),
-                  k),
-      axis=-1)
+  # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
+  if (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))):
+    y_true = array_ops.squeeze(y_true, [-1])
+
+  return K.mean(nn.in_top_k(y_pred, math_ops.cast(y_true, 'int32'), k),
+                axis=-1)
 
 # Aliases
 
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 4195ea18ad..43ac5b7ead 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -54,6 +54,18 @@ class KerasMetricsTest(test.TestCase):
       y_pred = K.variable(np.random.random((6, 7)))
       self.assertEqual(K.eval(metric(y_true, y_pred)).shape, (6,))
 
+      # Test correctness if the shape of y_true is (num_samples,)
+      y_true = K.variable([1., 0., 0., 0.])
+      y_pred = K.variable([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
+      print(K.eval(metric(y_true, y_pred)))
+      self.assertAllEqual(K.eval(metric(y_true, y_pred)), [0., 1., 1., 1.])
+
+      # Test correctness if the shape of y_true is (num_samples, 1)
+      y_true = K.variable([[1.], [0.], [0.], [0.]])
+      y_pred = K.variable([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
+      print(K.eval(metric(y_true, y_pred)))
+      self.assertAllEqual(K.eval(metric(y_true, y_pred)), [0., 1., 1., 1.])
+
   def test_sparse_categorical_accuracy_float(self):
     with self.cached_session():
       metric = metrics.sparse_categorical_accuracy
@@ -79,6 +91,7 @@ class KerasMetricsTest(test.TestCase):
 
   def test_sparse_top_k_categorical_accuracy(self):
     with self.cached_session():
+      # Test correctness if the shape of y_true is (num_samples, 1)
       y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
       y_true = K.variable(np.array([[1], [0]]))
       result = K.eval(
@@ -91,6 +104,19 @@ class KerasMetricsTest(test.TestCase):
           metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
       self.assertEqual(result, 0.)
 
+      # Test correctness if the shape of y_true is (num_samples,)
+      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
+      y_true = K.variable(np.array([1, 0]))
+      result = K.eval(
+        metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
+      self.assertEqual(result, 1)
+      result = K.eval(
+        metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
+      self.assertEqual(result, 0.5)
+      result = K.eval(
+        metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
+      self.assertEqual(result, 0.)
+
   def test_top_k_categorical_accuracy(self):
     with self.cached_session():
       y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
-- 
GitLab


From 78e205d35b31aa49e8dac357d827900a165f0a21 Mon Sep 17 00:00:00 2001
From: Erik Smistad <ersmistad@gmail.com>
Date: Thu, 20 Sep 2018 15:56:34 +0200
Subject: [PATCH 053/570] Added warning message if cmake version is below 3.8
 or host toolset is not set to x64 on windows

---
 tensorflow/contrib/cmake/CMakeLists.txt | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 225c5e6227..a7a66472df 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -1,8 +1,14 @@
 # Minimum CMake required
+cmake_minimum_required(VERSION 3.5)
+
 if(WIN32)
-  cmake_minimum_required(VERSION 3.8)
-else()
-  cmake_minimum_required(VERSION 3.5)
+	if(${CMAKE_VERSION} VERSION_LESS "3.8")
+		message(WARNING "Your current cmake version is ${CMAKE_VERSION} which does not support setting the toolset architecture to x64. This may cause \"compiler out of heap space\" errors when building. Consider upgrading your cmake to > 3.8 and using the flag -Thost=x64 when running cmake.")
+	else()
+		if(NOT CMAKE_VS_PLATFORM_TOOLSET_HOST_ARCHITECTURE OR NOT "${CMAKE_VS_PLATFORM_TOOLSET_HOST_ARCHITECTURE}" STREQUAL "x64")
+			message(WARNING "Your current cmake generator is set to use 32 bit toolset architecture. This may cause \"compiler out of heap space\" errors when building. Consider using the flag -Thost=x64 when running cmake.")
+		endif()
+	endif()
 endif()
 
 # Project
-- 
GitLab


From dcd63fab37f686a069b54a7653254bbb15a2bf20 Mon Sep 17 00:00:00 2001
From: Niranjan Hasabnis <niranjan.hasabnis@intel.com>
Date: Thu, 20 Sep 2018 11:04:25 -0700
Subject: [PATCH 054/570] Fix for failing eager:function_test

---
 tensorflow/core/kernels/partitioned_function_ops.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 42f99a73e6..7a5a2ff8fa 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -100,9 +100,9 @@ class PartitionedCallOp : public AsyncOpKernel {
         // We need to pass global op_registry as default_registry when creating
         // graph. So that graph optimization passes can lookup all possible ops
         // by name.
-        FunctionLibraryDefinition func_lib_def(
-            OpRegistry::Global(), fbody->graph->flib_def().ToProto());
-        auto graph = tensorflow::MakeUnique<Graph>(func_lib_def);
+        auto graph = tensorflow::MakeUnique<Graph>(fbody->graph->flib_def());
+        FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
+        graph.get()->AddFunctionLibrary(global_flib.ToProto());
         CopyGraph(*fbody->graph, graph.get());
         OP_REQUIRES_OK_ASYNC(ctx, PinResourceArgs(graph.get(), args), done);
 
@@ -255,10 +255,10 @@ class PartitionedCallOp : public AsyncOpKernel {
     VLOG(3) << "Partitioned function '" << func_.name() << "', yielding "
             << partitions.size() << " shards.";
 
-    FunctionLibraryDefinition func_lib_def(OpRegistry::Global(),
-                                           graph->flib_def().ToProto());
     for (const auto& partition : partitions) {
-      std::unique_ptr<Graph> subgraph(new Graph(func_lib_def));
+      std::unique_ptr<Graph> subgraph(new Graph(graph->flib_def()));
+      FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
+      subgraph.get()->AddFunctionLibrary(global_flib.ToProto());
       GraphConstructorOptions opts;
       opts.allow_internal_ops = true;
       opts.expect_device_spec = true;
-- 
GitLab


From 039ddaa6c0af4be4291383564db5a964d0035c1d Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 20 Sep 2018 15:49:40 -0700
Subject: [PATCH 055/570] Fix bad indentation

---
 tensorflow/python/keras/metrics_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 43ac5b7ead..5f5565d4d5 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -108,13 +108,13 @@ class KerasMetricsTest(test.TestCase):
       y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
       y_true = K.variable(np.array([1, 0]))
       result = K.eval(
-        metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
       self.assertEqual(result, 1)
       result = K.eval(
-        metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
       self.assertEqual(result, 0.5)
       result = K.eval(
-        metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
       self.assertEqual(result, 0.)
 
   def test_top_k_categorical_accuracy(self):
-- 
GitLab


From 16a257eb598b7dfd220249babf8d18c984aab103 Mon Sep 17 00:00:00 2001
From: Guozhong Zhuang <guozhong.zhuang@intel.com>
Date: Fri, 21 Sep 2018 09:43:22 -0700
Subject: [PATCH 056/570] change back MICRA def - coding styling

---
 tensorflow/core/util/mkl_util.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 387e5ee5a6..f371fd6f95 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
-#define TENSORFLOW_TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#ifndef TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #ifdef INTEL_MKL
 
 #include <string>
@@ -2222,4 +2222,4 @@ inline bool IsConv1x1StrideNot1(memory::dims filter_dims,
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
-#endif  // TENSORFLOW_TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#endif  // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
-- 
GitLab


From 59a47b7d330a40971bad89f0e8aa282e79e889f1 Mon Sep 17 00:00:00 2001
From: Guozhong Zhuang <guozhong.zhuang@intel.com>
Date: Fri, 21 Sep 2018 09:56:29 -0700
Subject: [PATCH 057/570] refine a comment per Tatiana's suggestions

---
 tensorflow/core/util/mkl_util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index f371fd6f95..2f2705de92 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -2098,7 +2098,7 @@ static inline memory::format get_desired_format(int channel,
              (channel % 8) == 0) {
     fmt_desired = is_2d
                       ? memory::format::nChw8c
-                      : memory::format::ncdhw;  // not support avx2 for 3d yet.
+                      : memory::format::ncdhw;  // no avx2 support for 3d yet.
   } else {
     fmt_desired = is_2d ? memory::format::nchw : memory::format::ncdhw;
   }
-- 
GitLab


From 268bf6b118646c8e93162d591263bca907c7db28 Mon Sep 17 00:00:00 2001
From: AG Ramesh <ag.ramesh@intel.com>
Date: Fri, 21 Sep 2018 11:39:29 -0700
Subject: [PATCH 058/570] Removing dead code. With the addition of mkl slice
 using MKL DNN this code will not longer be executed

---
 tensorflow/core/kernels/slice_op.cc | 198 ----------------------------
 1 file changed, 198 deletions(-)

diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 77594479cb..83377ffab5 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -228,190 +228,6 @@ class SliceOp : public OpKernel {
   }
 };
 
-#ifdef INTEL_MKL
-template <typename Device, typename T>
-class MklSliceOp : public OpKernel {
- public:
-  explicit MklSliceOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    TensorShape output_shape;
-    gtl::InlinedVector<int64, 4> begin;
-    gtl::InlinedVector<int64, 4> size;
-    Tensor* result = nullptr;
-    bool done = false;
-    SharedSliceCommonCases<T>(context, &output_shape, &begin, &size, &result,
-                              &done);
-    if (!context->status().ok() || done == true) return;
-
-    const Tensor& input = context->input(0);
-    const int input_dims = input.dims();
-
-    if (output_shape.num_elements() > 0) {
-      if (std::is_same<Device, CPUDevice>::value && input_dims == 2 &&
-          DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
-        auto input = context->input(0).tensor<T, 2>();
-        auto output = result->tensor<T, 2>();
-        // TODO(agarwal): Consider multi-threading this loop for cases where
-        // size[0] is very large.
-        for (int i = 0; i < size[0]; ++i) {
-          const int64 row = begin[0] + i;
-          if (i + 1 < size[0]) {
-            port::prefetch<port::PREFETCH_HINT_T0>(&output(i + 1, 0));
-            port::prefetch<port::PREFETCH_HINT_T0>(&input(row + 1, begin[1]));
-          }
-          memcpy(&output(i, 0), &input(row, begin[1]), size[1] * sizeof(T));
-        }
-        return;
-      }
-#define HANDLE_DIM(NDIM)                            \
-  if (input_dims == NDIM) {                         \
-    HandleCase<NDIM>(context, begin, size, result); \
-    return;                                         \
-  }
-
-      HANDLE_DIM(1);
-      HANDLE_DIM(2);
-      HANDLE_DIM(3);
-      HANDLE_DIM(4);
-      HANDLE_DIM(5);
-      HANDLE_DIM(6);
-      HANDLE_DIM(7);
-
-#undef HANDLE_DIM
-
-      OP_REQUIRES(
-          context, false,
-          errors::Unimplemented("SliceOp : Unhandled input dimensions"));
-    }
-  }
-
- private:
-  // Helper function for DoesSliceShapeDifferInOnly1D. Checks if the following
-  // criteria matches for slice_dim: if indices for slice are 0 in all dims
-  // except slice_dim and if sizes of all the dimensions of the slice are same
-  // as the sizes of all the dimensions of the input except slice_dim, then
-  // returns True. Otherwise, returns False.
-  bool DoesSliceShapeDifferInOnly1DHelper(const TensorShape& input_shape,
-                                          const gtl::ArraySlice<int64>& begin,
-                                          const gtl::ArraySlice<int64>& size,
-                                          int slice_dim) {
-    for (int dim = 0; dim < 4; dim++) {
-      if (dim != slice_dim &&
-          (begin[dim] != 0 || size[dim] != input_shape.dim_size(dim))) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  // Is 'input' tensor being sliced over a single dimension out of 4?
-  //
-  // This check is applicable in the context of Slice of a 4-D tensor in
-  // NHWC or NCHW format over channel dimension.
-  //
-  // If indices for slice are 0 in all dims except one dimension and if sizes of
-  // all dimensions of slice are same as sizes of all dimensions of inputs
-  // except that dimension, then we are slicing over a single dimension.
-  //
-  // Returns True if Slicing over a single dimension, and sets slice_dim
-  // to the number of the dimension that satisfies criteria.
-  bool DoesSliceShapeDifferInOnly1D(const TensorShape& input_shape,
-                                    const gtl::ArraySlice<int64>& begin,
-                                    const gtl::ArraySlice<int64>& size,
-                                    int* slice_dim) {
-    for (int dim = 0; dim < 4; dim++) {
-      if (DoesSliceShapeDifferInOnly1DHelper(input_shape, begin, size, dim)) {
-        *slice_dim = dim;
-        return true;
-      }
-    }
-    return false;
-  }
-
-  template <int NDIM>
-  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
-                  const gtl::ArraySlice<int64>& size, Tensor* result) {
-    int slice_dim = -1;
-    TensorShape in_shape = context->input(0).shape();
-    // Special case for handling 4-D tensor slice when shape of the slice
-    // differs from the input tensor in only 1 out of 4 dimensions.
-    // This case arises in the context of Slice of 4-D tensor in NHWC or NCHW
-    // format over channel dimension.
-    if (NDIM == 4 &&
-        DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
-      size_t in_strides[4] = {
-          (size_t)in_shape.dim_size(1) * in_shape.dim_size(2) *
-              in_shape.dim_size(3),
-          (size_t)in_shape.dim_size(2) * in_shape.dim_size(3),
-          (size_t)in_shape.dim_size(3), (size_t)1};
-
-      size_t out_strides[4] = {(size_t)size[1] * size[2] * size[3],
-                               (size_t)size[2] * size[3], (size_t)size[3],
-                               (size_t)1};
-
-      T* in_buf = const_cast<T*>(
-          const_cast<const T*>(context->input(0).flat<T>().data()));
-      T* op_buf = result->flat<T>().data();
-
-      if (slice_dim == 1) {
-        /* data format = NCHW */
-
-#pragma omp parallel for
-        for (ssize_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
-          T* ip = in_buf + (d0 * in_strides[0]);
-          T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
-#pragma omp parallel for
-          for (ssize_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
-            T* ip1 = ip + (d1 * in_strides[1]);
-            T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
-            // For NCHW, H and W will be contiguous. So we can copy
-            // both with one memcpy.
-            memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
-                   sizeof(T) * in_strides[1]);
-          }
-        }
-        return;
-      } else if (slice_dim == 3) {
-        /* data_format = NHWC */
-
-#pragma omp parallel for
-        for (ssize_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
-          T* ip = in_buf + (d0 * in_strides[0]);
-          T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
-#pragma omp parallel for
-          for (ssize_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
-            T* ip1 = ip + (d1 * in_strides[1]);
-            T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
-#pragma omp parallel for
-            for (ssize_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
-              T* ip2 = ip1 + (d2 * in_strides[2]);
-              T* ip3 = ip2 + begin[3];
-              T* op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
-              T* op3 = op2;
-              memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
-                     sizeof(T) * size[3]);
-            }
-          }
-        }
-        return;
-      }
-      // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
-    }
-
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
-    for (int i = 0; i < NDIM; ++i) {
-      indices[i] = begin[i];
-      sizes[i] = size[i];
-    }
-
-    functor::Slice<Device, T, NDIM>()(
-        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
-        context->input(0).tensor<T, NDIM>(), indices, sizes);
-  }
-};
-#endif
 
 // Forward declarations of the functor specializations for declared in the
 // sharded source files.
@@ -440,7 +256,6 @@ TF_CALL_ALL_TYPES(DECLARE_FOR_N);
 #undef DECLARE_CPU_SPEC
 }  // namespace functor
 
-#ifndef INTEL_MKL
 #define REGISTER_SLICE(type)                             \
   REGISTER_KERNEL_BUILDER(Name("Slice")                  \
                               .Device(DEVICE_CPU)        \
@@ -452,19 +267,6 @@ TF_CALL_ALL_TYPES(DECLARE_FOR_N);
 TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
 TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);
 #undef REGISTER_SLICE
-#else
-#define REGISTER_SLICE(type)                             \
-  REGISTER_KERNEL_BUILDER(Name("Slice")                  \
-                              .Device(DEVICE_CPU)        \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("begin")       \
-                              .HostMemory("size"),       \
-                          MklSliceOp<CPUDevice, type>)
-
-TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
-TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);
-#undef REGISTER_SLICE
-#endif  // INTEL_MKL
 
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
-- 
GitLab


From 457ef66c2d4985000aa1d1a9bc643f66bbddd46d Mon Sep 17 00:00:00 2001
From: Martin Wicke <577277+martinwicke@users.noreply.github.com>
Date: Fri, 21 Sep 2018 12:58:32 -0700
Subject: [PATCH 059/570] Fix long lines

---
 tensorflow/python/keras/layers/embeddings.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index a0b9393812..76e551a7ce 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -142,12 +142,14 @@ class Embedding(Layer):
       else:
         in_lens = [self.input_length]
       if len(in_lens) != len(input_shape) - 1:
-        raise ValueError('"input_length" is %s, but received input has shape %s' %
+        raise ValueError('"input_length" is %s, '
+                         'but received input has shape %s' %
                          (str(self.input_length), str(input_shape)))
       else:
         for i, (s1, s2) in enumerate(zip(in_lens, input_shape[1:])):
           if s1 is not None and s2 is not None and s1 != s2:
-            raise ValueError('"input_length" is %s, but received input has shape %s' %
+            raise ValueError('"input_length" is %s, '
+                             'but received input has shape %s' %
                              (str(self.input_length), str(input_shape)))
           elif s1 is None:
             in_lens[i] = s2
-- 
GitLab


From 282d6e7c384c83f9b6bf43b7b37eb606ccc64d06 Mon Sep 17 00:00:00 2001
From: Martin Wicke <577277+martinwicke@users.noreply.github.com>
Date: Fri, 21 Sep 2018 12:59:15 -0700
Subject: [PATCH 060/570] Fix long lines

---
 tensorflow/python/ops/nn_ops.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 17e10995f2..a68422c315 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -818,12 +818,14 @@ class Convolution(object):
     try:
       input_shape.with_rank(num_spatial_dims + 2)
     except ValueError:
-      raise ValueError("input tensor must have rank %d" % (num_spatial_dims + 2))
+      raise ValueError("input tensor must have rank %d" % 
+                       (num_spatial_dims + 2))
 
     try:
       filter_shape.with_rank(num_spatial_dims + 2)
     except ValueError:
-      raise ValueError("filter tensor must have rank %d" % (num_spatial_dims + 2))
+      raise ValueError("filter tensor must have rank %d" % 
+                       (num_spatial_dims + 2))
 
     if data_format is None or not data_format.startswith("NC"):
       input_channels_dim = input_shape[num_spatial_dims + 1]
-- 
GitLab


From 6dd7a09211cc74d11ff1554624b527c432020cbc Mon Sep 17 00:00:00 2001
From: wangsiyu <siyu.wsy@gmail.com>
Date: Sun, 23 Sep 2018 20:33:19 +0800
Subject: [PATCH 061/570] Enable partitioned variable assignments

---
 .../python/kernel_tests/variables_test.py     | 43 ++++++++++++++++-
 tensorflow/python/ops/variables.py            | 47 +++++++++++++++++--
 2 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 2e7975667c..687784c8b7 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -673,7 +673,7 @@ class PartitionedVariableTest(test.TestCase):
         v0._set_save_slice_info(
             variables.Variable.SaveSliceInfo(v0.name, [2], [0], [1]))
         v1._set_save_slice_info(
-            variables.Variable.SaveSliceInfo(v0.name, [2], [1], [1]))
+            variables.Variable.SaveSliceInfo(v1.name, [2], [1], [1]))
         partitions = [2]
 
         variables.PartitionedVariable(
@@ -696,6 +696,47 @@ class PartitionedVariableTest(test.TestCase):
             variable_list=[v0],
             partitions=partitions)
 
+  def testPartitionedVariableAssignments(self):
+    with ops.Graph().as_default(), self.cached_session() as sess:
+      v0 = variables.Variable(initial_value=[0.0])
+      v1 = variables.Variable(initial_value=[1.0])
+      v0._set_save_slice_info(
+          variables.Variable.SaveSliceInfo(v0.name, [2], [0], [1]))
+      v1._set_save_slice_info(
+          variables.Variable.SaveSliceInfo(v0.name, [2], [1], [1]))
+      partitions = [2]
+
+      # Pass variable_list as [v1, v0] to ensure they are properly
+      # re-sorted to [v0, v1] based on their slice info offsets.
+      partitioned_variable = variables.PartitionedVariable(
+          name="two_vars",
+          shape=[2],
+          dtype=v0.dtype,
+          variable_list=[v0, v1],
+          partitions=partitions)
+      
+      deltas_a = constant_op.constant([1.0, 2.0])
+      deltas_b = constant_op.constant([3.0, 4.0])
+      ones = array_ops.ones([2])
+      plus_delta = partitioned_variable.assign_add(deltas_a)
+      minus_delta = partitioned_variable.assign_sub(deltas_b)
+      assign_ones = partitioned_variable.assign(ones)
+      variables.global_variables_initializer().run()
+
+      self.assertEqual([1.0], plus_delta[0].eval())
+      self.assertEqual([1.0], v0.eval())
+      self.assertEqual([3.0], plus_delta[1].eval())
+      self.assertEqual([3.0], v1.eval())
+      
+      self.assertEqual([-2.0], minus_delta[0].eval())
+      self.assertEqual([-2.0], v0.eval())
+      self.assertEqual([-1.0], minus_delta[1].eval())
+      self.assertEqual([-1.0], v1.eval())
+ 
+      self.assertEqual([1.0], assign_ones[0].eval())
+      self.assertEqual([1.0], v0.eval())
+      self.assertEqual([1.0], assign_ones[1].eval())
+      self.assertEqual([1.0], v1.eval())
 
 class VariableContainerTest(test.TestCase):
 
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 7a46157739..2d6a767fed 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -2395,11 +2395,50 @@ class PartitionedVariable(object):
   def _get_partitions(self):
     return self._partitions
 
-  def assign(self, value, use_locking=False):
-    _ = value, use_locking
-    raise NotImplementedError(
-        "assign() has not been implemented for PartitionedVariable.")
+  def _apply_assign_fn(self,
+                       assign_fn,
+                       value):
+    partition_axes = self._partition_axes()
+    if len(partition_axes) > 1:
+      raise NotImplementedError(
+          "Cannot concatenate along more than one dimension: %s.  "
+          "Multi-axis partition assign_fn is not supported" % str(partition_axes))
+    partition_ix = partition_axes[0]
+    size_splits_list = [
+        var.shape[partition_ix].value for var in self._variable_list]
+    value_list = array_ops.split(
+        value, size_splits_list, axis=partition_ix)
+    op_list = [
+        assign_fn(var, value_list[idx], idx) \
+        for idx, var in enumerate(self._variable_list)]
+    return op_list
 
+  def assign(self, value, use_locking=False, name=None, read_value=True):
+    assign_fn = lambda var, r_value, idx: var.assign(
+        r_value, use_locking=use_locking,
+        name="%s_%d" % (name, idx), read_value=read_value)
+    assign_list = self._apply_assign_fn(assign_fn, value)
+    if read_value:
+      return assign_list
+    return [assign.op for assign in assign_list]
+
+  def assign_add(self, value, use_locking=False, name=None, read_value=True):
+    assign_fn = lambda var, r_value, idx: var.assign_add(
+        r_value, use_locking=use_locking,
+        name="%s_%d" % (name, idx), read_value=read_value)
+    assign_list = self._apply_assign_fn(assign_fn, value)
+    if read_value:
+      return assign_list
+    return [assign.op for assign in assign_list]
+
+  def assign_sub(self, value, use_locking=False, name=None, read_value=True):
+    assign_fn = lambda var, r_value, idx: var.assign_sub(
+        r_value, use_locking=use_locking,
+        name="%s_%d" % (name, idx), read_value=read_value)
+    assign_list = self._apply_assign_fn(assign_fn, value)
+    if read_value:
+      return assign_list
+    return [assign.op for assign in assign_list]
 
 @tf_export("global_variables")
 def global_variables(scope=None):
-- 
GitLab


From a4eecdb369ecdae3b7fe7c1415d7b3b55bcc7b9e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 23 Sep 2018 17:14:53 +0000
Subject: [PATCH 062/570] Fix GPU build issue on python 3

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/image/kernels/image_ops.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h
index 6b63eed130..7fac774d07 100644
--- a/tensorflow/contrib/image/kernels/image_ops.h
+++ b/tensorflow/contrib/image/kernels/image_ops.h
@@ -71,14 +71,7 @@ class ProjectiveGenerator {
         (transform[3] * output_x + transform[4] * output_y + transform[5]) /
         projection;
 
-    // TODO(ringwalt): Add a fill value input.
-#if (defined __CUDA_ARCH__) && (CUDART_VERSION < 8000)
-    // On CUDA versions previous to 8.0, only __shared__ variables
-    // could be declared as static in the device code.
     const T fill_value = T(0);
-#else
-    static const T fill_value = T(0);
-#endif
     switch (interpolation_) {
       case INTERPOLATION_NEAREST:
         // Switch the order of x and y again for indexing into the image.
-- 
GitLab


From 8f4ded5884684f40b4912d95c717b185340996b8 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Mon, 24 Sep 2018 11:07:21 +0300
Subject: [PATCH 063/570] Fix clang styles.

---
 tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h b/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
index 484cc4d6f5..6753c67701 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
@@ -24,9 +24,7 @@ constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
 
 class ByteSwapper {
  public:
-  ByteSwapper(bool big_endian) {
-    swap_ = big_endian == kLittleEndian;
-  }
+  ByteSwapper(bool big_endian) { swap_ = big_endian == kLittleEndian; }
 
   inline void SwapIfRequiredInt16(int16_t *x) const {
     if (swap_) {
-- 
GitLab


From 90c68770467701a23d23a85c5d769f6f4fa39f0f Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Mon, 24 Sep 2018 12:14:45 +0300
Subject: [PATCH 064/570] Fix byte-order issue.

---
 tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h b/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
index 6753c67701..46df3e39dc 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
+++ b/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
@@ -17,14 +17,13 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BYTE_SWAPPER_H_
 
 #include <stdint.h>
+#include "tensorflow/core/platform/byte_order.h"
 
 namespace tensorflow {
 
-constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
-
 class ByteSwapper {
  public:
-  ByteSwapper(bool big_endian) { swap_ = big_endian == kLittleEndian; }
+  ByteSwapper(bool big_endian) { swap_ = big_endian == port::kLittleEndian; }
 
   inline void SwapIfRequiredInt16(int16_t *x) const {
     if (swap_) {
-- 
GitLab


From f0886f7269de900d226455d4831722f6fc94a71b Mon Sep 17 00:00:00 2001
From: Cao Zongyan <zongyan.cao@alibaba-inc.com>
Date: Tue, 25 Sep 2018 09:59:17 +0800
Subject: [PATCH 065/570] Fix build dependencies in tensorflow/cc/BUILD.

---
 tensorflow/cc/BUILD                            | 1 +
 tensorflow/python/kernel_tests/relu_op_test.py | 4 ++--
 tensorflow/python/ops/nn_ops.py                | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index f56521dac0..e99d15f85d 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -410,6 +410,7 @@ tf_cc_test(
     srcs = ["gradients/nn_grad_test.cc"],
     deps = [
         ":cc_ops",
+        ":cc_ops_internal",
         ":grad_op_registry",
         ":grad_testutil",
         ":gradient_checker",
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index 86d9c90e83..d97a1613b9 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -351,7 +351,7 @@ class LeakyReluTest(test.TestCase):
     self.assertLess(err, 1e-10)
 
   def testGradGradFloat32(self):
-    with compat.forward_compatibility_horizon(2018, 10, 2):
+    with compat.forward_compatibility_horizon(2018, 11, 2):
       with self.test_session():
         x = constant_op.constant(
             [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
@@ -369,7 +369,7 @@ class LeakyReluTest(test.TestCase):
       self.assertLess(err, 1e-4)
 
   def testGradGradFloat64(self):
-    with compat.forward_compatibility_horizon(2018, 10, 2):
+    with compat.forward_compatibility_horizon(2018, 11, 2):
       with self.test_session():
         x = constant_op.constant(
             [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index d646245ce3..2861f40586 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1601,7 +1601,7 @@ def leaky_relu(features, alpha=0.2, name=None):
     features = ops.convert_to_tensor(features, name="features")
     if features.dtype.is_integer:
       features = math_ops.to_float(features)
-    if compat.forward_compatible(2018, 10, 1):
+    if compat.forward_compatible(2018, 11, 1):
       return gen_nn_ops.leaky_relu(features, alpha=alpha, name=name)
     alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
     return math_ops.maximum(alpha * features, features, name=name)
-- 
GitLab


From c12a90e45c5f94b80289f4278f81be4a0348fa19 Mon Sep 17 00:00:00 2001
From: wangsiyu <siyu.wsy@gmail.com>
Date: Tue, 25 Sep 2018 13:51:36 +0800
Subject: [PATCH 066/570] fix pylint

---
 tensorflow/python/ops/variables.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 2d6a767fed..d058478d58 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -2402,7 +2402,8 @@ class PartitionedVariable(object):
     if len(partition_axes) > 1:
       raise NotImplementedError(
           "Cannot concatenate along more than one dimension: %s.  "
-          "Multi-axis partition assign_fn is not supported" % str(partition_axes))
+          "Multi-axis partition assign_fn is not supported "
+          % str(partition_axes))
     partition_ix = partition_axes[0]
     size_splits_list = [
         var.shape[partition_ix].value for var in self._variable_list]
-- 
GitLab


From 3d60d636de59449a8448cbcbcd71af82e2871538 Mon Sep 17 00:00:00 2001
From: wangsiyu <siyu.wsy@gmail.com>
Date: Tue, 25 Sep 2018 13:53:36 +0800
Subject: [PATCH 067/570] fix back variabe name

---
 tensorflow/python/kernel_tests/variables_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 687784c8b7..0b101529fe 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -673,7 +673,7 @@ class PartitionedVariableTest(test.TestCase):
         v0._set_save_slice_info(
             variables.Variable.SaveSliceInfo(v0.name, [2], [0], [1]))
         v1._set_save_slice_info(
-            variables.Variable.SaveSliceInfo(v1.name, [2], [1], [1]))
+            variables.Variable.SaveSliceInfo(v0.name, [2], [1], [1]))
         partitions = [2]
 
         variables.PartitionedVariable(
-- 
GitLab


From 21d4e8bb30a1753a81edd4912881d95b47ae3d1c Mon Sep 17 00:00:00 2001
From: wangsiyu <siyu.wsy@gmail.com>
Date: Tue, 25 Sep 2018 15:50:10 +0800
Subject: [PATCH 068/570] remove warning lines

---
 tensorflow/python/ops/variables.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index d058478d58..69f63bc8e6 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -2401,7 +2401,6 @@ class PartitionedVariable(object):
     partition_axes = self._partition_axes()
     if len(partition_axes) > 1:
       raise NotImplementedError(
-          "Cannot concatenate along more than one dimension: %s.  "
           "Multi-axis partition assign_fn is not supported "
           % str(partition_axes))
     partition_ix = partition_axes[0]
-- 
GitLab


From 937ad7c27f0d289067c935543d282e5ac5a310b1 Mon Sep 17 00:00:00 2001
From: Niranjan Hasabnis <niranjan.hasabnis@intel.com>
Date: Tue, 25 Sep 2018 14:00:41 -0700
Subject: [PATCH 069/570] Adding check around AddFunctionLibrary

---
 tensorflow/core/kernels/partitioned_function_ops.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 7a5a2ff8fa..fdb4c84c46 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -102,7 +102,8 @@ class PartitionedCallOp : public AsyncOpKernel {
         // by name.
         auto graph = tensorflow::MakeUnique<Graph>(fbody->graph->flib_def());
         FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
-        graph.get()->AddFunctionLibrary(global_flib.ToProto());
+        TF_CHECK_OK(
+                    graph.get()->AddFunctionLibrary(global_flib.ToProto()));
         CopyGraph(*fbody->graph, graph.get());
         OP_REQUIRES_OK_ASYNC(ctx, PinResourceArgs(graph.get(), args), done);
 
@@ -258,7 +259,8 @@ class PartitionedCallOp : public AsyncOpKernel {
     for (const auto& partition : partitions) {
       std::unique_ptr<Graph> subgraph(new Graph(graph->flib_def()));
       FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
-      subgraph.get()->AddFunctionLibrary(global_flib.ToProto());
+      TF_CHECK_OK(
+                subgraph.get()->AddFunctionLibrary(global_flib.ToProto()));
       GraphConstructorOptions opts;
       opts.allow_internal_ops = true;
       opts.expect_device_spec = true;
-- 
GitLab


From 7630e9df4804a01f5dd0ab20d4c0bcfb58e45432 Mon Sep 17 00:00:00 2001
From: Richard Yu <yohan.richard.yu@gmail.com>
Date: Tue, 25 Sep 2018 15:50:13 -0700
Subject: [PATCH 070/570] Fixing error

---
 tensorflow/contrib/quantize/python/fold_batch_norms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index d882b79892..d9f179bee4 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -628,7 +628,7 @@ def _GetBatchNormParams(graph, context, has_scaling):
   bn_decay_var_tensor = _FindMatchingTensor(graph, op_suffix_bn_decay_var,
                                             context)
   if batch_mean_tensor is None and moving_mean_tensor is None:
-    raise ValueError('Error folding unfused batch norms')
+    ValueError('Error folding unfused batch norms')
   if has_scaling:
     gamma_tensor = _FindMatchingTensor(graph, op_suffix_gamma, context)
 
-- 
GitLab


From f55e5ef27b3ccf1b75932e219f7358976dbf56c2 Mon Sep 17 00:00:00 2001
From: IMBurbank <bassmanburbank@gmail.com>
Date: Tue, 25 Sep 2018 18:39:11 -0600
Subject: [PATCH 071/570] Update to use python 2-3 compatible function
 tf_inspect.getfullargspec.

---
 .../python/losses/python/tuple_losses_impl.py |   2 +-
 .../labeled_tensor/python/ops/_typecheck.py   |   2 +-
 .../layers/python/layers/rev_block_lib.py     |   3 +-
 .../python/learn/estimators/estimator.py      |   4 +-
 .../learn/python/learn/estimators/head.py     |   2 +-
 .../learn/python/learn/experiment_test.py     |   2 +-
 .../learn/python/learn/export_strategy.py     |   2 +-
 .../contrib/learn/python/learn/metric_spec.py |   2 +-
 .../contrib/learn/python/learn/monitors.py    |   2 +-
 .../contrib/tpu/python/tpu/tpu_function.py    |   2 +-
 tensorflow/python/framework/errors_impl.py    |   2 +-
 tensorflow/python/framework/function.py       |   6 +-
 tensorflow/python/keras/backend_test.py       |   2 +-
 tensorflow/python/keras/testing_utils.py      |   2 +-
 .../kernel_tests/variable_scope_test.py       |   4 +-
 tensorflow/python/ops/variable_scope.py       |   4 +-
 tensorflow/python/util/tf_contextlib_test.py  |   2 +-
 tensorflow/python/util/tf_inspect.py          |   7 +-
 tensorflow/python/util/tf_inspect_test.py     | 249 +++++++++++++++++-
 .../api/lib/python_object_to_proto_visitor.py |   2 +-
 20 files changed, 267 insertions(+), 36 deletions(-)

diff --git a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
index 221c70c38b..00a83e5e55 100644
--- a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
@@ -101,7 +101,7 @@ def _args_to_gan_model(loss_fn):
   """
   # Match arguments in `loss_fn` to elements of `namedtuple`.
   # TODO(joelshor): Properly handle `varargs` and `keywords`.
-  argspec = tf_inspect.getargspec(loss_fn)
+  argspec = tf_inspect.getfullargspec(loss_fn)
   defaults = argspec.defaults or []
 
   required_args = set(argspec.args[:-len(defaults)])
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py b/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
index 80fa17ec1f..0e23039847 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
@@ -230,7 +230,7 @@ def accepts(*types):
 
   def check_accepts(f):
     """Check the types."""
-    spec = tf_inspect.getargspec(f)
+    spec = tf_inspect.getfullargspec(f)
 
     num_function_arguments = len(spec.args)
     if len(types) != num_function_arguments:
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index 06da32072f..55979cc391 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -576,7 +576,8 @@ def _recomputing_grad_fn(compute_fn,
 
 def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
   """See recompute_grad."""
-  has_is_recompute_kwarg = "is_recomputing" in tf_inspect.getargspec(fn).args
+  has_is_recompute_kwarg = (
+      "is_recomputing" in tf_inspect.getfullargspec(fn).args)
   for arg in args:
     if not isinstance(arg, framework_ops.Tensor):
       raise ValueError("All inputs to function must be Tensors")
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index c1de42782e..b88923bca2 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -199,11 +199,11 @@ def _model_fn_args(fn):
   if hasattr(fn, 'func') and hasattr(fn, 'keywords') and hasattr(fn, 'args'):
     # Handle functools.partial and similar objects.
     return tuple([
-        arg for arg in tf_inspect.getargspec(fn.func).args[len(fn.args):]
+        arg for arg in tf_inspect.getfullargspec(fn.func).args[len(fn.args):]
         if arg not in set(fn.keywords.keys())
     ])
   # Handle function.
-  return tuple(tf_inspect.getargspec(fn).args)
+  return tuple(tf_inspect.getfullargspec(fn).args)
 
 
 def _get_replica_device_setter(config):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index c6f79e00d5..63dd08316b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -1861,7 +1861,7 @@ def _get_arguments(func):
   _, func = tf_decorator.unwrap(func)
   if hasattr(func, "__code__"):
     # Regular function.
-    return tf_inspect.getargspec(func)
+    return tf_inspect.getfullargspec(func)
   elif hasattr(func, "func"):
     # Partial function.
     return _get_arguments(func.func)
diff --git a/tensorflow/contrib/learn/python/learn/experiment_test.py b/tensorflow/contrib/learn/python/learn/experiment_test.py
index fb16c94c29..6926696fb6 100644
--- a/tensorflow/contrib/learn/python/learn/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/experiment_test.py
@@ -126,7 +126,7 @@ class TestBaseEstimator(object):
 
 def _check_method_supports_args(method, kwargs):
   """Checks that the given method supports the given args."""
-  supported_args = tuple(tf_inspect.getargspec(method).args)
+  supported_args = tuple(tf_inspect.getfullargspec(method).args)
   for kwarg in kwargs:
     if kwarg not in supported_args:
       raise ValueError(
diff --git a/tensorflow/contrib/learn/python/learn/export_strategy.py b/tensorflow/contrib/learn/python/learn/export_strategy.py
index 075cab536e..0d6e0cdc18 100644
--- a/tensorflow/contrib/learn/python/learn/export_strategy.py
+++ b/tensorflow/contrib/learn/python/learn/export_strategy.py
@@ -96,7 +96,7 @@ class ExportStrategy(
     """
     # don't break existing export_fns that don't accept checkpoint_path and
     # eval_result
-    export_fn_args = tf_inspect.getargspec(self.export_fn).args
+    export_fn_args = tf_inspect.getfullargspec(self.export_fn).args
     kwargs = {}
     if 'checkpoint_path' in export_fn_args:
       kwargs['checkpoint_path'] = checkpoint_path
diff --git a/tensorflow/contrib/learn/python/learn/metric_spec.py b/tensorflow/contrib/learn/python/learn/metric_spec.py
index 97220365d5..604d6d46b4 100644
--- a/tensorflow/contrib/learn/python/learn/metric_spec.py
+++ b/tensorflow/contrib/learn/python/learn/metric_spec.py
@@ -51,7 +51,7 @@ def _args(fn):
     return tuple(
         [arg for arg in _args(fn.func) if arg not in set(fn.keywords.keys())])
   # Handle function.
-  return tuple(tf_inspect.getargspec(fn).args)
+  return tuple(tf_inspect.getfullargspec(fn).args)
 
 
 _CANONICAL_LABELS_ARG = 'labels'
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index 3d691d4340..5f61e0264f 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -1303,7 +1303,7 @@ class RunHookAdapterForMonitors(session_run_hook.SessionRunHook):
   def end(self, session):
     self._last_step = None
     for m in self._monitors:
-      if "session" in tf_inspect.getargspec(m.end).args:
+      if "session" in tf_inspect.getfullargspec(m.end).args:
         m.end(session=session)
       else:
         m.end()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_function.py b/tensorflow/contrib/tpu/python/tpu/tpu_function.py
index 0c7a38dbbb..9c4bd1c4d1 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_function.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_function.py
@@ -80,7 +80,7 @@ def check_function_argument_count(func, input_arity, infeed_queue):
   number_of_arguments_needed = input_arity
   if infeed_queue is not None:
     number_of_arguments_needed += infeed_queue.number_of_tuple_elements
-  arg_spec = tf_inspect.getargspec(func)
+  arg_spec = tf_inspect.getfullargspec(func)
   number_of_args = len(arg_spec.args)
   if arg_spec.defaults is None:
     number_of_defaults = 0
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index 5af71f2cfb..c373e75a74 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -55,7 +55,7 @@ class OpError(Exception):
 
   def __reduce__(self):
     # Allow the subclasses to accept less arguments in their __init__.
-    init_argspec = tf_inspect.getargspec(self.__class__.__init__)
+    init_argspec = tf_inspect.getfullargspec(self.__class__.__init__)
     args = tuple(getattr(self, arg) for arg in init_argspec.args[1:])
     return self.__class__, args
 
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index f287289bd0..3db6f683c9 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -132,9 +132,9 @@ class Defun(object):
       raise ValueError("func %s must be callable" % func)
 
     # Func should not use kwargs and defaults.
-    argspec = tf_inspect.getargspec(func)
-    if argspec.keywords or argspec.defaults:
-      raise ValueError("Functions with argument defaults or keyword "
+    argspec = tf_inspect.getfullargspec(func)
+    if argspec.varkw or argspec.defaults:
+      raise ValueError("Functions with argument defaults or varkw "
                        "arguments are not supported.")
 
     # Computes how many arguments 'func' has.
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index ab71589940..31191d0d35 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -452,7 +452,7 @@ class BackendLinearAlgebraTest(test.TestCase):
         compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
                                          keras_kwargs={'axis': -1},
                                          np_kwargs={'axis': -1})
-        if 'keepdims' in tf_inspect.getargspec(keras_op).args:
+        if 'keepdims' in tf_inspect.getfullargspec(keras_op).args:
           compare_single_input_op_to_numpy(keras_op, np_op,
                                            input_shape=(4, 7, 5),
                                            keras_kwargs={'axis': 1,
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 501b50ba5f..1afaba5653 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -102,7 +102,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   layer.set_weights(weights)
 
   # test and instantiation from weights
-  if 'weights' in tf_inspect.getargspec(layer_cls.__init__):
+  if 'weights' in tf_inspect.getfullargspec(layer_cls.__init__):
     kwargs['weights'] = weights
     layer = layer_cls(**kwargs)
 
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 401e1ae102..1d0b72b17a 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -998,8 +998,8 @@ class VariableScopeTest(test.TestCase):
 
   def testSignatureGetVarVsGetLocalVar(self):
     """get_{local,}variable() must take the same list of args."""
-    arg_names = tf_inspect.getargspec(variable_scope.get_variable)[0]
-    local_arg_names = tf_inspect.getargspec(
+    arg_names = tf_inspect.getfullargspec(variable_scope.get_variable)[0]
+    local_arg_names = tf_inspect.getfullargspec(
         variable_scope.get_local_variable)[0]
     self.assertEqual(arg_names, local_arg_names)
 
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index a43676cd70..3cc1eb916d 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -892,14 +892,14 @@ class _VariableStore(object):
         if shape and shape.is_fully_defined():
           init_val = lambda: initializer(  # pylint: disable=g-long-lambda
               shape.as_list(), dtype=dtype, partition_info=partition_info)
-        elif not tf_inspect.getargspec(initializer).args:
+        elif not tf_inspect.getfullargspec(initializer).args:
           init_val = initializer
         else:
           raise ValueError("You can only pass an initializer function that "
                            "expects no arguments to its callable when the "
                            "shape is not fully defined. The given initializer "
                            "function expects the following args %s" %
-                           tf_inspect.getargspec(initializer).args)
+                           tf_inspect.getfullargspec(initializer).args)
         variable_dtype = dtype.base_dtype
 
     # Create the variable.
diff --git a/tensorflow/python/util/tf_contextlib_test.py b/tensorflow/python/util/tf_contextlib_test.py
index 4a5bf388a6..1e921b5ea3 100644
--- a/tensorflow/python/util/tf_contextlib_test.py
+++ b/tensorflow/python/util/tf_contextlib_test.py
@@ -83,7 +83,7 @@ class TfContextlibTest(test.TestCase):
     self.assertFalse(isinstance(target, tf_decorator.TFDecorator))
 
   def testGetArgSpecReturnsWrappedArgSpec(self):
-    argspec = tf_inspect.getargspec(test_params_and_defaults)
+    argspec = tf_inspect.getfullargspec(test_params_and_defaults)
     self.assertEqual(['a', 'b', 'c', 'd'], argspec.args)
     self.assertEqual((2, True, 'hello'), argspec.defaults)
 
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 967c872c2a..234850ac3f 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -43,7 +43,12 @@ def currentframe():
 
 
 def getargspec(obj):
-  """TFDecorator-aware replacement for inspect.getargspec.
+  """TFDecorator-aware replacement for `inspect.getargspec`.
+
+  This should not be called from other modules. It is deprecated in python3.
+
+  Use `getfullargspec`. It is a TFDecorator-aware replacement for 
+  `inspect.getfullargspec` compatible with both python2 and python3.
 
   Args:
     obj: A function, partial function, or callable object, possibly
diff --git a/tensorflow/python/util/tf_inspect_test.py b/tensorflow/python/util/tf_inspect_test.py
index d3b7e4b969..55f88f8fc6 100644
--- a/tensorflow/python/util/tf_inspect_test.py
+++ b/tensorflow/python/util/tf_inspect_test.py
@@ -122,18 +122,6 @@ class TfInspectTest(test.TestCase):
 
     self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
 
-  def testGetFullArgsSpecForPartial(self):
-
-    def func(a, b):
-      del a, b
-
-    partial_function = functools.partial(func, 1)
-    argspec = tf_inspect.FullArgSpec(
-        args=['b'], varargs=None, varkw=None, defaults=None,
-        kwonlyargs=[], kwonlydefaults=None, annotations={})
-
-    self.assertEqual(argspec, tf_inspect.getfullargspec(partial_function))
-
   def testGetArgSpecOnPartialInvalidArgspec(self):
     """Tests getargspec on partial function that doesn't have valid argspec."""
 
@@ -303,6 +291,243 @@ class TfInspectTest(test.TestCase):
 
     self.assertEqual(argspec, tf_inspect.getargspec(NewClass))
 
+  def testGetFullArgSpecOnDecoratorsThatDontProvideFullArgSpec(self):
+    argspec = tf_inspect.getfullargspec(
+        test_decorated_function_with_defaults)
+    self.assertEqual(['a', 'b', 'c'], argspec.args)
+    self.assertEqual((2, 'Hello'), argspec.defaults)
+
+  def testGetFullArgSpecOnDecoratorThatChangesFullArgSpec(self):
+    argspec = tf_inspect.FullArgSpec(
+        args=['a', 'b', 'c'],
+        varargs=None,
+        varkw=None,
+        defaults=(1, 'hello'),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
+
+    decorator = tf_decorator.TFDecorator('', test_undecorated_function, '',
+                                         argspec)
+    self.assertEqual(argspec, tf_inspect.getfullargspec(decorator))
+
+  def testGetFullArgSpecIgnoresDecoratorsThatDontProvideFullArgSpec(self):
+    argspec = tf_inspect.FullArgSpec(
+        args=['a', 'b', 'c'],
+        varargs=None,
+        varkw=None,
+        defaults=(1, 'hello'),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
+
+    inner_decorator = tf_decorator.TFDecorator('', test_undecorated_function,
+                                               '', argspec)
+    outer_decorator = tf_decorator.TFDecorator('', inner_decorator)
+    self.assertEqual(argspec, tf_inspect.getfullargspec(outer_decorator))
+
+  def testGetFullArgSpecReturnsOutermostDecoratorThatChangesFullArgSpec(self):
+    outer_argspec = tf_inspect.FullArgSpec(
+        args=['a'], varargs=None, varkw=None, defaults=None,
+        kwonlyargs=[], kwonlydefaults=None, annotations={})
+    inner_argspec = tf_inspect.FullArgSpec(
+        args=['b'], varargs=None, varkw=None, defaults=None,
+        kwonlyargs=[], kwonlydefaults=None, annotations={})
+
+    inner_decorator = tf_decorator.TFDecorator('', test_undecorated_function,
+                                               '', inner_argspec)
+    outer_decorator = tf_decorator.TFDecorator('', inner_decorator, '',
+                                               outer_argspec)
+    self.assertEqual(outer_argspec,
+                     tf_inspect.getfullargspec(outer_decorator))
+
+  def testGetFullArgsSpecForPartial(self):
+
+    def func(a, b):
+      del a, b
+
+    partial_function = functools.partial(func, 1)
+    argspec = tf_inspect.FullArgSpec(
+        args=['b'], varargs=None, varkw=None, defaults=None,
+        kwonlyargs=[], kwonlydefaults=None, annotations={})
+
+    self.assertEqual(argspec, tf_inspect.getfullargspec(partial_function))
+
+  def testGetFullArgSpecOnPartialInvalidFullArgSpec(self):
+    """Tests getfullargspec.
+
+    Tests on partial function that doesn't have valid fullargspec.
+    """
+
+    def func(m, n, l, k=4):
+      return 2 * m + l + n * k
+
+    partial_func = functools.partial(func, n=7)
+
+    exception_message = (r"Some arguments \['l'\] do not have default value, "
+                         "but they are positioned after those with default "
+                         "values. This can not be expressed with ArgSpec.")
+    with self.assertRaisesRegexp(ValueError, exception_message):
+      tf_inspect.getfullargspec(partial_func)
+
+  def testGetFullArgSpecOnPartialValidFullArgSpec(self):
+    """Tests getfullargspec on partial function with valid fullargspec."""
+
+    def func(m, n, l, k=4):
+      return 2 * m + l + n * k
+
+    partial_func = functools.partial(func, n=7, l=2)
+    argspec = tf_inspect.FullArgSpec(
+        args=['m', 'n', 'l', 'k'],
+        varargs=None,
+        varkw=None,
+        defaults=(7, 2, 4),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
+
+    self.assertEqual(argspec, tf_inspect.getfullargspec(partial_func))
+
+  def testGetFullArgSpecOnPartialNoArgumentsLeft(self):
+    """Tests getfullargspec on partial function that prunes all arguments."""
+
+    def func(m, n):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, 7, 10)
+    argspec = tf_inspect.FullArgSpec(
+        args=[], varargs=None, varkw=None, defaults=None,
+        kwonlyargs=[], kwonlydefaults=None, annotations={})
+
+    self.assertEqual(argspec, tf_inspect.getfullargspec(partial_func))
+
+  def testGetFullArgSpecOnPartialKeywordArgument(self):
+    """Tests getfullargspec on partial function that prunes some arguments."""
+
+    def func(m, n):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, n=7)
+    argspec = tf_inspect.FullArgSpec(
+        args=['m', 'n'], varargs=None, varkw=None, defaults=(7,),
+        kwonlyargs=[], kwonlydefaults=None, annotations={})
+
+    self.assertEqual(argspec, tf_inspect.getfullargspec(partial_func))
+
+  def testGetFullArgSpecOnPartialKeywordArgumentWithDefaultValue(self):
+    """Tests getfullargspec.
+    
+    Tests on partial function that prunes argument by keyword.
+    """
+
+    def func(m=1, n=2):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, n=7)
+    argspec = tf_inspect.FullArgSpec(
+        args=['m', 'n'], varargs=None, varkw=None, defaults=(1, 7),
+        kwonlyargs=[], kwonlydefaults=None, annotations={})
+
+    self.assertEqual(argspec, tf_inspect.getfullargspec(partial_func))
+
+  def testGetFullArgSpecOnPartialWithVarargs(self):
+    """Tests getfullargspec on partial function with variable arguments."""
+
+    def func(m, *arg):
+      return m + len(arg)
+
+    partial_func = functools.partial(func, 7, 8)
+    argspec = tf_inspect.FullArgSpec(
+        args=[], varargs='arg', varkw=None, defaults=None,
+        kwonlyargs=[], kwonlydefaults=None, annotations={})
+
+    self.assertEqual(argspec, tf_inspect.getfullargspec(partial_func))
+
+  def testGetFullArgSpecOnPartialWithVarkwargs(self):
+    """Tests getfullargspec.
+
+    Tests on partial function with variable keyword arguments.
+    """
+
+    def func(m, n, **kwarg):
+      return m * n + len(kwarg)
+
+    partial_func = functools.partial(func, 7)
+    argspec = tf_inspect.FullArgSpec(
+        args=['n'], varargs=None, varkw='kwarg', defaults=None,
+        kwonlyargs=[], kwonlydefaults=None, annotations={})
+
+    self.assertEqual(argspec, tf_inspect.getfullargspec(partial_func))
+
+  def testGetFullArgSpecOnPartialWithDecorator(self):
+    """Tests getfullargspec on decorated partial function."""
+
+    @test_decorator('decorator')
+    def func(m=1, n=2):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, n=7)
+    argspec = tf_inspect.FullArgSpec(
+        args=['m', 'n'], varargs=None, varkw=None, defaults=(1, 7),
+        kwonlyargs=[], kwonlydefaults=None, annotations={})
+
+    self.assertEqual(argspec, tf_inspect.getfullargspec(partial_func))
+
+  def testGetFullArgSpecOnCallableObject(self):
+
+    class Callable(object):
+
+      def __call__(self, a, b=1, c='hello'):
+        pass
+
+    argspec = tf_inspect.FullArgSpec(
+        args=['self', 'a', 'b', 'c'],
+        varargs=None,
+        varkw=None,
+        defaults=(1, 'hello'),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
+
+    test_obj = Callable()
+    self.assertEqual(argspec, tf_inspect.getfullargspec(test_obj))
+
+  def testGetFullArgSpecOnInitClass(self):
+
+    class InitClass(object):
+
+      def __init__(self, a, b=1, c='hello'):
+        pass
+
+    argspec = tf_inspect.FullArgSpec(
+        args=['self', 'a', 'b', 'c'],
+        varargs=None,
+        varkw=None,
+        defaults=(1, 'hello'),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
+
+    self.assertEqual(argspec, tf_inspect.getfullargspec(InitClass))
+
+  def testGetFullArgSpecOnNewClass(self):
+
+    class NewClass(object):
+
+      def __new__(cls, a, b=1, c='hello'):
+        pass
+
+    argspec = tf_inspect.FullArgSpec(
+        args=['cls', 'a', 'b', 'c'],
+        varargs=None,
+        varkw=None,
+        defaults=(1, 'hello'),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
+
+    self.assertEqual(argspec, tf_inspect.getfullargspec(NewClass))
+
   def testGetDoc(self):
     self.assertEqual('Test Decorated Function With Defaults Docstring.',
                      tf_inspect.getdoc(test_decorated_function_with_defaults))
diff --git a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
index 3a48cf683c..2a40caf720 100644
--- a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
+++ b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
@@ -47,7 +47,7 @@ def _SanitizedArgSpec(obj):
     string, a string representation of the argspec.
   """
   output_string = ''
-  unsanitized_arg_spec = tf_inspect.getargspec(obj)
+  unsanitized_arg_spec = tf_inspect.getfullargspec(obj)
 
   for clean_attr in ('args', 'varargs', 'keywords'):
     output_string += '%s=%s, ' % (clean_attr,
-- 
GitLab


From 7c2341501a583ca625c976f118090e495cdcbe07 Mon Sep 17 00:00:00 2001
From: Jason Furmanek <furmanek@us.ibm.com>
Date: Wed, 26 Sep 2018 04:44:12 +0000
Subject: [PATCH 072/570] Find NCCL2 debians in Tensorflow configure

---
 configure.py                        | 136 +++++++++++++++++++---------
 third_party/nccl/nccl_configure.bzl |  14 ++-
 third_party/nccl/system.BUILD.tpl   |   4 +-
 3 files changed, 105 insertions(+), 49 deletions(-)

diff --git a/configure.py b/configure.py
index f0b9fada5e..9fd2dc2630 100644
--- a/configure.py
+++ b/configure.py
@@ -54,6 +54,12 @@ _TF_BAZELRC_FILENAME = '.tf_configure.bazelrc'
 _TF_BAZELRC = os.path.join(_TF_WORKSPACE_ROOT, _TF_BAZELRC_FILENAME)
 _TF_WORKSPACE = os.path.join(_TF_WORKSPACE_ROOT, 'WORKSPACE')
 
+NCCL_LIB_PATHS = [
+  "lib64/",
+  "lib/powerpc64le-linux-gnu/",
+  "lib/x86_64-linux-gnu/",
+  ""
+]
 
 class UserInputError(Exception):
   pass
@@ -1085,7 +1091,7 @@ def set_tf_tensorrt_install_path(environ_cp):
 
 
 def set_tf_nccl_install_path(environ_cp):
-  """Set NCCL_INSTALL_PATH and TF_NCCL_VERSION.
+  """Set NCCL_INSTALL_PATH, NCCL_HDR_PATH and TF_NCCL_VERSION.
 
   Args:
     environ_cp: copy of the os.environ.
@@ -1111,46 +1117,98 @@ def set_tf_nccl_install_path(environ_cp):
     if tf_nccl_version == '1':
       break  # No need to get install path, NCCL 1 is a GitHub repo.
 
-    # TODO(csigg): Look with ldconfig first if we can find the library in paths
+    # Look with ldconfig first if we can find the library in paths
     # like /usr/lib/x86_64-linux-gnu and the header file in the corresponding
     # include directory. This is where the NCCL .deb packages install them.
-    # Then ask the user if we should use that. Instead of a single
-    # NCCL_INSTALL_PATH, pass separate NCCL_LIB_PATH and NCCL_HDR_PATH to
-    # nccl_configure.bzl
-    default_nccl_path = environ_cp.get('CUDA_TOOLKIT_PATH')
-    ask_nccl_path = (r'Please specify the location where NCCL %s library is '
+
+    # First check to see if NCCL is in the ldconfig.
+    # If its found, use that location.
+    if is_linux():
+      ldconfig_bin = which('ldconfig') or '/sbin/ldconfig'
+      nccl2_path_from_ldconfig = run_shell([ldconfig_bin, '-p'])
+      nccl2_path_from_ldconfig = re.search('.*libnccl.so .* => (.*)',
+                                           nccl2_path_from_ldconfig)
+    if nccl2_path_from_ldconfig:
+      nccl2_path_from_ldconfig = nccl2_path_from_ldconfig.group(1)
+      if os.path.exists('%s.%s' % (nccl2_path_from_ldconfig, tf_nccl_version)):
+        nccl_install_path = os.path.dirname(nccl2_path_from_ldconfig)
+        print('NCCL libraries found in ' + nccl2_path_from_ldconfig)
+        
+        # Check if this is the main system lib location
+        if re.search('.*linux-gnu', nccl_install_path):
+          trunc_nccl_install_path = "/usr"
+          print("This looks like a system path.")
+        else:
+          trunc_nccl_install_path = nccl_install_path + "/.."
+  
+        # Look for header
+        nccl_hdr_path = trunc_nccl_install_path + "/include"
+        print("Assuming NCCL header path is " + nccl_hdr_path)
+        if os.path.exists(nccl_hdr_path + "/nccl.h"):
+          # Set NCCL_INSTALL_PATH
+          environ_cp['NCCL_INSTALL_PATH'] = nccl_install_path
+          write_action_env_to_bazelrc('NCCL_INSTALL_PATH', nccl_install_path)
+
+          # Set NCCL_HDR_PATH
+          environ_cp['NCCL_HDR_PATH'] = nccl_hdr_path
+          write_action_env_to_bazelrc('NCCL_HDR_PATH', nccl_hdr_path)
+          break
+        else:
+          print('The header for NCCL2 cannot be found. Please install the libnccl-dev package.')
+      else:
+          print('NCCL2 is listed by ldconfig but the library is not found. ' 
+                'Your ldconfig is out of date. Please run sudo ldconfig.')
+    else:
+      # NCCL is not found in ldconfig. Ask the user for the location.
+      default_nccl_path = environ_cp.get('CUDA_TOOLKIT_PATH')
+      ask_nccl_path = (r'Please specify the location where NCCL %s library is '
                      'installed. Refer to README.md for more details. [Default '
                      'is %s]:') % (tf_nccl_version, default_nccl_path)
-    nccl_install_path = get_from_env_or_user_or_default(
+      nccl_install_path = get_from_env_or_user_or_default(
         environ_cp, 'NCCL_INSTALL_PATH', ask_nccl_path, default_nccl_path)
 
-    # Result returned from "read" will be used unexpanded. That make "~"
-    # unusable. Going through one more level of expansion to handle that.
-    nccl_install_path = os.path.realpath(os.path.expanduser(nccl_install_path))
-    if is_windows() or is_cygwin():
-      nccl_install_path = cygpath(nccl_install_path)
-
-    if is_windows():
-      nccl_lib_path = 'lib/x64/nccl.lib'
-    elif is_linux():
-      nccl_lib_path = 'lib/libnccl.so.%s' % tf_nccl_version
-    elif is_macos():
-      nccl_lib_path = 'lib/libnccl.%s.dylib' % tf_nccl_version
-
-    nccl_lib_path = os.path.join(nccl_install_path, nccl_lib_path)
-    nccl_hdr_path = os.path.join(nccl_install_path, 'include/nccl.h')
-    if os.path.exists(nccl_lib_path) and os.path.exists(nccl_hdr_path):
-      # Set NCCL_INSTALL_PATH
-      environ_cp['NCCL_INSTALL_PATH'] = nccl_install_path
-      write_action_env_to_bazelrc('NCCL_INSTALL_PATH', nccl_install_path)
-      break
-
-    # Reset and Retry
-    print('Invalid path to NCCL %s toolkit, %s or %s not found. Please use the '
-          'O/S agnostic package of NCCL 2' % (tf_nccl_version, nccl_lib_path,
+      # Result returned from "read" will be used unexpanded. That make "~"
+      # unusable. Going through one more level of expansion to handle that.
+      nccl_install_path = os.path.realpath(os.path.expanduser(nccl_install_path))
+      if is_windows() or is_cygwin():
+        nccl_install_path = cygpath(nccl_install_path)
+
+      if is_windows():
+        nccl_lib_path = 'lib/x64/nccl.lib'
+      elif is_linux():
+        nccl_lib_filename = 'libnccl.so.%s' % tf_nccl_version
+        nccl_lpath = '%s/lib/%s' % (nccl_install_path, nccl_lib_filename)
+        if not os.path.exists(nccl_lpath):
+          for relative_path in NCCL_LIB_PATHS:
+            path = '%s/%s%s' % (nccl_install_path, relative_path, nccl_lib_filename)
+            if os.path.exists(path):
+              print("NCCL found at " + path)
+              nccl_lib_path = path
+              break
+        else:
+          nccl_lib_path = nccl_lpath
+      elif is_macos():
+        nccl_lib_path = 'lib/libnccl.%s.dylib' % tf_nccl_version
+
+      nccl_lib_path = os.path.join(nccl_install_path, nccl_lib_path)
+      nccl_hdr_path = os.path.join(os.path.dirname(nccl_lib_path), '../include/nccl.h')
+      print("Assuming NCCL header path is "+nccl_hdr_path)
+      if os.path.exists(nccl_lib_path) and os.path.exists(nccl_hdr_path):
+        # Set NCCL_INSTALL_PATH
+        environ_cp['NCCL_INSTALL_PATH'] = os.path.dirname(nccl_lib_path)
+        write_action_env_to_bazelrc('NCCL_INSTALL_PATH', os.path.dirname(nccl_lib_path))
+
+        # Set NCCL_HDR_PATH
+        environ_cp['NCCL_HDR_PATH'] = os.path.dirname(nccl_hdr_path)
+        write_action_env_to_bazelrc('NCCL_HDR_PATH', os.path.dirname(nccl_hdr_path))
+        break
+
+      # Reset and Retry
+      print('Invalid path to NCCL %s toolkit, %s or %s not found. Please use the '
+            'O/S agnostic package of NCCL 2' % (tf_nccl_version, nccl_lib_path,
                                               nccl_hdr_path))
 
-    environ_cp['TF_NCCL_VERSION'] = ''
+      environ_cp['TF_NCCL_VERSION'] = ''
   else:
     raise UserInputError('Invalid TF_NCCL setting was provided %d '
                          'times in a row. Assuming to be a scripting mistake.' %
@@ -1401,20 +1459,10 @@ def set_grpc_build_flags():
 
 def set_system_libs_flag(environ_cp):
   syslibs = environ_cp.get('TF_SYSTEM_LIBS', '')
+  syslibs = ','.join(sorted(syslibs.split(',')))
   if syslibs and syslibs != '':
-    if ',' in syslibs:
-      syslibs = ','.join(sorted(syslibs.split(',')))
-    else:
-      syslibs = ','.join(sorted(syslibs.split()))
     write_action_env_to_bazelrc('TF_SYSTEM_LIBS', syslibs)
 
-  if 'PREFIX' in environ_cp:
-    write_to_bazelrc('build --define=PREFIX=%s' % environ_cp['PREFIX'])
-  if 'LIBDIR' in environ_cp:
-    write_to_bazelrc('build --define=LIBDIR=%s' % environ_cp['LIBDIR'])
-  if 'INCLUDEDIR' in environ_cp:
-    write_to_bazelrc('build --define=INCLUDEDIR=%s' % environ_cp['INCLUDEDIR'])
-
 
 def set_windows_build_flags(environ_cp):
   """Set Windows specific build options."""
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index ce9447096e..0713b36724 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -5,6 +5,7 @@
 
   * `TF_NCCL_VERSION`: The NCCL version.
   * `NCCL_INSTALL_PATH`: The installation path of the NCCL library.
+  * `NCCL_HDR_PATH`: The installation path of the NCCL header files.
 """
 
 load(
@@ -15,6 +16,7 @@ load(
 )
 
 _NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
+_NCCL_HDR_PATH = "NCCL_HDR_PATH"
 _TF_NCCL_VERSION = "TF_NCCL_VERSION"
 _TF_NCCL_CONFIG_REPO = "TF_NCCL_CONFIG_REPO"
 
@@ -68,7 +70,7 @@ def _find_nccl_header(repository_ctx, nccl_install_path):
   return header_path
 
 
-def _check_nccl_version(repository_ctx, nccl_install_path, nccl_version):
+def _check_nccl_version(repository_ctx, nccl_install_path, nccl_hdr_path, nccl_version):
   """Checks whether the header file matches the specified version of NCCL.
 
   Args:
@@ -79,7 +81,9 @@ def _check_nccl_version(repository_ctx, nccl_install_path, nccl_version):
   Returns:
     A string containing the library version of NCCL.
   """
-  header_path = _find_nccl_header(repository_ctx, nccl_install_path)
+  header_path = repository_ctx.path("%s/nccl.h" % nccl_hdr_path)
+  if not header_path.exists:
+    header_path = _find_nccl_header(repository_ctx, nccl_install_path)
   header_dir = str(header_path.realpath.dirname)
   major_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
                                    _DEFINE_NCCL_MAJOR)
@@ -109,6 +113,7 @@ def _find_nccl_lib(repository_ctx, nccl_install_path, nccl_version):
   """
   lib_path = repository_ctx.path("%s/lib/libnccl.so.%s" % (nccl_install_path,
                                                            nccl_version))
+
   if not lib_path.exists:
     auto_configure_fail("Cannot find NCCL library %s" % str(lib_path))
   return lib_path
@@ -138,10 +143,12 @@ def _nccl_configure_impl(repository_ctx):
   else:
     # Create target for locally installed NCCL.
     nccl_install_path = repository_ctx.os.environ[_NCCL_INSTALL_PATH].strip()
-    _check_nccl_version(repository_ctx, nccl_install_path, nccl_version)
+    nccl_hdr_path = repository_ctx.os.environ[_NCCL_HDR_PATH].strip()
+    _check_nccl_version(repository_ctx, nccl_install_path, nccl_hdr_path, nccl_version)
     repository_ctx.template("BUILD", _NCCL_LOCAL_BUILD_TEMPLATE, {
         "%{version}": nccl_version,
         "%{install_path}": nccl_install_path,
+        "%{hdr_path}": nccl_hdr_path,
     })
 
 
@@ -149,6 +156,7 @@ nccl_configure = repository_rule(
     implementation=_nccl_configure_impl,
     environ=[
         _NCCL_INSTALL_PATH,
+        _NCCL_HDR_PATH,
         _TF_NCCL_VERSION,
     ],
 )
diff --git a/third_party/nccl/system.BUILD.tpl b/third_party/nccl/system.BUILD.tpl
index 7ca835dedf..a07f54955f 100644
--- a/third_party/nccl/system.BUILD.tpl
+++ b/third_party/nccl/system.BUILD.tpl
@@ -20,7 +20,7 @@ genrule(
     "libnccl.so.%{version}",
     "nccl.h",
   ],
-  cmd = """cp "%{install_path}/include/nccl.h" "$(@D)/nccl.h" &&
-           cp "%{install_path}/lib/libnccl.so.%{version}" "$(@D)/libnccl.so.%{version}" """,
+  cmd = """cp "%{hdr_path}/nccl.h" "$(@D)/nccl.h" &&
+           cp "%{install_path}/libnccl.so.%{version}" "$(@D)/libnccl.so.%{version}" """,
 )
 
-- 
GitLab


From 96eec07af06f4dfc75cee57b74ba4b5347619634 Mon Sep 17 00:00:00 2001
From: Cao Zongyan <zongyan.cao@alibaba-inc.com>
Date: Wed, 26 Sep 2018 13:04:46 +0800
Subject: [PATCH 073/570] Re-add compat module for leaky_relu implementation.

---
 tensorflow/python/ops/nn_ops.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 3f64f0af9a..78e000e458 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -22,6 +22,7 @@ import numbers
 
 import numpy as np
 
+from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_util
-- 
GitLab


From d59678448469ca134875e062f7f8d6d77942af4e Mon Sep 17 00:00:00 2001
From: Jason Furmanek <furmanek@us.ibm.com>
Date: Wed, 26 Sep 2018 05:19:10 +0000
Subject: [PATCH 074/570] fix unintential removal of set_system_libs_flag

---
 configure.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/configure.py b/configure.py
index 9fd2dc2630..3791ead3ed 100644
--- a/configure.py
+++ b/configure.py
@@ -1459,10 +1459,20 @@ def set_grpc_build_flags():
 
 def set_system_libs_flag(environ_cp):
   syslibs = environ_cp.get('TF_SYSTEM_LIBS', '')
-  syslibs = ','.join(sorted(syslibs.split(',')))
   if syslibs and syslibs != '':
+    if ',' in syslibs:
+      syslibs = ','.join(sorted(syslibs.split(',')))
+    else:
+      syslibs = ','.join(sorted(syslibs.split()))
     write_action_env_to_bazelrc('TF_SYSTEM_LIBS', syslibs)
 
+  if 'PREFIX' in environ_cp:
+    write_to_bazelrc('build --define=PREFIX=%s' % environ_cp['PREFIX'])
+  if 'LIBDIR' in environ_cp:
+    write_to_bazelrc('build --define=LIBDIR=%s' % environ_cp['LIBDIR'])
+  if 'INCLUDEDIR' in environ_cp:
+write_to_bazelrc('build --define=INCLUDEDIR=%s' % environ_cp['INCLUDEDIR'])
+
 
 def set_windows_build_flags(environ_cp):
   """Set Windows specific build options."""
-- 
GitLab


From 1668d28ca3558f3bc4fcf94752799712211f219e Mon Sep 17 00:00:00 2001
From: Jason Furmanek <furmanek@us.ibm.com>
Date: Wed, 26 Sep 2018 05:22:04 +0000
Subject: [PATCH 075/570] fix in last line of set_system_lib_flag

---
 configure.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.py b/configure.py
index 3791ead3ed..b1ab55b657 100644
--- a/configure.py
+++ b/configure.py
@@ -1471,7 +1471,7 @@ def set_system_libs_flag(environ_cp):
   if 'LIBDIR' in environ_cp:
     write_to_bazelrc('build --define=LIBDIR=%s' % environ_cp['LIBDIR'])
   if 'INCLUDEDIR' in environ_cp:
-write_to_bazelrc('build --define=INCLUDEDIR=%s' % environ_cp['INCLUDEDIR'])
+    write_to_bazelrc('build --define=INCLUDEDIR=%s' % environ_cp['INCLUDEDIR'])
 
 
 def set_windows_build_flags(environ_cp):
-- 
GitLab


From bd2524f16f3722cce2360ec5f7122c6b6f1ead49 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Wed, 26 Sep 2018 13:23:14 +0800
Subject: [PATCH 076/570] fix unbalanced delimiter in benchmark_model doc

as reported in https://github.com/tensorflow/tensorflow/issues/22499,
there is unbalanced delimiter `"`
---
 tensorflow/tools/benchmark/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/benchmark/README.md b/tensorflow/tools/benchmark/README.md
index e64af2bfe1..dee1a20f3f 100644
--- a/tensorflow/tools/benchmark/README.md
+++ b/tensorflow/tools/benchmark/README.md
@@ -32,7 +32,7 @@ adb push bazel-bin/tensorflow/tools/benchmark/benchmark_model /data/local/tmp
 
 (4) Run the benchmark. For example:
 ```
-adb shell "/data/local/tmp/benchmark_model \
+adb shell /data/local/tmp/benchmark_model \
   --graph=/data/local/tmp/tensorflow_inception_graph.pb \
   --input_layer="input:0" \
   --input_layer_shape="1,224,224,3" \
-- 
GitLab


From 09bf8eb99cd76c506dcd2a0e8c8e893f7f3916b1 Mon Sep 17 00:00:00 2001
From: Jason Furmanek <furmanek@us.ibm.com>
Date: Wed, 26 Sep 2018 05:26:54 +0000
Subject: [PATCH 077/570] white space removal

---
 third_party/nccl/nccl_configure.bzl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 0713b36724..d78fe8f3aa 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -113,7 +113,6 @@ def _find_nccl_lib(repository_ctx, nccl_install_path, nccl_version):
   """
   lib_path = repository_ctx.path("%s/lib/libnccl.so.%s" % (nccl_install_path,
                                                            nccl_version))
-
   if not lib_path.exists:
     auto_configure_fail("Cannot find NCCL library %s" % str(lib_path))
   return lib_path
-- 
GitLab


From fa76895ad577246a8ab241e668765cad651558fb Mon Sep 17 00:00:00 2001
From: Isaac Burbank <bassmanburbank@gmail.com>
Date: Wed, 26 Sep 2018 11:20:44 -0600
Subject: [PATCH 078/570] Update python_object_to_proto_visitor.py

Changed test key for FullArgSpec to check for `varkw`, replacing the old ArgSpec key `keywords`
---
 tensorflow/tools/api/lib/python_object_to_proto_visitor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
index 2a40caf720..a8e69fda4f 100644
--- a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
+++ b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
@@ -49,7 +49,7 @@ def _SanitizedArgSpec(obj):
   output_string = ''
   unsanitized_arg_spec = tf_inspect.getfullargspec(obj)
 
-  for clean_attr in ('args', 'varargs', 'keywords'):
+  for clean_attr in ('args', 'varargs', 'varkw'):
     output_string += '%s=%s, ' % (clean_attr,
                                   getattr(unsanitized_arg_spec, clean_attr))
 
-- 
GitLab


From 5bbcdb8a58efd97b0f73927218d5896da67f5203 Mon Sep 17 00:00:00 2001
From: Isaac Burbank <bassmanburbank@gmail.com>
Date: Wed, 26 Sep 2018 11:34:38 -0600
Subject: [PATCH 079/570] Update tf_inspect_test.py

Remove subsection of added tests that were problematic.
---
 tensorflow/python/util/tf_inspect_test.py | 78 -----------------------
 1 file changed, 78 deletions(-)

diff --git a/tensorflow/python/util/tf_inspect_test.py b/tensorflow/python/util/tf_inspect_test.py
index 55f88f8fc6..ba9430c756 100644
--- a/tensorflow/python/util/tf_inspect_test.py
+++ b/tensorflow/python/util/tf_inspect_test.py
@@ -353,41 +353,6 @@ class TfInspectTest(test.TestCase):
 
     self.assertEqual(argspec, tf_inspect.getfullargspec(partial_function))
 
-  def testGetFullArgSpecOnPartialInvalidFullArgSpec(self):
-    """Tests getfullargspec.
-
-    Tests on partial function that doesn't have valid fullargspec.
-    """
-
-    def func(m, n, l, k=4):
-      return 2 * m + l + n * k
-
-    partial_func = functools.partial(func, n=7)
-
-    exception_message = (r"Some arguments \['l'\] do not have default value, "
-                         "but they are positioned after those with default "
-                         "values. This can not be expressed with ArgSpec.")
-    with self.assertRaisesRegexp(ValueError, exception_message):
-      tf_inspect.getfullargspec(partial_func)
-
-  def testGetFullArgSpecOnPartialValidFullArgSpec(self):
-    """Tests getfullargspec on partial function with valid fullargspec."""
-
-    def func(m, n, l, k=4):
-      return 2 * m + l + n * k
-
-    partial_func = functools.partial(func, n=7, l=2)
-    argspec = tf_inspect.FullArgSpec(
-        args=['m', 'n', 'l', 'k'],
-        varargs=None,
-        varkw=None,
-        defaults=(7, 2, 4),
-        kwonlyargs=[],
-        kwonlydefaults=None,
-        annotations={})
-
-    self.assertEqual(argspec, tf_inspect.getfullargspec(partial_func))
-
   def testGetFullArgSpecOnPartialNoArgumentsLeft(self):
     """Tests getfullargspec on partial function that prunes all arguments."""
 
@@ -401,35 +366,6 @@ class TfInspectTest(test.TestCase):
 
     self.assertEqual(argspec, tf_inspect.getfullargspec(partial_func))
 
-  def testGetFullArgSpecOnPartialKeywordArgument(self):
-    """Tests getfullargspec on partial function that prunes some arguments."""
-
-    def func(m, n):
-      return 2 * m + n
-
-    partial_func = functools.partial(func, n=7)
-    argspec = tf_inspect.FullArgSpec(
-        args=['m', 'n'], varargs=None, varkw=None, defaults=(7,),
-        kwonlyargs=[], kwonlydefaults=None, annotations={})
-
-    self.assertEqual(argspec, tf_inspect.getfullargspec(partial_func))
-
-  def testGetFullArgSpecOnPartialKeywordArgumentWithDefaultValue(self):
-    """Tests getfullargspec.
-    
-    Tests on partial function that prunes argument by keyword.
-    """
-
-    def func(m=1, n=2):
-      return 2 * m + n
-
-    partial_func = functools.partial(func, n=7)
-    argspec = tf_inspect.FullArgSpec(
-        args=['m', 'n'], varargs=None, varkw=None, defaults=(1, 7),
-        kwonlyargs=[], kwonlydefaults=None, annotations={})
-
-    self.assertEqual(argspec, tf_inspect.getfullargspec(partial_func))
-
   def testGetFullArgSpecOnPartialWithVarargs(self):
     """Tests getfullargspec on partial function with variable arguments."""
 
@@ -459,20 +395,6 @@ class TfInspectTest(test.TestCase):
 
     self.assertEqual(argspec, tf_inspect.getfullargspec(partial_func))
 
-  def testGetFullArgSpecOnPartialWithDecorator(self):
-    """Tests getfullargspec on decorated partial function."""
-
-    @test_decorator('decorator')
-    def func(m=1, n=2):
-      return 2 * m + n
-
-    partial_func = functools.partial(func, n=7)
-    argspec = tf_inspect.FullArgSpec(
-        args=['m', 'n'], varargs=None, varkw=None, defaults=(1, 7),
-        kwonlyargs=[], kwonlydefaults=None, annotations={})
-
-    self.assertEqual(argspec, tf_inspect.getfullargspec(partial_func))
-
   def testGetFullArgSpecOnCallableObject(self):
 
     class Callable(object):
-- 
GitLab


From 941b4e0f226de76f083401842e73bd9efd6db2d0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 10:23:59 -0700
Subject: [PATCH 080/570] Fix support for custom optimizers in explicit
 schedule

PiperOrigin-RevId: 214794973
---
 .../grappler/optimizers/meta_optimizer.cc     | 25 ++++++++++++++--
 .../core/grappler/optimizers/meta_optimizer.h |  4 +++
 .../optimizers/meta_optimizer_test.cc         | 30 +++++++++++++++++++
 .../core/protobuf/rewriter_config.proto       |  4 +--
 4 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index c59645e5f2..e18a5f21d2 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -172,11 +172,12 @@ Status MetaOptimizer::InitializeOptimizers(
     optimizers->push_back(MakeUnique<ScopedAllocatorOptimizer>(
         cfg_.scoped_allocator_optimization(), cfg_.scoped_allocator_opts()));
   }
-  return InitializeCustomGraphOptimizers(optimizers);
+  return InitializeCustomGraphOptimizers(std::set<string>(), optimizers);
 }
 
 Status MetaOptimizer::InitializeOptimizersByName(
     std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
+  std::set<string> initialized_custom_optimizers;
   for (const string& optimizer_name : cfg_.optimizers()) {
     auto optimizer = MakeNewOptimizer(optimizer_name);
     if (optimizer) {
@@ -190,18 +191,26 @@ Status MetaOptimizer::InitializeOptimizersByName(
 
     if (custom_optimizer) {
       VLOG(2) << "Registered custom graph optimizer: " << optimizer_name;
-      TF_RETURN_IF_ERROR(custom_optimizer->Init());
+      TF_RETURN_IF_ERROR(custom_optimizer->Init(
+          GetCustomGraphOptimizerConfig(optimizer_name)));
       optimizers->push_back(std::move(custom_optimizer));
+      initialized_custom_optimizers.insert(optimizer_name);
     } else {
       VLOG(2) << "Can't register an optimizer by name: " << optimizer_name;
     }
   }
-  return InitializeCustomGraphOptimizers(optimizers);
+  return InitializeCustomGraphOptimizers(initialized_custom_optimizers,
+                                         optimizers);
 }
 
 Status MetaOptimizer::InitializeCustomGraphOptimizers(
+    const std::set<string>& pre_initialized_optimizers,
     std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
   for (const auto& optimizer_config : cfg_.custom_optimizers()) {
+    if (pre_initialized_optimizers.find(optimizer_config.name()) !=
+        pre_initialized_optimizers.end()) {
+      continue;
+    }
     // Initialize the ExperimentalImplementationSelector here instead of
     // CustomizeOptimizer registry, due the static link issue in TensorRT for
     // double registry.
@@ -237,6 +246,16 @@ Status MetaOptimizer::InitializeCustomGraphOptimizers(
   return Status::OK();
 }
 
+const RewriterConfig::CustomGraphOptimizer*
+MetaOptimizer::GetCustomGraphOptimizerConfig(const string& name) const {
+  for (const auto& config : cfg_.custom_optimizers()) {
+    if (config.name() == name) {
+      return &config;
+    }
+  }
+  return nullptr;
+}
+
 Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
                                     GraphDef* optimized_graph) {
   int min_graph_nodes = cfg_.min_graph_nodes() == 0 ? kDefaultMinGraphNodes
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index 831c5e37c0..99a0a33ffa 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -54,7 +54,11 @@ class MetaOptimizer : public GraphOptimizer {
       std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
   // Initialize active optimizers from RewriterConfig.custom_optimizers.
   Status InitializeCustomGraphOptimizers(
+      const std::set<string>& pre_initialized_optimizers,
       std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
+  // Returns the config for a custom graph optimizer. Null if none was found.
+  const RewriterConfig::CustomGraphOptimizer* GetCustomGraphOptimizerConfig(
+      const string& name) const;
 
   // Run optimization pass over a single GrapplerItem. Meta optimizer might run
   // multiple such passes: 1) for the main graph 2) for the function library
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index e74e0f7501..c477c4d4b1 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -71,6 +71,17 @@ class TestGraphOptimizer : public TestOptimizer {
 
 REGISTER_GRAPH_OPTIMIZER(TestGraphOptimizer);
 
+class TestOptimizerWithParams : public TestOptimizer {
+ public:
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    CHECK(config != nullptr);
+    return Status::OK();
+  }
+};
+
+REGISTER_GRAPH_OPTIMIZER(TestOptimizerWithParams);
+
 class MetaOptimizerTest : public GrapplerTest {};
 
 TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
@@ -90,6 +101,25 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
   EXPECT_TRUE(TestOptimizer::IsOptimized());
 }
 
+TEST_F(MetaOptimizerTest, RunsCustomOptimizerWithParams) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  TestOptimizer::SetOptimized(false);
+  RewriterConfig rewriter_config;
+  rewriter_config.add_optimizers("TestOptimizerWithParams");
+  auto* custom_config = rewriter_config.add_custom_optimizers();
+  custom_config->set_name("TestOptimizerWithParams");
+  (*custom_config->mutable_parameter_map())["foo"] = AttrValue();
+
+  MetaOptimizer optimizer(nullptr, rewriter_config);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  EXPECT_TRUE(TestOptimizer::IsOptimized());
+}
+
 TEST_F(MetaOptimizerTest, RunsCustomOptimizerAndCustomGraphOptimizer) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index bb8f88336d..482178a540 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -143,8 +143,8 @@ message RewriterConfig {
   // not configurable (in contrast to memory optimization passes through the
   // meta-optimizer) and act only on manual op annotations.
   //
-  // Custom registered optimizers will be run after the base optimizers, in
-  // the order that they are specified.
+  // Custom optimizers (see custom_optimizers) that are not part of this
+  // schedule will be run after - in the order that they were specified.
   repeated string optimizers = 100;
 
   // Message to describe custom graph optimizer and its parameters
-- 
GitLab


From 3002b10e29363854c6fc20d788bc65233fd5116f Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Thu, 27 Sep 2018 10:25:58 -0700
Subject: [PATCH 081/570] Update L2HMC graph benchmark to be more similar to
 eager benchmark.

PiperOrigin-RevId: 214795331
---
 .../eager/python/examples/l2hmc/l2hmc_test.py | 162 ++++++++++--------
 1 file changed, 91 insertions(+), 71 deletions(-)

diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py
index c38a1597b8..1c925e455b 100644
--- a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py
@@ -45,6 +45,17 @@ def step(dynamics, optimizer, samples):
   return loss, samples
 
 
+# To be defunnable, the function cannot return an Operation, so the above
+# function is used for defun or eager, and this function is used in graph to be
+# able to run the gradient updates.
+def graph_step(dynamics, optimizer, samples):
+  loss, grads, samples, _ = l2hmc.loss_and_grads(
+      dynamics, samples, loss_fn=l2hmc.compute_loss)
+  train_op = optimizer.apply_gradients(zip(grads, dynamics.variables))
+
+  return train_op, loss, samples
+
+
 def warmup(dynamics,
            optimizer,
            n_iters=1,
@@ -134,51 +145,48 @@ class L2hmcBenchmark(tf.test.Benchmark):
     """Benchmark Graph performance."""
 
     hparams = get_default_hparams()
-    tf.reset_default_graph()
-    with tf.Graph().as_default():
-      energy_fn, _, _ = l2hmc.get_scg_energy_fn()
-      dynamics = l2hmc.Dynamics(
-          x_dim=hparams.x_dim,
-          minus_loglikelihood_fn=energy_fn,
-          n_steps=hparams.n_steps,
-          eps=hparams.eps)
-      x = tf.placeholder(tf.float32, shape=[None, hparams.x_dim])
-      loss, x_out, _ = l2hmc.compute_loss(dynamics, x)
-
-      global_step = tf.Variable(0., name="global_step", trainable=False)
-      learning_rate = tf.train.exponential_decay(
-          hparams.learning_rate, global_step, 1000, 0.96, staircase=True)
-      optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
-      train_op = optimizer.minimize(loss, global_step=global_step)
-
-      # Single thread; fairer comparison against eager
-      session_conf = tf.ConfigProto(
-          intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
-
-      with tf.Session(config=session_conf) as sess:
-        sess.run(tf.global_variables_initializer())
-
-        # Warmup to reduce initialization effect when timing
-        samples = npr.normal(size=[hparams.n_samples, hparams.x_dim])
-        for _ in range(hparams.n_warmup_iters):
-          _, _, _, _ = sess.run(
-              [x_out, loss, train_op, learning_rate], feed_dict={x: samples})
-
-        # Training
-        start_time = time.time()
-        for i in range(hparams.n_iters):
-          samples, loss_np, _, _ = sess.run(
-              [x_out, loss, train_op, learning_rate], feed_dict={x: samples})
-          print("Iteration %d: loss %.4f" % (i, loss_np))
-        wall_time = time.time() - start_time
-        examples_per_sec = hparams.n_samples / wall_time
-
-        self.report_benchmark(
-            name="graph_train_%s" % ("gpu"
-                                     if tf.test.is_gpu_available() else "cpu"),
-            iters=hparams.n_iters,
-            extras={"examples_per_sec": examples_per_sec},
-            wall_time=wall_time)
+    tf.enable_resource_variables()
+    for sample_size in [10, 25, 50, 100, 200]:
+      hparams.n_samples = sample_size
+      tf.reset_default_graph()
+      with tf.Graph().as_default():
+        energy_fn, _, _ = l2hmc.get_scg_energy_fn()
+        x = tf.random_normal([hparams.n_samples, hparams.x_dim],
+                             dtype=tf.float32)
+        dynamics = l2hmc.Dynamics(
+            x_dim=hparams.x_dim,
+            minus_loglikelihood_fn=energy_fn,
+            n_steps=hparams.n_steps,
+            eps=hparams.eps)
+        loss, _, _ = l2hmc.compute_loss(dynamics, x)
+
+        optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate)
+        train_op, loss, _ = graph_step(dynamics, optimizer, x)
+
+        # Single thread; fairer comparison against eager
+        session_conf = tf.ConfigProto(inter_op_parallelism_threads=1)
+
+        with tf.Session(config=session_conf) as sess:
+          sess.run(tf.global_variables_initializer())
+
+          # Warmup to reduce initialization effect when timing
+          for _ in range(hparams.n_warmup_iters):
+            _, _ = sess.run([train_op, loss])
+
+          # Training
+          start_time = time.time()
+          for i in range(hparams.n_iters):
+            _, loss_np = sess.run([train_op, loss])
+            print("Iteration %d: loss %.4f" % (i, loss_np))
+          wall_time = (time.time() - start_time) / hparams.n_iters
+          examples_per_sec = hparams.n_samples / wall_time
+
+          self.report_benchmark(
+              name="graph_train_%s_%d" %
+              ("gpu" if tf.test.is_gpu_available() else "cpu", sample_size),
+              iters=hparams.n_iters,
+              extras={"examples_per_sec": examples_per_sec},
+              wall_time=wall_time)
 
   def benchmark_eager(self):
     self._benchmark_eager()
@@ -190,32 +198,44 @@ class L2hmcBenchmark(tf.test.Benchmark):
     """Benchmark Eager performance."""
 
     hparams = get_default_hparams()
-    energy_fn, _, _ = l2hmc.get_scg_energy_fn()
-    dynamics = l2hmc.Dynamics(
-        x_dim=hparams.x_dim,
-        minus_loglikelihood_fn=energy_fn,
-        n_steps=hparams.n_steps,
-        eps=hparams.eps)
-    optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate)
-    step_fn = tfe.defun(step) if defun else step
-
-    # Warmup to reduce initialization effect when timing
-    warmup(dynamics, optimizer, n_iters=hparams.n_warmup_iters, step_fn=step_fn)
-
-    # Training
-    samples = tf.random_normal(
-        shape=[hparams.n_samples, hparams.x_dim], dtype=tf.float32)
-    start_time = time.time()
-    fit(dynamics, samples, optimizer, step_fn=step_fn, n_iters=hparams.n_iters)
-    wall_time = time.time() - start_time
-    examples_per_sec = hparams.n_samples / wall_time
-
-    self.report_benchmark(
-        name="eager_train_%s%s" % ("gpu" if tf.test.is_gpu_available() else
-                                   "cpu", "_defun" if defun else ""),
-        iters=hparams.n_iters,
-        extras={"examples_per_sec": examples_per_sec},
-        wall_time=wall_time)
+    for sample_size in [10, 25, 50, 100, 200]:
+      hparams.n_samples = sample_size
+      energy_fn, _, _ = l2hmc.get_scg_energy_fn()
+      dynamics = l2hmc.Dynamics(
+          x_dim=hparams.x_dim,
+          minus_loglikelihood_fn=energy_fn,
+          n_steps=hparams.n_steps,
+          eps=hparams.eps)
+      optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate)
+      step_fn = tfe.defun(step) if defun else step
+
+      # Warmup to reduce initialization effect when timing
+      warmup(
+          dynamics,
+          optimizer,
+          n_iters=hparams.n_warmup_iters,
+          n_samples=hparams.n_samples,
+          step_fn=step_fn)
+
+      # Training
+      samples = tf.random_normal(
+          shape=[hparams.n_samples, hparams.x_dim], dtype=tf.float32)
+      start_time = time.time()
+      fit(dynamics,
+          samples,
+          optimizer,
+          step_fn=step_fn,
+          n_iters=hparams.n_iters)
+      wall_time = (time.time() - start_time) / hparams.n_iters
+      examples_per_sec = hparams.n_samples / wall_time
+
+      self.report_benchmark(
+          name="eager_train_%s%s_%d" %
+          ("gpu" if tf.test.is_gpu_available() else "cpu",
+           "_defun" if defun else "", sample_size),
+          iters=hparams.n_iters,
+          extras={"examples_per_sec": examples_per_sec},
+          wall_time=wall_time)
 
     del dynamics
 
-- 
GitLab


From 334244be6864dd1dbec9bc8bb4996cc286a8e3e3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 10:31:36 -0700
Subject: [PATCH 082/570] Add tf.strings.unicode_script, which detects the
 script of a unicode codepoint based on standard ranges.

PiperOrigin-RevId: 214796357
---
 .../base_api/api_def_UnicodeScript.pbtxt      | 28 ++++++
 .../python_api/api_def_UnicodeScript.pbtxt    |  6 ++
 tensorflow/core/kernels/BUILD                 | 12 +++
 tensorflow/core/kernels/unicode_script_op.cc  | 53 +++++++++++
 tensorflow/core/ops/string_ops.cc             |  5 ++
 tensorflow/python/kernel_tests/BUILD          | 12 +++
 .../kernel_tests/unicode_script_op_test.py    | 57 ++++++++++++
 .../api/golden/v1/tensorflow.strings.pbtxt    |  4 +
 .../api/golden/v2/tensorflow.strings.pbtxt    |  4 +
 tensorflow/tools/lib_package/BUILD            |  2 +
 tensorflow/tools/pip_package/BUILD            |  1 +
 tensorflow/workspace.bzl                      |  2 +
 third_party/icu/BUILD                         |  1 +
 third_party/icu/BUILD.bazel                   | 88 +++++++++++++++++++
 third_party/icu/workspace.bzl                 | 15 ++++
 15 files changed, 290 insertions(+)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_UnicodeScript.pbtxt
 create mode 100644 tensorflow/core/kernels/unicode_script_op.cc
 create mode 100644 tensorflow/python/kernel_tests/unicode_script_op_test.py
 create mode 100644 third_party/icu/BUILD
 create mode 100644 third_party/icu/BUILD.bazel
 create mode 100644 third_party/icu/workspace.bzl

diff --git a/tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt
new file mode 100644
index 0000000000..7898fe8d6b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "UnicodeScript"
+  endpoint {
+    name: "UnicodeScript"
+  }
+  in_arg {
+    name: "input"
+    description: <<END
+A Tensor of int32 Unicode code points.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A Tensor of int32 script codes corresponding to each input code point.
+END
+  }
+  summary: <<END
+Determine the script codes of a given tensor of Unicode integer code points.
+END
+  description: <<END
+This operation converts Unicode code points to script codes corresponding to
+each code point. Script codes correspond to International Components for
+Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
+Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
+match input shape.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UnicodeScript.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnicodeScript.pbtxt
new file mode 100644
index 0000000000..a884a46143
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UnicodeScript.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnicodeScript"
+  endpoint {
+    name: "strings.unicode_script"
+  }
+}
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 0534b1829d..0b8e9ec527 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4431,6 +4431,7 @@ cc_library(
         ":string_strip_op",
         ":string_to_hash_bucket_op",
         ":substr_op",
+        ":unicode_script_op",
     ],
 )
 
@@ -5471,6 +5472,7 @@ filegroup(
             "batch_kernels.*",
             "regex_full_match_op.cc",
             "regex_replace_op.cc",
+            "unicode_script_op.cc",
             # Ops that are inherently incompatible with Android (e.g. tied to x86 platform).
             "mkl_*",
             "xsmm_*",
@@ -6565,6 +6567,16 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "unicode_script_op",
+    srcs = ["unicode_script_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:string_ops_op_lib",
+        "@icu//:common",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
 
diff --git a/tensorflow/core/kernels/unicode_script_op.cc b/tensorflow/core/kernels/unicode_script_op.cc
new file mode 100644
index 0000000000..085e397eba
--- /dev/null
+++ b/tensorflow/core/kernels/unicode_script_op.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "unicode/errorcode.h"  // TF:icu
+#include "unicode/uscript.h"  // TF:icu
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class UnicodeScriptOp : public OpKernel {
+ public:
+  explicit UnicodeScriptOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(context, context->input("input", &input_tensor));
+    const auto& input_flat = input_tensor->flat<int32>();
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("output", input_tensor->shape(),
+                                            &output_tensor));
+    auto output_flat = output_tensor->flat<int32>();
+
+    icu::ErrorCode status;
+    for (int i = 0; i < input_flat.size(); i++) {
+      UScriptCode script_code = uscript_getScript(input_flat(i), status);
+      if (status.isSuccess()) {
+        output_flat(i) = script_code;
+      } else {
+        output_flat(i) = -1;
+        status.reset();
+      }
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnicodeScript").Device(DEVICE_CPU),
+                        UnicodeScriptOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index da1d2a6432..b4fbde54d9 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -244,4 +244,9 @@ REGISTER_OP("Substr")
       return shape_inference::BroadcastBinaryOpShapeFn(c);
     });
 
+REGISTER_OP("UnicodeScript")
+    .Input("input: int32")
+    .Output("output: int32")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 5183e4d30c..c2e36e5e19 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1097,6 +1097,18 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "unicode_script_op_test",
+    size = "small",
+    srcs = ["unicode_script_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
 cuda_py_test(
     name = "topk_op_test",
     size = "small",
diff --git a/tensorflow/python/kernel_tests/unicode_script_op_test.py b/tensorflow/python/kernel_tests/unicode_script_op_test.py
new file mode 100644
index 0000000000..927e5459ed
--- /dev/null
+++ b/tensorflow/python/kernel_tests/unicode_script_op_test.py
@@ -0,0 +1,57 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+"""Functional tests for UnicodeScript op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class UnicodeScriptOpTest(test.TestCase):
+
+  def testValidScripts(self):
+    inputs = [
+        ord("a"),
+        0x0411,  # CYRILLIC CAPITAL LETTER BE
+        0x82b8,  # CJK UNIFIED IDEOGRAPH-82B8
+        ord(",")
+    ]
+    with self.cached_session():
+      input_vector = constant_op.constant(inputs, dtypes.int32)
+      outputs = string_ops.unicode_script(input_vector).eval()
+      self.assertAllEqual(
+          outputs,
+          [
+              25,  # USCRIPT_LATIN (LATN)
+              8,  # USCRIPT_CYRILLIC (CYRL)
+              17,  # USCRIPT_HAN (HANI)
+              0  # USCRIPT_COMMON (ZYYY)
+          ])
+
+  def testInvalidScript(self):
+    inputs = [-100, 0xffffff]
+    with self.cached_session():
+      input_vector = constant_op.constant(inputs, dtypes.int32)
+      outputs = string_ops.unicode_script(input_vector).eval()
+      self.assertAllEqual(outputs, [-1, -1])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
index c52581dec1..312e94b41d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
@@ -48,4 +48,8 @@ tf_module {
     name: "to_number"
     argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "unicode_script"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index c52581dec1..312e94b41d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -48,4 +48,8 @@ tf_module {
     name: "to_number"
     argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "unicode_script"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index b450bc42c5..095ac1f4cc 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -125,6 +125,7 @@ genrule(
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@highwayhash//:LICENSE",
+        "@icu//:icu4c/LICENSE",
         "@jpeg//:LICENSE.md",
         "@llvm//:LICENSE.TXT",
         "@lmdb//:LICENSE",
@@ -192,6 +193,7 @@ genrule(
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@highwayhash//:LICENSE",
+        "@icu//:icu4j/main/shared/licenses/LICENSE",
         "@jpeg//:LICENSE.md",
         "@llvm//:LICENSE.TXT",
         "@lmdb//:LICENSE",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 9d816f0672..cce60ccea0 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -153,6 +153,7 @@ filegroup(
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@highwayhash//:LICENSE",
+        "@icu//:icu4c/LICENSE",
         "@jpeg//:LICENSE.md",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 4bf2ff3fb5..e5a0a0b2b7 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -21,9 +21,11 @@ load(
     "def_file_filter_configure",
 )
 load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
+load("//third_party/icu:workspace.bzl", icu = "repo")
 
 def initialize_third_party():
     flatbuffers()
+    icu()
 
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
diff --git a/third_party/icu/BUILD b/third_party/icu/BUILD
new file mode 100644
index 0000000000..82bab3ffd9
--- /dev/null
+++ b/third_party/icu/BUILD
@@ -0,0 +1 @@
+# This empty BUILD file is required to make Bazel treat this directory as a package.
diff --git a/third_party/icu/BUILD.bazel b/third_party/icu/BUILD.bazel
new file mode 100644
index 0000000000..36d6b9006b
--- /dev/null
+++ b/third_party/icu/BUILD.bazel
@@ -0,0 +1,88 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files([
+    "icu4c/LICENSE",
+    "icu4j/main/shared/licenses/LICENSE",
+])
+
+cc_library(
+    name = "headers",
+    hdrs = glob(["icu4c/source/common/unicode/*.h"]),
+    includes = [
+        "icu4c/source/common",
+    ],
+    deps = [
+    ],
+)
+
+cc_library(
+    name = "common",
+    hdrs = glob(["icu4c/source/common/unicode/*.h"]),
+    includes = [
+        "icu4c/source/common",
+    ],
+    deps = [
+        ":icuuc",
+    ],
+)
+
+cc_library(
+    name = "icuuc",
+    srcs = glob(
+        [
+            "icu4c/source/common/*.c",
+            "icu4c/source/common/*.cpp",
+            "icu4c/source/stubdata/*.cpp",
+        ],
+    ),
+    hdrs = glob([
+        "icu4c/source/common/*.h",
+    ]),
+    copts = [
+        "-DU_COMMON_IMPLEMENTATION",
+        "-DU_HAVE_STD_ATOMICS",
+    ] + select({
+        ":android": [
+            "-fdata-sections",
+            "-DGOOGLE_VENDOR_SRC_BRANCH",
+            "-DU_HAVE_NL_LANGINFO_CODESET=0",
+            "-Wno-deprecated-declarations",
+        ],
+        ":apple": [
+            "-DGOOGLE_VENDOR_SRC_BRANCH",
+            "-Wno-shorten-64-to-32",
+            "-Wno-unused-variable",
+        ],
+        ":windows": [
+            "/utf-8",
+            "/DLOCALE_ALLOW_NEUTRAL_NAMES=0",
+        ],
+        "//conditions:default": [],
+    }),
+    tags = ["requires-rtti"],
+    visibility = [
+        "//visibility:private",
+    ],
+    deps = [
+        ":headers",
+    ],
+)
+
+config_setting(
+    name = "android",
+    values = {"crosstool_top": "//external:android/crosstool"},
+)
+
+config_setting(
+    name = "apple",
+    values = {"cpu": "darwin"},
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+)
diff --git a/third_party/icu/workspace.bzl b/third_party/icu/workspace.bzl
new file mode 100644
index 0000000000..bfebf4219b
--- /dev/null
+++ b/third_party/icu/workspace.bzl
@@ -0,0 +1,15 @@
+"""Loads a lightweight subset of the ICU library for Unicode processing."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "icu",
+        strip_prefix = "icu-release-62-1",
+        sha256 = "e15ffd84606323cbad5515bf9ecdf8061cc3bf80fb883b9e6aa162e485aa9761",
+        urls = [
+            "https://mirror.bazel.build/github.com/unicode-org/icu/archive/release-62-1.tar.gz",
+            "https://github.com/unicode-org/icu/archive/release-62-1.tar.gz",
+        ],
+        build_file = "//third_party/icu:BUILD.bazel",
+    )
-- 
GitLab


From 6d41787c32483b28f8c93973f28d4d078ea0b37e Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Thu, 27 Sep 2018 10:53:36 -0700
Subject: [PATCH 083/570] Add opaque field to custom call. The intent of this
 field is to enable more information to be encoded in the custom call and
 passed through to the backend.

PiperOrigin-RevId: 214800539
---
 tensorflow/compiler/xla/client/xla_builder.cc |  8 ++++---
 tensorflow/compiler/xla/client/xla_builder.h  | 24 +++++++++++--------
 tensorflow/compiler/xla/service/hlo.proto     |  8 +++++--
 .../compiler/xla/service/hlo_instruction.cc   |  9 +++----
 .../compiler/xla/service/hlo_instruction.h    |  5 ++--
 .../compiler/xla/service/hlo_instructions.cc  | 14 ++++++++---
 .../compiler/xla/service/hlo_instructions.h   |  8 +++++--
 tensorflow/compiler/xla/service/hlo_parser.cc |  7 ++++--
 .../compiler/xla/service/hlo_parser_test.cc   | 12 ++++++++++
 9 files changed, 67 insertions(+), 28 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 95ff6432a5..5277de6a85 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -1278,7 +1278,7 @@ XlaOp XlaBuilder::AfterAll(absl::Span<const XlaOp> tokens) {
 
 XlaOp XlaBuilder::CustomCall(const string& call_target_name,
                              absl::Span<const XlaOp> operands,
-                             const Shape& shape) {
+                             const Shape& shape, const string& opaque) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     if (absl::StartsWith(call_target_name, "$")) {
@@ -1289,6 +1289,7 @@ XlaOp XlaBuilder::CustomCall(const string& call_target_name,
     }
     *instr.mutable_shape() = shape;
     instr.set_custom_call_target(call_target_name);
+    instr.set_custom_call_opaque(opaque);
     return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
   });
 }
@@ -2681,8 +2682,9 @@ XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
 }
 
 XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
-                 absl::Span<const XlaOp> operands, const Shape& shape) {
-  return builder->CustomCall(call_target_name, operands, shape);
+                 absl::Span<const XlaOp> operands, const Shape& shape,
+                 const string& opaque) {
+  return builder->CustomCall(call_target_name, operands, shape, opaque);
 }
 
 XlaOp Complex(const XlaOp& real, const XlaOp& imag,
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index d0c59fa6f2..1da6ddd318 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -577,11 +577,9 @@ class XlaBuilder {
              absl::Span<const XlaOp> operands);
 
   // Enqueues a custom call instruction onto the computation.
-  // During code generation, a call instruction is emitted which targets a
-  // symbol with the name |call_target_name|.  The |operands| are passed to the
-  // call instruction.  |shape| is the resultant shape.
   XlaOp CustomCall(const string& call_target_name,
-                   absl::Span<const XlaOp> operands, const Shape& shape);
+                   absl::Span<const XlaOp> operands, const Shape& shape,
+                   const string& opaque);
 
   // The following methods enqueue element-wise binary arithmetic operations
   // onto the computation. The shapes of the operands have to match unless one
@@ -1195,7 +1193,8 @@ class XlaBuilder {
   friend XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
                     absl::Span<const XlaOp> operands);
   friend XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
-                          absl::Span<const XlaOp> operands, const Shape& shape);
+                          absl::Span<const XlaOp> operands, const Shape& shape,
+                          const string& opaque);
   friend XlaOp Complex(const XlaOp& real, const XlaOp& imag,
                        absl::Span<const int64> broadcast_dimensions);
   friend XlaOp Conj(const XlaOp& operand);
@@ -1717,12 +1716,17 @@ XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
 XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
            absl::Span<const XlaOp> operands);
 
-// Enqueues a custom call instruction onto the computation.
-// During code generation, a call instruction is emitted which targets a
-// symbol with the name |call_target_name|.  The |operands| are passed to the
-// call instruction.  |shape| is the resultant shape.
+// Enqueues a custom call instruction onto the computation. A custom call
+// invokes code external to XLA. The |operands| are passed to the external code,
+// and the external code is expected to produce a result of the given
+// |shape|. The exact mechanism is backend-specific. For example, in the CPU
+// backend, a call instruction is emitted which targets a symbol with the name
+// |call_target_name|.  |call_target_name| and |opaque| can arbitrary strings,
+// but |call_target_name| should be short as it may be used in labels. |opaque|
+// can encode arbitrarily large amounts of information.
 XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
-                 absl::Span<const XlaOp> operands, const Shape& shape);
+                 absl::Span<const XlaOp> operands, const Shape& shape,
+                 const string& opaque = "");
 
 // The following methods enqueue element-wise binary arithmetic operations
 // onto the computation. The shapes of the operands have to match unless one
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index b19ec12638..caaca16f71 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -34,7 +34,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 53
+// Next ID: 54
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -124,9 +124,13 @@ message HloInstructionProto {
   // The string representation of the infeed configuration.
   bytes infeed_config = 27;
 
-  // Name of a global symbol to call, only present for kCustomCall.
+  // Name of a external target (eg, global symbol) to call, only present for
+  // kCustomCall.
   string custom_call_target = 28;
 
+  // Opaque string, only present for kCustomCall.
+  string custom_call_opaque = 53;
+
   // Shape of outfeed request.
   xla.Shape outfeed_shape = 29;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index f7ec854d80..23787dbc8a 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -379,7 +379,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     case HloOpcode::kCustomCall:
       instruction = CreateCustomCall(proto.shape(), all_operands(),
-                                     proto.custom_call_target());
+                                     proto.custom_call_target(),
+                                     proto.custom_call_opaque());
       if (proto.has_window()) {
         static_cast<HloCustomCallInstruction*>(instruction.get())
             ->set_window(proto.window());
@@ -1108,9 +1109,9 @@ bool HloInstruction::HasSideEffect() const {
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCustomCall(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    absl::string_view custom_call_target) {
-  return absl::make_unique<HloCustomCallInstruction>(shape, operands,
-                                                     custom_call_target);
+    absl::string_view custom_call_target, absl::string_view opaque) {
+  return absl::make_unique<HloCustomCallInstruction>(
+      shape, operands, custom_call_target, opaque);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateTuple(
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index d615df0831..009bd3bab3 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -718,10 +718,11 @@ class HloInstruction {
       HloComputation* computation);
 
   // Creates a custom call instruction that applies the given custom call target
-  // to the given operands. "shape" is the resultant shape.
+  // to the given operands. "opaque" can be an arbitrary string with a
+  // backend-specific interpretation. "shape" is the resultant shape.
   static std::unique_ptr<HloInstruction> CreateCustomCall(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      absl::string_view custom_call_target);
+      absl::string_view custom_call_target, absl::string_view opaque = "");
 
   // Creates a tuple instruction with the given elements. This is a convenience
   // wrapper around CreateVariadic.
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index e92882c22a..cd71bc3323 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -1830,9 +1830,10 @@ HloSelectAndScatterInstruction::CloneWithNewOperandsImpl(
 
 HloCustomCallInstruction::HloCustomCallInstruction(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    absl::string_view custom_call_target)
+    absl::string_view custom_call_target, absl::string_view opaque)
     : HloInstruction(HloOpcode::kCustomCall, shape),
       custom_call_target_(custom_call_target.begin(), custom_call_target.end()),
+      opaque_(opaque.begin(), opaque.end()),
       feature_group_count_(1) {
   for (auto operand : operands) {
     AppendOperand(operand);
@@ -1849,6 +1850,7 @@ HloInstructionProto HloCustomCallInstruction::ToProto() const {
         *convolution_dimension_numbers_;
   }
   proto.set_custom_call_target(custom_call_target_);
+  proto.set_custom_call_opaque(opaque_);
   proto.set_feature_group_count(feature_group_count_);
   return proto;
 }
@@ -1872,6 +1874,11 @@ std::vector<string> HloCustomCallInstruction::ExtraAttributesToStringImpl(
   // an HloComputation.
   extra.push_back(
       StrCat("custom_call_target=\"", CEscape(custom_call_target_), "\""));
+  // If the opaque string becomes enormous we may want to reconsider printing
+  // this inline and consider other options.
+  if (!opaque_.empty()) {
+    extra.push_back(StrCat("opaque=\"", CEscape(opaque_), "\""));
+  }
   return extra;
 }
 
@@ -1897,7 +1904,8 @@ bool HloCustomCallInstruction::IdenticalSlowPath(
   if (feature_group_count_ != casted_other.feature_group_count_) {
     return false;
   }
-  return custom_call_target_ == casted_other.custom_call_target_;
+  return custom_call_target_ == casted_other.custom_call_target_ &&
+         opaque_ == casted_other.opaque_;
 }
 
 std::unique_ptr<HloInstruction>
@@ -1905,7 +1913,7 @@ HloCustomCallInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
   auto cloned = absl::make_unique<HloCustomCallInstruction>(
-      shape, new_operands, custom_call_target());
+      shape, new_operands, custom_call_target(), opaque());
   if (window_ != nullptr) {
     cloned->set_window(*window_);
   }
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 2d7bc83855..9c22f5db7e 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -1070,7 +1070,8 @@ class HloCustomCallInstruction : public HloInstruction {
  public:
   explicit HloCustomCallInstruction(const Shape& shape,
                                     absl::Span<HloInstruction* const> operands,
-                                    absl::string_view custom_call_target);
+                                    absl::string_view custom_call_target,
+                                    absl::string_view opaque);
   const Window& window() const override {
     CHECK(window_ != nullptr);
     return *window_;
@@ -1090,6 +1091,7 @@ class HloCustomCallInstruction : public HloInstruction {
     convolution_dimension_numbers_ =
         absl::make_unique<ConvolutionDimensionNumbers>(dnums);
   }
+  const string& opaque() const { return opaque_; }
   const string& custom_call_target() const { return custom_call_target_; }
   void set_feature_group_count(int64 feature_group_count) {
     feature_group_count_ = feature_group_count;
@@ -1109,8 +1111,10 @@ class HloCustomCallInstruction : public HloInstruction {
   std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
       const Shape& shape, absl::Span<HloInstruction* const> new_operands,
       HloCloneContext* context) const override;
-  // Name of a global symbol to call, only present for kCustomCall.
+  // Name of a global symbol to call.
   string custom_call_target_;
+  // Opaque string interpreted by the backend.
+  string opaque_;
   // Describes the window in a windowed operation such as convolution.
   std::unique_ptr<Window> window_;
   // Describes the dimension numbers used for a convolution.
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 37197b273b..25b70740e3 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -1266,11 +1266,13 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     }
     case HloOpcode::kCustomCall: {
       optional<string> custom_call_target;
+      optional<string> opaque;
       optional<Window> window;
       optional<ConvolutionDimensionNumbers> dnums;
       optional<int64> feature_group_count;
       attrs["custom_call_target"] = {/*required=*/true, AttrTy::kString,
                                      &custom_call_target};
+      attrs["opaque"] = {/*required=*/false, AttrTy::kString, &opaque};
       attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window};
       attrs["dim_labels"] = {/*required=*/false,
                              AttrTy::kConvolutionDimensionNumbers, &dnums};
@@ -1279,8 +1281,9 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
-      instruction = builder->AddInstruction(HloInstruction::CreateCustomCall(
-          shape, operands, *custom_call_target));
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateCustomCall(shape, operands, *custom_call_target,
+                                           opaque.has_value() ? *opaque : ""));
       if (window.has_value()) {
         instruction->set_window(*window);
       }
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index cca50fab54..96db96bdb9 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -1002,6 +1002,18 @@ ENTRY CustomCall {
   ROOT custom-call = f32[1,2,3]{0,2,1} custom-call(constant), custom_call_target="foo\"bar"
 }
 
+)"
+},
+// CustomCall with opaque value.
+{
+"CustomCallWithOpaque",
+R"(HloModule custom_call
+
+ENTRY CustomCall {
+  constant = f32[1]{0} constant({12345})
+  ROOT custom-call = f32[1,2,3]{0,2,1} custom-call(constant), custom_call_target="foo\"bar", opaque="this string is opaque"
+}
+
 )"
 },
 // Variables with non-default names
-- 
GitLab


From dcf72802384fdab6744d3c16577091a82bc2cce0 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 27 Sep 2018 11:01:56 -0700
Subject: [PATCH 084/570] Clean up unused members in DirectSession and
 Executor.

PiperOrigin-RevId: 214802032
---
 .../core/common_runtime/direct_session.cc     |  8 ++++----
 .../core/common_runtime/direct_session.h      | 20 ++++++++-----------
 tensorflow/core/common_runtime/executor.h     |  6 ------
 3 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index af5d5b17e7..841181f8c3 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -363,7 +363,7 @@ Status DirectSession::MaybeInitializeExecutionState(
 Status DirectSession::Create(const GraphDef& graph) {
   TF_RETURN_IF_ERROR(init_error_);
   if (graph.node_size() > 0) {
-    mutex_lock l(graph_def_lock_);
+    mutex_lock l(graph_state_lock_);
     if (graph_created_) {
       return errors::AlreadyExists(
           "A Graph has already been created for this session.");
@@ -375,7 +375,7 @@ Status DirectSession::Create(const GraphDef& graph) {
 
 Status DirectSession::Extend(const GraphDef& graph) {
   TF_RETURN_IF_ERROR(CheckNotClosed());
-  mutex_lock l(graph_def_lock_);
+  mutex_lock l(graph_state_lock_);
   return ExtendLocked(graph);
 }
 
@@ -1172,7 +1172,7 @@ Status DirectSession::CreateExecutors(
 
   int graph_def_version;
   {
-    mutex_lock l(graph_def_lock_);
+    mutex_lock l(graph_state_lock_);
     graph_def_version =
         execution_state_->original_graph_def().versions().producer();
   }
@@ -1400,7 +1400,7 @@ Status DirectSession::CreateGraphs(
     std::unique_ptr<FunctionLibraryDefinition>* flib_def,
     RunStateArgs* run_state_args, DataTypeVector* input_types,
     DataTypeVector* output_types, int64* collective_graph_key) {
-  mutex_lock l(graph_def_lock_);
+  mutex_lock l(graph_state_lock_);
   std::unique_ptr<ClientGraph> client_graph;
 
   std::unique_ptr<GraphExecutionState> temp_exec_state_holder;
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index c2cf3c7fd7..4a6a921ea7 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -215,7 +215,7 @@ class DirectSession : public Session {
   // if not already initialized.
   Status MaybeInitializeExecutionState(const GraphDef& graph,
                                        bool* out_already_initialized)
-      EXCLUSIVE_LOCKS_REQUIRED(graph_def_lock_);
+      EXCLUSIVE_LOCKS_REQUIRED(graph_state_lock_);
 
   // Retrieves an already existing set of executors to run 'inputs' and
   // 'outputs', or creates and caches them for future use.
@@ -248,7 +248,7 @@ class DirectSession : public Session {
                                    RunMetadata* run_metadata);
 
   ::tensorflow::Status ExtendLocked(const GraphDef& graph)
-      EXCLUSIVE_LOCKS_REQUIRED(graph_def_lock_);
+      EXCLUSIVE_LOCKS_REQUIRED(graph_state_lock_);
 
   ::tensorflow::Status ResourceHandleToInputTensor(
       const Tensor& resource_tensor, Tensor* retrieved_tensor);
@@ -289,7 +289,7 @@ class DirectSession : public Session {
   }
 
   ::tensorflow::Status CheckGraphCreated(const char* method) {
-    mutex_lock l(graph_def_lock_);
+    mutex_lock l(graph_state_lock_);
     if (!graph_created_) {
       return errors::InvalidArgument(
           "Session was not created with a graph before ", method, "!");
@@ -313,10 +313,8 @@ class DirectSession : public Session {
   DeviceSet device_set_;
 
   string session_handle_;
-  bool graph_created_ GUARDED_BY(graph_def_lock_) = false;
-
-  mutex graph_def_lock_;
-  GraphDef graph_def_ GUARDED_BY(graph_def_lock_);
+  mutex graph_state_lock_;
+  bool graph_created_ GUARDED_BY(graph_state_lock_) = false;
 
   // The thread-pools to use for running ops, with a bool indicating if the pool
   // is owned.
@@ -367,11 +365,11 @@ class DirectSession : public Session {
   // nodes can not be moved to a different device.  Maps node names to
   // device names.
   std::unordered_map<string, string> stateful_placements_
-      GUARDED_BY(graph_def_lock_);
+      GUARDED_BY(graph_state_lock_);
 
   // Execution_state; used when placing the entire graph.
   std::unique_ptr<GraphExecutionState> execution_state_
-      GUARDED_BY(graph_def_lock_);
+      GUARDED_BY(graph_state_lock_);
 
   // The function library, before any rewrites or optimizations have been
   // performed. In particular, CreateGraphs() may need to modify the function
@@ -386,7 +384,7 @@ class DirectSession : public Session {
   std::atomic<int64> edge_name_counter_ = {0};
   std::atomic<int64> handle_name_counter_ = {0};
 
-  // For generating step ids that are unique across all sessions.
+  // For generating step ids that are unique across this sessions.
   static std::atomic_int_fast64_t step_id_counter_;
 
   // Global timeout for all blocking operations in this session.
@@ -395,8 +393,6 @@ class DirectSession : public Session {
   // Manages all the cost models for the graphs executed in this session.
   CostModelManager cost_model_manager_;
 
-  Executor::Args::NodeOutputsCallback node_outputs_callback_ = nullptr;
-
   // For testing collective graph key generation.
   mutex collective_graph_key_lock_;
   int64 collective_graph_key_ GUARDED_BY(collective_graph_key_lock_) = -1;
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index 6cd4fd22ea..34bf73972f 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -97,12 +97,6 @@ class Executor {
     typedef std::function<void()> Closure;
     typedef std::function<void(Closure)> Runner;
     Runner runner = nullptr;
-
-    // A callback that is invoked each time a node has finished executing.
-    typedef std::function<Status(const string& node_name, const int output_slot,
-                                 const Tensor* tensor, const bool is_ref,
-                                 OpKernelContext* ctx)>
-        NodeOutputsCallback;
   };
   typedef std::function<void(const Status&)> DoneCallback;
   virtual void RunAsync(const Args& args, DoneCallback done) = 0;
-- 
GitLab


From 50b94fa1d50a916eaf7a5a46d93260e9b0f93554 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 27 Sep 2018 11:07:09 -0700
Subject: [PATCH 085/570] Internal change

PiperOrigin-RevId: 214803223
---
 tensorflow/contrib/fused_conv/BUILD           |  35 +-
 .../fused_conv2d_bias_activation_op.cc        |   4 +-
 .../fused_conv2d_bias_activation_op_test.py   | 891 +----------------
 ...sed_conv2d_bias_activation_op_test_base.py | 945 ++++++++++++++++++
 .../tools/pip_package/pip_smoke_test.py       |   1 +
 5 files changed, 985 insertions(+), 891 deletions(-)
 create mode 100644 tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py

diff --git a/tensorflow/contrib/fused_conv/BUILD b/tensorflow/contrib/fused_conv/BUILD
index 9725233e7f..490da9b33b 100644
--- a/tensorflow/contrib/fused_conv/BUILD
+++ b/tensorflow/contrib/fused_conv/BUILD
@@ -17,11 +17,14 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_kernel_library",
+    "tf_custom_op_library",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+)
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
-load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
-load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 tf_custom_op_py_library(
@@ -109,12 +112,13 @@ tf_gen_op_wrapper_py(
     deps = [":fused_conv2d_bias_activation_op_op_lib"],
 )
 
-cuda_py_test(
-    name = "fused_conv2d_bias_activation_op_test",
-    srcs = ["python/ops/fused_conv2d_bias_activation_op_test.py"],
-    additional_deps = [
+py_library(
+    name = "fused_conv2d_bias_activation_op_test_base",
+    testonly = 1,
+    srcs = ["python/ops/fused_conv2d_bias_activation_op_test_base.py"],
+    visibility = ["//tensorflow/compiler/tf2xla:internal"],
+    deps = [
         ":fused_conv_py",
-        "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -127,8 +131,21 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_test(
+    name = "fused_conv2d_bias_activation_op_test",
+    size = "large",
+    srcs = ["python/ops/fused_conv2d_bias_activation_op_test.py"],
+    additional_deps = [
+        ":fused_conv2d_bias_activation_op_test_base",
+        "//tensorflow/python:client_testlib",
     ],
     tags = [
+        "no_pip",
         "requires-gpu-sm70",
     ],
 )
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index e9e6464d06..93b1aaa85e 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -111,8 +111,8 @@ class FusedConv2DBiasActivationOp : public OpKernel {
         context,
         (GetTensorDim(strides, data_format_, 'N') == 1 &&
          GetTensorDim(strides, data_format_, 'C') == 1),
-        errors::InvalidArgument("Convolutional strides are not supported in "
-                                "the batch or depth dimensions."));
+        errors::Unimplemented("Convolutional strides are not supported in "
+                              "the batch and depth dimensions."));
 
     // Assuming qint8 <--> NCHW_VECT_C, OIHW_VECT_I (int8x4) here.
     constexpr bool is_int8x4 = std::is_same<T, qint8>::value;
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index 4894298694..e5c8a34fc1 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -12,896 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functional tests for fused conv2d bias and activation operation."""
+
+"""Tests for fused convolutions."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.contrib.fused_conv.python.ops import fused_conv2d_bias_activation_op
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.contrib.fused_conv.python.ops import fused_conv2d_bias_activation_op_test_base as test_base
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging
-
-
-def GetShrunkInceptionShapes(shrink=10):
-  """Iterator for smaller versions of convolution shapes in 2015 Inception.
-
-  Relative to inception, each depth value is `depth // shrink`.
-
-  Args:
-    shrink: Factor to shrink each depth value by relative to Inception.
-
-  Yields:
-    Tuple (input_size, filter_size, out_size, stride, padding), the convolution
-    parameters of Inception layers.
-  """
-  input_sizes = [[4, 5, 5, 1248], [4, 8, 8, 384], [4, 8, 8, 384], [
-      4, 8, 8, 2048
-  ], [4, 8, 8, 448], [4, 8, 8, 2048], [4, 8, 8, 2048], [4, 8, 8, 2048], [
-      4, 8, 8, 1760
-  ], [4, 8, 8, 1760], [4, 8, 8, 1760], [4, 8, 8, 1760], [4, 17, 17, 192], [
-      4, 17, 17, 192
-  ], [4, 17, 17, 1248], [4, 17, 17, 128], [4, 17, 17, 1248], [4, 17, 17, 224], [
-      4, 17, 17, 192
-  ], [4, 17, 17, 192], [4, 17, 17, 1216], [4, 17, 17, 1216], [4, 17, 17, 224], [
-      4, 17, 17, 192
-  ], [4, 17, 17, 192], [4, 17, 17, 1152], [4, 17, 17, 1152], [4, 17, 17, 192], [
-      4, 17, 17, 160
-  ], [4, 17, 17, 1152], [4, 17, 17, 1024], [4, 17, 17, 128], [4, 17, 17, 1024],
-                 [4, 17, 17, 128], [4, 17, 17, 1024], [4, 17, 17, 128], [
-                     4, 17, 17, 768
-                 ], [4, 17, 17, 128], [4, 17, 17, 128], [4, 17, 17, 768],
-                 [4, 17, 17, 768], [4, 35, 35, 96], [4, 35, 35, 288], [
-                     4, 35, 35, 64
-                 ], [4, 35, 35, 288], [4, 35, 35, 256], [4, 35, 35, 48], [
-                     4, 35, 35, 256
-                 ], [4, 35, 35, 96], [4, 35, 35, 192], [4, 35, 35, 192], [
-                     4, 35, 35, 192
-                 ], [4, 73, 73, 64], [4, 73, 73, 64], [4, 147, 147, 24]]
-  filter_sizes = [[1, 1, 1248, 128], [1, 3, 384, 384], [3, 1, 384, 384], [
-      1, 1, 2048, 192
-  ], [3, 3, 448, 384], [1, 1, 2048, 320], [1, 1, 2048, 448], [1, 1, 2048, 384],
-                  [1, 1, 1760, 384], [1, 1, 1760, 192], [1, 1, 1760, 448], [
-                      1, 1, 1760, 320
-                  ], [3, 3, 192, 192], [3, 3, 192, 192], [1, 1, 1248, 192], [
-                      3, 3, 128, 320
-                  ], [1, 1, 1248, 128], [1, 3, 224, 224], [3, 1, 192, 256], [
-                      1, 3, 192, 256
-                  ], [1, 1, 1216, 192], [1, 1, 1216, 96], [3, 1, 224, 224], [
-                      3, 3, 192, 224
-                  ], [1, 3, 192, 192], [1, 1, 1152, 192], [1, 1, 1152, 128], [
-                      3, 1, 192, 192
-                  ], [3, 3, 160, 192], [1, 1, 1152, 160], [1, 1, 1024, 128], [
-                      1, 3, 128, 192
-                  ], [1, 1, 1024, 160], [3, 1, 128, 192], [1, 1, 1024, 256], [
-                      3, 1, 128, 128
-                  ], [1, 1, 768, 192], [1, 3, 128, 128], [3, 3, 128, 128], [
-                      1, 1, 768, 128
-                  ], [1, 1, 768, 320], [3, 3, 96, 96], [3, 3, 288, 384], [
-                      3, 3, 64, 96
-                  ], [1, 1, 288, 64], [1, 1, 256, 64], [5, 5, 48, 64],
-                  [1, 1, 256, 48], [3, 3, 96, 96], [1, 1, 192, 32], [
-                      1, 1, 192, 64
-                  ], [1, 1, 192, 48], [3, 3, 64, 192], [1, 1, 64,
-                                                        64], [1, 1, 24, 64]]
-  out_sizes = [[4, 5, 5, 128], [4, 8, 8, 384], [4, 8, 8, 384], [4, 8, 8, 192], [
-      4, 8, 8, 384
-  ], [4, 8, 8, 320], [4, 8, 8, 448], [4, 8, 8, 384], [4, 8, 8, 384], [
-      4, 8, 8, 192
-  ], [4, 8, 8, 448], [4, 8, 8, 320], [4, 8, 8, 192], [4, 17, 17, 192], [
-      4, 17, 17, 192
-  ], [4, 8, 8, 320], [4, 17, 17, 128], [4, 17, 17, 224], [4, 17, 17, 256], [
-      4, 17, 17, 256
-  ], [4, 17, 17, 192], [4, 17, 17, 96], [4, 17, 17, 224], [4, 17, 17, 224], [
-      4, 17, 17, 192
-  ], [4, 17, 17, 192], [4, 17, 17, 128], [4, 17, 17, 192], [4, 17, 17, 192], [
-      4, 17, 17, 160
-  ], [4, 17, 17, 128], [4, 17, 17, 192], [4, 17, 17, 160], [4, 17, 17, 192], [
-      4, 17, 17, 256
-  ], [4, 17, 17, 128], [4, 17, 17, 192], [4, 17, 17, 128], [4, 17, 17, 128], [
-      4, 17, 17, 128
-  ], [4, 17, 17, 320], [4, 17, 17, 96], [4, 17, 17, 384], [4, 35, 35, 96], [
-      4, 35, 35, 64
-  ], [4, 35, 35, 64], [4, 35, 35, 64], [4, 35, 35, 48], [4, 35, 35, 96],
-               [4, 35, 35, 32], [4, 35, 35, 64], [4, 35, 35, 48],
-               [4, 71, 71, 192], [4, 73, 73, 64], [4, 147, 147, 64]]
-  strides = [
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1
-  ]
-  # Shrink sizes to make the test faster
-  for i in input_sizes:
-    i[3] //= shrink
-  for f in filter_sizes:
-    f[2] //= shrink
-    f[3] //= shrink
-  for o in out_sizes:
-    o[3] //= shrink
-  # pylint: disable=invalid-name
-  VALID = "VALID"
-  SAME = "SAME"
-  # pylint: enable=invalid-name
-  paddings = [
-      SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME,
-      VALID, SAME, SAME, VALID, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME,
-      SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME,
-      SAME, SAME, SAME, SAME, SAME, VALID, VALID, SAME, SAME, SAME, SAME, SAME,
-      SAME, SAME, SAME, SAME, VALID, VALID, VALID
-  ]
-  for i, f, o, s, p in zip(input_sizes, filter_sizes, out_sizes, strides,
-                           paddings):
-    yield i, f, o, s, p
-
-
-def GetTestConfigs():
-  """Get all the valid tests configs to run.
-
-  Returns:
-    all the valid test configs as tuples of data_format and use_gpu.
-  """
-  test_configs = [("NCHW", True), ("NHWC", True)]
-  return test_configs
-
-
-class FusedConv2DBiasActivationTest(test.TestCase):
-
-  def _DtypesToTest(self, use_gpu):
-    return [dtypes.float32]
-
-  def _FilterFormatsToTest(self, use_gpu):
-    return ["HWIO", "OIHW"]
-
-  def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, bias,
-                            strides, padding, activation_mode, data_format,
-                            filter_format, dtype):
-    """Verifies the output values of the convolution function.
-
-    Args:
-      tensor_in_sizes: Input tensor dimensions in
-        [batch, input_rows, input_cols, input_depth].
-      filter_in_sizes: Filter tensor dimensions in
-        [kernel_rows, kernel_cols, input_depth, output_depth].
-      bias: 1-D bias tensor of length output_depth.
-      strides: Stride: [col_stride, row_stride]
-      padding: Padding type.
-      activation_mode: Activation mode.
-      data_format: Format of the data tensors.
-      filter_format: Filter format to use for the fused convolution.
-      dtype: Data type for inputs and outputs.
-    Returns:
-      Symbolic tensor value and reference value that can be used to
-      execute the computation and verify the results.
-    """
-    input_size = np.prod(tensor_in_sizes)
-    filter_size = np.prod(filter_in_sizes)
-    bias_size = filter_in_sizes[-1]  # equals to output depth
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, input_size + 1)]
-    x2 = [f * 1.0 for f in range(1, filter_size + 1)]
-    # This is to guarantee that there is always negative values after
-    # bias add so that we can test whether relu works correctly.
-    x3 = bias
-    with self.test_session(use_gpu=True):
-      t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
-      t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
-      fused_t2 = t2
-      if filter_format == "OIHW":
-        fused_t2 = HwioToOihw(t2)
-      t3 = constant_op.constant(x3, shape=[bias_size], dtype=dtype)
-      strides = [1] + strides + [1]
-      if data_format == "NCHW":
-        t1 = test_util.NHWCToNCHW(t1)
-        strides = test_util.NHWCToNCHW(strides)
-      output = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
-          t1,
-          fused_t2,
-          t3,
-          strides=strides,
-          padding=padding,
-          data_format=data_format,
-          filter_format=filter_format,
-          activation_mode=activation_mode)
-      ref_conv_output = nn_ops.conv2d(
-          t1, t2, strides=strides, padding=padding, data_format=data_format)
-      ref_bias_output = nn_ops.bias_add(
-          ref_conv_output, t3, data_format=data_format)
-      ref_output = nn_ops.relu(ref_bias_output)
-      if data_format == "NCHW":
-        output = test_util.NCHWToNHWC(output)
-        ref_output = test_util.NCHWToNHWC(ref_output)
-
-      return output, ref_output
-
-  def _CompareFwdValues(self, tensor_in_sizes, filter_in_sizes, conv_strides,
-                        padding):
-    """Verifies that CPU and GPU produce the same values.
-
-    Args:
-      tensor_in_sizes: Input tensor dimensions in
-        [batch, input_rows, input_cols, input_depth].
-      filter_in_sizes: Filter tensor dimensions in
-        [kernel_rows, kernel_cols, input_depth, output_depth].
-      conv_strides: [row_stride, col_stride] for the convolution;
-      padding: Padding type.
-    """
-    x1 = np.random.rand(*tensor_in_sizes).astype(np.float32)
-    x2 = np.random.rand(*filter_in_sizes).astype(np.float32)
-    x3 = np.random.rand(*[filter_in_sizes[-1]]).astype(np.float32)
-
-    def _SetupVal(data_format, use_gpu):
-      with self.test_session(use_gpu=use_gpu):
-        t1 = constant_op.constant(x1, shape=tensor_in_sizes)
-        t2 = constant_op.constant(x2, shape=filter_in_sizes)
-        t3 = constant_op.constant(x3, shape=[filter_in_sizes[-1]])
-        strides = [1] + conv_strides + [1]
-        if data_format == "NCHW":
-          t1 = test_util.NHWCToNCHW(t1)
-          strides = test_util.NHWCToNCHW(strides)
-        output = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
-            t1,
-            t2,
-            t3,
-            strides=strides,
-            padding=padding,
-            data_format=data_format,
-            activation_mode="Relu")
-
-        if data_format == "NCHW":
-          output = test_util.NCHWToNHWC(output)
-        return output
-
-    tensors = []
-    for (data_format, use_gpu) in GetTestConfigs():
-      tensors.append(_SetupVal(data_format, use_gpu))
-    with self.cached_session() as sess:
-      values = sess.run(tensors)
-      for i in range(1, len(values)):
-        self.assertAllClose(values[0], values[i], rtol=1e-3, atol=1e-3)
-
-  def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, bias, strides,
-                    padding):
-    tensors = []
-    ref_tensors = []
-    for (data_format, use_gpu) in GetTestConfigs():
-      for dtype in self._DtypesToTest(use_gpu):
-        for filter_format in self._FilterFormatsToTest(use_gpu):
-          result, expected = self._SetupValuesForDevice(
-              tensor_in_sizes, filter_in_sizes, bias, strides, padding, "Relu",
-              data_format, filter_format, dtype)
-        tensors.append(result)
-        ref_tensors.append(expected)
-      with self.cached_session() as sess:
-        values = sess.run(tensors)
-        ref_values = sess.run(ref_tensors)
-        for i in range(len(tensors)):
-          conv = tensors[i]
-          value = values[i]
-          ref_value = ref_values[i]
-          tf_logging.info("expected = ", ref_value)
-          tf_logging.info("actual = ", value)
-          tol = 1e-5
-          if value.dtype == np.float16:
-            tol = 1e-3
-          self.assertAllClose(
-              np.ravel(ref_value), np.ravel(value), atol=tol, rtol=tol)
-          self.assertShapeEqual(value, conv)
-
-  def testConv2D1x1Filter(self, gpu_only=True):
-    if gpu_only and not test.is_gpu_available():
-      tf_logging.info("Skipping Conv2D1x1Filter test.")
-      return
-    # expected_output = [
-    #    0.0, 0.0, 0.0, 21.0, 0.0, 0.0, 57.0, 0.0, 0.0, 93.0, 41.0, 0.0, 129.0,
-    #    86.0, 43.0, 165.0, 131.0, 97.0
-    # ]
-    medians = [-45.0, -130.0, -215.0]
-    self._VerifyValues(
-        tensor_in_sizes=[1, 2, 3, 3],
-        filter_in_sizes=[1, 1, 3, 3],
-        bias=medians,
-        strides=[1, 1],
-        padding="VALID")
-
-  def testConv2DEmpty(self, gpu_only=True):
-    if gpu_only and not test.is_gpu_available():
-      tf_logging.info("Skipping Conv2DEmpty test.")
-      return
-    # expected_output = []
-    self._VerifyValues(
-        tensor_in_sizes=[0, 2, 3, 3],
-        filter_in_sizes=[1, 1, 3, 3],
-        bias=[0.0, 0.0, 0.0],
-        strides=[1, 1],
-        padding="VALID")
-
-  def testConv2D2x2Filter(self, gpu_only=True):
-    if gpu_only and not test.is_gpu_available():
-      tf_logging.info("Skipping Conv2D2x2Filter test.")
-      return
-    # expected_output = [0.0, 0.0, 0.0, 401.0, 533.0, 665.0]
-    self._VerifyValues(
-        tensor_in_sizes=[1, 2, 3, 3],
-        filter_in_sizes=[2, 2, 3, 3],
-        bias=[-2500.0, -2500.0, -2500.0],
-        strides=[1, 1],
-        padding="VALID")
-
-  def testConv2D1x2Filter(self, gpu_only=True):
-    if gpu_only and not test.is_gpu_available():
-      tf_logging.info("Skipping Conv2D1x2Filter test.")
-      return
-    # expected_output = [
-    #    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 190.0, 265.0, 340.0, 343.0, 436.0, 529.0
-    # ]
-    self._VerifyValues(
-        tensor_in_sizes=[1, 2, 3, 3],
-        filter_in_sizes=[1, 2, 3, 3],
-        bias=[-500.0, -500.0, -500.0],
-        strides=[1, 1],
-        padding="VALID")
-
-  def testConv2D2x2FilterStride2(self, gpu_only=True):
-    if gpu_only and not test.is_gpu_available():
-      tf_logging.info("Skipping Conv2D2x2FilterStride2 test.")
-      return
-    # expected_output = [0.0, 67.0, 163.0]
-    self._VerifyValues(
-        tensor_in_sizes=[1, 2, 3, 3],
-        filter_in_sizes=[2, 2, 3, 3],
-        bias=[-2300.0, -2300.0, -2300.0],
-        strides=[2, 2],
-        padding="VALID")
-
-  def testConv2D2x2FilterStride2Same(self, gpu_only=True):
-    if gpu_only and not test.is_gpu_available():
-      tf_logging.info("Skipping Conv2D2x2FilterStride2Same test.")
-      return
-    # expected_output = [0.0, 2367.0, 2463.0, 1230.0, 1305.0, 1380.0]
-    self._VerifyValues(
-        tensor_in_sizes=[1, 2, 3, 3],
-        filter_in_sizes=[2, 2, 3, 3],
-        bias=[-2300.0, -1000.0, -1000.0],
-        strides=[2, 2],
-        padding="SAME")
-
-  def testConv2D2x2FilterStride1x2(self, gpu_only=True):
-    if gpu_only and not test.is_gpu_available():
-      tf_logging.info("Skipping Conv2D2x2FilterStride1x2 test.")
-      return
-    # expected_output = [0.0, 0.0, 8.0, 28.0, 48.0, 68.0]
-    self._VerifyValues(
-        tensor_in_sizes=[1, 3, 6, 1],
-        filter_in_sizes=[2, 2, 1, 1],
-        bias=[-90.0],
-        strides=[1, 2],
-        padding="VALID")
-
-  def testConv2DKernelSmallerThanStrideValid(self, gpu_only=True):
-    if gpu_only and not test.is_gpu_available():
-      tf_logging.info("Skipping Conv2DKernelSmallerThanStrideValid test.")
-      return
-    # expected_output = [0, 0, 175, 205]
-    self._VerifyValues(
-        tensor_in_sizes=[1, 7, 7, 1],
-        filter_in_sizes=[2, 2, 1, 1],
-        bias=[-100.0],
-        strides=[3, 3],
-        padding="VALID")
-
-  def testConv2DKernelSmallerThanStrideSame(self, gpu_only=True):
-    if gpu_only and not test.is_gpu_available():
-      tf_logging.info("Skipping Conv2DKernelSmallerThanStrideSame test.")
-      return
-    # expected = [0, 0, 2, 4]
-    self._VerifyValues(
-        tensor_in_sizes=[1, 3, 3, 1],
-        filter_in_sizes=[1, 1, 1, 1],
-        bias=[-5.0],
-        strides=[2, 2],
-        padding="SAME")
-
-    # expected = [0, 0, 4, 6]
-    self._VerifyValues(
-        tensor_in_sizes=[1, 4, 4, 1],
-        filter_in_sizes=[1, 1, 1, 1],
-        bias=[-5.0],
-        strides=[2, 2],
-        padding="SAME")
-
-    # expected = [4, 0, 1, 0]
-    self._VerifyValues(
-        tensor_in_sizes=[1, 4, 4, 1],
-        filter_in_sizes=[2, 2, 1, 1],
-        bias=[-40.0],
-        strides=[3, 3],
-        padding="SAME")
-
-  def testConv2DKernelSizeMatchesInputSize(self, gpu_only=True):
-    if gpu_only and not test.is_gpu_available():
-      tf_logging.info("Skipping Conv2DKernelSizeMatchesInputSize test.")
-      return
-    # expected = [0, 5]
-    self._VerifyValues(
-        tensor_in_sizes=[1, 2, 2, 1],
-        filter_in_sizes=[2, 2, 1, 2],
-        bias=[-50.0, -55.0],
-        strides=[1, 1],
-        padding="VALID")
-
-    # expected = [0, 2, 282, 322]
-    self._VerifyValues(
-        tensor_in_sizes=[1, 8, 8, 1],
-        filter_in_sizes=[2, 2, 1, 1],
-        bias=[-200.0],
-        strides=[4, 4],
-        padding="SAME")
-
-  def testShapeFunctionEdgeCases(self):
-    # All shapes unknown.
-    c1 = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
-        array_ops.placeholder(dtypes.float32),
-        array_ops.placeholder(dtypes.float32),
-        array_ops.placeholder(dtypes.float32),
-        strides=[1, 1, 1, 1],
-        padding="SAME",
-        activation_mode="Relu")
-    self.assertEqual([None, None, None, None], c1.get_shape().as_list())
-
-    # Incorrect input shape.
-    with self.assertRaises(ValueError):
-      fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
-          array_ops.placeholder(dtypes.float32, shape=[1, 3]),
-          array_ops.placeholder(dtypes.float32),
-          array_ops.placeholder(dtypes.float32),
-          strides=[1, 1, 1, 1],
-          padding="SAME",
-          activation_mode="Relu")
-
-    # Incorrect filter shape.
-    with self.assertRaises(ValueError):
-      fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
-          array_ops.placeholder(dtypes.float32),
-          array_ops.placeholder(dtypes.float32, shape=[1, 3]),
-          array_ops.placeholder(dtypes.float32),
-          strides=[1, 1, 1, 1],
-          padding="SAME",
-          activation_mode="Relu")
-
-    # Depth mismatch.
-    with self.assertRaises(ValueError):
-      fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
-          array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
-          array_ops.placeholder(dtypes.float32, shape=[4, 4, 2, 2]),
-          array_ops.placeholder(dtypes.float32),
-          strides=[1, 1, 1, 1],
-          padding="SAME",
-          activation_mode="Relu")
-
-  def testOpEdgeCases(self, gpu_only=True):
-    if gpu_only and not test.is_gpu_available():
-      tf_logging.info("Skipping OpEdgeCases tests.")
-      return
-    with self.cached_session() as sess:
-      # Illegal strides.
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Convolutional strides are not supported in "
-                                   "the batch or depth dimensions."):
-        sess.run(
-            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
-                array_ops.placeholder(dtypes.float32),
-                array_ops.placeholder(dtypes.float32),
-                array_ops.placeholder(dtypes.float32),
-                strides=[2, 1, 1, 1],
-                padding="SAME",
-                activation_mode="Relu"))
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Convolutional strides are not supported in "
-                                   "the batch or depth dimensions."):
-        sess.run(
-            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
-                array_ops.placeholder(dtypes.float32),
-                array_ops.placeholder(dtypes.float32),
-                array_ops.placeholder(dtypes.float32),
-                strides=[1, 1, 1, 2],
-                padding="SAME",
-                activation_mode="Relu"))
-
-      # Illegal activation mode.
-      with self.assertRaisesRegexp(ValueError,
-                                   "Op passed string 'Tanh' not in:"):
-        sess.run(
-            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
-                array_ops.placeholder(dtypes.float32),
-                array_ops.placeholder(dtypes.float32),
-                array_ops.placeholder(dtypes.float32),
-                strides=[1, 1, 1, 1],
-                padding="SAME",
-                activation_mode="Tanh"))
-
-      # Filter larger than input.
-      with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
-        sess.run(
-            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
-                array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
-                array_ops.placeholder(dtypes.float32, shape=[20, 21, 3, 2]),
-                array_ops.placeholder(dtypes.float32, shape=[2]),
-                strides=[1, 1, 1, 1],
-                padding="VALID",
-                activation_mode="Relu"))
-      with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
-        sess.run(
-            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
-                array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
-                array_ops.placeholder(dtypes.float32, shape=[21, 20, 3, 2]),
-                array_ops.placeholder(dtypes.float32, shape=[2]),
-                strides=[1, 1, 1, 1],
-                padding="VALID",
-                activation_mode="Relu"))
-
-
-def GetInceptionFwdTest(input_size, filter_size, stride, padding,
-                        gpu_only=True):
-
-  def Test(self):
-    if gpu_only and not test.is_gpu_available():
-      tf_logging.info("Skipping InceptionFwd %s", (input_size, filter_size,
-                                                   stride, padding))
-      return
-    tf_logging.info("Testing InceptionFwd %s", (input_size, filter_size, stride,
-                                                padding))
-    self._CompareFwdValues(input_size, filter_size, [stride, stride], padding)
-
-  return Test
-
-
-def CalculateConvolvedOutputDim(input_dim, filter_dim, stride, padding_type):
-  """Calculates the size of an output dimension of a strided convolution.
-
-  Given the sizes of the corresponding dimension of the input and filter shapes,
-  and the stride and padding_types, calculates the size of the output dimension.
-  This function can be called separately for each input dimension.
-
-  Args:
-    input_dim: An `int` specifying the size of the input dimension.
-    filter_dim: An `int` specifying the size of the filter dimension.
-    stride: An `int` specifying the step size of the convolution along the
-      input dimension.
-    padding_type: either 'VALID' or 'SAME'.
-
-  Returns:
-    The size of the output dimension.
-  """
-  if padding_type == "VALID":
-    return (input_dim - filter_dim + stride) // stride
-  else:  # padding_type == 'SAME'
-    return (input_dim + stride - 1) // stride
-
-
-def NchwVectCToNchw(in_tensor):
-  # [N, C / 4, H, W, 4] => [N, C / 4, 4, H, W] == [N, C, H, W]
-  t = array_ops.transpose(in_tensor, [0, 1, 4, 2, 3])
-  n = in_tensor.shape.dims[0].value
-  c = in_tensor.shape.dims[1].value * in_tensor.shape.dims[4].value
-  h = in_tensor.shape.dims[2].value
-  w = in_tensor.shape.dims[3].value
-  return array_ops.reshape(t, [n, c, h, w])
-
-
-def OihwVectIToHwio(in_tensor):
-  # [O, I / 4, H, W, 4] => [O, I / 4, 4, H, W] == [O, I, H, W]
-  t = array_ops.transpose(in_tensor, [2, 3, 1, 4, 0])
-  o = in_tensor.shape.dims[0].value
-  i = in_tensor.shape.dims[1].value * in_tensor.shape.dims[4].value
-  h = in_tensor.shape.dims[2].value
-  w = in_tensor.shape.dims[3].value
-  return array_ops.reshape(t, [h, w, i, o])
-
-
-def NchwToNchwVectC(in_tensor):
-  n, c, h, w = in_tensor.shape.as_list()
-  assert c % 4 == 0
-  t = array_ops.reshape(in_tensor, [n, c // 4, 4, h, w])
-  return array_ops.transpose(t, [0, 1, 3, 4, 2])
-
-
-def HwioToOihw(in_tensor):
-  return array_ops.transpose(in_tensor, [3, 2, 0, 1])
-
-
-def SimulateFusedConv2dBiasActivationInt8(conv_input_scale, conv_input, kernel,
-                                          padding, strides, side_input_scale,
-                                          side_input, biases, apply_relu):
-  """Simulates the int8 fused 2-D convolution op using separate float ops.
-
-    The arguments and return values have the same format, meanings and
-    restrictions as the actual op.
-  Args:
-    conv_input_scale: A scalar 'float'.
-    conv_input: A `Tensor` of type `qint8` in NCHW_VECT_C layout.
-    kernel: A `Tensor` of type `qint8` in OIHW_VECT_I layout.
-    padding: A `string` from: `"SAME", "VALID"`.
-    strides: A list of `ints`.
-    side_input_scale: A scalar 'float'.
-    side_input: A `Tensor` of type `qint8` in NCHW_VECT_C layout.
-    biases: A `Tensor` of type `float32` in NCHW layout.
-    apply_relu: A boolean to specify whether to apply "Relu" activation function
-      that clips outputs to the range [0, 127], or "None" activation that clips
-      to the range [-128, 127].
-  Returns:
-    A `Tensor` of type `qint8` in NCHW_VECT_C layout.
-  """
-  conv_result = nn_ops.conv2d(
-      NchwVectCToNchw(gen_array_ops.dequantize(conv_input, -128, 127)),
-      OihwVectIToHwio(gen_array_ops.dequantize(kernel, -128, 127)),
-      strides=strides,
-      padding=padding,
-      data_format="NCHW") * conv_input_scale
-
-  conv_and_side_inputs = conv_result + side_input_scale * NchwVectCToNchw(
-      gen_array_ops.dequantize(side_input, -128, 127))
-
-  output = nn_ops.bias_add(conv_and_side_inputs, biases, data_format="NCHW")
-  if apply_relu:
-    output = nn_ops.relu(output)
-
-  result, _, _ = gen_array_ops.quantize_v2(
-      NchwToNchwVectC(output), -128, 127, dtypes.qint8)
-  return result
-
-
-class FusedConvInt8Tests(test.TestCase):
-  _test_params = [
-      {
-          "batch_size": 1,
-          "input_channels": 4,
-          "output_channels": 4,
-          "input_height": 8,
-          "input_width": 8,
-          "filter_height": 6,
-          "filter_width": 6,
-          "vertical_stride": 2,
-          "horizontal_stride": 2,
-          "conv_input_scale": 0.002,
-          "side_input_scale": 0.0,
-          "bias_scale": 1,
-          "padding_type": "SAME"
-      },
-      {
-          "batch_size": 1,
-          "input_channels": 4,
-          "output_channels": 4,
-          "input_height": 6,
-          "input_width": 6,
-          "filter_height": 6,
-          "filter_width": 6,
-          "vertical_stride": 2,
-          "horizontal_stride": 2,
-          "conv_input_scale": 0.002,
-          "side_input_scale": 0.0,
-          "bias_scale": 1,
-          "padding_type": "SAME"
-      },
-      {
-          "batch_size": 2,
-          "input_channels": 8,
-          "output_channels": 16,
-          "input_height": 8,
-          "input_width": 8,
-          "filter_height": 3,
-          "filter_width": 3,
-          "vertical_stride": 2,
-          "horizontal_stride": 2,
-          "conv_input_scale": 0.002,
-          "side_input_scale": 0.0,
-          "bias_scale": 1,
-          "padding_type": "VALID"
-      },
-      {
-          "batch_size": 2,
-          "input_channels": 8,
-          "output_channels": 16,
-          "input_height": 8,
-          "input_width": 8,
-          "filter_height": 3,
-          "filter_width": 3,
-          "vertical_stride": 2,
-          "horizontal_stride": 2,
-          "conv_input_scale": 0.002,
-          "side_input_scale": 0.0,
-          "bias_scale": 1,
-          "padding_type": "SAME"
-      },
-      {
-          "batch_size": 2,
-          "input_channels": 8,
-          "output_channels": 16,
-          "input_height": 8,
-          "input_width": 8,
-          "filter_height": 3,
-          "filter_width": 3,
-          "vertical_stride": 2,
-          "horizontal_stride": 2,
-          "conv_input_scale": 0.002,
-          "side_input_scale": 0.5,
-          "bias_scale": 1,
-          "padding_type": "VALID"
-      },
-      {
-          "batch_size": 2,
-          "input_channels": 16,
-          "output_channels": 16,
-          "input_height": 9,
-          "input_width": 9,
-          "filter_height": 3,
-          "filter_width": 3,
-          "vertical_stride": 1,
-          "horizontal_stride": 1,
-          "conv_input_scale": 0.001,
-          "side_input_scale": 0.5,
-          "bias_scale": 1,
-          "padding_type": "SAME"
-      },
-      {
-          "batch_size": 3,
-          "input_channels": 8,
-          "output_channels": 8,
-          "input_height": 9,
-          "input_width": 9,
-          "filter_height": 5,
-          "filter_width": 5,
-          "vertical_stride": 1,
-          "horizontal_stride": 1,
-          "conv_input_scale": 0.001,
-          "side_input_scale": 0.5,
-          "bias_scale": 1,
-          "padding_type": "SAME"
-      },
-      {
-          "batch_size": 3,
-          "input_channels": 8,
-          "output_channels": 8,
-          "input_height": 9,
-          "input_width": 9,
-          "filter_height": 7,
-          "filter_width": 1,
-          "vertical_stride": 2,
-          "horizontal_stride": 1,
-          "conv_input_scale": 0.002,
-          "side_input_scale": 0.5,
-          "bias_scale": 1,
-          "padding_type": "SAME"
-      },
-      {
-          "batch_size": 3,
-          "input_channels": 8,
-          "output_channels": 8,
-          "input_height": 9,
-          "input_width": 9,
-          "filter_height": 1,
-          "filter_width": 7,
-          "vertical_stride": 1,
-          "horizontal_stride": 1,
-          "conv_input_scale": 0.002,
-          "side_input_scale": 0.5,
-          "bias_scale": 1,
-          "padding_type": "SAME"
-      },
-  ]
-
-  def runTest(self, test_param, apply_relu):
-    batch_size = test_param["batch_size"]
-    input_channels = test_param["input_channels"]
-    output_channels = test_param["output_channels"]
-    input_height = test_param["input_height"]
-    input_width = test_param["input_width"]
-    filter_height = test_param["filter_height"]
-    filter_width = test_param["filter_width"]
-    vertical_stride = test_param["vertical_stride"]
-    horizontal_stride = test_param["horizontal_stride"]
-    conv_input_scale = test_param["conv_input_scale"]
-    side_input_scale = test_param["side_input_scale"]
-    bias_scale = test_param["bias_scale"]
-    padding_type = test_param["padding_type"]
-
-    conv_input, _, _ = gen_array_ops.quantize_v2(
-        random_ops.random_uniform(
-            [batch_size, input_channels // 4, input_height, input_width, 4],
-            minval=-0.0,
-            maxval=1.0,
-            dtype=dtypes.float32), -1.0, 1.0, dtypes.qint8)
-
-    kernel, _, _ = gen_array_ops.quantize_v2(
-        random_ops.random_uniform(
-            [
-                output_channels, input_channels // 4, filter_height,
-                filter_width, 4
-            ],
-            minval=-1.0,
-            maxval=1.0,
-            dtype=dtypes.float32), -1.0, 1.0, dtypes.qint8)
-
-    output_height = CalculateConvolvedOutputDim(input_height, filter_height,
-                                                vertical_stride, padding_type)
-    output_width = CalculateConvolvedOutputDim(input_width, filter_width,
-                                               horizontal_stride, padding_type)
-    tf_logging.info("output_height=", output_height, ", output_width=",
-                    output_width)
-
-    side_input, _, _ = gen_array_ops.quantize_v2(
-        random_ops.random_uniform(
-            [batch_size, output_channels // 4, output_height, output_width, 4],
-            minval=0.0,
-            maxval=1.0,
-            dtype=dtypes.float32), -1.0, 1.0, dtypes.qint8)
-
-    biases = random_ops.random_uniform(
-        [output_channels],
-        minval=-10 * bias_scale,
-        maxval=20 * bias_scale,
-        dtype=dtypes.float32)
-
-    strides = [1, 1, vertical_stride, horizontal_stride]
-
-    actual = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
-        conv_input,
-        kernel,
-        biases,
-        strides=strides,
-        padding=padding_type,
-        conv_input_scale=conv_input_scale,
-        side_input_scale=side_input_scale,
-        side_input=side_input,
-        activation_mode="Relu" if apply_relu else "None",
-        data_format="NCHW_VECT_C",
-        filter_format="OIHW_VECT_I")
 
-    expected = SimulateFusedConv2dBiasActivationInt8(
-        conv_input_scale, conv_input, kernel, padding_type, strides,
-        side_input_scale, side_input, biases, apply_relu)
 
-    with self.test_session(use_gpu=True) as sess:
-      actual_y, expected_y = sess.run([actual, expected])
-      self.assertAllClose(actual_y, expected_y, rtol=0, atol=1)
+# Instantiate the two test suites from test_base, mixing in test.TestCase as
+# the test framework.
+class FusedConv2DBiasActivationTest(test_base.FusedConv2DBiasActivationTest,
+                                    test.TestCase):
+  pass
 
-  def testFusedConvInt8(self):
-    if not test.is_gpu_available(
-        cuda_only=True, min_cuda_compute_capability=(6, 1)):
-      tf_logging.info("int8 test skipped because not run with --config=cuda or "
-                      "no GPUs with compute capability >= 6.1 are available.")
-      return
-    for apply_relu in [True, False]:
-      for test_param in self._test_params:
-        self.runTest(test_param, apply_relu)
 
+class FusedConvInt8Tests(test_base.FusedConvInt8Tests, test.TestCase):
+  pass
 
-if __name__ == "__main__":
-  for index, (input_size_, filter_size_, output_size_, stride_,
-              padding_) in enumerate(GetShrunkInceptionShapes()):
-    setattr(FusedConv2DBiasActivationTest, "testInceptionFwd_" + str(index),
-            GetInceptionFwdTest(input_size_, filter_size_, stride_, padding_))
 
-  # TODO(b/35359731)
-  # Fwd, BckInput, and BackFilter to test that for certain input parameter
-  # set, winograd nonfused algorithm will be excluded from conv autotune. If
-  # in such case, winograd nonfused algorithm is added as one option of the
-  # conv autotune, and cuDNN version is smaller than 7, the following tests
-  # will fail.
-  ishape = [1, 400, 400, 1]
-  fshape = [1, 1, 1, 256]
-  oshape = [1, 400, 400, 256]
-  setattr(FusedConv2DBiasActivationTest,
-          "testInceptionFwd_No_Winograd_Nonfused",
-          GetInceptionFwdTest(ishape, fshape, 1, "SAME", gpu_only=True))
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py
new file mode 100644
index 0000000000..35fc65e4ba
--- /dev/null
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py
@@ -0,0 +1,945 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Provides test suites that can be run to test fused convolutions.
+
+Each of the two test suites in this module, FusedConv2DBiasActivationTest and
+FusedConvInt8Tests, should be "instantiated" by declaring a class which inherits
+from the FusedConv test and a class that provides the standard test.TestCase
+API.
+
+See e.g. fused_conv2d_bias_activation_op_test.py in this folder.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import numpy as np
+
+from tensorflow.contrib.fused_conv.python.ops import fused_conv2d_bias_activation_op
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def _GetShrunkInceptionShapes(shrink=10):
+  """Iterator for smaller versions of convolution shapes in 2015 Inception.
+
+  Relative to inception, each depth value is `depth // shrink`.
+
+  Args:
+    shrink: Factor to shrink each depth value by relative to Inception.
+
+  Yields:
+    Tuple (input_size, filter_size, out_size, stride, padding), the convolution
+    parameters of Inception layers.
+  """
+  input_sizes = [[4, 5, 5, 1248], [4, 8, 8, 384], [4, 8, 8, 384], [
+      4, 8, 8, 2048
+  ], [4, 8, 8, 448], [4, 8, 8, 2048], [4, 8, 8, 2048], [4, 8, 8, 2048], [
+      4, 8, 8, 1760
+  ], [4, 8, 8, 1760], [4, 8, 8, 1760], [4, 8, 8, 1760], [4, 17, 17, 192], [
+      4, 17, 17, 192
+  ], [4, 17, 17, 1248], [4, 17, 17, 128], [4, 17, 17, 1248], [4, 17, 17, 224], [
+      4, 17, 17, 192
+  ], [4, 17, 17, 192], [4, 17, 17, 1216], [4, 17, 17, 1216], [4, 17, 17, 224], [
+      4, 17, 17, 192
+  ], [4, 17, 17, 192], [4, 17, 17, 1152], [4, 17, 17, 1152], [4, 17, 17, 192], [
+      4, 17, 17, 160
+  ], [4, 17, 17, 1152], [4, 17, 17, 1024], [4, 17, 17, 128], [4, 17, 17, 1024],
+                 [4, 17, 17, 128], [4, 17, 17, 1024], [4, 17, 17, 128], [
+                     4, 17, 17, 768
+                 ], [4, 17, 17, 128], [4, 17, 17, 128], [4, 17, 17, 768],
+                 [4, 17, 17, 768], [4, 35, 35, 96], [4, 35, 35, 288], [
+                     4, 35, 35, 64
+                 ], [4, 35, 35, 288], [4, 35, 35, 256], [4, 35, 35, 48], [
+                     4, 35, 35, 256
+                 ], [4, 35, 35, 96], [4, 35, 35, 192], [4, 35, 35, 192], [
+                     4, 35, 35, 192
+                 ], [4, 73, 73, 64], [4, 73, 73, 64], [4, 147, 147, 24]]
+  filter_sizes = [[1, 1, 1248, 128], [1, 3, 384, 384], [3, 1, 384, 384], [
+      1, 1, 2048, 192
+  ], [3, 3, 448, 384], [1, 1, 2048, 320], [1, 1, 2048, 448], [1, 1, 2048, 384],
+                  [1, 1, 1760, 384], [1, 1, 1760, 192], [1, 1, 1760, 448], [
+                      1, 1, 1760, 320
+                  ], [3, 3, 192, 192], [3, 3, 192, 192], [1, 1, 1248, 192], [
+                      3, 3, 128, 320
+                  ], [1, 1, 1248, 128], [1, 3, 224, 224], [3, 1, 192, 256], [
+                      1, 3, 192, 256
+                  ], [1, 1, 1216, 192], [1, 1, 1216, 96], [3, 1, 224, 224], [
+                      3, 3, 192, 224
+                  ], [1, 3, 192, 192], [1, 1, 1152, 192], [1, 1, 1152, 128], [
+                      3, 1, 192, 192
+                  ], [3, 3, 160, 192], [1, 1, 1152, 160], [1, 1, 1024, 128], [
+                      1, 3, 128, 192
+                  ], [1, 1, 1024, 160], [3, 1, 128, 192], [1, 1, 1024, 256], [
+                      3, 1, 128, 128
+                  ], [1, 1, 768, 192], [1, 3, 128, 128], [3, 3, 128, 128], [
+                      1, 1, 768, 128
+                  ], [1, 1, 768, 320], [3, 3, 96, 96], [3, 3, 288, 384], [
+                      3, 3, 64, 96
+                  ], [1, 1, 288, 64], [1, 1, 256, 64], [5, 5, 48, 64],
+                  [1, 1, 256, 48], [3, 3, 96, 96], [1, 1, 192, 32], [
+                      1, 1, 192, 64
+                  ], [1, 1, 192, 48], [3, 3, 64, 192], [1, 1, 64,
+                                                        64], [1, 1, 24, 64]]
+  out_sizes = [[4, 5, 5, 128], [4, 8, 8, 384], [4, 8, 8, 384], [4, 8, 8, 192], [
+      4, 8, 8, 384
+  ], [4, 8, 8, 320], [4, 8, 8, 448], [4, 8, 8, 384], [4, 8, 8, 384], [
+      4, 8, 8, 192
+  ], [4, 8, 8, 448], [4, 8, 8, 320], [4, 8, 8, 192], [4, 17, 17, 192], [
+      4, 17, 17, 192
+  ], [4, 8, 8, 320], [4, 17, 17, 128], [4, 17, 17, 224], [4, 17, 17, 256], [
+      4, 17, 17, 256
+  ], [4, 17, 17, 192], [4, 17, 17, 96], [4, 17, 17, 224], [4, 17, 17, 224], [
+      4, 17, 17, 192
+  ], [4, 17, 17, 192], [4, 17, 17, 128], [4, 17, 17, 192], [4, 17, 17, 192], [
+      4, 17, 17, 160
+  ], [4, 17, 17, 128], [4, 17, 17, 192], [4, 17, 17, 160], [4, 17, 17, 192], [
+      4, 17, 17, 256
+  ], [4, 17, 17, 128], [4, 17, 17, 192], [4, 17, 17, 128], [4, 17, 17, 128], [
+      4, 17, 17, 128
+  ], [4, 17, 17, 320], [4, 17, 17, 96], [4, 17, 17, 384], [4, 35, 35, 96], [
+      4, 35, 35, 64
+  ], [4, 35, 35, 64], [4, 35, 35, 64], [4, 35, 35, 48], [4, 35, 35, 96],
+               [4, 35, 35, 32], [4, 35, 35, 64], [4, 35, 35, 48],
+               [4, 71, 71, 192], [4, 73, 73, 64], [4, 147, 147, 64]]
+  strides = [
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1
+  ]
+  # Shrink sizes to make the test faster
+  for i in input_sizes:
+    i[3] //= shrink
+  for f in filter_sizes:
+    f[2] //= shrink
+    f[3] //= shrink
+  for o in out_sizes:
+    o[3] //= shrink
+  # pylint: disable=invalid-name
+  VALID = "VALID"
+  SAME = "SAME"
+  # pylint: enable=invalid-name
+  paddings = [
+      SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME,
+      VALID, SAME, SAME, VALID, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME,
+      SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME,
+      SAME, SAME, SAME, SAME, SAME, VALID, VALID, SAME, SAME, SAME, SAME, SAME,
+      SAME, SAME, SAME, SAME, VALID, VALID, VALID
+  ]
+  for i, f, o, s, p in zip(input_sizes, filter_sizes, out_sizes, strides,
+                           paddings):
+    yield i, f, o, s, p
+
+
+def _GetTestConfigs():
+  """Get all the valid tests configs to run.
+
+  Returns:
+    all the valid test configs as tuples of data_format and use_gpu.
+  """
+  test_configs = [("NCHW", True), ("NHWC", True)]
+  return test_configs
+
+
+def _IotaNdF32Constant(dim_sizes):
+
+  def MakeList(dims):
+    if len(dims) == 1:
+      return [float(1 + f) for f in range(dims[0])]
+    return [MakeList(dims[1:]) for _ in range(dims[0])]
+
+  return constant_op.constant(MakeList(dim_sizes), dtype=dtypes.float32)
+
+
+def _GetInceptionFwdTest(input_size,
+                         filter_size,
+                         stride,
+                         padding,
+                         gpu_only=True):
+
+  def Test(self):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping InceptionFwd %s",
+                      (input_size, filter_size, stride, padding))
+      return
+    tf_logging.info("Testing InceptionFwd %s",
+                    (input_size, filter_size, stride, padding))
+    self.CompareFwdValues(input_size, filter_size, [stride, stride], padding)
+
+  return Test
+
+
+class FusedConv2DBiasActivationTest(object):
+
+  @contextlib.contextmanager
+  def test_scope(self):  # pylint: disable=invalid-name
+    """Can be overridden in base classes to provide a test scope."""
+    yield
+
+  def _DtypesToTest(self, use_gpu):
+    return [dtypes.float32]
+
+  def _FilterFormatsToTest(self, use_gpu):
+    return ["HWIO", "OIHW"]
+
+  def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, bias,
+                            strides, padding, activation_mode, data_format,
+                            filter_format, dtype):
+    """Verifies the output values of the convolution function.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in
+        [batch, input_rows, input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in
+        [kernel_rows, kernel_cols, input_depth, output_depth].
+      bias: 1-D bias tensor of length output_depth.
+      strides: Stride: [col_stride, row_stride]
+      padding: Padding type.
+      activation_mode: Activation mode.
+      data_format: Format of the data tensors.
+      filter_format: Filter format to use for the fused convolution.
+      dtype: Data type for inputs and outputs.
+    Returns:
+      Symbolic tensor value and reference value that can be used to
+      execute the computation and verify the results.
+    """
+    input_size = np.prod(tensor_in_sizes)
+    filter_size = np.prod(filter_in_sizes)
+    bias_size = filter_in_sizes[-1]  # equals to output depth
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, input_size + 1)]
+    x2 = [f * 1.0 for f in range(1, filter_size + 1)]
+    # This is to guarantee that there are always negative values after
+    # bias add so that we can test whether relu works correctly.
+    x3 = bias
+    with self.cached_session(use_gpu=True), self.test_scope():
+      t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
+      t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
+      fused_t2 = t2
+      if filter_format == "OIHW":
+        fused_t2 = _HwioToOihw(t2)
+      t3 = constant_op.constant(x3, shape=[bias_size], dtype=dtype)
+      strides = [1] + strides + [1]
+      if data_format == "NCHW":
+        t1 = test_util.NHWCToNCHW(t1)
+        strides = test_util.NHWCToNCHW(strides)
+      output = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+          t1,
+          fused_t2,
+          t3,
+          strides=strides,
+          padding=padding,
+          data_format=data_format,
+          filter_format=filter_format,
+          activation_mode=activation_mode)
+      ref_conv_output = nn_ops.conv2d(
+          t1, t2, strides=strides, padding=padding, data_format=data_format)
+      ref_bias_output = nn_ops.bias_add(
+          ref_conv_output, t3, data_format=data_format)
+      ref_output = nn_ops.relu(ref_bias_output)
+      if data_format == "NCHW":
+        output = test_util.NCHWToNHWC(output)
+        ref_output = test_util.NCHWToNHWC(ref_output)
+
+      return output, ref_output
+
+  def CompareFwdValues(self, tensor_in_sizes, filter_in_sizes, conv_strides,
+                       padding):
+    """Verifies that CPU and GPU produce the same values.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in
+        [batch, input_rows, input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in
+        [kernel_rows, kernel_cols, input_depth, output_depth].
+      conv_strides: [row_stride, col_stride] for the convolution;
+      padding: Padding type.
+    """
+    x1 = np.random.rand(*tensor_in_sizes).astype(np.float32)
+    x2 = np.random.rand(*filter_in_sizes).astype(np.float32)
+    x3 = np.random.rand(*[filter_in_sizes[-1]]).astype(np.float32)
+
+    def _SetupVal(data_format, use_gpu):
+      with self.cached_session(use_gpu=use_gpu), self.test_scope():
+        t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+        t2 = constant_op.constant(x2, shape=filter_in_sizes)
+        t3 = constant_op.constant(x3, shape=[filter_in_sizes[-1]])
+        strides = [1] + conv_strides + [1]
+        if data_format == "NCHW":
+          t1 = test_util.NHWCToNCHW(t1)
+          strides = test_util.NHWCToNCHW(strides)
+        output = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+            t1,
+            t2,
+            t3,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            activation_mode="Relu")
+
+        if data_format == "NCHW":
+          output = test_util.NCHWToNHWC(output)
+        return output
+
+    tensors = []
+    for (data_format, use_gpu) in _GetTestConfigs():
+      tensors.append(_SetupVal(data_format, use_gpu))
+    with self.cached_session() as sess, self.test_scope():
+      values = sess.run(tensors)
+      for i in range(1, len(values)):
+        self.assertAllClose(values[0], values[i], rtol=1e-3, atol=1e-3)
+
+  def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, bias, strides,
+                    padding):
+    tensors = []
+    ref_tensors = []
+    for (data_format, use_gpu) in _GetTestConfigs():
+      for dtype in self._DtypesToTest(use_gpu):
+        for filter_format in self._FilterFormatsToTest(use_gpu):
+          result, expected = self._SetupValuesForDevice(
+              tensor_in_sizes, filter_in_sizes, bias, strides, padding, "Relu",
+              data_format, filter_format, dtype)
+        tensors.append(result)
+        ref_tensors.append(expected)
+      with self.cached_session() as sess, self.test_scope():
+        values = sess.run(tensors)
+        ref_values = sess.run(ref_tensors)
+        for i in range(len(tensors)):
+          conv = tensors[i]
+          value = values[i]
+          ref_value = ref_values[i]
+          tf_logging.info("expected = %s", ref_value)
+          tf_logging.info("actual = %s", value)
+          tol = 1e-5
+          if value.dtype == np.float16:
+            tol = 1e-3
+          self.assertAllClose(
+              np.ravel(ref_value), np.ravel(value), atol=tol, rtol=tol)
+          self.assertShapeEqual(value, conv)
+
+  def testConv2D1x1Filter(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2D1x1Filter test.")
+      return
+    # expected_output = [
+    #    0.0, 0.0, 0.0, 21.0, 0.0, 0.0, 57.0, 0.0, 0.0, 93.0, 41.0, 0.0, 129.0,
+    #    86.0, 43.0, 165.0, 131.0, 97.0
+    # ]
+    medians = [-45.0, -130.0, -215.0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[1, 1, 3, 3],
+        bias=medians,
+        strides=[1, 1],
+        padding="VALID")
+
+  def testConv2DEmpty(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2DEmpty test.")
+      return
+    # expected_output = []
+    self._VerifyValues(
+        tensor_in_sizes=[0, 2, 3, 3],
+        filter_in_sizes=[1, 1, 3, 3],
+        bias=[0.0, 0.0, 0.0],
+        strides=[1, 1],
+        padding="VALID")
+
+  def testConv2D2x2Filter(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2D2x2Filter test.")
+      return
+    # expected_output = [0.0, 0.0, 0.0, 401.0, 533.0, 665.0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        bias=[-2500.0, -2500.0, -2500.0],
+        strides=[1, 1],
+        padding="VALID")
+
+  def testConv2D1x2Filter(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2D1x2Filter test.")
+      return
+    # expected_output = [
+    #    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 190.0, 265.0, 340.0, 343.0, 436.0, 529.0
+    # ]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[1, 2, 3, 3],
+        bias=[-500.0, -500.0, -500.0],
+        strides=[1, 1],
+        padding="VALID")
+
+  def testConv2D2x2FilterStride2(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2D2x2FilterStride2 test.")
+      return
+    # expected_output = [0.0, 67.0, 163.0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        bias=[-2300.0, -2300.0, -2300.0],
+        strides=[2, 2],
+        padding="VALID")
+
+  def testConv2D2x2FilterStride2Same(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2D2x2FilterStride2Same test.")
+      return
+    # expected_output = [0.0, 2367.0, 2463.0, 1230.0, 1305.0, 1380.0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        bias=[-2300.0, -1000.0, -1000.0],
+        strides=[2, 2],
+        padding="SAME")
+
+  def testConv2D2x2FilterStride1x2(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2D2x2FilterStride1x2 test.")
+      return
+    # expected_output = [0.0, 0.0, 8.0, 28.0, 48.0, 68.0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 3, 6, 1],
+        filter_in_sizes=[2, 2, 1, 1],
+        bias=[-90.0],
+        strides=[1, 2],
+        padding="VALID")
+
+  def testConv2DKernelSmallerThanStrideValid(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2DKernelSmallerThanStrideValid test.")
+      return
+    # expected_output = [0, 0, 175, 205]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 7, 7, 1],
+        filter_in_sizes=[2, 2, 1, 1],
+        bias=[-100.0],
+        strides=[3, 3],
+        padding="VALID")
+
+  def testConv2DKernelSmallerThanStrideSame(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2DKernelSmallerThanStrideSame test.")
+      return
+    # expected = [0, 0, 2, 4]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 3, 3, 1],
+        filter_in_sizes=[1, 1, 1, 1],
+        bias=[-5.0],
+        strides=[2, 2],
+        padding="SAME")
+
+    # expected = [0, 0, 4, 6]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 4, 4, 1],
+        filter_in_sizes=[1, 1, 1, 1],
+        bias=[-5.0],
+        strides=[2, 2],
+        padding="SAME")
+
+    # expected = [4, 0, 1, 0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 4, 4, 1],
+        filter_in_sizes=[2, 2, 1, 1],
+        bias=[-40.0],
+        strides=[3, 3],
+        padding="SAME")
+
+  def testConv2DKernelSizeMatchesInputSize(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2DKernelSizeMatchesInputSize test.")
+      return
+    # expected = [0, 5]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 2, 1],
+        filter_in_sizes=[2, 2, 1, 2],
+        bias=[-50.0, -55.0],
+        strides=[1, 1],
+        padding="VALID")
+
+    # expected = [0, 2, 282, 322]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 8, 8, 1],
+        filter_in_sizes=[2, 2, 1, 1],
+        bias=[-200.0],
+        strides=[4, 4],
+        padding="SAME")
+
+  def testShapeFunctionEdgeCases(self):
+    # All shapes unknown.
+    c1 = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+        array_ops.placeholder(dtypes.float32),
+        array_ops.placeholder(dtypes.float32),
+        array_ops.placeholder(dtypes.float32),
+        strides=[1, 1, 1, 1],
+        padding="SAME",
+        activation_mode="Relu")
+    self.assertEqual([None, None, None, None], c1.get_shape().as_list())
+
+    # Incorrect input shape.
+    with self.assertRaises(ValueError):
+      fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+          array_ops.placeholder(dtypes.float32, shape=[1, 3]),
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding="SAME",
+          activation_mode="Relu")
+
+    # Incorrect filter shape.
+    with self.assertRaises(ValueError):
+      fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32, shape=[1, 3]),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding="SAME",
+          activation_mode="Relu")
+
+    # Depth mismatch.
+    with self.assertRaises(ValueError):
+      fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+          array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
+          array_ops.placeholder(dtypes.float32, shape=[4, 4, 2, 2]),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding="SAME",
+          activation_mode="Relu")
+
+  def testOpEdgeCases(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping OpEdgeCases tests.")
+      return
+    with self.cached_session() as sess, self.test_scope():
+      # Illegal strides.
+      with self.assertRaisesRegexp(
+          errors_impl.UnimplementedError,
+          ".*strides.*in the batch and depth dimensions"):
+        sess.run(
+            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+                _IotaNdF32Constant([1, 1, 1, 1]),
+                _IotaNdF32Constant([1, 1, 1, 1]),
+                _IotaNdF32Constant([1]),
+                strides=[2, 1, 1, 1],
+                padding="SAME",
+                activation_mode="Relu"))
+      with self.assertRaisesRegexp(
+          errors_impl.UnimplementedError,
+          ".*strides.*in the batch and depth dimensions"):
+        sess.run(
+            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+                _IotaNdF32Constant([1, 1, 1, 1]),
+                _IotaNdF32Constant([1, 1, 1, 1]),
+                _IotaNdF32Constant([1]),
+                strides=[1, 1, 1, 2],
+                padding="SAME",
+                activation_mode="Relu"))
+
+      # Illegal activation mode.
+      with self.assertRaisesRegexp(ValueError,
+                                   "Op passed string 'Tanh' not in:"):
+        sess.run(
+            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+                _IotaNdF32Constant([1, 1, 1, 1]),
+                _IotaNdF32Constant([1, 1, 1, 1]),
+                _IotaNdF32Constant([1]),
+                strides=[1, 1, 1, 1],
+                padding="SAME",
+                activation_mode="Tanh"))
+
+      # Filter larger than input.
+      with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
+        sess.run(
+            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+                _IotaNdF32Constant([32, 20, 20, 3]),
+                _IotaNdF32Constant([20, 21, 3, 2]),
+                _IotaNdF32Constant([2]),
+                strides=[1, 1, 1, 1],
+                padding="VALID",
+                activation_mode="Relu"))
+      with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
+        sess.run(
+            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+                _IotaNdF32Constant([32, 20, 20, 3]),
+                _IotaNdF32Constant([21, 20, 3, 2]),
+                _IotaNdF32Constant([2]),
+                strides=[1, 1, 1, 1],
+                padding="VALID",
+                activation_mode="Relu"))
+
+
+# Add InceptionFwd tests to FusedConv2DBiasActivationTest.
+for index, (input_size_, filter_size_, output_size_, stride_,
+            padding_) in enumerate(_GetShrunkInceptionShapes()):
+  setattr(FusedConv2DBiasActivationTest, "testInceptionFwd_" + str(index),
+          _GetInceptionFwdTest(input_size_, filter_size_, stride_, padding_))
+
+# TODO(b/35359731)
+# Fwd, BckInput, and BackFilter to test that for certain input parameter
+# set, winograd nonfused algorithm will be excluded from conv autotune. If
+# in such case, winograd nonfused algorithm is added as one option of the
+# conv autotune, and cuDNN version is smaller than 7, the following tests
+# will fail.
+ishape = [1, 400, 400, 1]
+fshape = [1, 1, 1, 256]
+oshape = [1, 400, 400, 256]
+setattr(FusedConv2DBiasActivationTest, "testInceptionFwd_No_Winograd_Nonfused",
+        _GetInceptionFwdTest(ishape, fshape, 1, "SAME", gpu_only=True))
+
+
+def _CalculateConvolvedOutputDim(input_dim, filter_dim, stride, padding_type):
+  """Calculates the size of an output dimension of a strided convolution.
+
+  Given the sizes of the corresponding dimension of the input and filter shapes,
+  and the stride and padding_types, calculates the size of the output dimension.
+  This function can be called separately for each input dimension.
+
+  Args:
+    input_dim: An `int` specifying the size of the input dimension.
+    filter_dim: An `int` specifying the size of the filter dimension.
+    stride: An `int` specifying the step size of the convolution along the
+      input dimension.
+    padding_type: either 'VALID' or 'SAME'.
+
+  Returns:
+    The size of the output dimension.
+  """
+  if padding_type == "VALID":
+    return (input_dim - filter_dim + stride) // stride
+  else:  # padding_type == 'SAME'
+    return (input_dim + stride - 1) // stride
+
+
+def _NchwVectCToNchw(in_tensor):
+  # [N, C / 4, H, W, 4] => [N, C / 4, 4, H, W] == [N, C, H, W]
+  t = array_ops.transpose(in_tensor, [0, 1, 4, 2, 3])
+  n = in_tensor.shape.dims[0].value
+  c = in_tensor.shape.dims[1].value * in_tensor.shape.dims[4].value
+  h = in_tensor.shape.dims[2].value
+  w = in_tensor.shape.dims[3].value
+  return array_ops.reshape(t, [n, c, h, w])
+
+
+def _OihwVectIToHwio(in_tensor):
+  # [O, I / 4, H, W, 4] => [O, I / 4, 4, H, W] == [O, I, H, W]
+  t = array_ops.transpose(in_tensor, [2, 3, 1, 4, 0])
+  o = in_tensor.shape.dims[0].value
+  i = in_tensor.shape.dims[1].value * in_tensor.shape.dims[4].value
+  h = in_tensor.shape.dims[2].value
+  w = in_tensor.shape.dims[3].value
+  return array_ops.reshape(t, [h, w, i, o])
+
+
+def _NchwToNchwVectC(in_tensor):
+  n, c, h, w = in_tensor.shape.as_list()
+  assert c % 4 == 0
+  t = array_ops.reshape(in_tensor, [n, c // 4, 4, h, w])
+  return array_ops.transpose(t, [0, 1, 3, 4, 2])
+
+
+def _HwioToOihw(in_tensor):
+  return array_ops.transpose(in_tensor, [3, 2, 0, 1])
+
+
+def _SimulateFusedConv2dBiasActivationInt8(conv_input_scale, conv_input, kernel,
+                                           padding, strides, side_input_scale,
+                                           side_input, biases, apply_relu):
+  """Simulates the int8 fused 2-D convolution op using separate float ops.
+
+    The arguments and return values have the same format, meanings and
+    restrictions as the actual op.
+  Args:
+    conv_input_scale: A scalar 'float'.
+    conv_input: A `Tensor` of type `qint8` in NCHW_VECT_C layout.
+    kernel: A `Tensor` of type `qint8` in OIHW_VECT_I layout.
+    padding: A `string` from: `"SAME", "VALID"`.
+    strides: A list of `ints`.
+    side_input_scale: A scalar 'float'.
+    side_input: A `Tensor` of type `qint8` in NCHW_VECT_C layout.
+    biases: A `Tensor` of type `float32` in NCHW layout.
+    apply_relu: A boolean to specify whether to apply "Relu" activation function
+      that clips outputs to the range [0, 127], or "None" activation that clips
+      to the range [-128, 127].
+  Returns:
+    A `Tensor` of type `qint8` in NCHW_VECT_C layout.
+  """
+  conv_result = nn_ops.conv2d(
+      _NchwVectCToNchw(gen_array_ops.dequantize(conv_input, -128, 127)),
+      _OihwVectIToHwio(gen_array_ops.dequantize(kernel, -128, 127)),
+      strides=strides,
+      padding=padding,
+      data_format="NCHW") * conv_input_scale
+
+  conv_and_side_inputs = conv_result + side_input_scale * _NchwVectCToNchw(
+      gen_array_ops.dequantize(side_input, -128, 127))
+
+  output = nn_ops.bias_add(conv_and_side_inputs, biases, data_format="NCHW")
+  if apply_relu:
+    output = nn_ops.relu(output)
+
+  result, _, _ = gen_array_ops.quantize_v2(
+      _NchwToNchwVectC(output), -128, 127, dtypes.qint8)
+  return result
+
+
+# TODO(b/114580749): XLA:CPU/GPU don't support int8 at the moment, so this test
+# doesn't currently use XLA.
+class FusedConvInt8Tests(object):
+  _test_params = [
+      {
+          "batch_size": 1,
+          "input_channels": 4,
+          "output_channels": 4,
+          "input_height": 8,
+          "input_width": 8,
+          "filter_height": 6,
+          "filter_width": 6,
+          "vertical_stride": 2,
+          "horizontal_stride": 2,
+          "conv_input_scale": 0.002,
+          "side_input_scale": 0.0,
+          "bias_scale": 1,
+          "padding_type": "SAME"
+      },
+      {
+          "batch_size": 1,
+          "input_channels": 4,
+          "output_channels": 4,
+          "input_height": 6,
+          "input_width": 6,
+          "filter_height": 6,
+          "filter_width": 6,
+          "vertical_stride": 2,
+          "horizontal_stride": 2,
+          "conv_input_scale": 0.002,
+          "side_input_scale": 0.0,
+          "bias_scale": 1,
+          "padding_type": "SAME"
+      },
+      {
+          "batch_size": 2,
+          "input_channels": 8,
+          "output_channels": 16,
+          "input_height": 8,
+          "input_width": 8,
+          "filter_height": 3,
+          "filter_width": 3,
+          "vertical_stride": 2,
+          "horizontal_stride": 2,
+          "conv_input_scale": 0.002,
+          "side_input_scale": 0.0,
+          "bias_scale": 1,
+          "padding_type": "VALID"
+      },
+      {
+          "batch_size": 2,
+          "input_channels": 8,
+          "output_channels": 16,
+          "input_height": 8,
+          "input_width": 8,
+          "filter_height": 3,
+          "filter_width": 3,
+          "vertical_stride": 2,
+          "horizontal_stride": 2,
+          "conv_input_scale": 0.002,
+          "side_input_scale": 0.0,
+          "bias_scale": 1,
+          "padding_type": "SAME"
+      },
+      {
+          "batch_size": 2,
+          "input_channels": 8,
+          "output_channels": 16,
+          "input_height": 8,
+          "input_width": 8,
+          "filter_height": 3,
+          "filter_width": 3,
+          "vertical_stride": 2,
+          "horizontal_stride": 2,
+          "conv_input_scale": 0.002,
+          "side_input_scale": 0.5,
+          "bias_scale": 1,
+          "padding_type": "VALID"
+      },
+      {
+          "batch_size": 2,
+          "input_channels": 16,
+          "output_channels": 16,
+          "input_height": 9,
+          "input_width": 9,
+          "filter_height": 3,
+          "filter_width": 3,
+          "vertical_stride": 1,
+          "horizontal_stride": 1,
+          "conv_input_scale": 0.001,
+          "side_input_scale": 0.5,
+          "bias_scale": 1,
+          "padding_type": "SAME"
+      },
+      {
+          "batch_size": 3,
+          "input_channels": 8,
+          "output_channels": 8,
+          "input_height": 9,
+          "input_width": 9,
+          "filter_height": 5,
+          "filter_width": 5,
+          "vertical_stride": 1,
+          "horizontal_stride": 1,
+          "conv_input_scale": 0.001,
+          "side_input_scale": 0.5,
+          "bias_scale": 1,
+          "padding_type": "SAME"
+      },
+      {
+          "batch_size": 3,
+          "input_channels": 8,
+          "output_channels": 8,
+          "input_height": 9,
+          "input_width": 9,
+          "filter_height": 7,
+          "filter_width": 1,
+          "vertical_stride": 2,
+          "horizontal_stride": 1,
+          "conv_input_scale": 0.002,
+          "side_input_scale": 0.5,
+          "bias_scale": 1,
+          "padding_type": "SAME"
+      },
+      {
+          "batch_size": 3,
+          "input_channels": 8,
+          "output_channels": 8,
+          "input_height": 9,
+          "input_width": 9,
+          "filter_height": 1,
+          "filter_width": 7,
+          "vertical_stride": 1,
+          "horizontal_stride": 1,
+          "conv_input_scale": 0.002,
+          "side_input_scale": 0.5,
+          "bias_scale": 1,
+          "padding_type": "SAME"
+      },
+  ]
+
+  @contextlib.contextmanager
+  def test_scope(self):  # pylint: disable=invalid-name
+    """Can be overridden in base classes to provide a test scope."""
+    yield
+
+  def runTest(self, test_param, apply_relu):
+    batch_size = test_param["batch_size"]
+    input_channels = test_param["input_channels"]
+    output_channels = test_param["output_channels"]
+    input_height = test_param["input_height"]
+    input_width = test_param["input_width"]
+    filter_height = test_param["filter_height"]
+    filter_width = test_param["filter_width"]
+    vertical_stride = test_param["vertical_stride"]
+    horizontal_stride = test_param["horizontal_stride"]
+    conv_input_scale = test_param["conv_input_scale"]
+    side_input_scale = test_param["side_input_scale"]
+    bias_scale = test_param["bias_scale"]
+    padding_type = test_param["padding_type"]
+
+    with self.cached_session(use_gpu=True) as sess, self.test_scope():
+      conv_input, _, _ = gen_array_ops.quantize_v2(
+          random_ops.random_uniform(
+              [batch_size, input_channels // 4, input_height, input_width, 4],
+              minval=-0.0,
+              maxval=1.0,
+              dtype=dtypes.float32), -1.0, 1.0, dtypes.qint8)
+
+      kernel, _, _ = gen_array_ops.quantize_v2(
+          random_ops.random_uniform([
+              output_channels, input_channels // 4, filter_height, filter_width,
+              4
+          ],
+                                    minval=-1.0,
+                                    maxval=1.0,
+                                    dtype=dtypes.float32), -1.0, 1.0,
+          dtypes.qint8)
+
+      output_height = _CalculateConvolvedOutputDim(
+          input_height, filter_height, vertical_stride, padding_type)
+      output_width = _CalculateConvolvedOutputDim(
+          input_width, filter_width, horizontal_stride, padding_type)
+      tf_logging.info("output_height=%s, output_width=%s", output_height,
+                      output_width)
+
+      side_input, _, _ = gen_array_ops.quantize_v2(
+          random_ops.random_uniform([
+              batch_size, output_channels // 4, output_height, output_width, 4
+          ],
+                                    minval=0.0,
+                                    maxval=1.0,
+                                    dtype=dtypes.float32), -1.0, 1.0,
+          dtypes.qint8)
+
+      biases = random_ops.random_uniform([output_channels],
+                                         minval=-10 * bias_scale,
+                                         maxval=20 * bias_scale,
+                                         dtype=dtypes.float32)
+
+      strides = [1, 1, vertical_stride, horizontal_stride]
+
+      actual = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+          conv_input,
+          kernel,
+          biases,
+          strides=strides,
+          padding=padding_type,
+          conv_input_scale=conv_input_scale,
+          side_input_scale=side_input_scale,
+          side_input=side_input,
+          activation_mode="Relu" if apply_relu else "None",
+          data_format="NCHW_VECT_C",
+          filter_format="OIHW_VECT_I")
+
+      expected = _SimulateFusedConv2dBiasActivationInt8(
+          conv_input_scale, conv_input, kernel, padding_type, strides,
+          side_input_scale, side_input, biases, apply_relu)
+
+      actual_y, expected_y = sess.run([actual, expected])
+      self.assertAllClose(actual_y, expected_y, rtol=0, atol=1)
+
+  def testFusedConvInt8(self):
+    if not test.is_gpu_available(
+        cuda_only=True, min_cuda_compute_capability=(6, 1)):
+      tf_logging.info("int8 test skipped because not run with --config=cuda or "
+                      "no GPUs with compute capability >= 6.1 are available.")
+      return
+    for apply_relu in [True, False]:
+      for test_param in self._test_params:
+        self.runTest(test_param, apply_relu)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index bfc007bc39..c6ef82ccdc 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -90,6 +90,7 @@ BLACKLIST = [
     "//tensorflow/contrib/lite/python:interpreter.py",
     "//tensorflow/contrib/lite/python:interpreter_test.py",
     "//tensorflow/contrib/ffmpeg:test_data",
+    "//tensorflow/contrib/fused_conv:fused_conv2d_bias_activation_op_test_base",
     "//tensorflow/contrib/hadoop:test_data",
     "//tensorflow/contrib/factorization/examples:mnist",
     "//tensorflow/contrib/factorization/examples:mnist.py",
-- 
GitLab


From db3e59a545f06780583ad839da9e19d847dfd392 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Thu, 27 Sep 2018 11:11:34 -0700
Subject: [PATCH 086/570] Internal change.

PiperOrigin-RevId: 214804105
---
 .../testing/model_coverage/model_coverage_lib.py     | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib.py b/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib.py
index f8ab394c60..5ca57d083d 100644
--- a/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib.py
+++ b/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib.py
@@ -183,7 +183,11 @@ def compare_models_random_data(tflite_model, tf_eval_func, tolerance=5):
     np.testing.assert_almost_equal(tf_result, tflite_result, tolerance)
 
 
-def test_frozen_graph(filename, input_arrays, output_arrays, **kwargs):
+def test_frozen_graph(filename,
+                      input_arrays,
+                      output_arrays,
+                      input_shapes=None,
+                      **kwargs):
   """Validates the TensorFlow frozen graph converts to a TFLite model.
 
   Converts the TensorFlow frozen graph to TFLite and checks the accuracy of the
@@ -193,10 +197,14 @@ def test_frozen_graph(filename, input_arrays, output_arrays, **kwargs):
     filename: Full filepath of file containing frozen GraphDef.
     input_arrays: List of input tensors to freeze graph with.
     output_arrays: List of output tensors to freeze graph with.
+    input_shapes: Dict of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
+      Automatically determined when input shapes is None (e.g., {"foo" : None}).
+        (default None)
     **kwargs: Additional arguments to be passed into the converter.
   """
   converter = _lite.TocoConverter.from_frozen_graph(filename, input_arrays,
-                                                    output_arrays)
+                                                    output_arrays, input_shapes)
   tflite_model = _convert(converter, **kwargs)
 
   tf_eval_func = evaluate_frozen_graph(filename, input_arrays, output_arrays)
-- 
GitLab


From d2a674a959101c35b8cf65c79a603baa16936805 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 11:28:48 -0700
Subject: [PATCH 087/570] Update ops-related pbtxt files.

PiperOrigin-RevId: 214807362
---
 tensorflow/core/ops/compat/ops_history.v1.pbtxt | 11 +++++++++++
 tensorflow/core/ops/ops.pbtxt                   | 11 +++++++++++
 2 files changed, 22 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index cac4259356..7625524674 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -74910,6 +74910,17 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnicodeScript"
+  input_arg {
+    name: "input"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+}
 op {
   name: "UniformCandidateSampler"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index e173c2d072..83af07431c 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -35648,6 +35648,17 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnicodeScript"
+  input_arg {
+    name: "input"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+}
 op {
   name: "UniformCandidateSampler"
   input_arg {
-- 
GitLab


From 9a0a768d4416d157664d864d992a62782beea4a4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 11:54:40 -0700
Subject: [PATCH 088/570] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 214812088

---
 tensorflow/go/op/wrappers.go | 650 +++++++++++++++++------------------
 1 file changed, 325 insertions(+), 325 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 065c7e3011..96df1eee30 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -4059,50 +4059,6 @@ func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-// If the given segment ID `i` is negative, the value is dropped and will not be
-// added to the sum of the segment.
-//
-// `num_segments` should equal the number of distinct segment IDs.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
 type ResourceStridedSliceAssignAttr func(optionalAttr)
 
@@ -10714,6 +10670,129 @@ func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value
 	return op.Output(0)
 }
 
+// This op consumes a lock created by `MutexLock`.
+//
+// This op exists to consume a tensor created by `MutexLock` (other than
+// direct control dependencies).  It should be the only that consumes the tensor,
+// and will raise an error if it is not.  Its only purpose is to keep the
+// mutex lock tensor alive until it is consumed by this op.
+//
+// **NOTE**: This operation must run on the same device as its input.  This may
+// be enforced via the `colocate_with` mechanism.
+//
+// Arguments:
+//	mutex_lock: A tensor returned by `MutexLock`.
+//
+// Returns the created operation.
+func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConsumeMutexLock",
+		Input: []tf.Input{
+			mutex_lock,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
+type ResourceScatterNdAddAttr func(optionalAttr)
+
+// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Adds sparse `updates` to individual values or slices within a given
+//
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_add(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 12, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdAdd",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Mutually reduces multiple tensors of identical type and shape.
+func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
+	opspec := tf.OpSpec{
+		Type: "CollectiveReduce",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Updates the tree ensemble by either adding a layer to the last tree being grown
 //
 // or by starting a new tree.
@@ -11455,68 +11534,31 @@ func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
-// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
-type ResourceScatterNdAddAttr func(optionalAttr)
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
 
-// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
 //
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["dtype"] = value
 	}
 }
 
-// Adds sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
-//
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
-//
-// ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_add(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
-//
-// The resulting update to ref would look like this:
+// Outputs deterministic pseudorandom values from a normal distribution.
 //
-//     [1, 12, 3, 14, 14, 6, 7, 20]
+// The generated values will have mean 0 and standard deviation 1.
 //
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
-// values to add to ref.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns the created operation.
-func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11525,25 +11567,9 @@ func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, update
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdAdd",
-		Input: []tf.Input{
-			ref, indices, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Mutually reduces multiple tensors of identical type and shape.
-func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
-	opspec := tf.OpSpec{
-		Type: "CollectiveReduce",
+		Type: "StatelessRandomNormal",
 		Input: []tf.Input{
-			input,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
@@ -11551,31 +11577,83 @@ func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key
 	return op.Output(0)
 }
 
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
-
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+// Creates a sequence of numbers.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
+//
+// For example:
+//
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```
+//
+// Arguments:
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+//
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Range",
+		Input: []tf.Input{
+			start, limit, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
+
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a normal distribution.
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// want to use Nesterov momentum.
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11584,14 +11662,13 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
+		Type: "ResourceApplyMomentum",
 		Input: []tf.Input{
-			shape, seed,
+			var_, accum, lr, grad, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
@@ -15062,6 +15139,78 @@ func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (o
 	return op.Output(0)
 }
 
+// Returns the last element of the input list as well as a list with all but that element.
+//
+// Fails if the list is empty.
+//
+// input_handle: the input list
+// tensor: the withdrawn last element of the list
+// element_dtype: the type of elements in the list
+// element_shape: the shape of the output tensor
+func TensorListPopBack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListPopBack",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
+
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
 type TensorArrayGatherV3Attr func(optionalAttr)
 
@@ -15108,33 +15257,6 @@ func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow
 	return op.Output(0)
 }
 
-// This op consumes a lock created by `MutexLock`.
-//
-// This op exists to consume a tensor created by `MutexLock` (other than
-// direct control dependencies).  It should be the only that consumes the tensor,
-// and will raise an error if it is not.  Its only purpose is to keep the
-// mutex lock tensor alive until it is consumed by this op.
-//
-// **NOTE**: This operation must run on the same device as its input.  This may
-// be enforced via the `colocate_with` mechanism.
-//
-// Arguments:
-//	mutex_lock: A tensor returned by `MutexLock`.
-//
-// Returns the created operation.
-func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConsumeMutexLock",
-		Input: []tf.Input{
-			mutex_lock,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Returns x / y element-wise for integer types.
 //
 // Truncation designates that negative numbers will round fractional quantities
@@ -18032,138 +18154,6 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 	return op.Output(0)
 }
 
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
-
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-//
-// want to use Nesterov momentum.
-//
-// accum = accum * momentum + grad
-// var -= lr * accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
-type MaxPoolGradGradAttr func(optionalAttr)
-
-// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the last element of the input list as well as a list with all but that element.
-//
-// Fails if the list is empty.
-//
-// input_handle: the input list
-// tensor: the withdrawn last element of the list
-// element_dtype: the type of elements in the list
-// element_shape: the shape of the output tensor
-func TensorListPopBack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListPopBack",
-		Input: []tf.Input{
-			input_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Returns element-wise integer closest to x.
 //
 // If the result is midway between two representable values,
@@ -21645,6 +21635,50 @@ func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Computes the sum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
+//
+// `num_segments` should equal the number of distinct segment IDs.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentSum",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the product along segments of a tensor.
 //
 // Read
@@ -22272,40 +22306,6 @@ func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (ou
 	return op.Output(0)
 }
 
-// Creates a sequence of numbers.
-//
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
-//
-// For example:
-//
-// ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
-// ```
-//
-// Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
-//
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Range",
-		Input: []tf.Input{
-			start, limit, delta,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
 type DestroyResourceOpAttr func(optionalAttr)
 
-- 
GitLab


From 561a3c4331ebfaac3e61c524911bf6fe85f4ebc9 Mon Sep 17 00:00:00 2001
From: Ayush Dubey <ayushd@google.com>
Date: Thu, 27 Sep 2018 12:20:33 -0700
Subject: [PATCH 089/570] Dynamic subdivisions in collective ring reduce.

Before this change, a CollectiveOp user was required to specify subdiv_offsets
for the RingReduce algorithm.  During ring reduction, we created chunks of the
tensor to exchange between devices.  If the chunks were too large, or if the
hardware supported multiple data exchanges in parallel, the user could further
subdivide the chunk by specifying more than 1 subdiv offset.  Each subdiv
offset corresponded to another subdivision of the chunk, so effectively the
total number of tensor chunks is number of devices * number of subdivs.

After this change, we can dynamically infer the number of subdivisions based on
a target chunk size.  In ring_reducer.cc, we start with 1 subdiv, and keep
increasing until chunk size is less than MAX_CHUNK_SIZE.  Currently,
MAX_CHUNK_SIZE is set at 4 MB, although it may make sense to change this based
on specific hardware.

As a part of this change, a user can now provide an empty subdiv_offset list.
If empty, we dynamically add subdivisions based on the above algorithm.  If
non-empty, we take the user-specified subdivions.

PiperOrigin-RevId: 214815959
---
 .../core/common_runtime/ring_reducer.cc       | 75 +++++++++++++++--
 .../core/common_runtime/ring_reducer_test.cc  | 83 +++++++++++++++----
 tensorflow/core/kernels/collective_ops.cc     | 21 +++--
 3 files changed, 147 insertions(+), 32 deletions(-)

diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index a81f8650bf..b1fe928ba7 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -41,6 +41,16 @@ limitations under the License.
 
 // Set true for greater intelligibility of debug mode log messages.
 #define READABLE_KEYS false
+// RingReduce algorithm exchanges chunks of tensor between devices.  The chunk
+// size depends on the number of subdivisions specified in the algorithm.  If
+// the user does not specify the number of subdivisions, we infer the number
+// dynamically so that the resulting chunk size does not exceed
+// kMaxChunkSizeBytes, empirically set at 4 MiB.
+constexpr size_t kMaxChunkSizeBytes = (4 * 1024 * 1024);
+// kMaxSubdivsPerDev is used to give an upper bound on the number of
+// subdivisions dynamically generated.  A reasonable value would be a small
+// multiple of the number of NICs adjacent to each device.
+constexpr int kMaxSubdivsPerDevice = 2;
 
 namespace tensorflow {
 namespace {
@@ -92,7 +102,62 @@ RingReducer::RingReducer()
 
 RingReducer::~RingReducer() { group_size_tensor_ready_.WaitForNotification(); }
 
+Status GenerateSubdivsInCollectiveParams(CollectiveParams* col_params) {
+  if (col_params->instance.shape.num_elements() == 0) {
+    return errors::Internal("shape in CollectiveParams should be non-empty");
+  }
+  const int kAvgDevPerTask =
+      col_params->group.group_size / col_params->group.num_tasks;
+  const int kMaxNumSubdivs = kMaxSubdivsPerDevice * kAvgDevPerTask;
+  if (kMaxNumSubdivs <= 0) {
+    return errors::Internal("Unexpected kMaxNumSubdivs ", kMaxNumSubdivs,
+                            " in RingReducer");
+  }
+  // NOTE(ayushd): If no subdiv_offsets have been specified, dynamically add
+  // as many offsets as needed so that the size of tensor chunks <=
+  // kMaxChunkSizeBytes.  Empirically, chunks that are too small or too large
+  // lead to worse performance.
+  int num_subdivs = 0;
+  const size_t tensor_size = col_params->instance.shape.num_elements() *
+                             DataTypeSize(col_params->instance.data_type);
+  size_t chunk_size;
+  do {
+    ++num_subdivs;
+    int num_chunks = col_params->group.group_size * num_subdivs;
+    chunk_size = tensor_size / num_chunks;
+    VLOG(2) << "num_subdivs " << num_subdivs << " num_chunks " << num_chunks
+            << " chunk_size " << chunk_size;
+  } while (chunk_size > kMaxChunkSizeBytes && num_subdivs < kMaxNumSubdivs);
+  if (num_subdivs <= 0) {
+    return errors::Internal("Unexpected num_subdivs ", num_subdivs,
+                            " in RingReducer");
+  }
+
+  int subdiv_stride = kAvgDevPerTask / num_subdivs;
+  if (subdiv_stride == 0) subdiv_stride = 1;
+  col_params->instance.impl_details.subdiv_offsets.reserve(num_subdivs);
+  for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+    int subdiv_offset = subdiv_stride * sdi;
+    if (sdi % 2 == 1) subdiv_offset *= -1;
+    col_params->instance.impl_details.subdiv_offsets.push_back(subdiv_offset);
+  }
+
+  if (VLOG_IS_ON(2)) {
+    string subdiv_buf;
+    for (const int subdiv_offset :
+         col_params->instance.impl_details.subdiv_offsets) {
+      strings::StrAppend(&subdiv_buf, " ", subdiv_offset);
+    }
+    VLOG(2) << "Dynamically generated " << num_subdivs
+            << " subdiv_offsets:" << subdiv_buf << " tensor_size "
+            << tensor_size << " chunk_size " << chunk_size;
+  }
+
+  return Status::OK();
+}
+
 Status RingReducer::InitializeCollectiveParams(CollectiveParams* col_params) {
+  // TODO(b/113171733): change CHECKs to return errors.
   CHECK_EQ(col_params->instance.type, REDUCTION_COLLECTIVE);
   CHECK_EQ(col_params->instance.impl_details.collective_name, "RingReduce");
   const string& device_name =
@@ -123,12 +188,11 @@ Status RingReducer::InitializeCollectiveParams(CollectiveParams* col_params) {
   dev_per_task.push_back(dev_count);
   CHECK_EQ(col_params->group.num_tasks, dev_per_task.size());
 
-  // Generate a ring permutation for each requested offset.
   if (col_params->instance.impl_details.subdiv_offsets.empty()) {
-    return errors::Internal(
-        "Subdiv offsets should be non-empty for ring reducer, size=",
-        col_params->instance.impl_details.subdiv_offsets.size());
+    TF_RETURN_IF_ERROR(GenerateSubdivsInCollectiveParams(col_params));
   }
+
+  // Generate a ring permutation for requested offset.
   VLOG(2) << "Setting up perms for col_params " << col_params
           << " subdiv_permutations "
           << &col_params->instance.impl_details.subdiv_permutations;
@@ -646,7 +710,8 @@ bool RingReducer::RunAsyncParts() {
         case RF_SEND:
           --send_pending_count;
           break;
-        default: {}  // Ignore any other actions
+        default: {
+        }  // Ignore any other actions
       }
     }
   }
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index 28df85399e..75aba43572 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -549,37 +549,38 @@ class RingReducerTest : public ::testing::Test {
   int32 reduce_counter_ GUARDED_BY(mu_) = 0;
 };
 
-TEST_F(RingReducerTest, InitializeParams) {
-  static const int kNumDevsPerTask = 8;
-  static const int kNumTasks = 3;
-  static const int kNumDevs = kNumDevsPerTask * kNumTasks;
+CollectiveParams SetUpCollectiveParams(const int num_devs_per_task,
+                                       const int num_tasks) {
   CollectiveParams cp;
-  std::vector<string> device_names;
-  std::vector<string> task_names;
+  const int kNumDevs = num_devs_per_task * num_tasks;
   cp.group.group_key = 1;
   cp.group.group_size = kNumDevs;
   cp.group.device_type = DeviceType("GPU");
-  cp.group.num_tasks = kNumTasks;
+  cp.group.num_tasks = num_tasks;
   cp.instance.instance_key = 3;
   cp.instance.type = REDUCTION_COLLECTIVE;
   cp.instance.data_type = DataType(DT_FLOAT);
-  cp.instance.shape = TensorShape({5});
+  cp.instance.shape = TensorShape({kNumDevs});
   cp.instance.impl_details.collective_name = "RingReduce";
   cp.instance.impl_details.subdiv_offsets.push_back(0);
   cp.is_source = false;
   for (int i = 0; i < kNumDevs; ++i) {
-    int task_id = i / kNumDevsPerTask;
-    int dev_id = i % kNumDevsPerTask;
+    int task_id = i / num_devs_per_task;
+    int dev_id = i % num_devs_per_task;
     string task_name = strings::StrCat("/job:worker/replica:0/task:", task_id);
-    task_names.push_back(task_name);
     string device_name = strings::StrCat(task_name, "/device:GPU:", dev_id);
-    device_names.push_back(device_name);
     cp.instance.task_names.push_back(task_name);
     cp.instance.device_names.push_back(device_name);
   }
+  return cp;
+}
 
-  int test_rank = 0;
-  cp.default_rank = test_rank;
+TEST_F(RingReducerTest, InitializeParams) {
+  const int kNumDevsPerTask = 8;
+  const int kNumTasks = 3;
+  CollectiveParams cp = SetUpCollectiveParams(kNumDevsPerTask, kNumTasks);
+
+  cp.default_rank = 0;
   cp.instance.impl_details.subdiv_offsets = {0, 4};
   RunSubdivPermsTest(&cp,
                      {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
@@ -588,8 +589,15 @@ TEST_F(RingReducerTest, InitializeParams) {
                        8, 9, 10, 11, 20, 21, 22, 23, 16, 17, 18, 19}},
                      {0, 4});
 
-  test_rank = 3;
-  cp.default_rank = test_rank;
+  cp.instance.impl_details.subdiv_offsets = {0, -4};
+  RunSubdivPermsTest(&cp,
+                     {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                       12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+                      {3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,  8,
+                       15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20}},
+                     {0, 3});
+
+  cp.default_rank = 3;
   cp.instance.impl_details.subdiv_offsets = {3, -3};
   RunSubdivPermsTest(&cp,
                      {{3,  4, 5, 6,  7,  0,  1,  2,  11, 12, 13, 14,
@@ -599,6 +607,49 @@ TEST_F(RingReducerTest, InitializeParams) {
                      {0, 1});
 }
 
+TEST_F(RingReducerTest, AutomaticSubdivs) {
+  const int kNumDevsPerTask = 8;
+  const int kNumTasks = 3;
+  const int kNumDevs = kNumDevsPerTask * kNumTasks;
+  CollectiveParams cp = SetUpCollectiveParams(kNumDevsPerTask, kNumTasks);
+
+  // Test automatic generation of subdiv offsets.
+  cp.default_rank = 0;
+  cp.instance.impl_details.subdiv_offsets.clear();
+  RunSubdivPermsTest(&cp, {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                            12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
+                     {0});
+
+  // Set shape so that with 2 subdivs chunk_size is 3 MiB.  This should cause 2
+  // offsets, {0, -4}, to be generated.
+  {
+    int num_subdivs = 2;
+    int num_chunks = kNumDevs * num_subdivs;
+    size_t chunk_size = 3 * 1048576;  // 3 MB
+    size_t tensor_size = chunk_size * num_chunks;
+    cp.instance.shape =
+        TensorShape({static_cast<int64>(tensor_size / DataTypeSize(DT_FLOAT))});
+  }
+  cp.instance.impl_details.subdiv_offsets.clear();
+  RunSubdivPermsTest(&cp,
+                     {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                       12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+                      {3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,  8,
+                       15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20}},
+                     {0, 3});
+}
+
+TEST_F(RingReducerTest, AutomaticSubdivUpperBound) {
+  const int kNumDevsPerTask = 1;
+  const int kNumTasks = 4;
+  CollectiveParams cp = SetUpCollectiveParams(kNumDevsPerTask, kNumTasks);
+
+  cp.default_rank = 0;
+  cp.instance.impl_details.subdiv_offsets.clear();
+  cp.instance.shape = TensorShape({104857600 / DataTypeSize(DT_FLOAT)});
+  RunSubdivPermsTest(&cp, {{0, 1, 2, 3}, {0, 1, 2, 3}}, {0, 0});
+}
+
 // TODO(b/113171733): change to use TEST_P.
 #define DEF_TEST(B, T, W, D, S, L, A)                                         \
   TEST_F(RingReducerTest,                                                     \
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index e0da91125b..fa959b5a0e 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -132,6 +132,7 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
             "Failed to get CollectiveExecutor from OpKernelContext for Op ",
             col_params_.name),
         done);
+    col_params_.instance.shape = c->input(0).shape();
     // Allocate output on the first pass through this function.  This must be
     // done immediately, while we're still in the executor thread.  Otherwise
     // the memory is not guaranteed to be unused by any concurrently executing
@@ -171,7 +172,7 @@ class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
     OP_REQUIRES_OK(
         c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
     OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
-    OP_REQUIRES_OK(c, c->GetAttr("shape", &shape_));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &col_params_.instance.shape));
     col_params_.is_source = true;
     col_params_.instance.impl_details.subdiv_offsets = {0};
 
@@ -195,13 +196,14 @@ class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
     if (c->mutable_output(0) == nullptr) {
       // Allocate the output tensor, trying to reuse the input.
       Tensor* output = nullptr;
-      OP_REQUIRES_OK_ASYNC(
-          c, c->forward_input_or_allocate_output({0}, 0, shape_, &output),
-          done);
+      OP_REQUIRES_OK_ASYNC(c,
+                           c->forward_input_or_allocate_output(
+                               {0}, 0, col_params_.instance.shape, &output),
+                           done);
     }
     if (!CanProceedWithCompute(c, col_exec, done)) return;
     OP_REQUIRES_ASYNC(
-        c, shape_.IsSameSize(c->input(0).shape()),
+        c, col_params_.instance.shape.IsSameSize(c->input(0).shape()),
         errors::Internal("Declared shape of op ", col_params_.name,
                          " does not match shape of input"),
         done);
@@ -214,8 +216,6 @@ class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
   }
 
  private:
-  TensorShape shape_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(CollectiveBcastSendOpKernel);
 };
 
@@ -234,7 +234,7 @@ class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
     OP_REQUIRES_OK(
         c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
     OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
-    OP_REQUIRES_OK(c, c->GetAttr("shape", &shape_));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &col_params_.instance.shape));
     col_params_.is_source = false;
     col_params_.instance.impl_details.subdiv_offsets = {0};
 
@@ -258,7 +258,8 @@ class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
     if (c->mutable_output(0) == nullptr) {
       // No input, so must allocate output.
       Tensor* output = nullptr;
-      OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, shape_, &output), done);
+      OP_REQUIRES_OK_ASYNC(
+          c, c->allocate_output(0, col_params_.instance.shape, &output), done);
     }
     if (!CanProceedWithCompute(c, col_exec, done)) return;
 
@@ -270,8 +271,6 @@ class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
   }
 
  private:
-  TensorShape shape_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(CollectiveBcastRecvOpKernel);
 };
 
-- 
GitLab


From 750466c6e6624d279de7f9a43accd682d487509c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 12:37:05 -0700
Subject: [PATCH 090/570] Introduce the abstraction of RunHandler which each
 DirectSession can use for the duration of a single RunInternal() call from
 RunHandlerPool. We want to leverage this abstraction for improving the
 cross-session inter-op parallelism for lower latency inference in the future.
 In the case that global pools aren't used, this change should be a no-op.

PiperOrigin-RevId: 214818187
---
 tensorflow/core/BUILD                         |  16 ++
 .../core/common_runtime/direct_session.cc     |  49 +++-
 .../core/common_runtime/direct_session.h      |   3 +
 .../common_runtime/direct_session_test.cc     |  28 ++
 tensorflow/core/framework/run_handler.cc      | 248 ++++++++++++++++++
 tensorflow/core/framework/run_handler.h       |  95 +++++++
 tensorflow/core/framework/run_handler_util.cc |  57 ++++
 tensorflow/core/framework/run_handler_util.h  |  43 +++
 .../core/framework/run_handler_util_test.cc   |  93 +++++++
 tensorflow/core/protobuf/config.proto         |   5 +
 ...ensorflow.-run-options.-experimental.pbtxt |   6 +
 .../golden/v1/tensorflow.-run-options.pbtxt   |   6 +
 ...ensorflow.-run-options.-experimental.pbtxt |   6 +
 .../golden/v2/tensorflow.-run-options.pbtxt   |   6 +
 14 files changed, 655 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/core/framework/run_handler.cc
 create mode 100644 tensorflow/core/framework/run_handler.h
 create mode 100644 tensorflow/core/framework/run_handler_util.cc
 create mode 100644 tensorflow/core/framework/run_handler_util.h
 create mode 100644 tensorflow/core/framework/run_handler_util_test.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index ca247dc56b..01e2e9f62b 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2484,6 +2484,8 @@ FRAMEWORK_INTERNAL_PUBLIC_HEADERS = [
     "framework/op_segment.h",
     "framework/rendezvous.h",  # only needed for tests
     "framework/resource_var.h",
+    "framework/run_handler.h",
+    "framework/run_handler_util.h",
     "framework/tensor_reference.h",
     "framework/tracking_allocator.h",  # only needed for tests
     "framework/unique_tensor_references.h",
@@ -2970,6 +2972,7 @@ tf_cuda_library(
         ":core_cpu_internal",
         ":device_tracer",
         ":framework",
+        ":framework_internal",
         ":graph",
         ":lib",
         ":lib_internal",
@@ -4117,6 +4120,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "framework_run_handler_util_test",
+    size = "small",
+    srcs = ["framework/run_handler_util_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":framework_internal",
+        ":lib",
+        ":test",
+        ":test_main",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "common_runtime_direct_session_test",
     size = "small",
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 841181f8c3..458e133b68 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/run_handler.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -244,6 +245,21 @@ void DirectSession::SchedClosure(thread::ThreadPool* pool,
 #endif  // __ANDROID__
 }
 
+static RunHandlerPool* GetOrCreateRunHandlerPool(
+    const SessionOptions& options) {
+  static RunHandlerPool* pool =
+      new RunHandlerPool(NumInterOpThreadsFromSessionOptions(options));
+  return pool;
+}
+
+bool DirectSession::ShouldUseRunHandlerPool() const {
+  if (options_.config.session_inter_op_thread_pool_size() > 0 ||
+      options_.config.use_per_session_threads()) {
+    return false;
+  }
+  return true;
+}
+
 DirectSession::DirectSession(const SessionOptions& options,
                              const DeviceMgr* device_mgr,
                              DirectSessionFactory* const factory)
@@ -582,16 +598,37 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
     }
   }
 
-  Executor::Args::Runner default_runner = [this,
-                                           pool](Executor::Args::Closure c) {
-    SchedClosure(pool, std::move(c));
-  };
+  std::unique_ptr<RunHandler> handler;
+  if (ShouldUseRunHandlerPool() &&
+      run_options.experimental().use_run_handler_pool()) {
+    // Non-null only when a global inter-op pool is used.
+    VLOG(1) << "Using RunHandler to scheduler inter-op closures.";
+    handler = GetOrCreateRunHandlerPool(options_)->Get();
+  }
+  auto* handler_ptr = handler.get();
+
+  Executor::Args::Runner default_runner = nullptr;
+
+  if (pool == nullptr) {
+    default_runner = [](Executor::Args::Closure c) { c(); };
+  } else if (handler_ptr != nullptr) {
+    default_runner = [handler_ptr](Executor::Args::Closure c) {
+      handler_ptr->ScheduleInterOpClosure(std::move(c));
+    };
+  } else {
+    default_runner = [this, pool](Executor::Args::Closure c) {
+      SchedClosure(pool, std::move(c));
+    };
+  }
+
   for (const auto& item : executors_and_keys->items) {
-    // TODO(zhengxq): support partial run.
-    // TODO(zhengxq): if the device picks its own threadpool, we need to assign
+    // TODO(azaks): support partial run.
+    // TODO(azaks): if the device picks its own threadpool, we need to assign
     //     less threads to the main compute pool by default.
     thread::ThreadPool* device_thread_pool =
         item.device->tensorflow_device_thread_pool();
+    // TODO(crk): Investigate usage of RunHandlerPool when using device specific
+    // thread pool(s).
     if (!device_thread_pool) {
       args.runner = default_runner;
     } else {
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 4a6a921ea7..3a168bbe3f 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -247,6 +247,9 @@ class DirectSession : public Session {
                                    ExecutorsAndKeys* executors_and_keys,
                                    RunMetadata* run_metadata);
 
+  // Returns whether inter-op execution uses a global pool.
+  bool ShouldUseRunHandlerPool() const;
+
   ::tensorflow::Status ExtendLocked(const GraphDef& graph)
       EXCLUSIVE_LOCKS_REQUIRED(graph_state_lock_);
 
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 65e816c202..e3e431f800 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -625,6 +625,34 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetworkWithOpts_Callable) {
   EXPECT_EQ(run_metadata.step_stats().dev_stats_size(), 2);
 }
 
+TEST_F(DirectSessionMinusAXTest, UseRunHandlerPool) {
+  Initialize({3, 2, -1, 0});
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def_));
+  std::vector<std::pair<string, Tensor>> inputs;
+
+  // Request two targets: one fetch output and one non-fetched output.
+  std::vector<string> output_names = {y_ + ":0"};
+  std::vector<string> target_nodes = {y_neg_};
+  std::vector<Tensor> outputs;
+
+  // Prepares RunOptions and RunMetadata
+  RunOptions run_options;
+  run_options.mutable_experimental()->set_use_run_handler_pool(true);
+
+  Status s = session->Run(run_options, inputs, output_names, target_nodes,
+                          &outputs, nullptr);
+  TF_ASSERT_OK(s);
+
+  ASSERT_EQ(1, outputs.size());
+  // The first output should be initialized and have the correct
+  // output.
+  auto mat = outputs[0].matrix<float>();
+  ASSERT_TRUE(outputs[0].IsInitialized());
+  EXPECT_FLOAT_EQ(5.0, mat(0, 0));
+}
+
 TEST(DirectSessionTest, KeepsStateAcrossRunsOfSession) {
   GraphDef def;
   Graph g(OpRegistry::Global());
diff --git a/tensorflow/core/framework/run_handler.cc b/tensorflow/core/framework/run_handler.cc
new file mode 100644
index 0000000000..9c6490a603
--- /dev/null
+++ b/tensorflow/core/framework/run_handler.cc
@@ -0,0 +1,248 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/run_handler.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/run_handler_util.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+
+// Contains the concrete implementation of the RunHandler.
+// Externally visible RunHandler class simply forwards the work to this one.
+class RunHandler::Impl {
+ public:
+  explicit Impl(RunHandlerPool::Impl* pool_impl) : pool_impl_(pool_impl) {
+    Reset();
+  }
+
+  ~Impl() {}
+
+  void set_inter_op_scheduling_range(std::uint_fast32_t start,
+                                     std::uint_fast32_t limit) {
+    inter_op_scheduling_range_.store(EncodePartition(start, limit),
+                                     std::memory_order_release);
+  }
+
+  std::uint_fast32_t inter_op_scheduling_range() const {
+    return inter_op_scheduling_range_.load(std::memory_order_acquire);
+  }
+
+  // Stores now time (in microseconds) since unix epoch when the handler is
+  // requested via RunHandlerPool::Get().
+  uint64 start_time_us() const { return start_time_us_; }
+
+  void ScheduleInterOpClosure(std::function<void()> fn);
+
+  void Reset();
+
+  RunHandlerPool::Impl* pool_impl() { return pool_impl_; }
+
+ private:
+  // Encoding/decoding logic for storing [start, limit) into a single
+  // uint_fast32_t int. We assume that pool_num_threads < (1 << 16).
+  const int kMaxPartitionBits = 16;
+  const int kMaxThreads = 1 << kMaxPartitionBits;
+
+  std::uint_fast32_t EncodePartition(std::uint_fast32_t start,
+                                     std::uint_fast32_t limit) {
+    return (start << kMaxPartitionBits) | limit;
+  }
+
+  void DecodePartition(std::uint_fast32_t val, std::uint_fast32_t* start,
+                       std::uint_fast32_t* limit) {
+    *limit = val & (kMaxThreads - 1);
+    val >>= kMaxPartitionBits;
+    *start = val;
+  }
+
+  std::atomic_uint_fast32_t inter_op_scheduling_range_;
+  RunHandlerPool::Impl* pool_impl_;  // NOT OWNED.
+  uint64 start_time_us_;
+};
+
+// Contains shared state across all run handlers present in the pool. Also
+// responsible for pool management decisions.
+// This class is thread safe.
+class RunHandlerPool::Impl {
+ public:
+  // Maximum number of handlers pre-created during pool construction time. The
+  // number has been chosen expecting each handler might at least want 1
+  // inter-op thread for execution (during compute intensive workloads like
+  // inference).
+  static const int kMaxHandlers = 128;
+
+  explicit Impl(int num_inter_op_threads)
+      : inter_op_thread_pool_(new thread::ThreadPool(
+            Env::Default(), ThreadOptions(), "inter_op", num_inter_op_threads)),
+        iterations_(0) {
+    VLOG(1) << "Creating a RunHandlerPool with max handlers: " << kMaxHandlers;
+    for (int i = 0; i < kMaxHandlers; ++i) {
+      handlers_.emplace_back(new RunHandler::Impl(this));
+      free_handlers_.push_back(handlers_.back().get());
+    }
+  }
+
+  ~Impl() {
+    // Sanity check that all handlers have been returned back to the pool before
+    // destruction.
+    DCHECK_EQ(handlers_.size(), kMaxHandlers);
+    DCHECK_EQ(free_handlers_.size(), handlers_.size());
+    DCHECK_EQ(sorted_active_handlers_.size(), 0);
+  }
+
+  thread::ThreadPool* inter_op_thread_pool() const {
+    return inter_op_thread_pool_.get();
+  }
+
+  std::unique_ptr<RunHandler> Get() LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    while (free_handlers_.empty()) {
+      one_handler_free_.wait(l);
+    }
+    // Remove the last entry from free_handlers_ and add to the end of
+    // sorted_active_handlers_.
+    auto* handler_impl = free_handlers_.back();
+    handler_impl->Reset();
+    // Sortedness isn't violated if we simply add at the end of the list, since
+    // handlers are expected to be obtained in increasing order of time.
+    sorted_active_handlers_.push_back(handler_impl);
+    DCHECK_LE(sorted_active_handlers_.size(), kMaxHandlers);
+    free_handlers_.pop_back();
+
+    RecomputePoolStatsLocked();
+    return WrapUnique<RunHandler>(new RunHandler(handler_impl));
+  }
+
+  void ReleaseHandler(RunHandler::Impl* handler) LOCKS_EXCLUDED(mu_) {
+    {
+      mutex_lock l(mu_);
+      DCHECK_GT(sorted_active_handlers_.size(), 0);
+
+      uint64 now = tensorflow::Env::Default()->NowMicros();
+      double elapsed = (now - handler->start_time_us()) / 1000.0;
+      time_hist_.Add(elapsed);
+
+      // Erase from and update sorted_active_handlers_. Add it to the end of
+      // free_handlers_.
+      auto iter = std::find(sorted_active_handlers_.begin(),
+                            sorted_active_handlers_.end(), handler);
+      DCHECK(iter != sorted_active_handlers_.end())
+          << "Unexpected handler: " << handler
+          << " is being requested for release";
+
+      // Remove this handler from this list and add it to the list of free
+      // handlers.
+      sorted_active_handlers_.erase(iter);
+      free_handlers_.push_back(handler);
+      DCHECK_LE(free_handlers_.size(), kMaxHandlers);
+
+      RecomputePoolStatsLocked();
+    }
+    one_handler_free_.notify_one();
+  }
+
+ private:
+  void RecomputePoolStatsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Thread safe part.
+  const std::unique_ptr<thread::ThreadPool> inter_op_thread_pool_;
+
+  // Thread compatible part used only by lock under RunHandlerPool.
+  // Handlers are sorted by start time.
+  std::vector<RunHandler::Impl*> sorted_active_handlers_ GUARDED_BY(mu_);
+  std::vector<RunHandler::Impl*> free_handlers_ GUARDED_BY(mu_);
+  std::vector<std::unique_ptr<RunHandler::Impl>> handlers_ GUARDED_BY(mu_);
+  // Histogram of elapsed runtime of every handler (in ms).
+  histogram::Histogram time_hist_ GUARDED_BY(mu_);
+  std::vector<std::uint_fast32_t> inter_op_start_ GUARDED_BY(mu_);
+  std::vector<std::uint_fast32_t> inter_op_limit_ GUARDED_BY(mu_);
+  int64 iterations_ GUARDED_BY(mu_);
+  condition_variable one_handler_free_;
+  mutex mu_;
+};
+
+void RunHandlerPool::Impl::RecomputePoolStatsLocked() {
+  int num_active_requests = sorted_active_handlers_.size();
+  if (num_active_requests == 0) return;
+
+  int num_threads = inter_op_thread_pool_->NumThreads();
+
+  inter_op_start_.resize(num_active_requests);
+  inter_op_limit_.resize(num_active_requests);
+
+  const int kMinThreadsPerRequest = 3;
+  ComputeInterOpSchedulingRanges(num_active_requests, num_threads,
+                                 kMinThreadsPerRequest, &inter_op_start_,
+                                 &inter_op_limit_);
+
+  for (int i = 0; i < num_active_requests; ++i) {
+    sorted_active_handlers_[i]->set_inter_op_scheduling_range(
+        inter_op_start_[i], inter_op_limit_[i]);
+  }
+
+  if (iterations_++ % 5000 == 0 && VLOG_IS_ON(1)) {
+    VLOG(1) << "Printing time histogram: " << time_hist_.ToString();
+    VLOG(1) << "Active session runs: " << num_active_requests;
+    uint64 now = tensorflow::Env::Default()->NowMicros();
+    string ranges_str = "";
+    string times_str = "";
+    for (int i = 0; i < num_active_requests; ++i) {
+      if (i > 0) {
+        times_str += " ";
+        ranges_str += " ";
+      }
+
+      times_str += strings::StrCat(
+          (now - sorted_active_handlers_[i]->start_time_us()) / 1000.0, " ms.");
+      ranges_str += strings::StrCat("[", inter_op_start_[i], ", ",
+                                    inter_op_limit_[i], ")");
+    }
+    VLOG(1) << "Elapsed times are: " << times_str;
+    VLOG(1) << "Ranges are: " << ranges_str;
+  }
+}
+
+void RunHandler::Impl::ScheduleInterOpClosure(std::function<void()> fn) {
+  std::uint_fast32_t start = 0, limit = 0;
+  DecodePartition(inter_op_scheduling_range(), &start, &limit);
+  pool_impl_->inter_op_thread_pool()->Schedule(std::move(fn));
+}
+
+void RunHandler::Impl::Reset() {
+  set_inter_op_scheduling_range(
+      0, pool_impl_->inter_op_thread_pool()->NumThreads());
+  start_time_us_ = tensorflow::Env::Default()->NowMicros();
+}
+
+RunHandlerPool::RunHandlerPool(int num_inter_op_threads)
+    : impl_(new Impl(num_inter_op_threads)) {}
+
+RunHandlerPool::~RunHandlerPool() {}
+
+std::unique_ptr<RunHandler> RunHandlerPool::Get() { return impl_->Get(); }
+
+RunHandler::RunHandler(Impl* impl) : impl_(impl) {}
+
+void RunHandler::ScheduleInterOpClosure(std::function<void()> fn) {
+  impl_->ScheduleInterOpClosure(std::move(fn));
+}
+
+RunHandler::~RunHandler() { impl_->pool_impl()->ReleaseHandler(impl_); }
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/run_handler.h b/tensorflow/core/framework/run_handler.h
new file mode 100644
index 0000000000..72fa6301b4
--- /dev/null
+++ b/tensorflow/core/framework/run_handler.h
@@ -0,0 +1,95 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_H_
+
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+class RunHandler;
+
+// RunHandlerPool is a fixed size pool of pre-allocated RunHandlers
+// that can be used for tracking inter-op work for a given Session::Run().
+// RunHandler(s) in the pool are initially 'inactive'. A RunHandler becomes
+// 'active' when its unique_ptr is returned by Get() and is being used by a
+// client. It becomes 'inactive' once more when its unique_ptr gets destroyed.
+//
+// Expected usage:
+//
+// * Create a single RunHandlerPool (say run_handler_pool_).
+//
+// * When a Session::Run() is invoked, obtain a handler by:
+// auto handler = run_handler_pool_->Get();
+//
+// * Use handler for scheduling all inter-op work by:
+// handler->ScheduleInterOpClosure(closure);
+//
+// This class is thread safe.
+class RunHandlerPool {
+ public:
+  explicit RunHandlerPool(int num_inter_op_threads);
+  ~RunHandlerPool();
+
+  // Returns an inactive RunHandler from the pool.
+  //
+  // RunHandlers in RunHandlerPool are initially 'inactive'.
+  // A RunHandler becomes 'active' when its unique_ptr its returned by Get()
+  // and is being used by a client.  It becomes 'inactive' once more when the
+  // unique_ptr is destroyed.
+  //
+  // Will block unless there is an inactive handler.
+  std::unique_ptr<RunHandler> Get();
+
+ private:
+  class Impl;
+  friend class RunHandler;
+
+  std::unique_ptr<Impl> impl_;
+};
+
+// RunHandler can be used to schedule inter-op closures to run on a global pool
+// shared across all Session::Run(s).
+//
+// It can only be created via RunHandlerPool::Get().
+//
+// This class can be used instead of directly scheduling closures on a global
+// pool since it maintains a global view across all sessions and optimizes pool
+// scheduling to improve (median and tail) latency.
+//
+// This class is thread safe.
+class RunHandler {
+ public:
+  void ScheduleInterOpClosure(std::function<void()> fn);
+
+  ~RunHandler();
+
+ private:
+  class Impl;
+  friend class RunHandlerPool::Impl;
+
+  explicit RunHandler(Impl* impl);
+
+  Impl* impl_;  // NOT OWNED.
+};
+
+}  // end namespace tensorflow.
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_H_
diff --git a/tensorflow/core/framework/run_handler_util.cc b/tensorflow/core/framework/run_handler_util.cc
new file mode 100644
index 0000000000..3087998c69
--- /dev/null
+++ b/tensorflow/core/framework/run_handler_util.cc
@@ -0,0 +1,57 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/run_handler_util.h"
+
+#include <algorithm>
+#include <cmath>
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+void ComputeInterOpSchedulingRanges(int num_active_requests, int num_threads,
+                                    int min_threads_per_request,
+                                    std::vector<std::uint_fast32_t>* start_vec,
+                                    std::vector<std::uint_fast32_t>* end_vec) {
+  // Each request is expected to have weight W[i] = num_active_requests - i.
+  // Therefore, total_weight = sum of all request weights.
+  float total_weight = 0.5f * num_active_requests * (num_active_requests + 1);
+  float demand_factor = static_cast<float>(num_threads) / total_weight;
+  float last_cumulative_weight = 0.0;
+  min_threads_per_request = std::max(1, min_threads_per_request);
+  for (int i = 0; i != num_active_requests; i++) {
+    float cumulative_weight =
+        static_cast<float>(i + 1) *
+        (num_active_requests - static_cast<float>(i) * 0.5f);
+    float weight = cumulative_weight - last_cumulative_weight;
+    // Quantize thread_demand by rounding up, and also satisfying
+    // `min_threads_per_request` constraint.
+    // Note: We subtract a small epsilon (0.00001) to prevent ceil(..) from
+    // rounding weights like 4.0 to 5.
+    int demand =
+        std::max(min_threads_per_request,
+                 static_cast<int>(ceil(weight * demand_factor - 0.00001f)));
+    // For the quantized range [start, end); compute the floor of real start,
+    // and expand downwards from there with length `demand` and adjust for
+    // boundary conditions.
+    int start = last_cumulative_weight * demand_factor;
+    int end = std::min(num_threads, start + demand);
+    start = std::max(0, std::min(start, end - demand));
+    start_vec->at(i) = start;
+    end_vec->at(i) = end;
+    last_cumulative_weight = cumulative_weight;
+  }
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/run_handler_util.h b/tensorflow/core/framework/run_handler_util.h
new file mode 100644
index 0000000000..c0c36aeccb
--- /dev/null
+++ b/tensorflow/core/framework/run_handler_util.h
@@ -0,0 +1,43 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
+
+#include <cstdint>
+#include <vector>
+
+namespace tensorflow {
+
+// Assign thread ranges to requests.
+// Requests are numbered 0...num_active_requests-1, and
+// threads are numbered 0...num_threads-1.
+// On return, the range start_vec->at(i)...end_vec->at(i)-1
+// indicates the subrange of the threads available to request i.
+// The ranges given to different requests may overlap.
+// Lower numbered requests will tend to be assigned more threads.
+// Thus, a client might associate older requests with lower
+// array indices so they receive access to more threads.
+// However, the routine ensures that each request is given access
+// to at least min(min_threads_per_request, num_threads)  threads.
+// Every thread will be assigned to at least one request range,
+// assuming there is at least one request.
+void ComputeInterOpSchedulingRanges(int num_active_requests, int num_threads,
+                                    int min_threads_per_request,
+                                    std::vector<std::uint_fast32_t>* start_vec,
+                                    std::vector<std::uint_fast32_t>* end_vec);
+
+}  // end namespace tensorflow
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
diff --git a/tensorflow/core/framework/run_handler_util_test.cc b/tensorflow/core/framework/run_handler_util_test.cc
new file mode 100644
index 0000000000..a1928c132b
--- /dev/null
+++ b/tensorflow/core/framework/run_handler_util_test.cc
@@ -0,0 +1,93 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/run_handler_util.h"
+
+#include <vector>
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+namespace tensorflow {
+namespace {
+
+void VerifyFunction(int num_active_requests, int num_threads,
+                    int min_threads_per_request, bool print_stats = false) {
+  if (print_stats) {
+    LOG(INFO) << "Test case# num_active_requests: " << num_active_requests
+              << " num_threads: " << num_threads
+              << " min_threads: " << min_threads_per_request;
+  }
+  std::vector<std::uint_fast32_t> start(num_active_requests);
+  std::vector<std::uint_fast32_t> end(num_active_requests);
+
+  ComputeInterOpSchedulingRanges(num_active_requests, num_threads,
+                                 min_threads_per_request, &start, &end);
+  string range_str = "";
+  for (int i = 0; i < num_active_requests; ++i) {
+    if (i > 0) range_str += " ";
+    range_str += strings::StrCat("[", start[i], ", ", end[i], ")");
+
+    ASSERT_GE(start[i], 0) << range_str;
+    ASSERT_LE(end[i], num_threads) << range_str;
+    if (i > 0) {
+      // Due to linearly decreasing demand, #threads(i - 1) >= #threads(i)
+      ASSERT_GE(end[i - 1] - start[i - 1], end[i] - start[i]) << range_str;
+      // No missing threads.
+      ASSERT_GE(end[i - 1], start[i]) << range_str;
+    }
+    // Each interval is at least of size 'min_threads_per_request'.
+    ASSERT_GE((end[i] - start[i]), min_threads_per_request) << range_str;
+    // Verify that assigned (quantized) threads is not overly estimated
+    // from real demand, when the demand is high (>=
+    // min_threads_per_request).
+    float entry_weight = num_active_requests - i;
+    float total_weight = 0.5f * num_active_requests * (num_active_requests + 1);
+    float thread_demand = (entry_weight * num_threads) / total_weight;
+    if (thread_demand > min_threads_per_request) {
+      // We expect some over-estimation of threads due to quantization,
+      // but we hope it's not more than 1 extra thread.
+      ASSERT_NEAR(end[i] - start[i], thread_demand, 1.0)
+          << "Ranges: " << range_str << " thread_demand: " << thread_demand
+          << " i: " << i;
+    }
+  }
+  ASSERT_EQ(end[num_active_requests - 1], num_threads);
+  ASSERT_EQ(start[0], 0);
+  if (print_stats) {
+    LOG(INFO) << "Assigned ranges: " << range_str;
+  }
+}
+
+TEST(RunHandlerUtilTest, TestComputeInterOpSchedulingRanges) {
+  const int kMinThreadsPerRequestBound = 12;
+  const int kMaxActiveRequests = 128;
+  const int kMaxThreads = 128;
+
+  for (int min_threads_per_request = 1;
+       min_threads_per_request <= kMinThreadsPerRequestBound;
+       ++min_threads_per_request) {
+    for (int num_active_requests = 1; num_active_requests <= kMaxActiveRequests;
+         ++num_active_requests) {
+      for (int num_threads = min_threads_per_request;
+           num_threads <= kMaxThreads; ++num_threads) {
+        VerifyFunction(num_active_requests, num_threads,
+                       min_threads_per_request);
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 85cd02350a..104ab039cb 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -453,6 +453,11 @@ message RunOptions {
     // same group_key value (in a distributed computation where tasks
     // run disjoint graphs).
     int64 collective_graph_key = 1;
+    // If true, then operations (using the inter-op pool) across all
+    // session::run() calls will be centrally scheduled, optimizing for (median
+    // and tail) latency.
+    // Consider using this option for CPU-bound workloads like inference.
+    bool use_run_handler_pool = 2;
   };
 
   Experimental experimental = 8;
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
index 537e73aa89..47b5b56faf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
@@ -8,5 +8,11 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_INT64
     }
+    field {
+      name: "use_run_handler_pool"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
index cec04a2bf0..c0c2e7b9f8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
@@ -55,6 +55,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT64
       }
+      field {
+        name: "use_run_handler_pool"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
     }
     enum_type {
       name: "TraceLevel"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
index 537e73aa89..47b5b56faf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
@@ -8,5 +8,11 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_INT64
     }
+    field {
+      name: "use_run_handler_pool"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
index cec04a2bf0..c0c2e7b9f8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
@@ -55,6 +55,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT64
       }
+      field {
+        name: "use_run_handler_pool"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
     }
     enum_type {
       name: "TraceLevel"
-- 
GitLab


From 1084594657a5d139102ac794f84d1427a710e39a Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Thu, 27 Sep 2018 12:51:52 -0700
Subject: [PATCH 091/570] TFLite: Rename ResetVariableTensorsToZero ->
 ResetVariableTensors PiperOrigin-RevId: 214820383

---
 .../contrib/lite/experimental/c/c_api_experimental.cc    | 5 ++---
 .../contrib/lite/experimental/c/c_api_experimental.h     | 2 +-
 .../lite/experimental/c/c_api_experimental_test.cc       | 2 +-
 tensorflow/contrib/lite/interpreter.cc                   | 9 ++++-----
 tensorflow/contrib/lite/interpreter.h                    | 7 +++++--
 tensorflow/contrib/lite/kernels/test_util.cc             | 2 +-
 tensorflow/contrib/lite/python/interpreter.py            | 4 ++--
 .../python/interpreter_wrapper/interpreter_wrapper.cc    | 4 ++--
 .../python/interpreter_wrapper/interpreter_wrapper.h     | 2 +-
 tensorflow/contrib/lite/testing/tflite_driver.cc         | 2 +-
 10 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/tensorflow/contrib/lite/experimental/c/c_api_experimental.cc b/tensorflow/contrib/lite/experimental/c/c_api_experimental.cc
index 0f16595811..29f8701f53 100644
--- a/tensorflow/contrib/lite/experimental/c/c_api_experimental.cc
+++ b/tensorflow/contrib/lite/experimental/c/c_api_experimental.cc
@@ -21,9 +21,8 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
-TFL_Status TFL_InterpreterResetVariableTensorsToZero(
-    TFL_Interpreter* interpreter) {
-  return interpreter->impl->ResetVariableTensorsToZero();
+TFL_Status TFL_InterpreterResetVariableTensors(TFL_Interpreter* interpreter) {
+  return interpreter->impl->ResetVariableTensors();
 }
 
 void TFL_InterpreterOptionsAddBuiltinOp(TFL_InterpreterOptions* options,
diff --git a/tensorflow/contrib/lite/experimental/c/c_api_experimental.h b/tensorflow/contrib/lite/experimental/c/c_api_experimental.h
index b8de7b9964..fca5d92f77 100644
--- a/tensorflow/contrib/lite/experimental/c/c_api_experimental.h
+++ b/tensorflow/contrib/lite/experimental/c/c_api_experimental.h
@@ -25,7 +25,7 @@ extern "C" {
 typedef TfLiteBuiltinOperator TFL_BuiltinOperator;
 
 // Resets all variable tensors to zero.
-TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterResetVariableTensorsToZero(
+TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterResetVariableTensors(
     TFL_Interpreter* interpreter);
 
 // Adds an op registration for a builtin operator.
diff --git a/tensorflow/contrib/lite/experimental/c/c_api_experimental_test.cc b/tensorflow/contrib/lite/experimental/c/c_api_experimental_test.cc
index d86ad00d6d..1b1bedb754 100644
--- a/tensorflow/contrib/lite/experimental/c/c_api_experimental_test.cc
+++ b/tensorflow/contrib/lite/experimental/c/c_api_experimental_test.cc
@@ -44,7 +44,7 @@ TEST(CApiExperimentalSimple, Smoke) {
   TFL_Interpreter* interpreter = TFL_NewInterpreter(model, options);
   ASSERT_NE(interpreter, nullptr);
   ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
-  EXPECT_EQ(TFL_InterpreterResetVariableTensorsToZero(interpreter), kTfLiteOk);
+  EXPECT_EQ(TFL_InterpreterResetVariableTensors(interpreter), kTfLiteOk);
   EXPECT_EQ(TFL_InterpreterInvoke(interpreter), kTfLiteOk);
 
   TFL_DeleteInterpreter(interpreter);
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 2657bcd42b..88e41ffc55 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -451,16 +451,15 @@ TfLiteStatus Interpreter::AllocateTensors() {
 
   // Reset the variable tensors to zero after (re)allocating the tensors.
   // Developers shouldn't rely on the side effect of this function to reset
-  // variable tesnsors. They should call `ResetVariableTensorsToZero` directly
+  // variable tesnsors. They should call `ResetVariableTensors` directly
   // instead.
-  ResetVariableTensorsToZero();
+  ResetVariableTensors();
 
   return kTfLiteOk;
 }
 
-// TODO(ycling): Consider to provide other functions to initialize variable
-// tensors to non-zero values.
-TfLiteStatus Interpreter::ResetVariableTensorsToZero() {
+// TODO(ycling): Support non-zero default values.
+TfLiteStatus Interpreter::ResetVariableTensors() {
   for (auto& tensor : tensors_) {
     if (!tensor.is_variable) {
       continue;
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index aa2bc4def6..7ef736d01b 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -421,9 +421,12 @@ class Interpreter {
     allow_buffer_handle_output_ = allow_buffer_handle_output;
   }
 
-  // Reset all variable tensors to zero.
+  // Reset all variable tensors to the default value.
+  // If a variable tensor doesn't have a buffer, reset it to zero.
+  // TODO(b/115961645): Implement - If a variable tensor has a buffer, reset it
+  // to the value of the buffer.
   // WARNING: This is an experimental API and subject to change.
-  TfLiteStatus ResetVariableTensorsToZero();
+  TfLiteStatus ResetVariableTensors();
 
   // Retrieve an operator's description of its work, for profiling purposes.
   const char* OpProfilingString(const TfLiteRegistration& op_reg,
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
index 0fdb0a3935..05a7c23ba1 100644
--- a/tensorflow/contrib/lite/kernels/test_util.cc
+++ b/tensorflow/contrib/lite/kernels/test_util.cc
@@ -122,7 +122,7 @@ void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
 
   CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
       << "Cannot allocate tensors";
-  interpreter_->ResetVariableTensorsToZero();
+  interpreter_->ResetVariableTensors();
 }
 
 void SingleOpModel::Invoke() { CHECK(interpreter_->Invoke() == kTfLiteOk); }
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index 1be61fe053..5700bf7892 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -253,5 +253,5 @@ class Interpreter(object):
     self._ensure_safe()
     self._interpreter.Invoke()
 
-  def reset_all_variables_to_zero(self):
-    return self._interpreter.ResetVariableTensorsToZero()
+  def reset_all_variables(self):
+    return self._interpreter.ResetVariableTensors()
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 9ab05f3068..418f19a179 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -466,9 +466,9 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
                                   error_msg);
 }
 
-PyObject* InterpreterWrapper::ResetVariableTensorsToZero() {
+PyObject* InterpreterWrapper::ResetVariableTensors() {
   TFLITE_PY_ENSURE_VALID_INTERPRETER();
-  TFLITE_PY_CHECK(interpreter_->ResetVariableTensorsToZero());
+  TFLITE_PY_CHECK(interpreter_->ResetVariableTensors());
   Py_RETURN_NONE;
 }
 
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index 641dd93db5..f5ca81e62a 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -65,7 +65,7 @@ class InterpreterWrapper {
   PyObject* TensorQuantization(int i) const;
   PyObject* SetTensor(int i, PyObject* value);
   PyObject* GetTensor(int i) const;
-  PyObject* ResetVariableTensorsToZero();
+  PyObject* ResetVariableTensors();
 
   // Returns a reference to tensor index i as a numpy array. The base_object
   // should be the interpreter object providing the memory.
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
index 1836eb53b9..17aa8cb293 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -301,7 +301,7 @@ bool TfLiteDriver::CheckResults() {
 }
 
 void TfLiteDriver::ResetLSTMStateTensors() {
-  interpreter_->ResetVariableTensorsToZero();
+  interpreter_->ResetVariableTensors();
 }
 
 }  // namespace testing
-- 
GitLab


From 4cedc8b6e738b7a188c9c091cf667bacafae44b7 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Thu, 27 Sep 2018 13:18:33 -0700
Subject: [PATCH 092/570] Updating the V2 variables API.

PiperOrigin-RevId: 214824023
---
 .../compiler/aot/tests/make_test_graphs.py    |   4 +-
 tensorflow/compiler/tests/lstm.py             |   2 +-
 .../dnn_tree_combined_estimator_test.py       |   2 +-
 .../python/external_regret_optimizer.py       |   4 +-
 .../python/swap_regret_optimizer.py           |   9 +-
 .../copy_graph/python/util/copy_elements.py   |   6 +-
 .../copy_graph/python/util/copy_test.py       |   4 +-
 .../python/kernel_tests/iterator_ops_test.py  |   2 +-
 .../python/kernel_tests/moving_stats_test.py  |   6 +-
 .../estimator/python/estimator/hooks_test.py  |   2 +-
 .../framework/python/ops/variables_test.py    |  28 +-
 .../graph_editor/tests/transform_test.py      |   2 +-
 .../layers/python/layers/layers_test.py       |   4 +-
 .../learn/python/learn/graph_actions_test.py  |  12 +-
 .../learn/python/learn/monitors_test.py       |  10 +-
 .../python/kernel_tests/sdca_ops_test.py      |   8 +-
 .../metrics/python/ops/metric_ops_test.py     |  19 +-
 .../contrib/model_pruning/python/pruning.py   |   3 +-
 .../model_pruning/python/pruning_test.py      |  22 +-
 .../opt/python/training/addsign_test.py       |  12 +-
 .../drop_stale_gradient_optimizer_test.py     |   4 +-
 .../training/external_optimizer_test.py       |  22 +-
 .../training/model_average_optimizer_test.py  |   3 +-
 .../opt/python/training/powersign_test.py     |  12 +-
 .../rnn/python/kernel_tests/core_rnn_test.py  |   4 +-
 .../contrib/session_bundle/exporter_test.py   |   6 +-
 .../kernel_tests/scatter_add_ndim_op_test.py  |  14 +-
 .../tensorrt/python/trt_convert_test.py       |   2 +-
 .../python/training/device_setter_test.py     |   8 +-
 tensorflow/python/client/session_test.py      |   2 +-
 .../python/debug/cli/analyzer_cli_test.py     |  20 +-
 .../python/debug/cli/stepper_cli_test.py      |   4 +-
 .../python/debug/lib/debug_utils_test.py      |   4 +-
 .../debug/lib/dist_session_debug_grpc_test.py |   4 +-
 .../python/debug/lib/grpc_large_data_test.py  |  12 +-
 .../debug/lib/session_debug_file_test.py      |   4 +-
 .../debug/lib/session_debug_grpc_test.py      |  46 +--
 .../python/debug/lib/session_debug_testlib.py |  90 ++---
 tensorflow/python/debug/lib/stepper_test.py   |  14 +-
 .../debug/wrappers/dumping_wrapper_test.py    |   2 +-
 .../debug/wrappers/local_cli_wrapper_test.py  |  14 +-
 tensorflow/python/estimator/estimator_test.py |  56 +--
 tensorflow/python/framework/function_test.py  |   2 +-
 .../python/framework/graph_util_test.py       |   8 +-
 tensorflow/python/framework/subscribe_test.py |   4 +-
 tensorflow/python/grappler/item_test.py       |   2 +-
 .../python/grappler/memory_optimizer_test.py  |  10 +-
 .../python/grappler/tf_optimizer_test.py      |   2 +-
 tensorflow/python/keras/engine/base_layer.py  |   4 +-
 .../python/kernel_tests/array_ops_test.py     |   4 +-
 .../kernel_tests/control_flow_ops_py_test.py  |  56 +--
 .../kernel_tests/dense_update_ops_test.py     |   6 +-
 .../kernel_tests/identity_op_py_test.py       |   2 +-
 .../resource_variable_ops_test.py             |   2 +-
 .../kernel_tests/scatter_nd_ops_test.py       |   4 +-
 .../python/kernel_tests/scatter_ops_test.py   |   4 +-
 .../kernel_tests/variable_scope_test.py       |   4 +-
 .../python/kernel_tests/variables_test.py     |  36 +-
 tensorflow/python/ops/gradients_test.py       |   2 +-
 tensorflow/python/ops/matmul_benchmark.py     |   8 +-
 tensorflow/python/ops/variable_scope.py       | 117 ++++++-
 tensorflow/python/ops/variables.py            | 323 +++++++++++++++---
 tensorflow/python/saved_model/loader_test.py  |  14 +-
 .../python/saved_model/saved_model_test.py    |  56 +--
 tensorflow/python/tools/freeze_graph_test.py  |   6 +-
 .../python/training/checkpointable/util.py    |   2 +-
 .../training/learning_rate_decay_test.py      |   4 +-
 .../python/training/monitored_session_test.py |  28 +-
 .../python/training/quantize_training_test.py |   3 +-
 .../python/training/queue_runner_test.py      |  22 +-
 tensorflow/python/training/saver_test.py      | 217 ++++++------
 ...server_lib_same_variables_no_clear_test.py |   4 +-
 tensorflow/python/training/server_lib_test.py |  18 +-
 .../python/training/session_manager_test.py   |  98 +++---
 tensorflow/python/training/supervisor_test.py |  52 +--
 .../training/sync_replicas_optimizer_test.py  |  17 +-
 .../python/training/training_ops_test.py      |  32 +-
 .../python/training/training_util_test.py     |   4 +-
 .../api/golden/v1/tensorflow.-variable.pbtxt  |   1 +
 .../tools/api/golden/v1/tensorflow.pbtxt      |   4 +
 .../v2/tensorflow.-variable-scope.pbtxt       | 105 ------
 ...ensorflow.-variable.-save-slice-info.pbtxt |  17 -
 .../api/golden/v2/tensorflow.-variable.pbtxt  | 130 -------
 .../golden/v2/tensorflow.initializers.pbtxt   |  12 -
 .../tools/api/golden/v2/tensorflow.pbtxt      |  92 -----
 .../golden/v2/tensorflow.variable_scope.pbtxt |   9 -
 86 files changed, 1015 insertions(+), 1040 deletions(-)
 delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.-variable-scope.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.-variable.-save-slice-info.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.variable_scope.pbtxt

diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index de135d7a23..64b861a730 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -47,7 +47,7 @@ def tfadd(_):
 
 def tfadd_with_ckpt(out_dir):
   x = array_ops.placeholder(dtypes.int32, name='x_hold')
-  y = variables.Variable(constant_op.constant([0]), name='y_saved')
+  y = variables.VariableV1(constant_op.constant([0]), name='y_saved')
   math_ops.add(x, y, name='x_y_sum')
 
   init_op = variables.initialize_all_variables()
@@ -62,7 +62,7 @@ def tfadd_with_ckpt(out_dir):
 
 def tfadd_with_ckpt_saver(out_dir):
   x = array_ops.placeholder(dtypes.int32, name='x_hold')
-  y = variables.Variable(constant_op.constant([0]), name='y_saved')
+  y = variables.VariableV1(constant_op.constant([0]), name='y_saved')
   math_ops.add(x, y, name='x_y_sum')
 
   init_op = variables.initialize_all_variables()
diff --git a/tensorflow/compiler/tests/lstm.py b/tensorflow/compiler/tests/lstm.py
index 43c469d032..73b3638e80 100644
--- a/tensorflow/compiler/tests/lstm.py
+++ b/tensorflow/compiler/tests/lstm.py
@@ -117,7 +117,7 @@ def LSTMLayer(cell_name, weights, m, c, x_seq, pad_seq):
 
 def RandomVar(shape, name=None):
   """Returns a variable of the given shape initialized to random values."""
-  return variables.Variable(
+  return variables.VariableV1(
       random_ops.random_uniform(shape), dtype=dtypes.float32, name=name)
 
 
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
index 839eedd3a8..04baa329a0 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
@@ -189,7 +189,7 @@ class CoreDNNBoostedTreeCombinedTest(test_util.TensorFlowTestCase):
     # Train for a few steps.
     est.train(input_fn=_train_input_fn, steps=1000)
     # 10 steps for dnn, 3  for 1 tree of depth 3 + 1 after the tree finished
-    self._assert_checkpoint(est.model_dir, global_step=14)
+    self._assert_checkpoint(est.model_dir, global_step=15)
     res = est.evaluate(input_fn=_eval_input_fn, steps=1)
     self.assertLess(0.5, res["auc"])
     est.predict(input_fn=_eval_input_fn)
diff --git a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
index d1af15f7e4..67f8ac2b93 100644
--- a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
+++ b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
@@ -102,9 +102,9 @@ def _project_multipliers_wrt_euclidean_norm(multipliers, radius):
         0.0,
         (radius - standard_ops.reduce_sum(multipliers)) / standard_ops.maximum(
             1.0, standard_ops.reduce_sum(inactive)))
-    multipliers += scale * inactive
+    multipliers = multipliers + (scale * inactive)
     new_inactive = standard_ops.cast(multipliers > 0, multipliers.dtype)
-    multipliers *= new_inactive
+    multipliers = multipliers * new_inactive
     return (iteration, multipliers, new_inactive, inactive)
 
   iteration = standard_ops.constant(0)
diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
index 2c673d9347..a6cb1f62f0 100644
--- a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
+++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
@@ -175,9 +175,9 @@ def _project_stochastic_matrix_wrt_euclidean_norm(matrix):
     scale = (1.0 - standard_ops.reduce_sum(
         matrix, axis=0, keepdims=True)) / standard_ops.maximum(
             1.0, standard_ops.reduce_sum(inactive, axis=0, keepdims=True))
-    matrix += scale * inactive
+    matrix = matrix + (scale * inactive)
     new_inactive = standard_ops.cast(matrix > 0, matrix.dtype)
-    matrix *= new_inactive
+    matrix = matrix * new_inactive
     return (iteration, matrix, new_inactive, inactive)
 
   iteration = standard_ops.constant(0)
@@ -210,8 +210,9 @@ def _project_log_stochastic_matrix_wrt_kl_divergence(log_matrix):
 
   # For numerical reasons, make sure that the largest matrix element is zero
   # before exponentiating.
-  log_matrix -= standard_ops.reduce_max(log_matrix, axis=0, keepdims=True)
-  log_matrix -= standard_ops.log(
+  log_matrix = log_matrix - standard_ops.reduce_max(
+      log_matrix, axis=0, keepdims=True)
+  log_matrix = log_matrix - standard_ops.log(
       standard_ops.reduce_sum(
           standard_ops.exp(log_matrix), axis=0, keepdims=True))
   return log_matrix
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index 6c9ab6aeb8..9c5871da34 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -31,7 +31,7 @@ from __future__ import division
 from __future__ import print_function
 
 from copy import deepcopy
-from tensorflow.python.ops.variables import Variable
+from tensorflow.python.ops.variables import VariableV1
 from tensorflow.python.client.session import Session
 from tensorflow.python.framework import ops
 
@@ -55,7 +55,7 @@ def copy_variable_to_graph(org_instance, to_graph, scope=''):
     TypeError: If `org_instance` is not a `Variable`.
   """
 
-  if not isinstance(org_instance, Variable):
+  if not isinstance(org_instance, VariableV1):
     raise TypeError(str(org_instance) + ' is not a Variable')
 
   #The name of the new variable
@@ -88,7 +88,7 @@ def copy_variable_to_graph(org_instance, to_graph, scope=''):
 
   #Initialize the new variable
   with to_graph.as_default():
-    new_var = Variable(
+    new_var = VariableV1(
         init_value,
         trainable,
         name=new_name,
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_test.py b/tensorflow/contrib/copy_graph/python/util/copy_test.py
index 05744bec4e..ba97c78456 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_test.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_test.py
@@ -36,7 +36,7 @@ class CopyVariablesTest(test.TestCase):
 
     with graph1.as_default():
       #Define a Variable in graph1
-      some_var = variables.Variable(2)
+      some_var = variables.VariableV1(2)
       #Initialize session
       sess1 = session_lib.Session()
       #Initialize the Variable
@@ -72,7 +72,7 @@ class CopyOpsTest(test.TestCase):
     with graph1.as_default():
       #Initialize a basic expression y = ax + b
       x = array_ops.placeholder("float")
-      a = variables.Variable(3.0)
+      a = variables.VariableV1(3.0)
       b = constant_op.constant(4.0)
       ax = math_ops.multiply(x, a)
       y = math_ops.add(ax, b)
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
index 704c0d1eb2..7e2326bd17 100644
--- a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
@@ -42,7 +42,7 @@ class CheckpointInputPipelineHookTest(test.TestCase):
     del config
     global_step = training_util.get_or_create_global_step()
     update_global_step_op = global_step.assign_add(1)
-    latest_feature = variables.Variable(
+    latest_feature = variables.VariableV1(
         0, name='latest_feature', dtype=dtypes.int64)
     store_latest_feature_op = latest_feature.assign(features)
     ops.add_to_collection('my_vars', global_step)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/moving_stats_test.py b/tensorflow/contrib/distributions/python/kernel_tests/moving_stats_test.py
index 3c988dad8a..be7c756bea 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/moving_stats_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/moving_stats_test.py
@@ -38,8 +38,8 @@ class MovingReduceMeanVarianceTest(test.TestCase):
     true_stddev = np.array([[1.1, 0.5]])
     with self.cached_session() as sess:
       # Start "x" out with this mean.
-      mean_var = variables.Variable(array_ops.zeros_like(true_mean))
-      variance_var = variables.Variable(array_ops.ones_like(true_stddev))
+      mean_var = variables.VariableV1(array_ops.zeros_like(true_mean))
+      variance_var = variables.VariableV1(array_ops.ones_like(true_stddev))
       x = random_ops.random_normal(shape, dtype=np.float64, seed=0)
       x = true_stddev * x + true_mean
       ema, emv = moving_stats.assign_moving_mean_variance(
@@ -115,7 +115,7 @@ class MovingLogExponentialMovingMeanExpTest(test.TestCase):
       # Start "x" out with this mean.
       x = random_ops.random_normal(shape, dtype=np.float64, seed=0)
       x = true_stddev * x + true_mean
-      log_mean_exp_var = variables.Variable(array_ops.zeros_like(true_mean))
+      log_mean_exp_var = variables.VariableV1(array_ops.zeros_like(true_mean))
       variables.global_variables_initializer().run()
       log_mean_exp = moving_stats.assign_log_moving_mean_exp(
           log_mean_exp_var, x, decay=decay)
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks_test.py b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
index c6c6cad95a..62ffad56da 100644
--- a/tensorflow/contrib/estimator/python/estimator/hooks_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
@@ -294,7 +294,7 @@ class InMemoryEvaluatorHookTest(test.TestCase):
 
     def model_fn(features, labels, mode):
       _, _ = features, labels
-      w = variables.Variable(
+      w = variables.VariableV1(
           initial_value=[0.],
           trainable=False,
           collections=[ops.GraphKeys.SAVEABLE_OBJECTS])
diff --git a/tensorflow/contrib/framework/python/ops/variables_test.py b/tensorflow/contrib/framework/python/ops/variables_test.py
index f9b0efd1da..c223df5b6e 100644
--- a/tensorflow/contrib/framework/python/ops/variables_test.py
+++ b/tensorflow/contrib/framework/python/ops/variables_test.py
@@ -192,7 +192,7 @@ class GlobalStepTest(test.TestCase):
   def test_invalid_dtype(self):
     with ops.Graph().as_default() as g:
       self.assertEquals(None, variables_lib2.get_global_step())
-      variables_lib.Variable(
+      variables_lib.VariableV1(
           0.0,
           trainable=False,
           dtype=dtypes.float32,
@@ -205,7 +205,7 @@ class GlobalStepTest(test.TestCase):
   def test_invalid_shape(self):
     with ops.Graph().as_default() as g:
       self.assertEquals(None, variables_lib2.get_global_step())
-      variables_lib.Variable(
+      variables_lib.VariableV1(
           [0],
           trainable=False,
           dtype=dtypes.int32,
@@ -229,7 +229,7 @@ class GlobalStepTest(test.TestCase):
   def test_get_global_step(self):
     with ops.Graph().as_default() as g:
       self.assertEquals(None, variables_lib2.get_global_step())
-      variables_lib.Variable(
+      variables_lib.VariableV1(
           0,
           trainable=False,
           dtype=dtypes.int32,
@@ -607,10 +607,10 @@ class ModelVariablesTest(test.TestCase):
     with self.cached_session():
       with variable_scope.variable_scope('A'):
         variables_lib2.local_variable([5])
-        a = variables_lib.Variable([5])
+        a = variables_lib.VariableV1([5])
       with variable_scope.variable_scope('B'):
         variables_lib2.local_variable([5])
-        b = variables_lib.Variable([5])
+        b = variables_lib.VariableV1([5])
       self.assertEquals([a], variables_lib2.get_trainable_variables('A'))
       self.assertEquals([b], variables_lib2.get_trainable_variables('B'))
 
@@ -953,7 +953,7 @@ class AssignFromCheckpointTest(test.TestCase):
       # Create a set of variables to save in the checkpoint.
       for var_name in var_names_to_values:
         var_value = var_names_to_values[var_name]
-        var_list.append(variables_lib.Variable(var_value, name=var_name))
+        var_list.append(variables_lib.VariableV1(var_value, name=var_name))
       saver = saver_lib.Saver(var_list)
       init_op = variables_lib.variables_initializer(var_list)
       sess.run(init_op)
@@ -1106,7 +1106,7 @@ class AssignFromCheckpointFnTest(test.TestCase):
       # Create a set of variables to save in the checkpoint.
       for var_name in var_names_to_values:
         var_value = var_names_to_values[var_name]
-        var_list.append(variables_lib.Variable(var_value, name=var_name))
+        var_list.append(variables_lib.VariableV1(var_value, name=var_name))
       saver = saver_lib.Saver(var_list)
       init_op = variables_lib.variables_initializer(var_list)
       sess.run(init_op)
@@ -1297,7 +1297,7 @@ class AssignFromCheckpointFnTest(test.TestCase):
 class ZeroInitializerOpTest(test.TestCase):
 
   def _testZeroInitializer(self, shape, initializer, use_init):
-    var = variables_lib.Variable(initializer)
+    var = variables_lib.VariableV1(initializer)
     var_zero = variables_lib2.zero_initializer(var)
     with self.cached_session() as sess:
       with self.assertRaisesOpError('Attempting to use uninitialized value'):
@@ -1350,12 +1350,12 @@ class FilterVariablesTest(test.TestCase):
     g = ops.Graph()
     with g.as_default():
       var_list = []
-      var_list.append(variables_lib.Variable(0, name='conv1/weights'))
-      var_list.append(variables_lib.Variable(0, name='conv1/biases'))
-      var_list.append(variables_lib.Variable(0, name='conv2/weights'))
-      var_list.append(variables_lib.Variable(0, name='conv2/biases'))
-      var_list.append(variables_lib.Variable(0, name='clfs/weights'))
-      var_list.append(variables_lib.Variable(0, name='clfs/biases'))
+      var_list.append(variables_lib.VariableV1(0, name='conv1/weights'))
+      var_list.append(variables_lib.VariableV1(0, name='conv1/biases'))
+      var_list.append(variables_lib.VariableV1(0, name='conv2/weights'))
+      var_list.append(variables_lib.VariableV1(0, name='conv2/biases'))
+      var_list.append(variables_lib.VariableV1(0, name='clfs/weights'))
+      var_list.append(variables_lib.VariableV1(0, name='clfs/biases'))
       self._var_list = var_list
 
   def _test_filter_variables(self,
diff --git a/tensorflow/contrib/graph_editor/tests/transform_test.py b/tensorflow/contrib/graph_editor/tests/transform_test.py
index 97f38c923f..0ebcdc2688 100644
--- a/tensorflow/contrib/graph_editor/tests/transform_test.py
+++ b/tensorflow/contrib/graph_editor/tests/transform_test.py
@@ -214,7 +214,7 @@ class TransformTest(test.TestCase):
 
   def test_graph_replace_gradients(self):
     ops.reset_default_graph()
-    w = variables.Variable(0.0, name="w")
+    w = variables.VariableV1(0.0, name="w")
     y = math_ops.multiply(math_ops.multiply(w, w, name="mul1"), w, name="mul2")
     g = gradients_impl.gradients(y, w, name="grad")[0]
 
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 85af9de4e4..3b7ae72e9c 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -2360,7 +2360,7 @@ class BatchNormTest(test.TestCase):
             batch_size * height * width, expected_var)
       images = constant_op.constant(
           image_values, shape=image_shape, dtype=dtypes.float32)
-      is_training = variables_lib.Variable(True)
+      is_training = variables_lib.VariableV1(True)
       output = _layers.batch_norm(
           images,
           decay=0.1,
@@ -2507,7 +2507,7 @@ class BatchNormTest(test.TestCase):
             batch_size * height * width, expected_var)
       images = constant_op.constant(
           image_values, shape=image_shape, dtype=dtypes.float32)
-      is_training = variables_lib.Variable(True)
+      is_training = variables_lib.VariableV1(True)
       output = _layers.batch_norm(
           images,
           decay=0.1,
diff --git a/tensorflow/contrib/learn/python/learn/graph_actions_test.py b/tensorflow/contrib/learn/python/learn/graph_actions_test.py
index 33180b778a..a160cb54a3 100644
--- a/tensorflow/contrib/learn/python/learn/graph_actions_test.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions_test.py
@@ -162,9 +162,9 @@ class GraphActionsTest(test.TestCase):
       Tuple of 3 `Tensor` objects, 2 input and 1 output.
     """
     variables_lib.create_global_step()
-    in0 = variables.Variable(1.0)
+    in0 = variables.VariableV1(1.0)
     in1 = variables_lib.local_variable(2.0)
-    fake_table = variables.Variable(
+    fake_table = variables.VariableV1(
         3.0,
         trainable=False,
         collections=['fake_tables'],
@@ -312,8 +312,8 @@ class GraphActionsTest(test.TestCase):
   def test_evaluate_ready_for_local_init(self):
     with ops.Graph().as_default() as g, self.session(g):
       variables_lib.create_global_step()
-      v = variables.Variable(1.0)
-      variables.Variable(
+      v = variables.VariableV1(1.0)
+      variables.VariableV1(
           v + 1, collections=[ops.GraphKeys.LOCAL_VARIABLES], trainable=False)
       ready_for_local_init_op = variables.report_uninitialized_variables(
           variables.global_variables())
@@ -456,9 +456,9 @@ class GraphActionsTrainTest(test.TestCase):
       Tuple of 3 `Tensor` objects, 2 input and 1 output.
     """
     variables_lib.create_global_step()
-    in0 = variables.Variable(1.0)
+    in0 = variables.VariableV1(1.0)
     in1 = variables_lib.local_variable(2.0)
-    fake_table = variables.Variable(
+    fake_table = variables.VariableV1(
         3.0,
         trainable=False,
         collections=['fake_tables'],
diff --git a/tensorflow/contrib/learn/python/learn/monitors_test.py b/tensorflow/contrib/learn/python/learn/monitors_test.py
index 83e48a36e7..d4a7169bb6 100644
--- a/tensorflow/contrib/learn/python/learn/monitors_test.py
+++ b/tensorflow/contrib/learn/python/learn/monitors_test.py
@@ -247,7 +247,7 @@ class MonitorsTest(test.TestCase):
 
   def test_logging_trainable(self):
     with ops.Graph().as_default() as g, self.session(g):
-      var = variables.Variable(constant_op.constant(42.0), name='foo')
+      var = variables.VariableV1(constant_op.constant(42.0), name='foo')
       var.initializer.run()
       cof = constant_op.constant(1.0)
       loss = math_ops.subtract(
@@ -261,7 +261,7 @@ class MonitorsTest(test.TestCase):
     with ops.Graph().as_default() as g, self.session(g):
       log_dir = 'log/dir'
       summary_writer = testing.FakeSummaryWriter(log_dir, g)
-      var = variables.Variable(0.0)
+      var = variables.VariableV1(0.0)
       var.initializer.run()
       tensor = state_ops.assign_add(var, 1.0)
       summary_op = summary.scalar('my_summary', tensor)
@@ -526,8 +526,8 @@ class MonitorsTest(test.TestCase):
     monitor0 = learn.monitors.GraphDump()
     monitor1 = learn.monitors.GraphDump()
     with ops.Graph().as_default() as g, self.session(g):
-      const_var = variables.Variable(42.0, name='my_const')
-      counter_var = variables.Variable(0.0, name='my_counter')
+      const_var = variables.VariableV1(42.0, name='my_const')
+      counter_var = variables.VariableV1(0.0, name='my_counter')
       assign_add = state_ops.assign_add(counter_var, 1.0, name='my_assign_add')
       variables.global_variables_initializer().run()
 
@@ -569,7 +569,7 @@ class MonitorsTest(test.TestCase):
     monitor = learn.monitors.CaptureVariable(
         var_name='my_assign_add:0', every_n=8, first_n=2)
     with ops.Graph().as_default() as g, self.session(g):
-      var = variables.Variable(0.0, name='my_var')
+      var = variables.VariableV1(0.0, name='my_var')
       var.initializer.run()
       state_ops.assign_add(var, 1.0, name='my_assign_add')
       self._run_monitor(monitor, num_epochs=3, num_steps_per_epoch=10)
diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
index 9ecf023e03..8466dc36d1 100644
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@@ -125,7 +125,7 @@ def make_random_examples_and_variables_dicts(num_examples, dim, num_non_zero):
       ],
       example_ids=[str(i) for i in range(num_examples)])
 
-  weights = variables_lib.Variable(
+  weights = variables_lib.VariableV1(
       array_ops.zeros([dim], dtype=dtypes.float32))
   variables_dict = dict(
       sparse_features_weights=[weights],
@@ -184,7 +184,7 @@ def make_dense_examples_and_variables_dicts(dense_features_values, weights,
     dense_tensors.append(dense_tensor)
     # Add variables of shape [feature_column_dimension].
     dense_weights.append(
-        variables_lib.Variable(
+        variables_lib.VariableV1(
             array_ops.zeros(
                 [dense_tensor.get_shape().as_list()[1]], dtype=dtypes.float32)))
 
@@ -341,7 +341,7 @@ class SdcaWithLogisticLossTest(SdcaModelTest):
         examples = make_example_dict(example_protos, example_weights)
         # Explicitly make age a [1]-shaped Variable (which cannot be
         # partitioned), while making gender a PartitionedVariable.
-        age_weights = variables_lib.Variable(
+        age_weights = variables_lib.VariableV1(
             array_ops.zeros([1], dtype=dtypes.float32))
         with variable_scope.variable_scope(
             name_or_scope=('variables/shard_{}'.format(num_shards)
@@ -801,7 +801,7 @@ class SdcaWithLogisticLossTest(SdcaModelTest):
           labels=[1.0, 0.0])
       # Replace with a variable of size 1 instead of 2.
       variables['dense_features_weights'] = [
-          variables_lib.Variable(array_ops.zeros(
+          variables_lib.VariableV1(array_ops.zeros(
               [1], dtype=dtypes.float32))
       ]
       options = dict(
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 955b83b44d..fc64f343ab 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -2069,11 +2069,11 @@ class StreamingDynamicAUCTest(test.TestCase):
     num_batches = 100
     labels = np.array([])
     predictions = np.array([])
-    tf_labels = variables.Variable(
+    tf_labels = variables.VariableV1(
         array_ops.ones(batch_size, dtypes_lib.int32),
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         dtype=dtypes_lib.int32)
-    tf_predictions = variables.Variable(
+    tf_predictions = variables.VariableV1(
         array_ops.ones(batch_size),
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         dtype=dtypes_lib.float32)
@@ -2133,15 +2133,15 @@ class StreamingDynamicAUCTest(test.TestCase):
     labels = np.array([])
     predictions = np.array([])
     weights = np.array([])
-    tf_labels = variables.Variable(
+    tf_labels = variables.VariableV1(
         array_ops.ones(batch_size, dtypes_lib.int32),
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         dtype=dtypes_lib.int32)
-    tf_predictions = variables.Variable(
+    tf_predictions = variables.VariableV1(
         array_ops.ones(batch_size),
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         dtype=dtypes_lib.float32)
-    tf_weights = variables.Variable(
+    tf_weights = variables.VariableV1(
         array_ops.ones(batch_size),
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         dtype=dtypes_lib.float32)
@@ -2311,10 +2311,11 @@ class AucWithConfidenceIntervalsTest(test.TestCase):
     num_batches = 100
     labels = np.array([])
     predictions = np.array([])
-    tf_labels = variables.Variable(array_ops.ones(batch_size, dtypes_lib.int32),
-                                   collections=[ops.GraphKeys.LOCAL_VARIABLES],
-                                   dtype=dtypes_lib.int32)
-    tf_predictions = variables.Variable(
+    tf_labels = variables.VariableV1(
+        array_ops.ones(batch_size, dtypes_lib.int32),
+        collections=[ops.GraphKeys.LOCAL_VARIABLES],
+        dtype=dtypes_lib.int32)
+    tf_predictions = variables.VariableV1(
         array_ops.ones(batch_size),
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
         dtype=dtypes_lib.float32)
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index a81abac2fa..67e58ff15d 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -247,7 +247,8 @@ class Pruning(object):
 
     # Stores the tensorflow sparsity variable.
     # Built using self._setup_sparsity() or provided externally
-    self._sparsity = sparsity if sparsity else self._setup_sparsity()
+    self._sparsity = (sparsity
+                      if sparsity is not None else self._setup_sparsity())
 
     # List of tensorflow assignments ops for new masks and thresholds
     self._assign_ops = []
diff --git a/tensorflow/contrib/model_pruning/python/pruning_test.py b/tensorflow/contrib/model_pruning/python/pruning_test.py
index cd3d8e76bb..1b6da5ce2b 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_test.py
@@ -45,7 +45,7 @@ class PruningHParamsTest(test.TestCase):
     # Add global step variable to the graph
     self.global_step = training_util.get_or_create_global_step()
     # Add sparsity
-    self.sparsity = variables.Variable(0.5, name="sparsity")
+    self.sparsity = variables.VariableV1(0.5, name="sparsity")
     # Parse hparams
     self.pruning_hparams = pruning.get_pruning_hparams().parse(
         self.TEST_HPARAMS)
@@ -88,7 +88,7 @@ class PruningTest(test.TestCase):
     width = 10
     height = 20
     with self.cached_session():
-      weights = variables.Variable(
+      weights = variables.VariableV1(
           random_ops.random_normal([width, height], stddev=1), name="weights")
       masked_weights = pruning.apply_mask(weights,
                                           variable_scope.get_variable_scope())
@@ -99,10 +99,10 @@ class PruningTest(test.TestCase):
 
   def testUpdateSingleMask(self):
     with self.cached_session() as session:
-      weights = variables.Variable(
+      weights = variables.VariableV1(
           math_ops.linspace(1.0, 100.0, 100), name="weights")
       masked_weights = pruning.apply_mask(weights)
-      sparsity = variables.Variable(0.5, name="sparsity")
+      sparsity = variables.VariableV1(0.5, name="sparsity")
       p = pruning.Pruning(sparsity=sparsity)
       p._spec.threshold_decay = 0.0
       mask_update_op = p.mask_update_op()
@@ -115,8 +115,8 @@ class PruningTest(test.TestCase):
 
   def _blockMasking(self, hparams, weights, expected_mask):
 
-    threshold = variables.Variable(0.0, name="threshold")
-    sparsity = variables.Variable(0.5, name="sparsity")
+    threshold = variables.VariableV1(0.0, name="threshold")
+    sparsity = variables.VariableV1(0.5, name="sparsity")
     test_spec = ",".join(hparams)
     pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)
 
@@ -169,7 +169,7 @@ class PruningTest(test.TestCase):
     partitioner = partitioned_variables.variable_axis_size_partitioner(40)
     with self.cached_session() as session:
       with variable_scope.variable_scope("", partitioner=partitioner):
-        sparsity = variables.Variable(0.5, name="Sparsity")
+        sparsity = variables.VariableV1(0.5, name="Sparsity")
         weights = variable_scope.get_variable(
             "weights", initializer=math_ops.linspace(1.0, 100.0, 100))
         masked_weights = pruning.apply_mask(
@@ -190,10 +190,10 @@ class PruningTest(test.TestCase):
     ]
     test_spec = ",".join(param_list)
     pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)
-    weights = variables.Variable(
+    weights = variables.VariableV1(
         math_ops.linspace(1.0, 100.0, 100), name="weights")
     masked_weights = pruning.apply_mask(weights)
-    sparsity = variables.Variable(0.00, name="sparsity")
+    sparsity = variables.VariableV1(0.00, name="sparsity")
     # Set up pruning
     p = pruning.Pruning(pruning_hparams, sparsity=sparsity)
     p._spec.threshold_decay = 0.0
@@ -222,11 +222,11 @@ class PruningTest(test.TestCase):
     pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)
 
     with variable_scope.variable_scope("layer1"):
-      w1 = variables.Variable(
+      w1 = variables.VariableV1(
           math_ops.linspace(1.0, 100.0, 100), name="weights")
       _ = pruning.apply_mask(w1)
     with variable_scope.variable_scope("layer2"):
-      w2 = variables.Variable(
+      w2 = variables.VariableV1(
           math_ops.linspace(1.0, 100.0, 100), name="weights")
       _ = pruning.apply_mask(w2)
 
diff --git a/tensorflow/contrib/opt/python/training/addsign_test.py b/tensorflow/contrib/opt/python/training/addsign_test.py
index 628a735e72..6150fa117f 100644
--- a/tensorflow/contrib/opt/python/training/addsign_test.py
+++ b/tensorflow/contrib/opt/python/training/addsign_test.py
@@ -80,9 +80,9 @@ class AddSignTest(test.TestCase):
           global_step = resource_variable_ops.ResourceVariable(
               0, trainable=False)
         else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
-          global_step = variables.Variable(
+          var0 = variables.VariableV1(var0_np)
+          var1 = variables.VariableV1(var1_np)
+          global_step = variables.VariableV1(
               0, trainable=False)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
@@ -183,9 +183,9 @@ class AddSignTest(test.TestCase):
           global_step = resource_variable_ops.ResourceVariable(
               0, trainable=False)
         else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
-          global_step = variables.Variable(
+          var0 = variables.VariableV1(var0_np)
+          var1 = variables.VariableV1(var1_np)
+          global_step = variables.VariableV1(
               0, trainable=False)
         grads0_np_indices = np.array([0, 1], dtype=np.int32)
         grads0 = ops.IndexedSlices(
diff --git a/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py
index 53232082e1..0a69096768 100644
--- a/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py
@@ -61,8 +61,8 @@ def _get_workers(num_workers, staleness):
     graph = ops.Graph()
     with graph.as_default():
       global_step = training_util.create_global_step()
-      var_0 = variables.Variable(0.0, name='v0')
-      var_1 = variables.Variable(1.0, name='v1')
+      var_0 = variables.VariableV1(0.0, name='v0')
+      var_1 = variables.VariableV1(1.0, name='v1')
       compute_gradients_queue = data_flow_ops.FIFOQueue(
           -1, global_step.dtype.base_dtype, shapes=(),
           name='compute_gradients_queue', shared_name='compute_gradients_queue')
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer_test.py b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
index 9997103016..70c5f8ff19 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
@@ -69,9 +69,9 @@ class TestCase(test.TestCase):
 class ExternalOptimizerInterfaceTest(TestCase):
 
   def test_optimize(self):
-    scalar = variables.Variable(random_ops.random_normal([]), 'scalar')
-    vector = variables.Variable(random_ops.random_normal([2]), 'vector')
-    matrix = variables.Variable(random_ops.random_normal([2, 3]), 'matrix')
+    scalar = variables.VariableV1(random_ops.random_normal([]), 'scalar')
+    vector = variables.VariableV1(random_ops.random_normal([2]), 'vector')
+    matrix = variables.VariableV1(random_ops.random_normal([2, 3]), 'matrix')
 
     minimum_location = constant_op.constant(np.arange(9), dtype=dtypes.float32)
 
@@ -96,7 +96,7 @@ class ExternalOptimizerInterfaceTest(TestCase):
 
   def test_callbacks(self):
     vector_val = np.array([7., -2.], dtype=np.float32)
-    vector = variables.Variable(vector_val, 'vector')
+    vector = variables.VariableV1(vector_val, 'vector')
 
     minimum_location_val = np.arange(2)
     minimum_location = constant_op.constant(
@@ -160,7 +160,7 @@ class ScipyOptimizerInterfaceTest(TestCase):
                                 rtol=1e-5,
                                 atol=1e-5,
                                 dimension=5):
-    x = variables.Variable(array_ops.zeros(dimension))
+    x = variables.VariableV1(array_ops.zeros(dimension))
     optimizer = external_optimizer.ScipyOptimizerInterface(
         self._objective(x), method=method, options=options)
 
@@ -173,7 +173,7 @@ class ScipyOptimizerInterfaceTest(TestCase):
   def test_unconstrained(self):
 
     dimension = 5
-    x = variables.Variable(array_ops.zeros(dimension))
+    x = variables.VariableV1(array_ops.zeros(dimension))
     optimizer = external_optimizer.ScipyOptimizerInterface(self._objective(x))
 
     with self.cached_session() as sess:
@@ -230,7 +230,7 @@ class ScipyOptimizerInterfaceTest(TestCase):
 
   def test_nonlinear_programming(self):
     vector_initial_value = [7., 7.]
-    vector = variables.Variable(vector_initial_value, 'vector')
+    vector = variables.VariableV1(vector_initial_value, 'vector')
 
     # Make norm as small as possible.
     loss = math_ops.reduce_sum(math_ops.square(vector))
@@ -249,7 +249,7 @@ class ScipyOptimizerInterfaceTest(TestCase):
 
   def test_scalar_bounds(self):
     vector_initial_value = [7., 7.]
-    vector = variables.Variable(vector_initial_value, 'vector')
+    vector = variables.VariableV1(vector_initial_value, 'vector')
 
     # Make norm as small as possible.
     loss = math_ops.reduce_sum(math_ops.square(vector))
@@ -267,7 +267,7 @@ class ScipyOptimizerInterfaceTest(TestCase):
 
   def test_vector_bounds(self):
     vector_initial_value = [7., 7.]
-    vector = variables.Variable(vector_initial_value, 'vector')
+    vector = variables.VariableV1(vector_initial_value, 'vector')
 
     # Make norm as small as possible.
     loss = math_ops.reduce_sum(math_ops.square(vector))
@@ -287,7 +287,7 @@ class ScipyOptimizerInterfaceTest(TestCase):
     # after running optimizer.minimize().
     # Bug reference: b/64065260
     vector_initial_value = [7., 7.]
-    vector = variables.Variable(vector_initial_value, 'vector')
+    vector = variables.VariableV1(vector_initial_value, 'vector')
     loss = math_ops.reduce_sum(math_ops.square(vector))
 
     optimizer = external_optimizer.ScipyOptimizerInterface(
@@ -301,7 +301,7 @@ class ScipyOptimizerInterfaceTest(TestCase):
 
   def test_callbacks(self):
     vector_val = np.array([7., -2.], dtype=np.float32)
-    vector = variables.Variable(vector_val, 'vector')
+    vector = variables.VariableV1(vector_val, 'vector')
 
     minimum_location_val = np.arange(2)
     minimum_location = constant_op.constant(
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
index b1fc50a21f..a25455e95d 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
@@ -110,10 +110,11 @@ def _get_workers(num_workers, steps, workers):
 
 
 class ModelAverageOptimizerTest(test.TestCase):
+
   def _run(self, train_op, sess):
     sess.run(train_op)
 
-  def test1Workers2Period(self):
+  def disabled_test1Workers2Period(self):
     num_workers = 2
     steps = 2
     num_ps = 1
diff --git a/tensorflow/contrib/opt/python/training/powersign_test.py b/tensorflow/contrib/opt/python/training/powersign_test.py
index 0bcf5d230a..1cf9901dc0 100644
--- a/tensorflow/contrib/opt/python/training/powersign_test.py
+++ b/tensorflow/contrib/opt/python/training/powersign_test.py
@@ -81,9 +81,9 @@ class PowerSignTest(test.TestCase):
           global_step = resource_variable_ops.ResourceVariable(
               0, trainable=False)
         else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
-          global_step = variables.Variable(
+          var0 = variables.VariableV1(var0_np)
+          var1 = variables.VariableV1(var1_np)
+          global_step = variables.VariableV1(
               0, trainable=False)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
@@ -188,9 +188,9 @@ class PowerSignTest(test.TestCase):
           global_step = resource_variable_ops.ResourceVariable(
               0, trainable=False)
         else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
-          global_step = variables.Variable(
+          var0 = variables.VariableV1(var0_np)
+          var1 = variables.VariableV1(var1_np)
+          global_step = variables.VariableV1(
               0, trainable=False)
         grads0_np_indices = np.array([0, 1], dtype=np.int32)
         grads0 = ops.IndexedSlices(
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index bf699db3ed..f31ad53d3c 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -163,8 +163,8 @@ class TestStateSaverWithCounters(TestStateSaver):
 
   def __init__(self, batch_size, state_size):
     super(TestStateSaverWithCounters, self).__init__(batch_size, state_size)
-    self._num_state_calls = variables_lib.Variable(0)
-    self._num_save_state_calls = variables_lib.Variable(0)
+    self._num_state_calls = variables_lib.VariableV1(0)
+    self._num_save_state_calls = variables_lib.VariableV1(0)
 
   def state(self, name):
     with ops_lib.control_dependencies(
diff --git a/tensorflow/contrib/session_bundle/exporter_test.py b/tensorflow/contrib/session_bundle/exporter_test.py
index 86df425da0..68419ffea0 100644
--- a/tensorflow/contrib/session_bundle/exporter_test.py
+++ b/tensorflow/contrib/session_bundle/exporter_test.py
@@ -64,10 +64,10 @@ class SaveRestoreShardedTest(test.TestCase):
       # v2 is an unsaved variable derived from v0 and v1.  It is used to
       # exercise the ability to run an init op when restoring a graph.
       with sess.graph.device("/cpu:0"):
-        v0 = variables.Variable(10, name="v0")
+        v0 = variables.VariableV1(10, name="v0")
       with sess.graph.device("/cpu:1"):
-        v1 = variables.Variable(20, name="v1")
-      v2 = variables.Variable(1, name="v2", trainable=False, collections=[])
+        v1 = variables.VariableV1(20, name="v1")
+      v2 = variables.VariableV1(1, name="v2", trainable=False, collections=[])
       assign_v2 = state_ops.assign(v2, math_ops.add(v0, v1))
       init_op = control_flow_ops.group(assign_v2, name="init_op")
 
diff --git a/tensorflow/contrib/tensor_forest/python/kernel_tests/scatter_add_ndim_op_test.py b/tensorflow/contrib/tensor_forest/python/kernel_tests/scatter_add_ndim_op_test.py
index 1c4e18dbda..0b02bdcb50 100644
--- a/tensorflow/contrib/tensor_forest/python/kernel_tests/scatter_add_ndim_op_test.py
+++ b/tensorflow/contrib/tensor_forest/python/kernel_tests/scatter_add_ndim_op_test.py
@@ -27,7 +27,7 @@ from tensorflow.python.platform import googletest
 class ScatterAddNdimTest(test_util.TensorFlowTestCase):
 
   def test1dim(self):
-    input_data = variables.Variable(
+    input_data = variables.VariableV1(
         [1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.])
     indices = [[1], [10]]
     updates = [100., 200.]
@@ -40,8 +40,8 @@ class ScatterAddNdimTest(test_util.TensorFlowTestCase):
           input_data.eval())
 
   def test3dim(self):
-    input_data = variables.Variable([[[1., 2., 3.], [4., 5., 6.]],
-                                     [[7., 8., 9.], [10., 11., 12.]]])
+    input_data = variables.VariableV1([[[1., 2., 3.], [4., 5., 6.]],
+                                       [[7., 8., 9.], [10., 11., 12.]]])
     indices = [[0, 0, 1], [1, 1, 2]]
     updates = [100., 200.]
 
@@ -53,7 +53,7 @@ class ScatterAddNdimTest(test_util.TensorFlowTestCase):
 
   def testNoUpdates(self):
     init_val = [[[1., 2., 3.], [4., 5., 6.]], [[7., 8., 9.], [10., 11., 12.]]]
-    input_data = variables.Variable(init_val)
+    input_data = variables.VariableV1(init_val)
     indices = []
     updates = []
 
@@ -64,7 +64,7 @@ class ScatterAddNdimTest(test_util.TensorFlowTestCase):
 
   def testBadInput(self):
     init_val = [[[1., 2., 3.], [4., 5., 6.]], [[7., 8., 9.], [10., 11., 12.]]]
-    input_data = variables.Variable(init_val)
+    input_data = variables.VariableV1(init_val)
     indices = [[0, 0, 1], [1, 1, 2]]
     updates = [100.]
     with self.cached_session():
@@ -75,8 +75,8 @@ class ScatterAddNdimTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(init_val, input_data.eval())
 
   def testIncompleteIndices(self):
-    input_data = variables.Variable([[[1., 2., 3.], [4., 5., 6.]],
-                                     [[7., 8., 9.], [10., 11., 12.]]])
+    input_data = variables.VariableV1([[[1., 2., 3.], [4., 5., 6.]],
+                                       [[7., 8., 9.], [10., 11., 12.]]])
     indices = [[0, 0], [1, 1]]
     updates = [[100., 200., 300.], [400., 500., 600.]]
 
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert_test.py b/tensorflow/contrib/tensorrt/python/trt_convert_test.py
index f3a1ef0d47..52cb0bd9f9 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert_test.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert_test.py
@@ -94,7 +94,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
       with g.device("/GPU:0"):
         inp = array_ops.placeholder(
             dtype=dtypes.float32, shape=[None, 1, 1], name="input")
-        var = variables.Variable([[[1.0]]], dtype=dtypes.float32, name="v1")
+        var = variables.VariableV1([[[1.0]]], dtype=dtypes.float32, name="v1")
         add = inp + var.value()
         mul = inp * add
         add = mul + add
diff --git a/tensorflow/contrib/training/python/training/device_setter_test.py b/tensorflow/contrib/training/python/training/device_setter_test.py
index 20746d911c..3bb2dce83d 100644
--- a/tensorflow/contrib/training/python/training/device_setter_test.py
+++ b/tensorflow/contrib/training/python/training/device_setter_test.py
@@ -98,10 +98,10 @@ class GreedyLoadBalancingStrategyTest(test.TestCase):
             cluster=_CLUSTER_SPEC,
             ps_strategy=device_setter_lib.GreedyLoadBalancingStrategy(
                 2, device_setter_lib.byte_size_load_fn))):
-      u = variables.Variable(array_ops.zeros([2, 2]))
-      v = variables.Variable(array_ops.zeros([2, 1]))
-      w = variables.Variable(array_ops.zeros([2, 2]))
-      x = variables.Variable(array_ops.zeros([1, 3]))
+      u = variables.VariableV1(array_ops.zeros([2, 2]))
+      v = variables.VariableV1(array_ops.zeros([2, 1]))
+      w = variables.VariableV1(array_ops.zeros([2, 2]))
+      x = variables.VariableV1(array_ops.zeros([1, 3]))
       a = v + w
       self.assertDeviceEqual("/job:ps/task:0", u.device)
       self.assertDeviceEqual("/job:ps/task:0", u.initializer.device)
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index f576435136..5c0c405306 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -1022,7 +1022,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     with session.Session():
       a = constant_op.constant(1.0, shape=[1, 2])
       b = constant_op.constant(2.0, shape=[1, 2], name='b')
-      v = variables.Variable(a, a.dtype)
+      v = variables.VariableV1(a, a.dtype)
       assign_a_to_v = state_ops.assign(v, a)
 
       assign_a_to_v.eval()
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index 4630bda590..f197a9e4dc 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -599,11 +599,11 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
       v_name = "simple_mul_add/v"
 
       u_init = constant_op.constant(u_init_val, shape=[2, 2], name="u_init")
-      u = variables.Variable(u_init, name=u_name)
+      u = variables.VariableV1(u_init, name=u_name)
       cls._u_line_number = line_number_above()
 
       v_init = constant_op.constant(v_init_val, shape=[2, 1], name="v_init")
-      v = variables.Variable(v_init, name=v_name)
+      v = variables.VariableV1(v_init, name=v_name)
       cls._v_line_number = line_number_above()
 
       w = math_ops.matmul(u, v, name="simple_mul_add/matmul")
@@ -612,7 +612,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
       x = math_ops.add(w, w, name="simple_mul_add/add")
       cls._x_line_number = line_number_above()
 
-      a = variables.Variable([1, 3, 3, 7], name="a")
+      a = variables.VariableV1([1, 3, 3, 7], name="a")
 
       u.initializer.run()
       v.initializer.run()
@@ -1371,7 +1371,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     # Verify the annotation of the line that creates u.
     index = self._findSourceLine(out, self._u_line_number)
     self.assertEqual(
-        ["L%d         u = variables.Variable(u_init, name=u_name)" %
+        ["L%d         u = variables.VariableV1(u_init, name=u_name)" %
          self._u_line_number,
          "    simple_mul_add/u",
          "    simple_mul_add/u/Assign",
@@ -1388,7 +1388,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     # Verify the annotation of the line that creates v.
     index = self._findSourceLine(out, self._v_line_number)
     self.assertEqual(
-        ["L%d         v = variables.Variable(v_init, name=v_name)" %
+        ["L%d         v = variables.VariableV1(v_init, name=v_name)" %
          self._v_line_number,
          "    simple_mul_add/v"],
         out.lines[index : index + 2])
@@ -1425,7 +1425,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     # Verify the annotation of the line that creates u.
     index = self._findSourceLine(out, self._u_line_number)
     self.assertEqual(
-        ["L%d         u = variables.Variable(u_init, name=u_name)" %
+        ["L%d         u = variables.VariableV1(u_init, name=u_name)" %
          self._u_line_number,
          "    simple_mul_add/u/read:0",
          "    simple_mul_add/u:0"],
@@ -1447,7 +1447,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
 
     index = self._findSourceLine(out, self._u_line_number)
     self.assertEqual(
-        ["L%d         u = variables.Variable(u_init, name=u_name)" %
+        ["L%d         u = variables.VariableV1(u_init, name=u_name)" %
          self._u_line_number,
          "    simple_mul_add/u",
          "    simple_mul_add/u/Assign",
@@ -1470,7 +1470,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
 
     index = self._findSourceLine(out, self._u_line_number)
     self.assertEqual(
-        ["L%d         u = variables.Variable(u_init, name=u_name)" %
+        ["L%d         u = variables.VariableV1(u_init, name=u_name)" %
          self._u_line_number,
          "    simple_mul_add/u",
          "    (... Omitted 2 of 3 op(s) ...) +5"],
@@ -1580,7 +1580,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     """List an input tree containing tensors from non-:0 output slot."""
 
     with session.Session(config=no_rewrite_session_config()) as sess:
-      x = variables.Variable([1, 3, 3, 7], name="x")
+      x = variables.VariableV1([1, 3, 3, 7], name="x")
       _, idx = array_ops.unique(x, name="x_unique")
       idx_times_two = math_ops.multiply(idx, 2, name="idx_times_two")
       sess.run(x.initializer)
@@ -1684,7 +1684,7 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
     with session.Session(config=no_rewrite_session_config()) as sess:
       x_init_val = np.array([5.0, 3.0])
       x_init = constant_op.constant(x_init_val, shape=[2])
-      x = variables.Variable(x_init, name="control_deps/x")
+      x = variables.VariableV1(x_init, name="control_deps/x")
 
       y = math_ops.add(x, x, name="control_deps/y")
       y = control_flow_ops.with_dependencies(
diff --git a/tensorflow/python/debug/cli/stepper_cli_test.py b/tensorflow/python/debug/cli/stepper_cli_test.py
index ee8cabca0d..7b8a42c253 100644
--- a/tensorflow/python/debug/cli/stepper_cli_test.py
+++ b/tensorflow/python/debug/cli/stepper_cli_test.py
@@ -132,8 +132,8 @@ def _parse_updated(lines):
 class NodeStepperSimpleGraphTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
-    self.a = variables.Variable(10.0, name="a")
-    self.b = variables.Variable(20.0, name="b")
+    self.a = variables.VariableV1(10.0, name="a")
+    self.b = variables.VariableV1(20.0, name="b")
 
     self.c = math_ops.add(self.a, self.b, name="c")  # Should be 30.0.
     self.d = math_ops.subtract(self.a, self.c, name="d")  # Should be -20.0.
diff --git a/tensorflow/python/debug/lib/debug_utils_test.py b/tensorflow/python/debug/lib/debug_utils_test.py
index 5b1875e092..23ab98444c 100644
--- a/tensorflow/python/debug/lib/debug_utils_test.py
+++ b/tensorflow/python/debug/lib/debug_utils_test.py
@@ -46,8 +46,8 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
       cls._b_init = constant_op.constant(
           cls._b_init_val, shape=[2, 1], name="b_init")
 
-      cls._a = variables.Variable(cls._a_init, name="a1")
-      cls._b = variables.Variable(cls._b_init, name="b")
+      cls._a = variables.VariableV1(cls._a_init, name="a1")
+      cls._b = variables.VariableV1(cls._b_init, name="b")
       cls._c = constant_op.constant(cls._c_val, shape=[2, 1], name="c")
 
       # Matrix product of a and b.
diff --git a/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py b/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
index 46a7be5808..74498c8ea3 100644
--- a/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
@@ -118,8 +118,8 @@ class DistributedSessionDebugTest(test_util.TensorFlowTestCase):
     """
     with ops.Graph().as_default() as graph:
       with ops.device("/job:worker/task:0/cpu:0"):
-        self.a = variables.Variable(10.0, name="a")
-        self.b = variables.Variable(100.0, name="b")
+        self.a = variables.VariableV1(10.0, name="a")
+        self.b = variables.VariableV1(100.0, name="b")
         self.inc_a = state_ops.assign_add(self.a, 2.0, name="inc_a")
         self.dec_b = state_ops.assign_add(self.b, -5.0, name="dec_b")
         self.p = math_ops.multiply(self.inc_a, self.dec_b, name="p")
diff --git a/tensorflow/python/debug/lib/grpc_large_data_test.py b/tensorflow/python/debug/lib/grpc_large_data_test.py
index 5bc477a9ba..ccc21bcf94 100644
--- a/tensorflow/python/debug/lib/grpc_large_data_test.py
+++ b/tensorflow/python/debug/lib/grpc_large_data_test.py
@@ -61,7 +61,7 @@ class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
     with self.test_session(
         use_gpu=True,
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
-      u = variables.Variable(42.0, name="original_u")
+      u = variables.VariableV1(42.0, name="original_u")
       for _ in xrange(50 * 1000):
         u = array_ops.identity(u)
       sess.run(variables.global_variables_initializer())
@@ -94,7 +94,7 @@ class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
 
       u_init = constant_op.constant(
           u_init_val_array, dtype=dtypes.float32, name="u_init")
-      u = variables.Variable(u_init, name="u")
+      u = variables.VariableV1(u_init, name="u")
 
       def watch_fn(fetches, feeds):
         del fetches, feeds  # Unused by this watch_fn.
@@ -117,7 +117,7 @@ class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
           b"", b"spam", b"A" * 2500 * 1024, b"B" * 2500 * 1024, b"egg", b""]
       u_init = constant_op.constant(
           u_init_val, dtype=dtypes.string, name="u_init")
-      u = variables.Variable(u_init, name="u")
+      u = variables.VariableV1(u_init, name="u")
 
       def watch_fn(fetches, feeds):
         del fetches, feeds
@@ -146,7 +146,7 @@ class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
 
       u_init = constant_op.constant(
           u_init_val_array, dtype=dtypes.string, name="u_init")
-      u = variables.Variable(u_init, name="u")
+      u = variables.VariableV1(u_init, name="u")
 
       def watch_fn(fetches, feeds):
         del fetches, feeds
@@ -167,7 +167,7 @@ class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
       u_init = constant_op.constant(
           [], dtype=dtypes.float32, shape=[0], name="u_init")
-      u = variables.Variable(u_init, name="u")
+      u = variables.VariableV1(u_init, name="u")
 
       def watch_fn(fetches, feeds):
         del fetches, feeds
@@ -189,7 +189,7 @@ class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
       u_init = constant_op.constant(
           [], dtype=dtypes.string, shape=[0], name="u_init")
-      u = variables.Variable(u_init, name="u")
+      u = variables.VariableV1(u_init, name="u")
 
       def watch_fn(fetches, feeds):
         del fetches, feeds
diff --git a/tensorflow/python/debug/lib/session_debug_file_test.py b/tensorflow/python/debug/lib/session_debug_file_test.py
index ba0f15b4e2..1874160dd6 100644
--- a/tensorflow/python/debug/lib/session_debug_file_test.py
+++ b/tensorflow/python/debug/lib/session_debug_file_test.py
@@ -58,9 +58,9 @@ class SessionDebugFileTest(session_debug_testlib.SessionDebugTestBase):
       v_name = "diff_Watch/v"
 
       u_init = constant_op.constant(u_init_val, shape=[2, 2])
-      u = variables.Variable(u_init, name=u_name)
+      u = variables.VariableV1(u_init, name=u_name)
       v_init = constant_op.constant(v_init_val, shape=[2, 1])
-      v = variables.Variable(v_init, name=v_name)
+      v = variables.VariableV1(v_init, name=v_name)
 
       w = math_ops.matmul(u, v, name="diff_Watch/matmul")
 
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
index 91f21cb1f3..bfc9a3a382 100644
--- a/tensorflow/python/debug/lib/session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -148,8 +148,8 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
           sess, "localhost:%d" % self._server_port, watch_fn="foo")
 
   def testGrpcDebugWrapperSessionWithoutWatchFnWorks(self):
-    u = variables.Variable(2.1, name="u")
-    v = variables.Variable(20.0, name="v")
+    u = variables.VariableV1(2.1, name="u")
+    v = variables.VariableV1(20.0, name="v")
     w = math_ops.multiply(u, v, name="w")
 
     sess = session.Session(
@@ -175,8 +175,8 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
       del feeds, fetch_keys
       return ["DebugIdentity", "DebugNumericSummary"], r".*/read", None
 
-    u = variables.Variable(2.1, name="u")
-    v = variables.Variable(20.0, name="v")
+    u = variables.VariableV1(2.1, name="u")
+    v = variables.VariableV1(20.0, name="v")
     w = math_ops.multiply(u, v, name="w")
 
     sess = session.Session(
@@ -209,8 +209,8 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
           op_type_regex_whitelist=None,
           tolerate_debug_op_creation_failures=True)
 
-    u = variables.Variable(2.1, name="u")
-    v = variables.Variable(20.0, name="v")
+    u = variables.VariableV1(2.1, name="u")
+    v = variables.VariableV1(20.0, name="v")
     w = math_ops.multiply(u, v, name="w")
 
     sess = session.Session(
@@ -241,8 +241,8 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
         14, len(dump.get_tensors("v/read", 0, "DebugNumericSummary")[0]))
 
   def testTensorBoardDebugHookWorks(self):
-    u = variables.Variable(2.1, name="u")
-    v = variables.Variable(20.0, name="v")
+    u = variables.VariableV1(2.1, name="u")
+    v = variables.VariableV1(20.0, name="v")
     w = math_ops.multiply(u, v, name="w")
 
     sess = session.Session(
@@ -286,8 +286,8 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
       self._server.query_source_file_line(__file__, 1)
 
   def testTensorBoardDebugHookDisablingTracebackSourceCodeSendingWorks(self):
-    u = variables.Variable(2.1, name="u")
-    v = variables.Variable(20.0, name="v")
+    u = variables.VariableV1(2.1, name="u")
+    v = variables.VariableV1(20.0, name="v")
     w = math_ops.multiply(u, v, name="w")
 
     sess = session.Session(
@@ -381,8 +381,8 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
   def testToggleEnableTwoDebugWatchesNoCrosstalkBetweenDebugNodes(self):
     with session.Session(
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
-      v_1 = variables.Variable(50.0, name="v_1")
-      v_2 = variables.Variable(-50.0, name="v_1")
+      v_1 = variables.VariableV1(50.0, name="v_1")
+      v_2 = variables.VariableV1(-50.0, name="v_1")
       delta_1 = constant_op.constant(5.0, name="delta_1")
       delta_2 = constant_op.constant(-5.0, name="delta_2")
       inc_v_1 = state_ops.assign_add(v_1, delta_1, name="inc_v_1")
@@ -451,8 +451,8 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
 
     with session.Session(
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
-      v_1 = variables.Variable(50.0, name="v_1")
-      v_2 = variables.Variable(-50.0, name="v_1")
+      v_1 = variables.VariableV1(50.0, name="v_1")
+      v_2 = variables.VariableV1(-50.0, name="v_1")
       # These two nodes have names that match those in the
       # toggle_watch_on_core_metadata argument used when calling
       # start_server_on_separate_thread().
@@ -491,7 +491,7 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
   def testToggleEnableTwoDebugWatchesNoCrosstalkBetweenServers(self):
     with session.Session(
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
-      v = variables.Variable(50.0, name="v")
+      v = variables.VariableV1(50.0, name="v")
       delta = constant_op.constant(5.0, name="delta")
       inc_v = state_ops.assign_add(v, delta, name="inc_v")
 
@@ -534,8 +534,8 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
   def testToggleBreakpointsWorks(self):
     with session.Session(
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
-      v_1 = variables.Variable(50.0, name="v_1")
-      v_2 = variables.Variable(-50.0, name="v_2")
+      v_1 = variables.VariableV1(50.0, name="v_1")
+      v_2 = variables.VariableV1(-50.0, name="v_2")
       delta_1 = constant_op.constant(5.0, name="delta_1")
       delta_2 = constant_op.constant(-5.0, name="delta_2")
       inc_v_1 = state_ops.assign_add(v_1, delta_1, name="inc_v_1")
@@ -592,8 +592,8 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
   def testTensorBoardDebuggerWrapperToggleBreakpointsWorks(self):
     with session.Session(
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
-      v_1 = variables.Variable(50.0, name="v_1")
-      v_2 = variables.Variable(-50.0, name="v_2")
+      v_1 = variables.VariableV1(50.0, name="v_1")
+      v_2 = variables.VariableV1(-50.0, name="v_2")
       delta_1 = constant_op.constant(5.0, name="delta_1")
       delta_2 = constant_op.constant(-5.0, name="delta_2")
       inc_v_1 = state_ops.assign_add(v_1, delta_1, name="inc_v_1")
@@ -665,8 +665,8 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
   def testTensorBoardDebuggerWrapperDisablingTracebackSourceSendingWorks(self):
     with session.Session(
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
-      v_1 = variables.Variable(50.0, name="v_1")
-      v_2 = variables.Variable(-50.0, name="v_2")
+      v_1 = variables.VariableV1(50.0, name="v_1")
+      v_2 = variables.VariableV1(-50.0, name="v_2")
       delta_1 = constant_op.constant(5.0, name="delta_1")
       delta_2 = constant_op.constant(-5.0, name="delta_2")
       inc_v_1 = state_ops.assign_add(v_1, delta_1, name="inc_v_1")
@@ -699,7 +699,7 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
 
   def testGetGrpcDebugWatchesReturnsCorrectAnswer(self):
     with session.Session() as sess:
-      v = variables.Variable(50.0, name="v")
+      v = variables.VariableV1(50.0, name="v")
       delta = constant_op.constant(5.0, name="delta")
       inc_v = state_ops.assign_add(v, delta, name="inc_v")
 
@@ -743,7 +743,7 @@ class DelayedDebugServerTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       a_init = constant_op.constant(42.0, name="a_init")
-      a = variables.Variable(a_init, name="a")
+      a = variables.VariableV1(a_init, name="a")
 
       def watch_fn(fetches, feeds):
         del fetches, feeds
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index 070d9c4cd7..25ef91b575 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -70,7 +70,7 @@ class _RNNCellForTest(rnn_cell_impl.RNNCell):
   def __init__(self, input_output_size, state_size):
     self._input_output_size = input_output_size
     self._state_size = state_size
-    self._w = variables.Variable(1.0, dtype=dtypes.float32, name="w")
+    self._w = variables.VariableV1(1.0, dtype=dtypes.float32, name="w")
 
   @property
   def output_size(self):
@@ -182,9 +182,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       w_name = "w"
 
       u_init = constant_op.constant(u_init_val, shape=[2, 2])
-      u = variables.Variable(u_init, name=u_name)
+      u = variables.VariableV1(u_init, name=u_name)
       v_init = constant_op.constant(v_init_val, shape=[2, 1])
-      v = variables.Variable(v_init, name=v_name)
+      v = variables.VariableV1(v_init, name=v_name)
 
       w = math_ops.matmul(u, v, name=w_name)
 
@@ -221,8 +221,8 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testCopyNodesHaveCorrectDebugOpsAndURLsAttributeValues(self):
     with session.Session() as sess:
-      u = variables.Variable(2.1, name="u")
-      v = variables.Variable(20.0, name="v")
+      u = variables.VariableV1(2.1, name="u")
+      v = variables.VariableV1(20.0, name="v")
       w = math_ops.multiply(u, v, name="w")
 
       sess.run(variables.global_variables_initializer())
@@ -324,8 +324,8 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
       str1_name = "str1"
       str2_name = "str2"
-      str1 = variables.Variable(str1_init, name=str1_name)
-      str2 = variables.Variable(str2_init, name=str2_name)
+      str1 = variables.VariableV1(str1_init, name=str1_name)
+      str2 = variables.VariableV1(str2_init, name=str2_name)
       # Concatenate str1 and str2
       str_concat = math_ops.add(str1, str2, name="str_concat")
 
@@ -387,9 +387,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       s_name = "%s/s" % op_namespace
 
       u_init = constant_op.constant(u_init_val, shape=[2, 2])
-      u = variables.Variable(u_init, name=u_name)
+      u = variables.VariableV1(u_init, name=u_name)
       s_init = constant_op.constant(s_init_val)
-      s = variables.Variable(s_init, name=s_name)
+      s = variables.VariableV1(s_init, name=s_name)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       debug_urls = self._debug_urls()
@@ -439,7 +439,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
       u_init_val = np.array(11.0)
       u_init = constant_op.constant(u_init_val)
-      u = variables.Variable(u_init, name=u_name)
+      u = variables.VariableV1(u_init, name=u_name)
 
       # "v" is the increment.
       v_name = "testDumpToFileWhileLoop/v"
@@ -447,7 +447,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
       v_init_val = np.array(2.0)
       v_init = constant_op.constant(v_init_val)
-      v = variables.Variable(v_init, name=v_name)
+      v = variables.VariableV1(v_init, name=v_name)
 
       u.initializer.run()
       v.initializer.run()
@@ -605,8 +605,8 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testDebugCondWatchingWholeGraphWorks(self):
     with session.Session() as sess:
-      x = variables.Variable(10.0, name="x")
-      y = variables.Variable(20.0, name="y")
+      x = variables.VariableV1(10.0, name="x")
+      y = variables.VariableV1(20.0, name="y")
       cond = control_flow_ops.cond(
           x > y, lambda: math_ops.add(x, 1), lambda: math_ops.add(y, 1))
 
@@ -628,9 +628,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       z_name = "testFindNodesWithBadTensorValues/z"
 
       u_init = constant_op.constant([2.0, 4.0])
-      u = variables.Variable(u_init, name=u_name)
+      u = variables.VariableV1(u_init, name=u_name)
       v_init = constant_op.constant([2.0, 1.0])
-      v = variables.Variable(v_init, name=v_name)
+      v = variables.VariableV1(v_init, name=v_name)
 
       # Expected output: [0.0, 3.0]
       w = math_ops.subtract(u, v, name=w_name)
@@ -679,9 +679,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       z_name = "testFindInfOrNanWithOpNameExclusion/z"
 
       u_init = constant_op.constant([2.0, 4.0])
-      u = variables.Variable(u_init, name=u_name)
+      u = variables.VariableV1(u_init, name=u_name)
       v_init = constant_op.constant([2.0, 1.0])
-      v = variables.Variable(v_init, name=v_name)
+      v = variables.VariableV1(v_init, name=v_name)
 
       # Expected output: [0.0, 3.0]
       w = math_ops.subtract(u, v, name=w_name)
@@ -725,7 +725,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       w_name = "testDumpGraphStructureLookup/w"
 
       u_init = constant_op.constant([2.0, 4.0])
-      u = variables.Variable(u_init, name=u_name)
+      u = variables.VariableV1(u_init, name=u_name)
       v = math_ops.add(u, u, name=v_name)
       w = math_ops.add(v, v, name=w_name)
 
@@ -859,9 +859,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testGraphPathFindingOnControlEdgesWorks(self):
     with session.Session(config=no_rewrite_session_config()) as sess:
-      v1 = variables.Variable(1.0, name="v1")
-      v2 = variables.Variable(2.0, name="v2")
-      v3 = variables.Variable(3.0, name="v3")
+      v1 = variables.VariableV1(1.0, name="v1")
+      v2 = variables.VariableV1(2.0, name="v2")
+      v3 = variables.VariableV1(3.0, name="v3")
       a = math_ops.add(v1, v2, name="a")
       with ops.control_dependencies([a]):
         c = math_ops.subtract(v3, v3, name="c")
@@ -875,8 +875,8 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testGraphPathFindingReverseRefEdgeWorks(self):
     with session.Session(config=no_rewrite_session_config()) as sess:
-      v = variables.Variable(10.0, name="v")
-      delta = variables.Variable(1.0, name="delta")
+      v = variables.VariableV1(10.0, name="v")
+      delta = variables.VariableV1(1.0, name="delta")
       inc_v = state_ops.assign_add(v, delta, name="inc_v")
 
       sess.run(variables.global_variables_initializer())
@@ -894,7 +894,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       w_name = "testDumpCausalityCheck/w"
 
       u_init = constant_op.constant([2.0, 4.0])
-      u = variables.Variable(u_init, name=u_name)
+      u = variables.VariableV1(u_init, name=u_name)
       v = math_ops.add(u, u, name=v_name)
       w = math_ops.add(v, v, name=w_name)
 
@@ -980,7 +980,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       w_name = "oneOfTwoSlots/w"
       y_name = "oneOfTwoSlots/y"
 
-      x = variables.Variable([1, 3, 3, 7], dtype=dtypes.int32, name=x_name)
+      x = variables.VariableV1([1, 3, 3, 7], dtype=dtypes.int32, name=x_name)
       sess.run(x.initializer)
 
       unique_x, indices, _ = array_ops.unique_with_counts(x, name=u_name)
@@ -1039,9 +1039,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
     with session.Session(config=no_rewrite_session_config()) as sess:
       u_init = constant_op.constant(10.0)
-      u = variables.Variable(u_init, name="gdo/u")
+      u = variables.VariableV1(u_init, name="gdo/u")
       v_init = constant_op.constant(20.0)
-      v = variables.Variable(v_init, name="gdo/v")
+      v = variables.VariableV1(v_init, name="gdo/v")
 
       w = math_ops.multiply(u, v, name="gdo/w")
       # gdo stands for GradientDescentOptimizer.
@@ -1085,7 +1085,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
     with session.Session() as sess:
       x_init = constant_op.constant([2, 2, 3, 5, 5])
-      x = variables.Variable(x_init, name="unconnected/x")
+      x = variables.VariableV1(x_init, name="unconnected/x")
 
       # The UniqueOp (tf.unique) has two output slots. Use only slot 0 in the
       # graph. Let the debugger watch the unused slot 1.
@@ -1225,14 +1225,14 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testDebugNumericSummaryOnInitializedTensorGivesCorrectResult(self):
     with session.Session(config=no_rewrite_session_config()) as sess:
-      a = variables.Variable(
+      a = variables.VariableV1(
           [
               np.nan, np.nan, 0.0, 0.0, 0.0, -1.0, -3.0, 3.0, 7.0, -np.inf,
               -np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.nan, np.nan
           ],
           dtype=np.float32,
           name="numeric_summary/a")
-      b = variables.Variable(
+      b = variables.VariableV1(
           [0.0] * 18, dtype=np.float32, name="numeric_summary/b")
       c = math_ops.add(a, b, name="numeric_summary/c")
 
@@ -1249,7 +1249,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testDebugNumericSummaryOnUninitializedTensorGivesCorrectResult(self):
     with session.Session() as sess:
-      a = variables.Variable(
+      a = variables.VariableV1(
           [42], dtype=np.float32, name="numeric_summary_uninit/a")
 
       _, dump = self._debug_run_and_get_dump(
@@ -1275,9 +1275,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testDebugNumericSummaryFailureIsToleratedWhenOrdered(self):
     with session.Session() as sess:
-      a = variables.Variable("1", name="a")
-      b = variables.Variable("3", name="b")
-      c = variables.Variable("2", name="c")
+      a = variables.VariableV1("1", name="a")
+      b = variables.VariableV1("3", name="b")
+      c = variables.VariableV1("2", name="c")
 
       d = math_ops.add(a, b, name="d")
       e = math_ops.add(d, c, name="e")
@@ -1313,9 +1313,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testDebugNumericSummaryInvalidAttributesStringAreCaught(self):
     with session.Session(config=no_rewrite_session_config()) as sess:
-      a = variables.Variable(10.0, name="a")
-      b = variables.Variable(0.0, name="b")
-      c = variables.Variable(0.0, name="c")
+      a = variables.VariableV1(10.0, name="a")
+      b = variables.VariableV1(0.0, name="b")
+      c = variables.VariableV1(0.0, name="c")
 
       x = math_ops.divide(a, b, name="x")
       y = math_ops.multiply(x, c, name="y")
@@ -1361,9 +1361,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testDebugNumericSummaryMuteOnHealthyMutesOnlyHealthyTensorDumps(self):
     with session.Session(config=no_rewrite_session_config()) as sess:
-      a = variables.Variable(10.0, name="a")
-      b = variables.Variable(0.0, name="b")
-      c = variables.Variable(0.0, name="c")
+      a = variables.VariableV1(10.0, name="a")
+      b = variables.VariableV1(0.0, name="b")
+      c = variables.VariableV1(0.0, name="c")
 
       x = math_ops.divide(a, b, name="x")
       y = math_ops.multiply(x, c, name="y")
@@ -1396,8 +1396,8 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testDebugNumericSummaryMuteOnHealthyAndCustomBoundsWork(self):
     with session.Session() as sess:
-      a = variables.Variable([10.0, 10.0], name="a")
-      b = variables.Variable([10.0, 2.0], name="b")
+      a = variables.VariableV1([10.0, 10.0], name="a")
+      b = variables.VariableV1([10.0, 2.0], name="b")
 
       x = math_ops.add(a, b, name="x")  # [20.0, 12.0]
       y = math_ops.divide(x, b, name="y")  # [2.0, 6.0]
@@ -1436,9 +1436,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
   def testLookUpNodePythonTracebackWorks(self):
     with session.Session() as sess:
       u_init = constant_op.constant(10.0)
-      u = variables.Variable(u_init, name="traceback/u")
+      u = variables.VariableV1(u_init, name="traceback/u")
       v_init = constant_op.constant(20.0)
-      v = variables.Variable(v_init, name="traceback/v")
+      v = variables.VariableV1(v_init, name="traceback/v")
 
       w = math_ops.multiply(u, v, name="traceback/w")
 
@@ -1487,7 +1487,7 @@ class DebugConcurrentRunCallsTest(test_util.TensorFlowTestCase):
       self.skipTest("No testing concurrent runs on a single GPU.")
 
     with session.Session() as sess:
-      v = variables.Variable(30.0, name="v")
+      v = variables.VariableV1(30.0, name="v")
       constants = []
       for i in xrange(self._num_concurrent_runs):
         constants.append(constant_op.constant(1.0, name="c%d" % i))
diff --git a/tensorflow/python/debug/lib/stepper_test.py b/tensorflow/python/debug/lib/stepper_test.py
index 9a3d0efabf..3839c67198 100644
--- a/tensorflow/python/debug/lib/stepper_test.py
+++ b/tensorflow/python/debug/lib/stepper_test.py
@@ -36,8 +36,8 @@ from tensorflow.python.training import gradient_descent
 class StepperTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
-    self.a = variables.Variable(2.0, name="a")
-    self.b = variables.Variable(3.0, name="b")
+    self.a = variables.VariableV1(2.0, name="a")
+    self.b = variables.VariableV1(3.0, name="b")
 
     self.c = math_ops.multiply(self.a, self.b, name="c")  # Should be 6.0.
     self.d = math_ops.multiply(self.a, self.a, name="d")  # Should be 4.0.
@@ -49,7 +49,7 @@ class StepperTest(test_util.TensorFlowTestCase):
 
     # The there nodes x, y and z form a graph with "cross-links" in. I.e., x
     # and y are both direct inputs to z, but x is also a direct input to y.
-    self.x = variables.Variable(2.0, name="x")  # Should be 2.0
+    self.x = variables.VariableV1(2.0, name="x")  # Should be 2.0
     self.y = math_ops.negative(self.x, name="y")  # Should be -2.0.
 
     self.z = math_ops.multiply(self.x, self.y, name="z")  # Should be -4.0.
@@ -580,7 +580,7 @@ class StepperTestWithPlaceHolders(test_util.TensorFlowTestCase):
 class StepperAssignAddTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
-    self.v = variables.Variable(10.0, name="v")
+    self.v = variables.VariableV1(10.0, name="v")
     self.p = math_ops.add(self.v, self.v, name="p")
     self.q = math_ops.multiply(self.p, self.p, name="q")
     self.delta = constant_op.constant(2.0, name="delta")
@@ -711,9 +711,9 @@ class StepperBackwardRunTest(test_util.TensorFlowTestCase):
     Construct a backward graph using the GradientDescentOptimizer.
     """
 
-    self.a = variables.Variable(1.0, name="a")
-    self.b = variables.Variable(2.0, name="b")
-    self.c = variables.Variable(4.0, name="c")
+    self.a = variables.VariableV1(1.0, name="a")
+    self.b = variables.VariableV1(2.0, name="b")
+    self.c = variables.VariableV1(4.0, name="c")
     self.d = math_ops.multiply(self.a, self.b, name="d")
     self.e = math_ops.multiply(self.b, self.c, name="e")
     self.f = math_ops.multiply(self.d, self.e, name="f")
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index 254201c393..11011a5c13 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -46,7 +46,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
   def setUp(self):
     self.session_root = tempfile.mkdtemp()
 
-    self.v = variables.Variable(10.0, dtype=dtypes.float32, name="v")
+    self.v = variables.VariableV1(10.0, dtype=dtypes.float32, name="v")
     self.delta = constant_op.constant(1.0, dtype=dtypes.float32, name="delta")
     self.eta = constant_op.constant(-1.4, dtype=dtypes.float32, name="eta")
     self.inc_v = state_ops.assign_add(self.v, self.delta, name="inc_v")
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 05c9eaa4d2..149a7497df 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -132,8 +132,8 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
   def setUp(self):
     self._tmp_dir = tempfile.mktemp()
 
-    self.v = variables.Variable(10.0, name="v")
-    self.w = variables.Variable(21.0, name="w")
+    self.v = variables.VariableV1(10.0, name="v")
+    self.w = variables.VariableV1(21.0, name="w")
     self.delta = constant_op.constant(1.0, name="delta")
     self.inc_v = state_ops.assign_add(self.v, self.delta, name="inc_v")
 
@@ -358,7 +358,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
   def testDebuggingMakeCallableTensorRunnerWorks(self):
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
         [["run"], ["run"]], self.sess, dump_root=self._tmp_dir)
-    v = variables.Variable(42)
+    v = variables.VariableV1(42)
     tensor_runner = wrapped_sess.make_callable(v)
     self.sess.run(v.initializer)
 
@@ -382,7 +382,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
   def testDebuggingMakeCallableOperationRunnerWorks(self):
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
         [["run"], ["run"]], self.sess, dump_root=self._tmp_dir)
-    v = variables.Variable(10.0)
+    v = variables.VariableV1(10.0)
     inc_v = state_ops.assign_add(v, 1.0)
     op_runner = wrapped_sess.make_callable(inc_v.op)
     self.sess.run(v.initializer)
@@ -403,7 +403,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, len(wrapped_sess.observers["debug_dumps"]))
 
   def testDebuggingMakeCallableFromOptionsWithZeroFeedWorks(self):
-    variable_1 = variables.Variable(
+    variable_1 = variables.VariableV1(
         10.5, dtype=dtypes.float32, name="variable_1")
     a = math_ops.add(variable_1, variable_1, "callable_a")
     math_ops.add(a, a, "callable_b")
@@ -480,7 +480,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
       self.assertItemsEqual(["callable_a", "callable_b"], node_names)
 
   def testDebugMakeCallableFromOptionsWithCustomOptionsAndMetadataWorks(self):
-    variable_1 = variables.Variable(
+    variable_1 = variables.VariableV1(
         10.5, dtype=dtypes.float32, name="variable_1")
     a = math_ops.add(variable_1, variable_1, "callable_a")
     math_ops.add(a, a, "callable_b")
@@ -528,7 +528,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
   def testRuntimeErrorBeforeGraphExecutionIsRaised(self):
     # Use an impossible device name to cause an error before graph execution.
     with ops.device("/device:GPU:1337"):
-      w = variables.Variable([1.0] * 10, name="w")
+      w = variables.VariableV1([1.0] * 10, name="w")
 
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
         [["run"]], self.sess, dump_root=self._tmp_dir)
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index 1ed5e30b0e..bc2504ca19 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -1017,7 +1017,7 @@ class EstimatorGetVariablesTest(test.TestCase):
 
     def _model_fn(features, labels, mode):
       _, _ = features, labels
-      variables.Variable(1., name='one')
+      variables.VariableV1(1., name='one')
       return model_fn_lib.EstimatorSpec(
           mode=mode,
           loss=constant_op.constant(0.),
@@ -1033,8 +1033,8 @@ class EstimatorGetVariablesTest(test.TestCase):
 
     def _model_fn(features, labels, mode):
       _, _ = features, labels
-      variables.Variable(1., name='one')
-      variables.Variable(3., name='three')
+      variables.VariableV1(1., name='one')
+      variables.VariableV1(3., name='three')
       return model_fn_lib.EstimatorSpec(
           mode=mode,
           loss=constant_op.constant(0.),
@@ -1178,13 +1178,13 @@ class EstimatorEvaluateTest(test.TestCase):
     def _model_fn(features, labels, mode, params):
       del features, labels, params
       mean = metrics_module.Mean()
-      mean.update_state(variables.Variable(2.) + 1)
+      mean.update_state(variables.VariableV1(2.) + 1)
       return model_fn_lib.EstimatorSpec(
           mode,
           loss=constant_op.constant(1.),
           eval_metric_ops={
               'mean1': mean,
-              'mean2': metrics_lib.mean(variables.Variable(2.) + 1)
+              'mean2': metrics_lib.mean(variables.VariableV1(2.) + 1)
           })
 
     est = estimator.Estimator(model_fn=_model_fn)
@@ -1332,7 +1332,7 @@ class EstimatorEvaluateTest(test.TestCase):
 
     def _model_fn_with_incremental_loss(features, labels, mode):
       _, _ = features, labels
-      local_weight = variables.Variable(
+      local_weight = variables.VariableV1(
           0., name='local_weight', collections=[ops.GraphKeys.LOCAL_VARIABLES])
       # Loss will be 2, 4, 6, ...
       loss = 2 * state_ops.assign_add(local_weight, 1.)
@@ -1385,7 +1385,7 @@ class EstimatorEvaluateTest(test.TestCase):
     def _get_model_fn(val=1):
       def _model_fn(features, labels, mode):
         del features, labels  # unused
-        variables.Variable(val, name='weight')
+        variables.VariableV1(val, name='weight')
         return model_fn_lib.EstimatorSpec(
             mode=mode,
             predictions=constant_op.constant([[1.]]),
@@ -1409,7 +1409,7 @@ class EstimatorEvaluateTest(test.TestCase):
 
     def _model_fn_scaffold(features, labels, mode):
       _, _ = features, labels
-      variables.Variable(1., name='weight')
+      variables.VariableV1(1., name='weight')
       self.mock_saver = get_mock_saver()
       return model_fn_lib.EstimatorSpec(
           mode=mode,
@@ -1603,7 +1603,7 @@ class EstimatorPredictTest(test.TestCase):
   def test_no_checkpoint_uses_init(self):
     def _model_fn(features, labels, mode, params, config):
       del features, labels, params, config
-      x = variables.Variable([[3.]], name='x')
+      x = variables.VariableV1([[3.]], name='x')
       return model_fn_lib.EstimatorSpec(mode, predictions=math_ops.add(x, 1.))
     est = estimator.Estimator(model_fn=_model_fn)
     # Expected prediction value is 1 + the value of the Variable that is newly
@@ -1614,7 +1614,7 @@ class EstimatorPredictTest(test.TestCase):
     def _make_model_fn(x):
       def _variable_creating_and_export_model_fn(features, labels, mode):
         _, _ = features, labels
-        x_var = variables.Variable([[x]], name='x')
+        x_var = variables.VariableV1([[x]], name='x')
         return model_fn_lib.EstimatorSpec(
             mode,
             predictions=math_ops.add(x_var, 1.),
@@ -1936,7 +1936,7 @@ class EstimatorPredictTest(test.TestCase):
 
     def _model_fn(features, labels, mode):
       _, _ = features, labels
-      v = variables.Variable([[16.]], name='weight')
+      v = variables.VariableV1([[16.]], name='weight')
       prediction = v * 2
       return model_fn_lib.EstimatorSpec(
           mode,
@@ -1953,7 +1953,7 @@ class EstimatorPredictTest(test.TestCase):
 
     def _model_fn(features, labels, mode):
       _, _ = features, labels
-      v = variables.Variable([[16.]], name='weight')
+      v = variables.VariableV1([[16.]], name='weight')
       prediction = v * 2
       return model_fn_lib.EstimatorSpec(
           mode,
@@ -1974,7 +1974,7 @@ class EstimatorPredictTest(test.TestCase):
 
     def _model_fn_scaffold(features, labels, mode):
       _, _ = features, labels
-      variables.Variable(1., name='weight')
+      variables.VariableV1(1., name='weight')
       self.mock_saver = get_mock_saver()
       return model_fn_lib.EstimatorSpec(
           mode=mode,
@@ -2029,7 +2029,7 @@ class EstimatorPredictTest(test.TestCase):
 
 def _model_fn_for_export_tests(features, labels, mode):
   _, _ = features, labels
-  variables.Variable(1., name='weight')
+  variables.VariableV1(1., name='weight')
   scores = constant_op.constant([3.])
   classes = constant_op.constant(['wumpus'])
   update_global_step = state_ops.assign_add(training.get_global_step(), 1)
@@ -2052,11 +2052,11 @@ def _x_y_input_fn():
 
 def _model_fn_with_x_y(features, labels, mode):
   _ = labels
-  variables.Variable(1., name='weight')
+  variables.VariableV1(1., name='weight')
   scores = constant_op.constant([3.])
   classes = constant_op.constant(['wumpus'])
   if mode == model_fn_lib.ModeKeys.PREDICT:
-    variables.Variable(36., name='name_collision')
+    variables.VariableV1(36., name='name_collision')
     return model_fn_lib.EstimatorSpec(
         mode,
         predictions=constant_op.constant(10.),
@@ -2076,8 +2076,8 @@ def _model_fn_with_x_y(features, labels, mode):
             metrics_lib.mean(
                 features['x'] - features['y'], name='{}mean'.format(prefix))
     }
-    variables.Variable(1., name='later_var')
-    variables.Variable(3., name='name_collision')
+    variables.VariableV1(1., name='later_var')
+    variables.VariableV1(3., name='name_collision')
     return model_fn_lib.EstimatorSpec(
         mode,
         predictions=multiplied,
@@ -2411,9 +2411,9 @@ class EstimatorExportTest(test.TestCase):
     def _model_fn_with_predict_only_vars(features, labels, mode):
       _, _ = features, labels
       if mode == model_fn_lib.ModeKeys.PREDICT:
-        variables.Variable(1., name='only_in_predict')
+        variables.VariableV1(1., name='only_in_predict')
       else:
-        variables.Variable(1., name='otherwise')
+        variables.VariableV1(1., name='otherwise')
 
       prediction = constant_op.constant(1.)
       return model_fn_lib.EstimatorSpec(
@@ -2684,7 +2684,7 @@ class EstimatorExportTest(test.TestCase):
 
     def _model_fn_scaffold(features, labels, mode):
       _, _ = features, labels
-      variables.Variable(1., name='weight')
+      variables.VariableV1(1., name='weight')
       self.mock_saver = get_mock_saver()
       scores = constant_op.constant([3.])
       return model_fn_lib.EstimatorSpec(
@@ -2717,7 +2717,7 @@ class EstimatorExportTest(test.TestCase):
 
     def _model_fn_scaffold(features, labels, mode):
       _, _ = features, labels
-      variables.Variable(1., name='weight')
+      variables.VariableV1(1., name='weight')
 
       scores = constant_op.constant([3.])
       if mode == model_fn_lib.ModeKeys.PREDICT:
@@ -2762,8 +2762,8 @@ class EstimatorExportTest(test.TestCase):
 
     def _model_fn_scaffold(features, labels, mode):
       _, _ = features, labels
-      my_int = variables.Variable(1, name='my_int',
-                                  collections=[ops.GraphKeys.LOCAL_VARIABLES])
+      my_int = variables.VariableV1(1, name='my_int',
+                                    collections=[ops.GraphKeys.LOCAL_VARIABLES])
       _ = training.get_or_create_steps_per_run_variable()
       scores = constant_op.constant([3.])
       with ops.control_dependencies([
@@ -2808,8 +2808,8 @@ class EstimatorExportTest(test.TestCase):
 
     def _model_fn_scaffold(features, labels, mode):
       _, _ = features, labels
-      my_int = variables.Variable(1, name='my_int',
-                                  collections=[ops.GraphKeys.LOCAL_VARIABLES])
+      my_int = variables.VariableV1(1, name='my_int',
+                                    collections=[ops.GraphKeys.LOCAL_VARIABLES])
       scores = constant_op.constant([3.])
       with ops.control_dependencies([
           variables.local_variables_initializer(),
@@ -3038,7 +3038,7 @@ class EstimatorExportTest(test.TestCase):
 
     def _model_fn(features, labels, mode):
       _, _ = features, labels
-      variables.Variable(1., name='weight')
+      variables.VariableV1(1., name='weight')
       return model_fn_lib.EstimatorSpec(
           mode,
           predictions=constant_op.constant(10.),
@@ -3081,7 +3081,7 @@ class EstimatorHookOrderingTest(test.TestCase):
       """A graph that generates NaN's for testing."""
       del features, labels
 
-      global_step = variables.Variable(
+      global_step = variables.VariableV1(
           0, dtype=dtypes.int64, name='global_step')
       inc_global_step = state_ops.assign_add(global_step, 1)
       nan_const = constant_op.constant(np.nan, dtype=dtypes.float32)
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index f740e5cfaa..87f567db0e 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -113,7 +113,7 @@ class FunctionTest(test.TestCase):
       return a
 
     with ops.Graph().as_default():
-      var = variables.Variable([18.0])
+      var = variables.VariableV1([18.0])
       call = MyIdentityFunc(var._ref())  # pylint: disable=protected-access
       self.assertEqual("MyIdentity", call.op.name)
       for cfg in _OptimizerOptions():
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index 2dafb94ba7..563a177dd0 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -104,13 +104,13 @@ class DeviceFunctionsTest(test.TestCase):
 
   def testNestedDeviceFunctions(self):
     with ops.Graph().as_default():
-      var_0 = variables.Variable(0)
+      var_0 = variables.VariableV1(0)
       with ops.device(test_device_func_pin_variable_to_cpu):
-        var_1 = variables.Variable(1)
+        var_1 = variables.VariableV1(1)
         with ops.device(lambda op: "/device:GPU:0"):
-          var_2 = variables.Variable(2)
+          var_2 = variables.VariableV1(2)
         with ops.device("/device:GPU:0"):  # Implicit merging device function.
-          var_3 = variables.Variable(3)
+          var_3 = variables.VariableV1(3)
 
     self.assertDeviceEqual(var_0.device, None)
     self.assertDeviceEqual(var_1.device, "/device:CPU:0")
diff --git a/tensorflow/python/framework/subscribe_test.py b/tensorflow/python/framework/subscribe_test.py
index 1d594e4078..cab426844d 100644
--- a/tensorflow/python/framework/subscribe_test.py
+++ b/tensorflow/python/framework/subscribe_test.py
@@ -212,8 +212,8 @@ class SubscribeTest(test_util.TensorFlowTestCase):
 
   def testSubscribeVariable(self):
     """Confirm that variables can be subscribed."""
-    v1 = variables.Variable(0.0)
-    v2 = variables.Variable(4.0)
+    v1 = variables.VariableV1(0.0)
+    v2 = variables.VariableV1(4.0)
     add = math_ops.add(v1, v2)
     assign_v1 = v1.assign(3.0)
 
diff --git a/tensorflow/python/grappler/item_test.py b/tensorflow/python/grappler/item_test.py
index c40de9da0a..d3d96c646c 100644
--- a/tensorflow/python/grappler/item_test.py
+++ b/tensorflow/python/grappler/item_test.py
@@ -110,7 +110,7 @@ class ItemTest(test.TestCase):
   def testColocationContraints(self):
     with ops.Graph().as_default() as g:
       c = constant_op.constant([10])
-      v = variables.Variable([3], dtype=dtypes.int32)
+      v = variables.VariableV1([3], dtype=dtypes.int32)
       i = gen_array_ops.ref_identity(v)
       a = state_ops.assign(i, c)
       train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index b658edff2d..03b42f6453 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -39,8 +39,8 @@ class MemoryOptimizerSwapTest(test.TestCase):
 
   def testNoSwapping(self):
     """Make sure the graph is preserved when there is nothing to swap."""
-    a = variables.Variable(10, name='a')
-    b = variables.Variable(20, name='b')
+    a = variables.VariableV1(10, name='a')
+    b = variables.VariableV1(20, name='b')
     c = math_ops.add_n([a, b], name='c')
     d = math_ops.add_n([b, c], name='d')
     train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
@@ -60,8 +60,8 @@ class MemoryOptimizerSwapTest(test.TestCase):
 
   def testSimpleSwap(self):
     """Check that the swap annotations are followed."""
-    a = variables.Variable(10, name='a')
-    b = variables.Variable(20, name='b')
+    a = variables.VariableV1(10, name='a')
+    b = variables.VariableV1(20, name='b')
     c = math_ops.add_n([a, b], name='c')
     d = math_ops.add_n([b, c], name='d')
     train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
@@ -244,7 +244,7 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
         init_op_name=init_op_name,
         train_op_name=train_op_name,
         loss_op_name=loss_op_name)
-    self.assertAllClose(original_loss, memory_optimized_loss, rtol=1e-4)
+    self.assertAllClose(original_loss, memory_optimized_loss, rtol=1e-2)
 
   def _annotated_graph(self):
     graph = ops.Graph()
diff --git a/tensorflow/python/grappler/tf_optimizer_test.py b/tensorflow/python/grappler/tf_optimizer_test.py
index 5a9afe7257..eca0f67982 100644
--- a/tensorflow/python/grappler/tf_optimizer_test.py
+++ b/tensorflow/python/grappler/tf_optimizer_test.py
@@ -57,7 +57,7 @@ class PyWrapOptimizeGraphTest(test.TestCase):
   def testKeepNodes(self):
     g = ops.Graph()
     with g.as_default():
-      a1 = variables.Variable(
+      a1 = variables.VariableV1(
           1.0)  # Must be preserved since it's in the collection 'variables'.
       a2 = constant_op.constant(0, shape=[50, 50], name='keep')
       ops.add_to_collection('a2', a2)  # Explicitly add to collection.
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index cb19a412a2..e98b131ae6 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -1972,7 +1972,9 @@ def make_variable(name,
   if use_resource is None:
     use_resource = True
 
-  v = tf_variables.Variable(
+  # TODO(apassos,rohanj) figure out how to remove collections from here so we
+  # can remove the V1.
+  v = tf_variables.VariableV1(
       initial_value=init_val,
       name=name,
       trainable=trainable,
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 2fe85839d0..c5547b19be 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -1001,14 +1001,14 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
         errors.FailedPreconditionError,
         "Attempting to use uninitialized value Variable"):
       with self.cached_session() as sess:
-        v = variables.Variable([1, 2])
+        v = variables.VariableV1([1, 2])
         sess.run(v[:].assign([1, 2]))
 
   def testTypeError(self):
     init_val = constant_op.constant([1, 2], dtype=dtypes.int32)
     too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8)
     too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64)
-    v = variables.Variable(init_val)
+    v = variables.VariableV1(init_val)
     with self.assertRaises(TypeError):
       v[:].assign(too_small_val)
     with self.assertRaises(TypeError):
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index fc4d2a3809..083de84775 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -130,7 +130,7 @@ class ControlFlowTest(test.TestCase):
 
   def testRefIdentity(self):
     with self.cached_session():
-      v = variables.Variable(7)
+      v = variables.VariableV1(7)
 
       v = control_flow_ops._Identity(v)
       op = state_ops.assign(v, 9)
@@ -142,7 +142,7 @@ class ControlFlowTest(test.TestCase):
 
   def testRefEnter(self):
     with self.cached_session():
-      v = variables.Variable(7)
+      v = variables.VariableV1(7)
 
       enter_v = control_flow_ops._Enter(v, "foo_1", is_constant=True)
       nine = constant_op.constant(9)
@@ -155,7 +155,7 @@ class ControlFlowTest(test.TestCase):
 
   def testRefSwitch(self):
     with self.cached_session():
-      v = variables.Variable(7)
+      v = variables.VariableV1(7)
 
       p = constant_op.constant(True)
       v1 = control_flow_ops._SwitchRefOrTensor(v._ref(), p)  # pylint: disable=protected-access
@@ -796,7 +796,7 @@ class ControlFlowTest(test.TestCase):
 
   def testWhileWithRefs_1(self):
     with self.cached_session() as sess:
-      x = variables.Variable(0)._ref()  # pylint: disable=protected-access
+      x = variables.VariableV1(0)._ref()  # pylint: disable=protected-access
       i = constant_op.constant(0)
       c = lambda i, x: math_ops.less(i, 100)
 
@@ -2317,7 +2317,7 @@ class ControlFlowTest(test.TestCase):
 
   def testWhileWithRefsWithGradients_1(self):
     with self.cached_session() as sess:
-      x = variables.Variable(0.)._ref()  # pylint: disable=protected-access
+      x = variables.VariableV1(0.)._ref()  # pylint: disable=protected-access
       i = constant_op.constant(0)
       c = lambda i, x: math_ops.less(i, 10)
 
@@ -2329,7 +2329,7 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(c, body, [i, x], parallel_iterations=5)
 
-      grad_ys = [variables.Variable(73)._ref()]  # pylint: disable=protected-access
+      grad_ys = [variables.VariableV1(73)._ref()]  # pylint: disable=protected-access
       grad = gradients_impl.gradients([r[1]], [x], grad_ys=grad_ys)
 
       variables.global_variables_initializer().run()
@@ -2779,7 +2779,7 @@ class ControlFlowTest(test.TestCase):
 
   def testWithOpsDependencies(self):
     with self.cached_session() as sess:
-      v = variables.Variable(0.0)
+      v = variables.VariableV1(0.0)
       c = constant_op.constant(10)
 
       # Fetching v directly will result in an uninitialized error
@@ -2802,7 +2802,7 @@ class ControlFlowTest(test.TestCase):
 
   def testWithTensorDependencies(self):
     with self.cached_session():
-      v = variables.Variable(0.0)
+      v = variables.VariableV1(0.0)
       c1 = constant_op.constant(10)
       c2 = constant_op.constant(20)
 
@@ -2828,7 +2828,7 @@ class ControlFlowTest(test.TestCase):
 
   def testWithIndexedSlicesDependencies(self):
     with self.cached_session():
-      v = variables.Variable(
+      v = variables.VariableV1(
           np.array([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]]).astype(np.float32))
       v_at_1 = ops.IndexedSlices(v, constant_op.constant([1]))
       gather_v_at_1 = array_ops.gather(v_at_1.values, v_at_1.indices)
@@ -2851,18 +2851,18 @@ class ControlFlowTest(test.TestCase):
     with ops.Graph().as_default():
       # device set on tensor => same device on dep.
       with ops.device("/job:ps"):
-        vd = variables.Variable([0.0])
+        vd = variables.VariableV1([0.0])
       with_vd_dep = control_flow_ops.with_dependencies([vd.initializer], vd)
       self.assertTrue("/job:ps" in with_vd_dep.device)
 
       # No device set on tensor => no device on dep.
-      vnod = variables.Variable([0.0])
+      vnod = variables.VariableV1([0.0])
       with_vnod_dep = control_flow_ops.with_dependencies([vnod.initializer],
                                                          vnod)
       self.assertDeviceEqual(None, with_vnod_dep.device)
 
       # device set on tensor, default device on graph => default device on dep.
-      vdef = variables.Variable([0.0], name="vdef")
+      vdef = variables.VariableV1([0.0], name="vdef")
       with ops.device("/job:worker/device:GPU:1"):
         with_vdef_dep = control_flow_ops.with_dependencies([vdef.initializer],
                                                            vdef)
@@ -2872,8 +2872,8 @@ class ControlFlowTest(test.TestCase):
 
   def testGroup(self):
     with self.cached_session() as sess:
-      v1 = variables.Variable([0.0])
-      v2 = variables.Variable([1.0])
+      v1 = variables.VariableV1([0.0])
+      v2 = variables.VariableV1([1.0])
 
       # Group init1 and init2 and run.
       init = control_flow_ops.group(v1.initializer, v2.initializer)
@@ -2955,29 +2955,29 @@ class ControlFlowTest(test.TestCase):
     p1 = array_ops.placeholder(dtypes.float32)
     p2 = array_ops.placeholder(dtypes.float32)
     p3 = array_ops.placeholder(dtypes.float32)
-    v1 = variables.Variable(p1, validate_shape=False)
-    v2 = variables.Variable(p2, validate_shape=False)
-    v3 = variables.Variable(p3, validate_shape=False)
+    v1 = variables.VariableV1(p1, validate_shape=False)
+    v2 = variables.VariableV1(p2, validate_shape=False)
+    v3 = variables.VariableV1(p3, validate_shape=False)
     self.assertIs(None, v1.get_shape().ndims)
     s = control_flow_ops.ref_select(index, [v1, v2, v3])
     self.assertIs(None, s.get_shape().ndims)
 
     # All inputs known but different.
-    v1 = variables.Variable([[1, 2]])
-    v2 = variables.Variable([[2], [1]])
+    v1 = variables.VariableV1([[1, 2]])
+    v2 = variables.VariableV1([[2], [1]])
     s = control_flow_ops.ref_select(index, [v1, v2])
     self.assertIs(None, s.get_shape().ndims)
 
     # All inputs known and same.
-    v1 = variables.Variable([[1, 2]])
-    v2 = variables.Variable([[1, 2]])
+    v1 = variables.VariableV1([[1, 2]])
+    v2 = variables.VariableV1([[1, 2]])
     s = control_flow_ops.ref_select(index, [v1, v2])
     self.assertEqual([1, 2], s.get_shape())
 
     # Possibly the same but not guaranteed.
-    v1 = variables.Variable([[1., 2.]])
+    v1 = variables.VariableV1([[1., 2.]])
     p2 = array_ops.placeholder(dtypes.float32, shape=[None, 2])
-    v2 = variables.Variable(p2, validate_shape=False)
+    v2 = variables.VariableV1(p2, validate_shape=False)
     s = control_flow_ops.ref_select(index, [v1, v2])
     self.assertEqual(None, s.get_shape())
 
@@ -3160,11 +3160,11 @@ class TupleTest(test.TestCase):
   def testTensors(self):
     for v1_first in [True, False]:
       with self.cached_session():
-        v1 = variables.Variable([1.0])
+        v1 = variables.VariableV1([1.0])
         add1 = math_ops.add(
             control_flow_ops.with_dependencies([v1.initializer], v1._ref()),  # pylint: disable=protected-access
             2.0)
-        v2 = variables.Variable([10.0])
+        v2 = variables.VariableV1([10.0])
         add2 = math_ops.add(
             control_flow_ops.with_dependencies([v2.initializer], v2._ref()),  # pylint: disable=protected-access
             20.0)
@@ -3190,14 +3190,14 @@ class TupleTest(test.TestCase):
   def testIndexedSlices(self):
     for v1_first in [True, False]:
       with self.cached_session():
-        v1 = variables.Variable(
+        v1 = variables.VariableV1(
             np.array([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]]).astype(
                 np.float32))
         v1_at_1 = ops.IndexedSlices(
             control_flow_ops.with_dependencies([v1.initializer], v1._ref()),  # pylint: disable=protected-access
             constant_op.constant([1]))
 
-        v2 = variables.Variable(
+        v2 = variables.VariableV1(
             np.array([[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]]).astype(
                 np.float32))
         v2_at_1 = ops.IndexedSlices(
@@ -3229,7 +3229,7 @@ class TupleTest(test.TestCase):
 
   def testAcceptTensorsAsControlInputs(self):
     with self.cached_session():
-      var = variables.Variable(0)
+      var = variables.VariableV1(0)
       assign = state_ops.assign(var, 1)
       t, = control_flow_ops.tuple(
           [constant_op.constant(0)], control_inputs=[assign])
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_test.py b/tensorflow/python/kernel_tests/dense_update_ops_test.py
index 06c3271850..120e10314f 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_test.py
@@ -87,7 +87,7 @@ class AssignOpTest(test.TestCase):
   def testAssignNonStrictShapeChecking(self):
     with self.cached_session():
       data = array_ops.fill([1024, 1024], 0)
-      p = variables.Variable([1])
+      p = variables.VariableV1([1])
       a = state_ops.assign(p, data, validate_shape=False)
       a.op.run()
       self.assertAllEqual(p.eval(), data.eval())
@@ -100,14 +100,14 @@ class AssignOpTest(test.TestCase):
 
   def testInitRequiredAssignAdd(self):
     with self.cached_session():
-      p = variables.Variable(array_ops.fill([1024, 1024], 1), dtypes.int32)
+      p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
       a = state_ops.assign_add(p, array_ops.fill([1024, 1024], 0))
       with self.assertRaisesOpError("use uninitialized"):
         a.op.run()
 
   def testInitRequiredAssignSub(self):
     with self.cached_session():
-      p = variables.Variable(array_ops.fill([1024, 1024], 1), dtypes.int32)
+      p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
       a = state_ops.assign_sub(p, array_ops.fill([1024, 1024], 0))
       with self.assertRaisesOpError("use uninitialized"):
         a.op.run()
diff --git a/tensorflow/python/kernel_tests/identity_op_py_test.py b/tensorflow/python/kernel_tests/identity_op_py_test.py
index 37f9f716f8..88ea10c22a 100644
--- a/tensorflow/python/kernel_tests/identity_op_py_test.py
+++ b/tensorflow/python/kernel_tests/identity_op_py_test.py
@@ -61,7 +61,7 @@ class IdentityOpTest(test.TestCase):
   def testRefIdentityShape(self):
     with self.cached_session():
       shape = [2, 3]
-      tensor = variables.Variable(
+      tensor = variables.VariableV1(
           constant_op.constant(
               [[1, 2, 3], [6, 5, 4]], dtype=dtypes.int32))
       self.assertEquals(shape, tensor.get_shape())
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index f90545f84c..1365d4b240 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -290,7 +290,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(self.evaluate(read), [[2]])
 
   def testUseResource(self):
-    v = variables.Variable(1.0, use_resource=True)
+    v = variables.VariableV1(1.0, use_resource=True)
     self.assertTrue(isinstance(v, resource_variable_ops.ResourceVariable))
 
   def testEagerNoUseResource(self):
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 86e063cb36..4b92309e4d 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -136,7 +136,7 @@ class StatefulScatterNdTest(test.TestCase):
         new = ref.copy()
         np_scatter(new, indices, updates)
         # Scatter via tensorflow
-        ref_var = variables.Variable(ref)
+        ref_var = variables.VariableV1(ref)
         ref_var.initializer.run()
         tf_scatter(ref_var, indices, updates).eval()
 
@@ -258,7 +258,7 @@ class StatefulScatterNdTest(test.TestCase):
       params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
       updates = np.array([-3, -4, -5]).astype(np.float32)
       with self.test_session(use_gpu=False):
-        ref = variables.Variable(params)
+        ref = variables.VariableV1(params)
         ref.initializer.run()
 
         # Indices all in range, no problem.
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index 1a0fa744ae..527b7daf10 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -178,7 +178,7 @@ class ScatterTest(test.TestCase):
             np_scatter = _TF_OPS_TO_NUMPY[tf_scatter]
           np_scatter(new, indices, updates)
           # Scatter via tensorflow
-          ref = variables.Variable(old)
+          ref = variables.VariableV1(old)
           ref.initializer.run()
           tf_scatter(ref, indices, updates).eval()
           self.assertAllClose(ref.eval(), new)
@@ -294,7 +294,7 @@ class ScatterTest(test.TestCase):
       updates = np.array([-3, -4, -5]).astype(np.float32)
       if not test.is_gpu_available():
         with self.test_session(use_gpu=False):
-          ref = variables.Variable(params)
+          ref = variables.VariableV1(params)
           ref.initializer.run()
 
           # Indices all in range, no problem.
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 401e1ae102..33f464fb90 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -394,10 +394,10 @@ class VariableScopeTest(test.TestCase):
     old = variable_scope._DEFAULT_USE_RESOURCE
     try:
       variable_scope.enable_resource_variables()
-      self.assertTrue(isinstance(variables_lib.Variable(1.0),
+      self.assertTrue(isinstance(variables_lib.VariableV1(1.0),
                                  resource_variable_ops.ResourceVariable))
       variable_scope.disable_resource_variables()
-      self.assertFalse(isinstance(variables_lib.Variable(1.0),
+      self.assertFalse(isinstance(variables_lib.VariableV1(1.0),
                                   resource_variable_ops.ResourceVariable))
     finally:
       variable_scope._DEFAULT_USE_RESOURCE = old
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 2e7975667c..942ceedc8b 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -43,14 +43,14 @@ class VariablesTestCase(test.TestCase):
 
   def testInitialization(self):
     with self.cached_session():
-      var0 = variables.Variable(0.0)
+      var0 = variables.VariableV1(0.0)
       self.assertEqual("Variable:0", var0.name)
       self.assertEqual("Variable", var0._shared_name)
       self.assertEqual([], var0.get_shape())
       self.assertEqual([], var0.get_shape())
       self.assertEqual([], var0.shape)
 
-      var1 = variables.Variable(1.1)
+      var1 = variables.VariableV1(1.1)
       self.assertEqual("Variable_1:0", var1.name)
       self.assertEqual("Variable_1", var1._shared_name)
       self.assertEqual([], var1.get_shape())
@@ -143,7 +143,7 @@ class VariablesTestCase(test.TestCase):
 
   def testZeroSizeStringAssign(self):
     with self.cached_session() as sess:
-      array = variables.Variable(
+      array = variables.VariableV1(
           initial_value=array_ops.zeros((0,), dtype=dtypes.string),
           name="foo",
           trainable=False,
@@ -192,7 +192,7 @@ class VariablesTestCase(test.TestCase):
         # d get the control dep.
         d = constant_op.constant(2.0)
         # variables do not.
-        var_x = variables.Variable(2.0)
+        var_x = variables.VariableV1(2.0)
       self.assertEqual([c.op], d.op.control_inputs)
       self.assertEqual([], var_x.initializer.control_inputs)
       self.assertEqual([], var_x.value().op.control_inputs)
@@ -280,10 +280,10 @@ class VariablesTestCase(test.TestCase):
 
   def testCollections(self):
     with self.cached_session():
-      var_x = variables.Variable(2.0)
-      var_y = variables.Variable(2.0, trainable=False)
-      var_z = variables.Variable(2.0, trainable=True)
-      var_t = variables.Variable(
+      var_x = variables.VariableV1(2.0)
+      var_y = variables.VariableV1(2.0, trainable=False)
+      var_z = variables.VariableV1(2.0, trainable=True)
+      var_t = variables.VariableV1(
           2.0,
           trainable=True,
           collections=[
@@ -296,9 +296,9 @@ class VariablesTestCase(test.TestCase):
   def testCollectionsWithScope(self):
     with self.cached_session():
       with ops.name_scope("scope_1"):
-        var_x = variables.Variable(2.0)
+        var_x = variables.VariableV1(2.0)
       with ops.name_scope("scope_2"):
-        var_y = variables.Variable(2.0)
+        var_y = variables.VariableV1(2.0)
 
       self.assertEqual([var_x, var_y], variables.global_variables())
       self.assertEqual([var_x], variables.global_variables("scope_1"))
@@ -399,7 +399,7 @@ class VariablesTestCase(test.TestCase):
 
   def testColocation(self):
     with ops.device("/job:ps"):
-      var = variables.Variable(0, name="v")
+      var = variables.VariableV1(0, name="v")
     with ops.device("/job:worker/task:7"):
       assign_op = var.assign(1)
     self.assertDeviceEqual("/job:ps", assign_op.device)
@@ -522,7 +522,7 @@ class VariablesTestCase(test.TestCase):
       self.assertAllClose(np.ones((5, 5), np.float32), var.eval())
 
   def testRepr(self):
-    var = variables.Variable(np.zeros((5, 5), np.float32), name="noop")
+    var = variables.VariableV1(np.zeros((5, 5), np.float32), name="noop")
     self.assertEqual(
         "<tf.Variable 'noop:0' shape=(5, 5) dtype=float32_ref>",
         repr(var))
@@ -556,8 +556,8 @@ class IsInitializedTest(test.TestCase):
 
   def testVariableList(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
-      v = variables.Variable([1, 2], name="v")
-      w = variables.Variable([3, 4], name="w")
+      v = variables.VariableV1([1, 2], name="v")
+      w = variables.VariableV1([3, 4], name="w")
       uninited = variables.report_uninitialized_variables()
       self.assertAllEqual(np.array([b"v", b"w"]), sess.run(uninited))
       sess.run(w.initializer)
@@ -593,8 +593,8 @@ class ObsoleteIsInitializedTest(test.TestCase):
 
   def testVariables(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
-      v = variables.Variable([1, 2])
-      w = variables.Variable([3, 4])
+      v = variables.VariableV1([1, 2])
+      w = variables.VariableV1([3, 4])
       _ = v, w
       inited = variables.assert_variables_initialized()
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
@@ -604,8 +604,8 @@ class ObsoleteIsInitializedTest(test.TestCase):
 
   def testVariableList(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
-      v = variables.Variable([1, 2])
-      w = variables.Variable([3, 4])
+      v = variables.VariableV1([1, 2])
+      w = variables.VariableV1([3, 4])
       inited = variables.assert_variables_initialized([v])
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         inited.op.run()
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 4f6e5dc473..3c9b7a01c7 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -273,7 +273,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
   def testVariableRefGradient(self):
     with ops.Graph().as_default():
       init = constant_op.constant(100.0)
-      var = variables.Variable(init)
+      var = variables.VariableV1(init)
       gradient = gradients.gradients(var._ref(), var)
       self.assertIsNotNone(gradient)
 
diff --git a/tensorflow/python/ops/matmul_benchmark.py b/tensorflow/python/ops/matmul_benchmark.py
index 6e5fe74290..138149e63d 100644
--- a/tensorflow/python/ops/matmul_benchmark.py
+++ b/tensorflow/python/ops/matmul_benchmark.py
@@ -49,13 +49,13 @@ def build_graph(device, n, m, k, transpose_a, transpose_b, dtype):
   """
   with ops.device('%s' % device):
     if not transpose_a:
-      x = variables.Variable(random_ops.random_uniform([n, m], dtype=dtype))
+      x = variables.VariableV1(random_ops.random_uniform([n, m], dtype=dtype))
     else:
-      x = variables.Variable(random_ops.random_uniform([m, n], dtype=dtype))
+      x = variables.VariableV1(random_ops.random_uniform([m, n], dtype=dtype))
     if not transpose_b:
-      y = variables.Variable(random_ops.random_uniform([m, k], dtype=dtype))
+      y = variables.VariableV1(random_ops.random_uniform([m, k], dtype=dtype))
     else:
-      y = variables.Variable(random_ops.random_uniform([k, m], dtype=dtype))
+      y = variables.VariableV1(random_ops.random_uniform([k, m], dtype=dtype))
 
     z = math_ops.matmul(x, y, transpose_a=transpose_a, transpose_b=transpose_b)
     return control_flow_ops.group(z)
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 562e1ad6cb..af5c7d4050 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -198,7 +198,7 @@ VariableSynchronization = variables.VariableSynchronization  # pylint: disable=i
 VariableAggregation = variables.VariableAggregation  # pylint: disable=invalid-name
 
 AUTO_REUSE = _ReuseMode.AUTO_REUSE
-tf_export("AUTO_REUSE").export_constant(__name__, "AUTO_REUSE")
+tf_export(v1=["AUTO_REUSE"]).export_constant(__name__, "AUTO_REUSE")
 AUTO_REUSE.__doc__ = """
 When passed in as the value for the `reuse` flag, AUTO_REUSE indicates that
 get_variable() should create the requested variable if it doesn't exist or, if
@@ -908,7 +908,7 @@ class _VariableStore(object):
     if use_resource is None:
       # Set the default value if unspecified.
       use_resource = _DEFAULT_USE_RESOURCE
-    v = variable(
+    v = variables.VariableV1(
         initial_value=init_val,
         name=name,
         trainable=trainable,
@@ -994,7 +994,7 @@ def no_regularizer(_):
 
 
 # TODO(alive): support caching devices and partitioned variables in Eager mode.
-@tf_export("VariableScope")
+@tf_export(v1=["VariableScope"])
 class VariableScope(object):
   """Variable scope object to carry defaults to provide to `get_variable`.
 
@@ -1342,7 +1342,7 @@ def get_variable_scope_store():
   return scope_store
 
 
-@tf_export("get_variable_scope")
+@tf_export(v1=["get_variable_scope"])
 def get_variable_scope():
   """Returns the current variable scope."""
   return get_variable_scope_store().current_scope
@@ -1451,7 +1451,7 @@ class EagerVariableStore(object):
 # The argument list for get_variable must match arguments to get_local_variable.
 # So, if you are updating the arguments, also update arguments to
 # get_local_variable below.
-@tf_export("get_variable")
+@tf_export(v1=["get_variable"])
 def get_variable(name,
                  shape=None,
                  dtype=None,
@@ -1596,7 +1596,7 @@ get_variable.__doc__ = get_variable_or_local_docstring % (
 
 # The argument list for get_local_variable must match arguments to get_variable.
 # So, if you are updating the arguments, also update arguments to get_variable.
-@tf_export("get_local_variable")
+@tf_export(v1=["get_local_variable"])
 def get_local_variable(  # pylint: disable=missing-docstring
     name,
     shape=None,
@@ -1941,7 +1941,7 @@ def _get_unique_variable_scope(prefix):
 # Named like a function for backwards compatibility with the
 # @tf_contextlib.contextmanager version, which was switched to a class to avoid
 # some object creation overhead.
-@tf_export("variable_scope")  # pylint: disable=invalid-name
+@tf_export(v1=["variable_scope"])  # pylint: disable=invalid-name
 class variable_scope(object):
   """A context manager for defining ops that creates variables (layers).
 
@@ -2322,7 +2322,7 @@ class variable_scope(object):
 
 
 # pylint: disable=g-doc-return-or-yield
-@tf_export("variable_op_scope")
+@tf_export(v1=["variable_op_scope"])
 @tf_contextlib.contextmanager
 def variable_op_scope(values,
                       name_or_scope,
@@ -2443,7 +2443,33 @@ def default_variable_creator(next_creator=None, **kwargs):
         expected_shape=expected_shape, import_scope=import_scope)
 
 
+def default_variable_creator_v2(next_creator=None, **kwargs):
+  """Default variable creator."""
+  assert next_creator is None
+  initial_value = kwargs.get("initial_value", None)
+  trainable = kwargs.get("trainable", None)
+  validate_shape = kwargs.get("validate_shape", True)
+  caching_device = kwargs.get("caching_device", None)
+  name = kwargs.get("name", None)
+  variable_def = kwargs.get("variable_def", None)
+  dtype = kwargs.get("dtype", None)
+  import_scope = kwargs.get("import_scope", None)
+  constraint = kwargs.get("constraint", None)
+
+  # Set trainable value based on synchronization value.
+  synchronization = kwargs.get("synchronization", VariableSynchronization.AUTO)
+  trainable = _get_trainable_value(
+      synchronization=synchronization, trainable=trainable)
+
+  return resource_variable_ops.ResourceVariable(
+      initial_value=initial_value, trainable=trainable,
+      validate_shape=validate_shape, caching_device=caching_device,
+      name=name, dtype=dtype, constraint=constraint, variable_def=variable_def,
+      import_scope=import_scope)
+
+
 variables.default_variable_creator = default_variable_creator
+variables.default_variable_creator_v2 = default_variable_creator_v2
 
 
 def _make_getter(captured_getter, captured_previous):
@@ -2452,11 +2478,12 @@ def _make_getter(captured_getter, captured_previous):
 
 
 # TODO(apassos) remove forwarding symbol
-variable = variables.Variable
+variable = variables.VariableV1
 
 
+@tf_export(v1=["variable_creator_scope"])
 @tf_contextlib.contextmanager
-def variable_creator_scope(variable_creator):
+def variable_creator_scope_v1(variable_creator):
   """Scope which defines a variable creation function to be used by variable().
 
   variable_creator is expected to be a function with the following signature:
@@ -2527,3 +2554,73 @@ def variable_creator_scope(variable_creator):
   """
   with ops.get_default_graph()._variable_creator_scope(variable_creator):  # pylint: disable=protected-access
     yield
+
+
+# Note: only the docstrings differ between this and v1.
+@tf_export(v2=["variable_creator_scope"])
+@tf_contextlib.contextmanager
+def variable_creator_scope(variable_creator):
+  """Scope which defines a variable creation function to be used by variable().
+
+  variable_creator is expected to be a function with the following signature:
+
+  ```
+    def variable_creator(next_creator, **kwargs)
+  ```
+
+  The creator is supposed to eventually call the next_creator to create a
+  variable if it does want to create a variable and not call Variable or
+  ResourceVariable directly. This helps make creators composable. A creator may
+  choose to create multiple variables, return already existing variables, or
+  simply register that a variable was created and defer to the next creators in
+  line. Creators can also modify the keyword arguments seen by the next
+  creators.
+
+  Custom getters in the variable scope will eventually resolve down to these
+  custom creators when they do create variables.
+
+  The valid keyword arguments in kwds are:
+      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
+        which is the initial value for the Variable. The initial value must have
+        a shape specified unless `validate_shape` is set to False. Can also be a
+        callable with no argument that returns the initial value when called. In
+        that case, `dtype` must be specified. (Note that initializer functions
+        from init_ops.py must first be bound to a shape before being used here.)
+      trainable: If `True`, the default, GradientTapes automatically watch
+        uses of this Variable.
+      validate_shape: If `False`, allows the variable to be initialized with a
+        value of unknown shape. If `True`, the default, the shape of
+        `initial_value` must be known.
+      caching_device: Optional device string describing where the Variable
+        should be cached for reading.  Defaults to the Variable's device.
+        If not `None`, caches on another device.  Typical use is to cache
+        on the device where the Ops using the Variable reside, to deduplicate
+        copying through `Switch` and other conditional statements.
+      name: Optional name for the variable. Defaults to `'Variable'` and gets
+        uniquified automatically.
+      dtype: If set, initial_value will be converted to the given type.
+        If `None`, either the datatype will be kept (if `initial_value` is
+        a Tensor), or `convert_to_tensor` will decide.
+      constraint: A constraint function to be applied to the variable after
+        updates by some algorithms.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses
+        when to synchronize. If `synchronization` is set to `ON_READ`,
+        `trainable` must not be set to `True`.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
+
+  This set may grow over time, so it's important the signature of creators is as
+  mentioned above.
+
+  Args:
+    variable_creator: the passed creator
+
+  Yields:
+    A scope in which the creator is active
+  """
+  with ops.get_default_graph()._variable_creator_scope(variable_creator):  # pylint: disable=protected-access
+    yield
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 7a46157739..8da1e9fe56 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -46,6 +46,11 @@ def default_variable_creator(_, **kwds):
   raise NotImplementedError("variable_scope needs to be imported")
 
 
+def default_variable_creator_v2(_, **kwds):
+  del kwds
+  raise NotImplementedError("variable_scope needs to be imported")
+
+
 def _make_getter(captured_getter, captured_previous):
   """To avoid capturing loop variables."""
   def getter(**kwargs):
@@ -101,21 +106,21 @@ class VariableAggregation(enum.Enum):
 class VariableMetaclass(type):
   """Metaclass to allow construction of tf.Variable to be overridden."""
 
-  def _variable_call(cls,
-                     initial_value=None,
-                     trainable=None,
-                     collections=None,
-                     validate_shape=True,
-                     caching_device=None,
-                     name=None,
-                     variable_def=None,
-                     dtype=None,
-                     expected_shape=None,
-                     import_scope=None,
-                     constraint=None,
-                     use_resource=None,
-                     synchronization=VariableSynchronization.AUTO,
-                     aggregation=VariableAggregation.NONE):
+  def _variable_v1_call(cls,
+                        initial_value=None,
+                        trainable=None,
+                        collections=None,
+                        validate_shape=True,
+                        caching_device=None,
+                        name=None,
+                        variable_def=None,
+                        dtype=None,
+                        expected_shape=None,
+                        import_scope=None,
+                        constraint=None,
+                        use_resource=None,
+                        synchronization=VariableSynchronization.AUTO,
+                        aggregation=VariableAggregation.NONE):
     """Call on Variable class. Useful to force the signature."""
     previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
     for getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
@@ -140,14 +145,49 @@ class VariableMetaclass(type):
         synchronization=synchronization,
         aggregation=aggregation)
 
+  def _variable_v2_call(cls,
+                        initial_value=None,
+                        trainable=None,
+                        validate_shape=True,
+                        caching_device=None,
+                        name=None,
+                        variable_def=None,
+                        dtype=None,
+                        import_scope=None,
+                        constraint=None,
+                        synchronization=VariableSynchronization.AUTO,
+                        aggregation=VariableAggregation.NONE):
+    """Call on Variable class. Useful to force the signature."""
+    previous_getter = lambda **kws: default_variable_creator_v2(None, **kws)
+    for getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
+      previous_getter = _make_getter(getter, previous_getter)
+
+    # Reset `aggregation` that is explicitly set as `None` to the enum NONE.
+    if aggregation is None:
+      aggregation = VariableAggregation.NONE
+    return previous_getter(
+        initial_value=initial_value,
+        trainable=trainable,
+        validate_shape=validate_shape,
+        caching_device=caching_device,
+        name=name,
+        variable_def=variable_def,
+        dtype=dtype,
+        import_scope=import_scope,
+        constraint=constraint,
+        synchronization=synchronization,
+        aggregation=aggregation)
+
   def __call__(cls, *args, **kwargs):
-    if cls is Variable:
-      return cls._variable_call(*args, **kwargs)
+    if cls is VariableV1:
+      return cls._variable_v1_call(*args, **kwargs)
+    elif cls is Variable:
+      return cls._variable_v2_call(*args, **kwargs)
     else:
       return super(VariableMetaclass, cls).__call__(*args, **kwargs)
 
 
-@tf_export("Variable")
+@tf_export(v2=["Variable"])
 class Variable(six.with_metaclass(VariableMetaclass,
                                   checkpointable.CheckpointableBase)):
   """See the [Variables Guide](https://tensorflow.org/guide/variables).
@@ -267,16 +307,13 @@ class Variable(six.with_metaclass(VariableMetaclass,
   def __init__(self,
                initial_value=None,
                trainable=True,
-               collections=None,
                validate_shape=True,
                caching_device=None,
                name=None,
                variable_def=None,
                dtype=None,
-               expected_shape=None,
                import_scope=None,
                constraint=None,
-               use_resource=None,
                synchronization=VariableSynchronization.AUTO,
                aggregation=VariableAggregation.NONE):
     """Creates a new variable with value `initial_value`.
@@ -297,11 +334,8 @@ class Variable(six.with_metaclass(VariableMetaclass,
         callable with no argument that returns the initial value when called. In
         that case, `dtype` must be specified. (Note that initializer functions
         from init_ops.py must first be bound to a shape before being used here.)
-      trainable: If `True`, the default, also adds the variable to the graph
-        collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
-        the default list of variables to use by the `Optimizer` classes.
-      collections: List of graph collections keys. The new variable is added to
-        these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+      trainable: If `True`, the default, GradientTapes automatically watch uses
+        of this variable.
       validate_shape: If `False`, allows the variable to be initialized with a
         value of unknown shape. If `True`, the default, the shape of
         `initial_value` must be known.
@@ -319,8 +353,6 @@ class Variable(six.with_metaclass(VariableMetaclass,
       dtype: If set, initial_value will be converted to the given type.
         If `None`, either the datatype will be kept (if `initial_value` is
         a Tensor), or `convert_to_tensor` will decide.
-      expected_shape: A TensorShape. If set, initial_value is expected
-        to have this shape.
       import_scope: Optional `string`. Name scope to add to the
         `Variable.` Only used when initializing from protocol buffer.
       constraint: An optional projection function to be applied to the variable
@@ -330,9 +362,6 @@ class Variable(six.with_metaclass(VariableMetaclass,
         variable and return the Tensor for the projected value
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
-      use_resource: if True, a ResourceVariable is created; otherwise an
-       old-style ref-based variable is created. When eager execution is enabled
-       a resource variable is always created.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
@@ -1009,11 +1038,207 @@ class Variable(six.with_metaclass(VariableMetaclass,
     raise NotImplementedError
 
 
+@tf_export(v1=["Variable"])
+class VariableV1(Variable):
+  """See the [Variables Guide](https://tensorflow.org/guide/variables).
+
+  A variable maintains state in the graph across calls to `run()`. You add a
+  variable to the graph by constructing an instance of the class `Variable`.
+
+  The `Variable()` constructor requires an initial value for the variable,
+  which can be a `Tensor` of any type and shape. The initial value defines the
+  type and shape of the variable. After construction, the type and shape of
+  the variable are fixed. The value can be changed using one of the assign
+  methods.
+
+  If you want to change the shape of a variable later you have to use an
+  `assign` Op with `validate_shape=False`.
+
+  Just like any `Tensor`, variables created with `Variable()` can be used as
+  inputs for other Ops in the graph. Additionally, all the operators
+  overloaded for the `Tensor` class are carried over to variables, so you can
+  also add nodes to the graph by just doing arithmetic on variables.
+
+  ```python
+  import tensorflow as tf
+
+  # Create a variable.
+  w = tf.Variable(<initial-value>, name=<optional-name>)
+
+  # Use the variable in the graph like any Tensor.
+  y = tf.matmul(w, ...another variable or tensor...)
+
+  # The overloaded operators are available too.
+  z = tf.sigmoid(w + y)
+
+  # Assign a new value to the variable with `assign()` or a related method.
+  w.assign(w + 1.0)
+  w.assign_add(1.0)
+  ```
+
+  When you launch the graph, variables have to be explicitly initialized before
+  you can run Ops that use their value. You can initialize a variable by
+  running its *initializer op*, restoring the variable from a save file, or
+  simply running an `assign` Op that assigns a value to the variable. In fact,
+  the variable *initializer op* is just an `assign` Op that assigns the
+  variable's initial value to the variable itself.
+
+  ```python
+  # Launch the graph in a session.
+  with tf.Session() as sess:
+      # Run the variable initializer.
+      sess.run(w.initializer)
+      # ...you now can run ops that use the value of 'w'...
+  ```
+
+  The most common initialization pattern is to use the convenience function
+  `global_variables_initializer()` to add an Op to the graph that initializes
+  all the variables. You then run that Op after launching the graph.
+
+  ```python
+  # Add an Op to initialize global variables.
+  init_op = tf.global_variables_initializer()
+
+  # Launch the graph in a session.
+  with tf.Session() as sess:
+      # Run the Op that initializes global variables.
+      sess.run(init_op)
+      # ...you can now run any Op that uses variable values...
+  ```
+
+  If you need to create a variable with an initial value dependent on another
+  variable, use the other variable's `initialized_value()`. This ensures that
+  variables are initialized in the right order.
+
+  All variables are automatically collected in the graph where they are
+  created. By default, the constructor adds the new variable to the graph
+  collection `GraphKeys.GLOBAL_VARIABLES`. The convenience function
+  `global_variables()` returns the contents of that collection.
+
+  When building a machine learning model it is often convenient to distinguish
+  between variables holding the trainable model parameters and other variables
+  such as a `global step` variable used to count training steps. To make this
+  easier, the variable constructor supports a `trainable=<bool>` parameter. If
+  `True`, the new variable is also added to the graph collection
+  `GraphKeys.TRAINABLE_VARIABLES`. The convenience function
+  `trainable_variables()` returns the contents of this collection. The
+  various `Optimizer` classes use this collection as the default list of
+  variables to optimize.
+
+  WARNING: tf.Variable objects by default have a non-intuitive memory model. A
+  Variable is represented internally as a mutable Tensor which can
+  non-deterministically alias other Tensors in a graph. The set of operations
+  which consume a Variable and can lead to aliasing is undetermined and can
+  change across TensorFlow versions. Avoid writing code which relies on the
+  value of a Variable either changing or not changing as other operations
+  happen. For example, using Variable objects or simple functions thereof as
+  predicates in a `tf.cond` is dangerous and error-prone:
+
+  ```
+  v = tf.Variable(True)
+  tf.cond(v, lambda: v.assign(False), my_false_fn)  # Note: this is broken.
+  ```
+
+  Here replacing adding `use_resource=True` when constructing the variable will
+  fix any nondeterminism issues:
+  ```
+  v = tf.Variable(True, use_resource=True)
+  tf.cond(v, lambda: v.assign(False), my_false_fn)
+  ```
+
+  To use the replacement for variables which does
+  not have these issues:
+
+  * Add `use_resource=True` when constructing `tf.Variable`;
+  * Call `tf.get_variable_scope().set_use_resource(True)` inside a
+    `tf.variable_scope` before the `tf.get_variable()` call.
+  """
+
+  def __init__(self,  # pylint: disable=super-init-not-called
+               initial_value=None,
+               trainable=True,
+               collections=None,
+               validate_shape=True,
+               caching_device=None,
+               name=None,
+               variable_def=None,
+               dtype=None,
+               expected_shape=None,
+               import_scope=None,
+               constraint=None,
+               use_resource=None,
+               synchronization=VariableSynchronization.AUTO,
+               aggregation=VariableAggregation.NONE):
+    """Creates a new variable with value `initial_value`.
+
+    The new variable is added to the graph collections listed in `collections`,
+    which defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+
+    If `trainable` is `True` the variable is also added to the graph collection
+    `GraphKeys.TRAINABLE_VARIABLES`.
+
+    This constructor creates both a `variable` Op and an `assign` Op to set the
+    variable to its initial value.
+
+    Args:
+      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
+        which is the initial value for the Variable. The initial value must have
+        a shape specified unless `validate_shape` is set to False. Can also be a
+        callable with no argument that returns the initial value when called. In
+        that case, `dtype` must be specified. (Note that initializer functions
+        from init_ops.py must first be bound to a shape before being used here.)
+      trainable: If `True`, the default, also adds the variable to the graph
+        collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
+        the default list of variables to use by the `Optimizer` classes.
+      collections: List of graph collections keys. The new variable is added to
+        these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+      validate_shape: If `False`, allows the variable to be initialized with a
+        value of unknown shape. If `True`, the default, the shape of
+        `initial_value` must be known.
+      caching_device: Optional device string describing where the Variable
+        should be cached for reading.  Defaults to the Variable's device.
+        If not `None`, caches on another device.  Typical use is to cache
+        on the device where the Ops using the Variable reside, to deduplicate
+        copying through `Switch` and other conditional statements.
+      name: Optional name for the variable. Defaults to `'Variable'` and gets
+        uniquified automatically.
+      variable_def: `VariableDef` protocol buffer. If not `None`, recreates
+        the Variable object with its contents, referencing the variable's nodes
+        in the graph, which must already exist. The graph is not changed.
+        `variable_def` and the other arguments are mutually exclusive.
+      dtype: If set, initial_value will be converted to the given type.
+        If `None`, either the datatype will be kept (if `initial_value` is
+        a Tensor), or `convert_to_tensor` will decide.
+      expected_shape: A TensorShape. If set, initial_value is expected
+        to have this shape.
+      import_scope: Optional `string`. Name scope to add to the
+        `Variable.` Only used when initializing from protocol buffer.
+      constraint: An optional projection function to be applied to the variable
+        after being updated by an `Optimizer` (e.g. used to implement norm
+        constraints or value constraints for layer weights). The function must
+        take as input the unprojected Tensor representing the value of the
+        variable and return the Tensor for the projected value
+        (which must have the same shape). Constraints are not safe to
+        use when doing asynchronous distributed training.
+      use_resource: whether to use resource variables.
+      synchronization: unused
+      aggregation: unused
+
+    Raises:
+      ValueError: If both `variable_def` and initial_value are specified.
+      ValueError: If the initial value is not specified, or does not have a
+        shape and `validate_shape` is `True`.
+      RuntimeError: If eager execution is enabled.
+    """
+
+  SaveSliceInfo = Variable.SaveSliceInfo
+
+
 # TODO(apassos): do not repeat all comments here
-class RefVariable(Variable):
+class RefVariable(VariableV1):
   """Ref-based implementation of variables."""
 
-  def __init__(self,
+  def __init__(self,  # pylint: disable=super-init-not-called
                initial_value=None,
                trainable=True,
                collections=None,
@@ -1873,7 +2098,7 @@ class RefVariable(Variable):
   def _OverloadAllOperators():  # pylint: disable=invalid-name
     """Register overloads for all operators."""
     for operator in ops.Tensor.OVERLOADABLE_OPERATORS:
-      Variable._OverloadOperator(operator)
+      Variable._OverloadOperator(operator)  # pylint: disable=protected-access
     # For slicing, bind getitem differently than a tensor (use SliceHelperVar
     # instead)
     # pylint: disable=protected-access
@@ -2401,7 +2626,7 @@ class PartitionedVariable(object):
         "assign() has not been implemented for PartitionedVariable.")
 
 
-@tf_export("global_variables")
+@tf_export(v1=["global_variables"])
 def global_variables(scope=None):
   """Returns global variables.
 
@@ -2427,7 +2652,7 @@ def global_variables(scope=None):
   return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES, scope)
 
 
-@tf_export("all_variables")
+@tf_export(v1=["all_variables"])
 @deprecated("2017-03-02", "Please use tf.global_variables instead.")
 def all_variables():
   """See `tf.global_variables`."""
@@ -2452,7 +2677,7 @@ def _all_saveable_objects(scope=None):
           ops.get_collection(ops.GraphKeys.SAVEABLE_OBJECTS, scope))
 
 
-@tf_export("local_variables")
+@tf_export(v1=["local_variables"])
 def local_variables(scope=None):
   """Returns local variables.
 
@@ -2480,7 +2705,7 @@ def local_variables(scope=None):
   return ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES, scope)
 
 
-@tf_export("model_variables")
+@tf_export(v1=["model_variables"])
 def model_variables(scope=None):
   """Returns all variables in the MODEL_VARIABLES collection.
 
@@ -2497,7 +2722,7 @@ def model_variables(scope=None):
   return ops.get_collection(ops.GraphKeys.MODEL_VARIABLES, scope)
 
 
-@tf_export("trainable_variables")
+@tf_export(v1=["trainable_variables"])
 def trainable_variables(scope=None):
   """Returns all variables created with `trainable=True`.
 
@@ -2519,7 +2744,7 @@ def trainable_variables(scope=None):
   return ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES, scope)
 
 
-@tf_export("moving_average_variables")
+@tf_export(v1=["moving_average_variables"])
 def moving_average_variables(scope=None):
   """Returns all variables that maintain their moving averages.
 
@@ -2541,7 +2766,7 @@ def moving_average_variables(scope=None):
   return ops.get_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES, scope)
 
 
-@tf_export("initializers.variables", "variables_initializer")
+@tf_export(v1=["initializers.variables", "variables_initializer"])
 def variables_initializer(var_list, name="init"):
   """Returns an Op that initializes a list of variables.
 
@@ -2567,7 +2792,7 @@ def variables_initializer(var_list, name="init"):
   return control_flow_ops.no_op(name=name)
 
 
-@tf_export("initialize_variables")
+@tf_export(v1=["initialize_variables"])
 @tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.variables_initializer` instead.")
 def initialize_variables(var_list, name="init"):
@@ -2575,7 +2800,7 @@ def initialize_variables(var_list, name="init"):
   return variables_initializer(var_list, name=name)
 
 
-@tf_export("initializers.global_variables", "global_variables_initializer")
+@tf_export(v1=["initializers.global_variables", "global_variables_initializer"])
 def global_variables_initializer():
   """Returns an Op that initializes global variables.
 
@@ -2589,7 +2814,7 @@ def global_variables_initializer():
   return variables_initializer(global_variables())
 
 
-@tf_export("initialize_all_variables")
+@tf_export(v1=["initialize_all_variables"])
 @tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.global_variables_initializer` instead.")
 def initialize_all_variables():
@@ -2597,7 +2822,7 @@ def initialize_all_variables():
   return global_variables_initializer()
 
 
-@tf_export("initializers.local_variables", "local_variables_initializer")
+@tf_export(v1=["initializers.local_variables", "local_variables_initializer"])
 def local_variables_initializer():
   """Returns an Op that initializes all local variables.
 
@@ -2611,7 +2836,7 @@ def local_variables_initializer():
   return variables_initializer(local_variables())
 
 
-@tf_export("initialize_local_variables")
+@tf_export(v1=["initialize_local_variables"])
 @tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.local_variables_initializer` instead.")
 def initialize_local_variables():
@@ -2619,7 +2844,7 @@ def initialize_local_variables():
   return local_variables_initializer()
 
 
-@tf_export("is_variable_initialized")
+@tf_export(v1=["is_variable_initialized"])
 @tf_should_use.should_use_result
 def is_variable_initialized(variable):
   """Tests if a variable has been initialized.
@@ -2634,7 +2859,7 @@ def is_variable_initialized(variable):
   return state_ops.is_variable_initialized(variable)
 
 
-@tf_export("assert_variables_initialized")
+@tf_export(v1=["assert_variables_initialized"])
 @tf_should_use.should_use_result
 def assert_variables_initialized(var_list=None):
   """Returns an Op to check if variables are initialized.
@@ -2677,7 +2902,7 @@ def assert_variables_initialized(var_list=None):
       return array_ops.stack(ranks)
 
 
-@tf_export("report_uninitialized_variables")
+@tf_export(v1=["report_uninitialized_variables"])
 @tf_should_use.should_use_result
 def report_uninitialized_variables(var_list=None,
                                    name="report_uninitialized_variables"):
diff --git a/tensorflow/python/saved_model/loader_test.py b/tensorflow/python/saved_model/loader_test.py
index b7e217a35b..924b2e7c06 100644
--- a/tensorflow/python/saved_model/loader_test.py
+++ b/tensorflow/python/saved_model/loader_test.py
@@ -47,8 +47,8 @@ class SavedModelLoaderTest(test.TestCase):
   def setUp(self):
     """Write test SavedModels to a temp directory."""
     with session.Session(graph=ops.Graph()) as sess:
-      x = variables.Variable(5, name="x")
-      y = variables.Variable(11, name="y")
+      x = variables.VariableV1(5, name="x")
+      y = variables.VariableV1(11, name="y")
       z = x + y
       sess.run(variables.global_variables_initializer())
 
@@ -134,8 +134,8 @@ class SavedModelLoaderTest(test.TestCase):
   def test_restore_variables(self):
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     with self.session(graph=ops.Graph()) as sess:
-      x = variables.Variable(0, name="x")
-      y = variables.Variable(0, name="y")
+      x = variables.VariableV1(0, name="x")
+      y = variables.VariableV1(0, name="y")
       z = x * y
 
       sess.run(variables.global_variables_initializer())
@@ -186,8 +186,10 @@ class SavedModelLoaderTest(test.TestCase):
     """
     path = _get_export_dir("no_variable_saved_model")
     with session.Session(graph=ops.Graph()) as sess:
-      x = variables.Variable(5, name="x", collections=["not_global_variable"])
-      y = variables.Variable(11, name="y", collections=["not_global_variable"])
+      x = variables.VariableV1(
+          5, name="x", collections=["not_global_variable"])
+      y = variables.VariableV1(
+          11, name="y", collections=["not_global_variable"])
       self.assertFalse(variables._all_saveable_objects())
       z = x + y
       sess.run(variables.variables_initializer([x, y]))
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 49d52d3bee..80b75b7ee6 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -60,7 +60,7 @@ class SavedModelTest(test.TestCase):
     return os.path.join(test.get_temp_dir(), label)
 
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
-    v = variables.Variable(variable_value, name=variable_name)
+    v = variables.VariableV1(variable_value, name=variable_name)
     sess.run(variables.global_variables_initializer())
     self.assertEqual(variable_value, v.eval())
 
@@ -458,7 +458,7 @@ class SavedModelTest(test.TestCase):
     # Graph with a single variable added to a collection. SavedModel invoked to:
     # - add with weights.
     with self.session(graph=ops.Graph()) as sess:
-      v = variables.Variable(42, name="v")
+      v = variables.VariableV1(42, name="v")
       ops.add_to_collection("foo_vars", v)
       sess.run(variables.global_variables_initializer())
       self.assertEqual(42, v.eval())
@@ -468,7 +468,7 @@ class SavedModelTest(test.TestCase):
     # SavedModel invoked to:
     # - simply add the model (weights are not updated).
     with self.session(graph=ops.Graph()) as sess:
-      v = variables.Variable(43, name="v")
+      v = variables.VariableV1(43, name="v")
       ops.add_to_collection("bar_vars", v)
       sess.run(variables.global_variables_initializer())
       self.assertEqual(43, v.eval())
@@ -780,13 +780,13 @@ class SavedModelTest(test.TestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
-      v1 = variables.Variable(1, name="v1")
+      v1 = variables.VariableV1(1, name="v1")
       ops.add_to_collection("v", v1)
-      v2 = variables.Variable(2, name="v2")
+      v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
       # Initialize another variable `v3` to 42.
-      v3 = variables.Variable(42, name="v3")
+      v3 = variables.VariableV1(42, name="v3")
       ops.add_to_collection("v", v3)
 
       # Set up an assignment op to be run as part of the main_op.
@@ -815,13 +815,13 @@ class SavedModelTest(test.TestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
-      v1 = variables.Variable(1, name="v1")
+      v1 = variables.VariableV1(1, name="v1")
       ops.add_to_collection("v", v1)
-      v2 = variables.Variable(2, name="v2")
+      v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
       # Initialize another variable `v3` to 42.
-      v3 = variables.Variable(42, name="v3", trainable=False, collections=[])
+      v3 = variables.VariableV1(42, name="v3", trainable=False, collections=[])
       ops.add_to_collection("v", v3)
 
       # Set up an assignment op to be run as part of the legacy_init_op.
@@ -860,11 +860,11 @@ class SavedModelTest(test.TestCase):
     g = ops.Graph()
     with self.session(graph=g) as sess:
       # Initialize variable `v1` to 1.
-      v1 = variables.Variable(1, name="v1")
+      v1 = variables.VariableV1(1, name="v1")
       ops.add_to_collection("v", v1)
 
       # Initialize another variable `v2` to 42.
-      v2 = variables.Variable(42, name="v2", trainable=False, collections=[])
+      v2 = variables.VariableV1(42, name="v2", trainable=False, collections=[])
       ops.add_to_collection("v", v2)
 
       # Set up an assignment op to be run as part of the init op.
@@ -889,9 +889,9 @@ class SavedModelTest(test.TestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
-      v1 = variables.Variable(1, name="v1")
+      v1 = variables.VariableV1(1, name="v1")
       ops.add_to_collection("v", v1)
-      v2 = variables.Variable(2, name="v2")
+      v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
       sess.run(variables.global_variables_initializer())
@@ -918,9 +918,9 @@ class SavedModelTest(test.TestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
-      v1 = variables.Variable(1, name="v1")
+      v1 = variables.VariableV1(1, name="v1")
       ops.add_to_collection("v", v1)
-      v2 = variables.Variable(2, name="v2")
+      v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
       sess.run(variables.global_variables_initializer())
@@ -947,9 +947,9 @@ class SavedModelTest(test.TestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
-      v1 = variables.Variable(1, name="v1")
+      v1 = variables.VariableV1(1, name="v1")
       ops.add_to_collection("v", v1)
-      v2 = variables.Variable(2, name="v2")
+      v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
       sess.run(variables.global_variables_initializer())
@@ -1071,13 +1071,13 @@ class SavedModelTest(test.TestCase):
         graph=ops.Graph(),
         config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
       with sess.graph.device("/cpu:0"):
-        v1 = variables.Variable(1, name="v1")
+        v1 = variables.VariableV1(1, name="v1")
       with sess.graph.device("/cpu:1"):
-        v2 = variables.Variable(2, name="v2")
+        v2 = variables.VariableV1(2, name="v2")
 
       # v3 is an unsaved variable derived from v1 and v2.  It is used to
       # exercise the ability to run an init op when restoring a graph.
-      v3 = variables.Variable(1, name="v3", trainable=False, collections=[])
+      v3 = variables.VariableV1(1, name="v3", trainable=False, collections=[])
       assign_v3 = state_ops.assign(v3, math_ops.add(v1, v2))
       init_op = control_flow_ops.group(assign_v3, name="init_op")
 
@@ -1140,7 +1140,7 @@ class SavedModelTest(test.TestCase):
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
-      variables.Variable(1, name="v1")
+      variables.VariableV1(1, name="v1")
       sess.run(variables.global_variables_initializer())
       custom_saver = training.Saver(name="my_saver")
       builder.add_meta_graph_and_variables(sess, ["tag"], saver=custom_saver)
@@ -1162,7 +1162,7 @@ class SavedModelTest(test.TestCase):
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
-      variables.Variable(1, name="v1")
+      variables.VariableV1(1, name="v1")
       sess.run(variables.global_variables_initializer())
       training.Saver(name="my_saver")
       builder.add_meta_graph_and_variables(sess, ["tag"])
@@ -1184,7 +1184,7 @@ class SavedModelTest(test.TestCase):
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
-      variables.Variable(1, name="v1")
+      variables.VariableV1(1, name="v1")
       sess.run(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(sess, ["tag_0"])
 
@@ -1293,8 +1293,8 @@ class SavedModelTest(test.TestCase):
     # Add a graph with two float32 variables and a Complex Op composing them
     # with strip_default_attrs enabled.
     with session.Session(graph=ops.Graph()) as sess:
-      real_num = variables.Variable(1.0, dtype=dtypes.float32, name="real")
-      imag_num = variables.Variable(2.0, dtype=dtypes.float32, name="imag")
+      real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
+      imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
       sess.run(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(
@@ -1303,8 +1303,8 @@ class SavedModelTest(test.TestCase):
     # Add a graph with the same float32 variables and a Complex Op composing
     # them with strip_default_attrs disabled.
     with session.Session(graph=ops.Graph()) as sess:
-      real_num = variables.Variable(1.0, dtype=dtypes.float32, name="real")
-      imag_num = variables.Variable(2.0, dtype=dtypes.float32, name="imag")
+      real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
+      imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
       sess.run(variables.global_variables_initializer())
       builder.add_meta_graph(["bar"], strip_default_attrs=False)
@@ -1366,7 +1366,7 @@ class SavedModelTest(test.TestCase):
     # Add a graph with a single variable and a test op with a defaultless
     # float32 attr, "test_attr".
     with session.Session(graph=ops.Graph()) as sess:
-      variables.Variable(1.0, dtype=dtypes.float64, name="var")
+      variables.VariableV1(1.0, dtype=dtypes.float64, name="var")
       test_ops.test_attr(T=dtypes.float32, name="test_attr")
       sess.run(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(sess, ["foo"])
diff --git a/tensorflow/python/tools/freeze_graph_test.py b/tensorflow/python/tools/freeze_graph_test.py
index e38945fabc..5dc14a6961 100644
--- a/tensorflow/python/tools/freeze_graph_test.py
+++ b/tensorflow/python/tools/freeze_graph_test.py
@@ -60,7 +60,7 @@ class FreezeGraphTest(test_util.TensorFlowTestCase):
     # We'll create an input graph that has a single variable containing 1.0,
     # and that then multiplies it by 2.
     with ops.Graph().as_default():
-      variable_node = variables.Variable(1.0, name="variable_node")
+      variable_node = variables.VariableV1(1.0, name="variable_node")
       output_node = math_ops.multiply(variable_node, 2.0, name="output_node")
       sess = session.Session()
       init = variables.global_variables_initializer()
@@ -138,7 +138,7 @@ class FreezeGraphTest(test_util.TensorFlowTestCase):
       features = parsing_ops.parse_example(examples, feature_configs)
       feature = features[feature_name]
 
-      variable_node = variables.Variable(1.0, name="variable_node")
+      variable_node = variables.VariableV1(1.0, name="variable_node")
       scores = math_ops.multiply(variable_node, feature, name="output_node")
       class_feature = array_ops.fill(array_ops.shape(feature),
                                      "class_%s" % feature_name)
@@ -174,7 +174,7 @@ class FreezeGraphTest(test_util.TensorFlowTestCase):
     output_graph_filename = os.path.join(tmp_dir, "output_graph.pb")
 
     with ops.Graph().as_default():
-      variable_node = variables.Variable(1.0, name="variable_node")
+      variable_node = variables.VariableV1(1.0, name="variable_node")
       output_node = math_ops.multiply(variable_node, 2.0, name="output_node")
       sess = session.Session()
       init = variables.global_variables_initializer()
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py
index 56c4043d9d..eff15b24ce 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -247,7 +247,7 @@ def _default_getter(name, shape, dtype, initializer=None,
       def initial_value():
         return initializer(
             shape_object.as_list(), dtype=dtype, partition_info=partition_info)
-    return variables.Variable(
+    return variables.VariableV1(
         initial_value=initial_value,
         name=name,
         dtype=variable_dtype,
diff --git a/tensorflow/python/training/learning_rate_decay_test.py b/tensorflow/python/training/learning_rate_decay_test.py
index 5a9215730e..03a32f6ca0 100644
--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@@ -63,7 +63,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
 
   def testVariables(self):
     with self.cached_session():
-      step = variables.Variable(1)
+      step = variables.VariableV1(1)
       assign_1 = step.assign(1)
       assign_2 = step.assign(2)
       assign_100 = step.assign(100)
@@ -121,7 +121,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
 
     # Test that ref types are valid.
     if not context.executing_eagerly():
-      x = variables.Variable(0.0)
+      x = variables.VariableV1(0.0)
       x_ref = x.op.outputs[0]   # float32_ref tensor should be accepted
       boundaries, values = [1.0, 2.0], [1, 2, 3]
       learning_rate_decay.piecewise_constant(x_ref, boundaries, values)
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 2d7799d66a..c870d99de9 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -69,8 +69,8 @@ class ScaffoldTest(test.TestCase):
   def test_defaults_empty_graph(self):
     with ops.Graph().as_default():
       scaffold = monitored_session.Scaffold()
-      variables.Variable(1, name='my_var')
-      variables.Variable(
+      variables.VariableV1(1, name='my_var')
+      variables.VariableV1(
           2, name='my_local_var', collections=[ops.GraphKeys.LOCAL_VARIABLES])
       scaffold.finalize()
       self.assertTrue(isinstance(scaffold.init_op, ops.Operation))
@@ -105,7 +105,7 @@ class ScaffoldTest(test.TestCase):
 
   def test_caches_values(self):
     with ops.Graph().as_default():
-      variables.Variable([1])
+      variables.VariableV1([1])
       scaffold1 = monitored_session.Scaffold()
       scaffold1.finalize()
       scaffold2 = monitored_session.Scaffold()
@@ -119,7 +119,7 @@ class ScaffoldTest(test.TestCase):
 
   def test_raise_error_if_more_than_one_cached_item(self):
     with ops.Graph().as_default():
-      variables.Variable([1])
+      variables.VariableV1([1])
       ops.add_to_collection(ops.GraphKeys.SAVERS, saver_lib.Saver())
       ops.add_to_collection(ops.GraphKeys.SAVERS, saver_lib.Saver())
       with self.assertRaisesRegexp(RuntimeError, 'More than one item'):
@@ -127,7 +127,7 @@ class ScaffoldTest(test.TestCase):
 
   def test_uses_passed_values(self):
     with ops.Graph().as_default():
-      variables.Variable([1])
+      variables.VariableV1([1])
       saver = saver_lib.Saver()
       scaffold = monitored_session.Scaffold(
           init_op=2,
@@ -148,7 +148,7 @@ class ScaffoldTest(test.TestCase):
 
   def test_graph_is_finalized(self):
     with ops.Graph().as_default():
-      variables.Variable([1])
+      variables.VariableV1([1])
       monitored_session.Scaffold().finalize()
       with self.assertRaisesRegexp(RuntimeError,
                                    'Graph is finalized and cannot be modified'):
@@ -157,7 +157,7 @@ class ScaffoldTest(test.TestCase):
   def test_new_scaffold_from_default_scaffold(self):
     scaffold1 = monitored_session.Scaffold()
     with ops.Graph().as_default():
-      variables.Variable([1])
+      variables.VariableV1([1])
       saver = saver_lib.Saver()
       scaffold2 = monitored_session.Scaffold(
           init_op=2,
@@ -180,7 +180,7 @@ class ScaffoldTest(test.TestCase):
 
   def test_new_scaffold_from_existing_scaffold(self):
     with ops.Graph().as_default():
-      variables.Variable([1])
+      variables.VariableV1([1])
       saver = saver_lib.Saver()
       scaffold1 = monitored_session.Scaffold(
           init_op=2,
@@ -1374,7 +1374,7 @@ class MonitoredSessionTest(test.TestCase):
 
   def test_defaults(self):
     with ops.Graph().as_default():
-      a_var = variables.Variable(0)
+      a_var = variables.VariableV1(0)
       with monitored_session.MonitoredSession() as session:
         self.assertEqual(0, session.run(a_var))
 
@@ -1700,7 +1700,7 @@ class MonitoredSessionTest(test.TestCase):
 
   def test_graph_finalized_during_run_unfinalized_after_exit(self):
     with ops.Graph().as_default() as g:
-      a_var = variables.Variable(0)
+      a_var = variables.VariableV1(0)
       with monitored_session.MonitoredSession() as session:
         self.assertEqual(0, session.run(a_var))
         self.assertTrue(g.finalized)
@@ -1708,7 +1708,7 @@ class MonitoredSessionTest(test.TestCase):
 
   def test_keep_finalized_graph_as_finalized(self):
     with ops.Graph().as_default() as g:
-      a_var = variables.Variable(0)
+      a_var = variables.VariableV1(0)
       monitored_session.Scaffold().finalize()
       with monitored_session.MonitoredSession() as session:
         self.assertEqual(0, session.run(a_var))
@@ -2032,7 +2032,7 @@ class MonitoredSessionTest(test.TestCase):
     with ops.Graph().as_default():
       c = array_ops.placeholder(dtypes.float32)
       v = array_ops.identity(c)
-      graph_state = variables.Variable(0.0)
+      graph_state = variables.VariableV1(0.0)
       graph_side_effect = state_ops.assign_add(graph_state, 0.31)
 
       def step_fn(step_context):
@@ -2088,7 +2088,7 @@ class MonitoredSessionTest(test.TestCase):
       c = array_ops.placeholder(dtypes.float32)
       v = array_ops.identity(c)
       vv = constant_op.constant(3.2)
-      graph_state = variables.Variable(0.0)
+      graph_state = variables.VariableV1(0.0)
       graph_side_effect = state_ops.assign_add(graph_state, 0.31)
 
       class Hook(session_run_hook.SessionRunHook):
@@ -2125,7 +2125,7 @@ class SingularMonitoredSessionTest(test.TestCase):
 
   def test_handles_initialization(self):
     with ops.Graph().as_default():
-      a_var = variables.Variable(0)
+      a_var = variables.VariableV1(0)
       with monitored_session.SingularMonitoredSession() as session:
         # If it's not initialized, following statement raises an error.
         self.assertEqual(0, session.run(a_var))
diff --git a/tensorflow/python/training/quantize_training_test.py b/tensorflow/python/training/quantize_training_test.py
index 9754adea85..6edbf7665f 100644
--- a/tensorflow/python/training/quantize_training_test.py
+++ b/tensorflow/python/training/quantize_training_test.py
@@ -58,7 +58,8 @@ class PywrapQuantizeTrainingTest(test.TestCase):
     g = ops.Graph()
     with session.Session(graph=g) as sess:
       a = constant_op.constant(6.0, shape=[1, 1], name='a')
-      b = variables.Variable(constant_op.constant(7.0, shape=[1, 1]), name='b')
+      b = variables.VariableV1(
+          constant_op.constant(7.0, shape=[1, 1]), name='b')
       c = math_ops.matmul(a, b, name='matmul')
 
       init_op = variables.global_variables_initializer()
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index 9b9e28af2b..15fe42bbd8 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -44,7 +44,7 @@ class QueueRunnerTest(test.TestCase):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
-      var = variables.Variable(zero64)
+      var = variables.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
       variables.global_variables_initializer().run()
@@ -64,9 +64,9 @@ class QueueRunnerTest(test.TestCase):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
-      var0 = variables.Variable(zero64)
+      var0 = variables.VariableV1(zero64)
       count_up_to_3 = var0.count_up_to(3)
-      var1 = variables.Variable(zero64)
+      var1 = variables.VariableV1(zero64)
       count_up_to_30 = var1.count_up_to(30)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
       qr = queue_runner_impl.QueueRunner(queue, [count_up_to_3, count_up_to_30])
@@ -131,7 +131,7 @@ class QueueRunnerTest(test.TestCase):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
-      var = variables.Variable(zero64)
+      var = variables.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
       variables.global_variables_initializer().run()
@@ -184,7 +184,7 @@ class QueueRunnerTest(test.TestCase):
     with self.cached_session() as sess:
       with session.Session() as other_sess:
         zero64 = constant_op.constant(0, dtype=dtypes.int64)
-        var = variables.Variable(zero64)
+        var = variables.VariableV1(zero64)
         count_up_to = var.count_up_to(3)
         queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
         variables.global_variables_initializer().run()
@@ -199,7 +199,7 @@ class QueueRunnerTest(test.TestCase):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
-      var = variables.Variable(zero64)
+      var = variables.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
       variables.global_variables_initializer().run()
@@ -215,7 +215,7 @@ class QueueRunnerTest(test.TestCase):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
-      var = variables.Variable(zero64)
+      var = variables.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
       variables.global_variables_initializer().run()
@@ -250,7 +250,7 @@ class QueueRunnerTest(test.TestCase):
   def testStartQueueRunners(self):
     # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
-    var = variables.Variable(zero64)
+    var = variables.VariableV1(zero64)
     count_up_to = var.count_up_to(3)
     queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
     init_op = variables.global_variables_initializer()
@@ -267,7 +267,7 @@ class QueueRunnerTest(test.TestCase):
 
   def testStartQueueRunnersRaisesIfNotASession(self):
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
-    var = variables.Variable(zero64)
+    var = variables.VariableV1(zero64)
     count_up_to = var.count_up_to(3)
     queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
     init_op = variables.global_variables_initializer()
@@ -280,7 +280,7 @@ class QueueRunnerTest(test.TestCase):
 
   def testStartQueueRunnersIgnoresMonitoredSession(self):
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
-    var = variables.Variable(zero64)
+    var = variables.VariableV1(zero64)
     count_up_to = var.count_up_to(3)
     queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
     init_op = variables.global_variables_initializer()
@@ -297,7 +297,7 @@ class QueueRunnerTest(test.TestCase):
     graph = ops.Graph()
     with graph.as_default():
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
-      var = variables.Variable(zero64)
+      var = variables.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
       init_op = variables.global_variables_initializer()
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 69b1055ebe..49e6e6546d 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -311,8 +311,8 @@ class SaverTest(test.TestCase):
 
     # Build a graph with 2 parameter nodes, and Save and
     # Restore nodes for them.
-    v0 = variables.Variable(10.0, name="v0")
-    v1 = variables.Variable(20.0, name="v1")
+    v0 = variables.VariableV1(10.0, name="v0")
+    v1 = variables.VariableV1(20.0, name="v1")
     v2 = saver_test_utils.CheckpointedOp(name="v2")
     v2_init = v2.insert("k1", 30.0)
     save = saver_module.Saver(
@@ -350,8 +350,8 @@ class SaverTest(test.TestCase):
     # Start a second session.  In that session the parameter nodes
     # have not been initialized either.
     with self.cached_session() as sess:
-      v0 = variables.Variable(-1.0, name="v0")
-      v1 = variables.Variable(-1.0, name="v1")
+      v0 = variables.VariableV1(-1.0, name="v0")
+      v1 = variables.VariableV1(-1.0, name="v1")
       v2 = saver_test_utils.CheckpointedOp(name="v2")
       save = saver_module.Saver({"v0": v0, "v1": v1, "v2": v2.saveable})
 
@@ -370,7 +370,7 @@ class SaverTest(test.TestCase):
       self.assertEqual(30.0, v2.values().eval())
 
   def testFilenameTensor(self):
-    v0 = variables.Variable(0, name="v0")
+    v0 = variables.VariableV1(0, name="v0")
     filename = b"somerandomfilename"
     save = saver_module.Saver({"v0": v0}, filename=filename)
     with self.cached_session() as sess:
@@ -379,7 +379,7 @@ class SaverTest(test.TestCase):
       self.assertEqual(sess.run(tensor), filename)
 
   def testInvalidPath(self):
-    v0 = variables.Variable(0, name="v0")
+    v0 = variables.VariableV1(0, name="v0")
     for ver in (saver_pb2.SaverDef.V1, saver_pb2.SaverDef.V2):
       with self.cached_session() as sess:
         save = saver_module.Saver({"v0": v0}, write_version=ver)
@@ -392,7 +392,7 @@ class SaverTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Build a graph with 1 node, and save and restore for them.
-      v = variables.Variable(np.int64(15), name="v")
+      v = variables.VariableV1(np.int64(15), name="v")
       save = saver_module.Saver({"v": v}, restore_sequentially=True)
       variables.global_variables_initializer().run()
 
@@ -402,7 +402,7 @@ class SaverTest(test.TestCase):
       self.assertEqual(save_path, val)
 
       with self.cached_session() as sess:
-        v = variables.Variable(np.int64(-1), name="v")
+        v = variables.VariableV1(np.int64(-1), name="v")
         save = saver_module.Saver({"v": v})
 
       with self.assertRaisesWithPredicateMatch(
@@ -416,9 +416,9 @@ class SaverTest(test.TestCase):
 
   def testSomeErrors(self):
     with ops_lib.Graph().as_default():
-      v0 = variables.Variable([10.0], name="v0")
-      v1 = variables.Variable([20.0], name="v1")
-      v2 = variables.Variable([20.0], name="v2")
+      v0 = variables.VariableV1([10.0], name="v0")
+      v1 = variables.VariableV1([20.0], name="v1")
+      v2 = variables.VariableV1([20.0], name="v2")
       v2._set_save_slice_info(
           variables.Variable.SaveSliceInfo("v1", [1], [0], [1]))
 
@@ -446,7 +446,7 @@ class SaverTest(test.TestCase):
 
   def testSameName(self):
     with ops_lib.Graph().as_default():
-      v0 = variables.Variable([10.0], name="v0")
+      v0 = variables.VariableV1([10.0], name="v0")
       v2 = saver_test_utils.CheckpointedOp(name="v2")
 
       # Saving one variable under two names raises an error.
@@ -468,8 +468,8 @@ class SaverTest(test.TestCase):
     with self.session(graph=ops_lib.Graph()) as sess:
       # Build a graph with 2 parameter nodes, and Save and
       # Restore nodes for them.
-      v0 = variables.Variable(10.0, name="v0")
-      v1 = variables.Variable(20.0, name="v1")
+      v0 = variables.VariableV1(10.0, name="v0")
+      v1 = variables.VariableV1(20.0, name="v1")
       v2 = saver_test_utils.CheckpointedOp(name="v2")
       v2_init = v2.insert("k1", 30.0)
       save = saver_module.Saver([v0, v1, v2.saveable])
@@ -490,8 +490,8 @@ class SaverTest(test.TestCase):
     # Start a second session.  In that session the variables
     # have not been initialized either.
     with self.session(graph=ops_lib.Graph()) as sess:
-      v0 = variables.Variable(-1.0, name="v0")
-      v1 = variables.Variable(-1.0, name="v1")
+      v0 = variables.VariableV1(-1.0, name="v0")
+      v1 = variables.VariableV1(-1.0, name="v1")
       v2 = saver_test_utils.CheckpointedOp(name="v2")
       save = saver_module.Saver([v0, v1, v2.saveable])
 
@@ -515,8 +515,8 @@ class SaverTest(test.TestCase):
     # Build another graph with 2 nodes, initialized
     # differently, and a Restore node for them.
     with self.session(graph=ops_lib.Graph()) as sess:
-      v0_2 = variables.Variable(1000.0, name="v0")
-      v1_2 = variables.Variable(2000.0, name="v1")
+      v0_2 = variables.VariableV1(1000.0, name="v0")
+      v1_2 = variables.VariableV1(2000.0, name="v1")
       v2_2 = saver_test_utils.CheckpointedOp(name="v2")
       save2 = saver_module.Saver([v0_2, v1_2, v2_2.saveable])
       v2_2.insert("k1000", 3000.0).run()
@@ -574,14 +574,14 @@ class SaverTest(test.TestCase):
     save_path = os.path.join(self.get_temp_dir(), "gpu")
     with session.Session("", graph=ops_lib.Graph()) as sess:
       with sess.graph.device(test.gpu_device_name()):
-        v0_1 = variables.Variable(123.45)
+        v0_1 = variables.VariableV1(123.45)
       save = saver_module.Saver({"v0": v0_1})
       variables.global_variables_initializer().run()
       save.save(sess, save_path)
 
     with session.Session("", graph=ops_lib.Graph()) as sess:
       with sess.graph.device(test.gpu_device_name()):
-        v0_2 = variables.Variable(543.21)
+        v0_2 = variables.VariableV1(543.21)
       save = saver_module.Saver({"v0": v0_2})
       variables.global_variables_initializer().run()
 
@@ -591,22 +591,22 @@ class SaverTest(test.TestCase):
     save_path = os.path.join(self.get_temp_dir(), "gpu")
     with session.Session("", graph=ops_lib.Graph()) as sess:
       with sess.graph.device(test.gpu_device_name()):
-        v0_1 = variables.Variable(123.45)
+        v0_1 = variables.VariableV1(123.45)
       save = saver_module.Saver({"v0": v0_1}, sharded=True, allow_empty=True)
       variables.global_variables_initializer().run()
       save.save(sess, save_path)
 
     with session.Session("", graph=ops_lib.Graph()) as sess:
       with sess.graph.device(test.gpu_device_name()):
-        v0_2 = variables.Variable(543.21)
+        v0_2 = variables.VariableV1(543.21)
       save = saver_module.Saver({"v0": v0_2}, sharded=True, allow_empty=True)
       variables.global_variables_initializer().run()
 
   def testVariables(self):
     save_path = os.path.join(self.get_temp_dir(), "variables")
     with session.Session("", graph=ops_lib.Graph()) as sess:
-      one = variables.Variable(1.0)
-      twos = variables.Variable([2.0, 2.0, 2.0])
+      one = variables.VariableV1(1.0)
+      twos = variables.VariableV1([2.0, 2.0, 2.0])
       v2 = saver_test_utils.CheckpointedOp(name="v2")
       init = variables.global_variables_initializer()
       save = saver_module.Saver()
@@ -615,8 +615,8 @@ class SaverTest(test.TestCase):
       save.save(sess, save_path)
 
     with session.Session("", graph=ops_lib.Graph()) as sess:
-      one = variables.Variable(0.0)
-      twos = variables.Variable([0.0, 0.0, 0.0])
+      one = variables.VariableV1(0.0)
+      twos = variables.VariableV1([0.0, 0.0, 0.0])
       v2 = saver_test_utils.CheckpointedOp(name="v2")
       # Saver with no arg, defaults to 'all variables'.
       save = saver_module.Saver()
@@ -628,14 +628,14 @@ class SaverTest(test.TestCase):
 
   def testVarListShouldBeEmptyInDeferredBuild(self):
     with ops_lib.Graph().as_default():
-      v = variables.Variable(1.0)
+      v = variables.VariableV1(1.0)
       with self.assertRaisesRegexp(ValueError, "defer_build"):
         saver_module.Saver([v], defer_build=True)
 
   def testBuildShouldBeCalledBeforeSaveInCaseOfDeferBuild(self):
     save_path = os.path.join(self.get_temp_dir(), "error_deferred_build")
     with ops_lib.Graph().as_default(), session.Session() as sess:
-      variables.Variable(1.0)
+      variables.VariableV1(1.0)
       saver = saver_module.Saver(defer_build=True)
       with self.assertRaisesRegexp(RuntimeError, "build"):
         saver.save(sess, save_path)
@@ -643,18 +643,18 @@ class SaverTest(test.TestCase):
   def testDeferredBuild(self):
     save_path = os.path.join(self.get_temp_dir(), "deferred_build")
     with session.Session("", graph=ops_lib.Graph()) as sess:
-      one = variables.Variable(1.0)
+      one = variables.VariableV1(1.0)
       save = saver_module.Saver(defer_build=True)
       # if build is not deferred, saver cannot save the `twos`.
-      twos = variables.Variable([2.0, 2.0, 2.0])
+      twos = variables.VariableV1([2.0, 2.0, 2.0])
       init = variables.global_variables_initializer()
       save.build()
       init.run()
       save.save(sess, save_path)
 
     with session.Session("", graph=ops_lib.Graph()) as sess:
-      one = variables.Variable(0.0)
-      twos = variables.Variable([0.0, 0.0, 0.0])
+      one = variables.VariableV1(0.0)
+      twos = variables.VariableV1([0.0, 0.0, 0.0])
       # Saver with no arg, defaults to 'all variables'.
       save = saver_module.Saver()
       save.restore(sess, save_path)
@@ -664,7 +664,7 @@ class SaverTest(test.TestCase):
   def testReshape(self):
     save_path = os.path.join(self.get_temp_dir(), "variables_reshape")
     with session.Session("", graph=ops_lib.Graph()) as sess:
-      var = variables.Variable([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+      var = variables.VariableV1([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
       init = variables.global_variables_initializer()
       save = saver_module.Saver()
       init.run()
@@ -672,7 +672,7 @@ class SaverTest(test.TestCase):
 
     # Error when restoring with default reshape=False
     with session.Session("", graph=ops_lib.Graph()) as sess:
-      var = variables.Variable([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])
+      var = variables.VariableV1([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])
       save = saver_module.Saver()
       with self.assertRaisesRegexp(
           errors_impl.InvalidArgumentError,
@@ -681,7 +681,7 @@ class SaverTest(test.TestCase):
 
     # Restored to new shape with reshape=True
     with session.Session("", graph=ops_lib.Graph()) as sess:
-      var = variables.Variable([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])
+      var = variables.VariableV1([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])
       save = saver_module.Saver(reshape=True)
       save.restore(sess, save_path)
       self.assertAllClose([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], var.eval())
@@ -731,8 +731,8 @@ class SaverTest(test.TestCase):
     for save_path in paths:
       # Build a graph with 2 parameter nodes, and Save and
       # Restore nodes for them.
-      v0 = variables.Variable(10.0, name="v0")
-      v1 = variables.Variable(20.0, name="v1")
+      v0 = variables.VariableV1(10.0, name="v0")
+      v1 = variables.VariableV1(20.0, name="v1")
       save = saver_module.Saver({"v0": v0, "v1": v1}, restore_sequentially=True)
       init_all_op = variables.global_variables_initializer()
 
@@ -770,8 +770,8 @@ class SaverTest(test.TestCase):
 
     # Build a graph with 2 parameter nodes, and Save and
     # Restore nodes for them.
-    v0 = variables.Variable(10.0, name="v0")
-    v1 = variables.Variable(20.0, name="v1")
+    v0 = variables.VariableV1(10.0, name="v0")
+    v1 = variables.VariableV1(20.0, name="v1")
     save = saver_module.Saver({"v0": v0, "v1": v1}, restore_sequentially=True)
     init_all_op = variables.global_variables_initializer()
 
@@ -859,10 +859,10 @@ class SaveRestoreShardedTest(test.TestCase):
         target="",
         config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
       with sess.graph.device("/cpu:0"):
-        v0 = variables.Variable(10, name="v0")
+        v0 = variables.VariableV1(10, name="v0")
         t0 = saver_test_utils.CheckpointedOp(name="t0")
       with sess.graph.device("/cpu:1"):
-        v1 = variables.Variable(20, name="v1")
+        v1 = variables.VariableV1(20, name="v1")
         t1 = saver_test_utils.CheckpointedOp(name="t1")
       save = saver_module.Saver(
           {
@@ -890,7 +890,7 @@ class SaveRestoreShardedTest(test.TestCase):
           target="",
           config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
         with sess.graph.device("/cpu:0"):
-          v0 = variables.Variable(111, name="v0")
+          v0 = variables.VariableV1(111, name="v0")
           t0 = saver_test_utils.CheckpointedOp(name="t0")
         save = saver_module.Saver(
             {
@@ -914,7 +914,7 @@ class SaveRestoreShardedTest(test.TestCase):
           target="",
           config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
         with sess.graph.device("/cpu:0"):
-          v1 = variables.Variable(222)
+          v1 = variables.VariableV1(222)
           t1 = saver_test_utils.CheckpointedOp(name="t1")
         save = saver_module.Saver(
             {
@@ -938,10 +938,10 @@ class SaveRestoreShardedTest(test.TestCase):
         target="",
         config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
       with sess.graph.device("/cpu:0"):
-        v0 = variables.Variable(111, name="v0")
+        v0 = variables.VariableV1(111, name="v0")
         t0 = saver_test_utils.CheckpointedOp(name="t0")
       with sess.graph.device("/cpu:1"):
-        v1 = variables.Variable(222, name="v1")
+        v1 = variables.VariableV1(222, name="v1")
         t1 = saver_test_utils.CheckpointedOp(name="t1")
       save = saver_module.Saver(
           {
@@ -984,7 +984,7 @@ class SaveRestoreShardedTest(test.TestCase):
 
   def testSaverDef(self):
     with self.cached_session():
-      v0 = variables.Variable(123, name="v0")
+      v0 = variables.VariableV1(123, name="v0")
       save = saver_module.Saver({"v0": v0}, sharded=True)
       sd = save.as_saver_def()
       self.assertTrue(sd.sharded)
@@ -1023,7 +1023,7 @@ class SaveRestoreShardedTest(test.TestCase):
           if use_resource:
             vs = [resource_variable_ops.ResourceVariable(rnd, name=var_name)]
           else:
-            vs = [variables.Variable(rnd, name=var_name)]
+            vs = [variables.VariableV1(rnd, name=var_name)]
 
         variables.global_variables_initializer().run()
         if call_saver_with_dict:
@@ -1054,7 +1054,7 @@ class SaveRestoreShardedTest(test.TestCase):
           ]
         else:
           new_vs = [
-              variables.Variable(
+              variables.VariableV1(
                   array_ops.zeros(
                       shape=var_full_shape),  # != original contents.
                   name=var_name)
@@ -1210,7 +1210,7 @@ class MaxToKeepTest(test.TestCase):
     save_dir = self._get_test_dir("max_to_keep_non_sharded")
 
     with self.cached_session() as sess:
-      v = variables.Variable(10.0, name="v")
+      v = variables.VariableV1(10.0, name="v")
       save = saver_module.Saver({"v": v}, max_to_keep=2)
       variables.global_variables_initializer().run()
       self.assertEqual([], save.last_checkpoints)
@@ -1389,9 +1389,9 @@ class MaxToKeepTest(test.TestCase):
         target="",
         config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
       with sess.graph.device("/cpu:0"):
-        v0 = variables.Variable(111, name="v0")
+        v0 = variables.VariableV1(111, name="v0")
       with sess.graph.device("/cpu:1"):
-        v1 = variables.Variable(222, name="v1")
+        v1 = variables.VariableV1(222, name="v1")
       save = saver_module.Saver(
           {
               "v0": v0,
@@ -1448,7 +1448,7 @@ class MaxToKeepTest(test.TestCase):
     save_dir2 = self._get_test_dir("max_to_keep_0")
 
     with self.cached_session() as sess:
-      v = variables.Variable(10.0, name="v")
+      v = variables.VariableV1(10.0, name="v")
       variables.global_variables_initializer().run()
 
       # Test max_to_keep being None.
@@ -1475,7 +1475,7 @@ class MaxToKeepTest(test.TestCase):
     save_dir = self._get_test_dir("no_meta_graph")
 
     with self.cached_session() as sess:
-      v = variables.Variable(10.0, name="v")
+      v = variables.VariableV1(10.0, name="v")
       save = saver_module.Saver({"v": v})
       variables.global_variables_initializer().run()
 
@@ -1632,13 +1632,13 @@ class MetaGraphTest(test.TestCase):
     filename = os.path.join(test_dir, "metafile")
     with self.cached_session():
       # Creates a graph.
-      v0 = variables.Variable(1.0, name="v0")
+      v0 = variables.VariableV1(1.0, name="v0")
       control_flow_ops.cond(
           math_ops.less(v0, 10), lambda: math_ops.add(v0, 1),
           lambda: math_ops.subtract(v0, 1))
       control_flow_ops.while_loop(lambda i: math_ops.less(i, 10),
                                   lambda i: math_ops.add(i, 1), [v0])
-      var = variables.Variable(constant_op.constant(0, dtype=dtypes.int64))
+      var = variables.VariableV1(constant_op.constant(0, dtype=dtypes.int64))
       count_up_to = var.count_up_to(3)
       input_queue = data_flow_ops.FIFOQueue(
           30, dtypes.float32, shared_name="collection_queue")
@@ -1687,7 +1687,7 @@ class MetaGraphTest(test.TestCase):
   def testAddCollectionDefFails(self):
     with self.cached_session():
       # Creates a graph.
-      v0 = variables.Variable(10.0, name="v0")
+      v0 = variables.VariableV1(10.0, name="v0")
       # Creates a saver.
       save = saver_module.Saver({"v0": v0})
       # Generates MetaGraphDef.
@@ -1711,8 +1711,8 @@ class MetaGraphTest(test.TestCase):
     saver1_ckpt = os.path.join(test_dir, "saver1.ckpt")
     with self.session(graph=ops_lib.Graph()) as sess:
       # Creates a graph.
-      v0 = variables.Variable([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], name="v0")
-      v1 = variables.Variable(11.0, name="v1")
+      v0 = variables.VariableV1([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], name="v0")
+      v1 = variables.VariableV1(11.0, name="v1")
       # Creates 2 savers.
       saver0 = saver_module.Saver({"v0": v0}, name="saver0")
       saver1 = saver_module.Saver({"v1": v1}, name="saver1")
@@ -1788,8 +1788,8 @@ class MetaGraphTest(test.TestCase):
     saver1_ckpt = os.path.join(test_dir, "saver1.ckpt")
     with self.session(graph=ops_lib.Graph()) as sess:
       # Creates a graph.
-      v0 = variables.Variable([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], name="v0")
-      v1 = variables.Variable(11.0, name="v1")
+      v0 = variables.VariableV1([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], name="v0")
+      v1 = variables.VariableV1(11.0, name="v1")
 
       # Creates 2 savers.
       saver0 = saver_module.Saver({"v0": v0}, name="saver0")
@@ -1840,7 +1840,7 @@ class MetaGraphTest(test.TestCase):
     filename = os.path.join(test_dir, "metafile")
     with self.session(graph=ops_lib.Graph()):
       # Creates a graph.
-      variables.Variable(10.0, name="v0")
+      variables.VariableV1(10.0, name="v0")
       # Exports the graph as binary format.
       saver_module.export_meta_graph(filename, as_text=False)
     with self.session(graph=ops_lib.Graph()):
@@ -1871,8 +1871,8 @@ class MetaGraphTest(test.TestCase):
     test_dir = self._get_test_dir("slice_saver")
     filename = os.path.join(test_dir, "metafile")
     with self.cached_session():
-      v1 = variables.Variable([20.0], name="v1")
-      v2 = variables.Variable([20.0], name="v2")
+      v1 = variables.VariableV1([20.0], name="v1")
+      v2 = variables.VariableV1([20.0], name="v2")
       v2._set_save_slice_info(
           variables.Variable.SaveSliceInfo("v1", [1], [0], [1]))
 
@@ -1899,7 +1899,7 @@ class MetaGraphTest(test.TestCase):
     # Hidden 1
     images = constant_op.constant(1.2, dtypes.float32, shape=[100, 28])
     with ops_lib.name_scope("hidden1"):
-      weights = variables.Variable(
+      weights = variables.VariableV1(
           random_ops.truncated_normal(
               [28, 128], stddev=1.0 / math.sqrt(float(28))),
           name="weights")
@@ -1907,7 +1907,7 @@ class MetaGraphTest(test.TestCase):
       # the save and restore of control flow context (which doesn't make any
       # sense here from a machine learning perspective).  The typical biases is
       # a simple Variable without the conditions.
-      biases = variables.Variable(
+      biases = variables.VariableV1(
           control_flow_ops.cond(
               math_ops.less(random.random(), 0.5),
               lambda: array_ops.ones([128]), lambda: array_ops.zeros([128])),
@@ -1915,7 +1915,7 @@ class MetaGraphTest(test.TestCase):
       hidden1 = nn_ops.relu(math_ops.matmul(images, weights) + biases)
     # Hidden 2
     with ops_lib.name_scope("hidden2"):
-      weights = variables.Variable(
+      weights = variables.VariableV1(
           random_ops.truncated_normal(
               [128, 32], stddev=1.0 / math.sqrt(float(128))),
           name="weights")
@@ -1933,15 +1933,16 @@ class MetaGraphTest(test.TestCase):
 
       _, biases = control_flow_ops.while_loop(
           loop_cond, loop_body,
-          [constant_op.constant(0), variables.Variable(array_ops.zeros([32]))])
+          [constant_op.constant(0),
+           variables.VariableV1(array_ops.zeros([32]))])
       hidden2 = nn_ops.relu(math_ops.matmul(hidden1, weights) + biases)
     # Linear
     with ops_lib.name_scope("softmax_linear"):
-      weights = variables.Variable(
+      weights = variables.VariableV1(
           random_ops.truncated_normal(
               [32, 10], stddev=1.0 / math.sqrt(float(32))),
           name="weights")
-      biases = variables.Variable(array_ops.zeros([10]), name="biases")
+      biases = variables.VariableV1(array_ops.zeros([10]), name="biases")
       logits = math_ops.matmul(hidden2, weights) + biases
       ops_lib.add_to_collection("logits", logits)
     init_all_op = variables.global_variables_initializer()
@@ -2028,7 +2029,7 @@ class MetaGraphTest(test.TestCase):
 
     # Create while loop using `outer_body_fn`.
     with ops_lib.Graph().as_default():
-      var = variables.Variable(0.0)
+      var = variables.VariableV1(0.0)
       var_name = var.name
       output = graph_fn(var)
       output_name = output.name
@@ -2122,8 +2123,8 @@ class MetaGraphTest(test.TestCase):
   def testStrippedOpListDef(self):
     with self.cached_session():
       # Creates a graph.
-      v0 = variables.Variable(0.0)
-      var = variables.Variable(10.0)
+      v0 = variables.VariableV1(0.0)
+      var = variables.VariableV1(10.0)
       math_ops.add(v0, var)
 
       @function.Defun(dtypes.float32)
@@ -2161,8 +2162,8 @@ class MetaGraphTest(test.TestCase):
     # With strip_default_attrs enabled, attributes "T" (float32) and "Tout"
     # (complex64) in the "Complex" op must be removed.
     with self.cached_session():
-      real_num = variables.Variable(1.0, dtype=dtypes.float32, name="real")
-      imag_num = variables.Variable(2.0, dtype=dtypes.float32, name="imag")
+      real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
+      imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
 
       save = saver_module.Saver({"real_num": real_num, "imag_num": imag_num})
@@ -2178,8 +2179,8 @@ class MetaGraphTest(test.TestCase):
     # (complex64) in the "Complex" op must *not* be removed, even if they map
     # to their defaults.
     with self.session(graph=ops_lib.Graph()):
-      real_num = variables.Variable(1.0, dtype=dtypes.float32, name="real")
-      imag_num = variables.Variable(2.0, dtype=dtypes.float32, name="imag")
+      real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
+      imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
 
       save = saver_module.Saver({"real_num": real_num, "imag_num": imag_num})
@@ -2198,9 +2199,9 @@ class MetaGraphTest(test.TestCase):
     image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
     label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
     with session.Session() as sess:
-      weights = variables.Variable(
+      weights = variables.VariableV1(
           random_ops.random_uniform([784, 10]), name="weights")
-      bias = variables.Variable(array_ops.zeros([10]), name="bias")
+      bias = variables.VariableV1(array_ops.zeros([10]), name="bias")
       logit = nn_ops.relu(math_ops.matmul(image, weights) + bias, name="logits")
       nn_ops.softmax(logit, name="prediction")
       cost = nn_ops.softmax_cross_entropy_with_logits(labels=label,
@@ -2243,7 +2244,7 @@ class MetaGraphTest(test.TestCase):
       self.assertIsNone(new_saver_1)
 
       # Create a variable in graph_2 under scope "my_scope".
-      variables.Variable(array_ops.zeros([10]), name="my_scope/my_var")
+      variables.VariableV1(array_ops.zeros([10]), name="my_scope/my_var")
       sess.run(variables.global_variables_initializer())
       # Restore the checkpoint into a different scope "subgraph_2".
       new_saver_2 = saver_module.import_meta_graph(
@@ -2268,9 +2269,9 @@ class MetaGraphTest(test.TestCase):
     image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
     label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
     with session.Session() as sess:
-      weights = variables.Variable(
+      weights = variables.VariableV1(
           random_ops.random_uniform([784, 10]), name="weights")
-      bias = variables.Variable(array_ops.zeros([10]), name="bias")
+      bias = variables.VariableV1(array_ops.zeros([10]), name="bias")
       logit = nn_ops.relu(math_ops.matmul(image, weights) + bias, name="logits")
       nn_ops.softmax(logit, name="prediction")
       cost = nn_ops.softmax_cross_entropy_with_logits(labels=label,
@@ -2299,9 +2300,9 @@ class MetaGraphTest(test.TestCase):
       with ops_lib.device("/job:ps/replica:0/task:0/device:GPU:0"):
         image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
         label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
-        weights = variables.Variable(
+        weights = variables.VariableV1(
             random_ops.random_uniform([784, 10]), name="weights")
-        bias = variables.Variable(array_ops.zeros([10]), name="bias")
+        bias = variables.VariableV1(array_ops.zeros([10]), name="bias")
         logit = nn_ops.relu(math_ops.matmul(image, weights) + bias)
         nn_ops.softmax(logit, name="prediction")
         cost = nn_ops.softmax_cross_entropy_with_logits(labels=label,
@@ -2332,9 +2333,9 @@ class MetaGraphTest(test.TestCase):
       with ops_lib.device("/job:ps/replica:0/task:0/device:GPU:0"):
         image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
         label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
-        weights = variables.Variable(
+        weights = variables.VariableV1(
             random_ops.random_uniform([784, 10]), name="weights")
-        bias = variables.Variable(array_ops.zeros([10]), name="bias")
+        bias = variables.VariableV1(array_ops.zeros([10]), name="bias")
         logit = nn_ops.relu(math_ops.matmul(image, weights) + bias)
         nn_ops.softmax(logit, name="prediction")
         cost = nn_ops.softmax_cross_entropy_with_logits(labels=label,
@@ -2385,9 +2386,9 @@ class CheckpointReaderTest(test.TestCase):
 
   def testDebugString(self):
     # Builds a graph.
-    v0 = variables.Variable(
+    v0 = variables.VariableV1(
         [[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32, name="v0")
-    v1 = variables.Variable(
+    v1 = variables.VariableV1(
         [[[1], [2]], [[3], [4]], [[5], [6]]], dtype=dtypes.float32, name="v1")
     init_all_op = variables.global_variables_initializer()
     save = saver_module.Saver(
@@ -2444,7 +2445,8 @@ class WriteGraphTest(test.TestCase):
 
   def testWriteGraph(self):
     test_dir = self._get_test_dir("write_graph_dir")
-    variables.Variable([[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32, name="v0")
+    variables.VariableV1(
+        [[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32, name="v0")
     path = graph_io.write_graph(ops_lib.get_default_graph(),
                                 os.path.join(test_dir, "l1"), "graph.pbtxt")
     truth = os.path.join(test_dir, "l1", "graph.pbtxt")
@@ -2453,7 +2455,8 @@ class WriteGraphTest(test.TestCase):
 
   def testRecursiveCreate(self):
     test_dir = self._get_test_dir("deep_dir")
-    variables.Variable([[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32, name="v0")
+    variables.VariableV1(
+        [[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32, name="v0")
     path = graph_io.write_graph(ops_lib.get_default_graph().as_graph_def(),
                                 os.path.join(test_dir, "l1", "l2", "l3"),
                                 "graph.pbtxt")
@@ -2477,7 +2480,7 @@ class ScopedGraphTest(test.TestCase):
       images = constant_op.constant(
           1.2, dtypes.float32, shape=[100, 28], name="images")
       with ops_lib.name_scope("hidden1"):
-        weights1 = variables.Variable(
+        weights1 = variables.VariableV1(
             random_ops.truncated_normal(
                 [28, 128], stddev=1.0 / math.sqrt(float(28))),
             name="weights")
@@ -2485,7 +2488,7 @@ class ScopedGraphTest(test.TestCase):
         # coverage the save and restore of control flow context (which doesn't
         # make any sense here from a machine learning perspective).  The typical
         # biases is a simple Variable without the conditions.
-        biases1 = variables.Variable(
+        biases1 = variables.VariableV1(
             control_flow_ops.cond(
                 math_ops.less(random.random(), 0.5),
                 lambda: array_ops.ones([128]), lambda: array_ops.zeros([128])),
@@ -2494,7 +2497,7 @@ class ScopedGraphTest(test.TestCase):
 
       # Hidden 2
       with ops_lib.name_scope("hidden2"):
-        weights2 = variables.Variable(
+        weights2 = variables.VariableV1(
             random_ops.truncated_normal(
                 [128, 32], stddev=1.0 / math.sqrt(float(128))),
             name="weights")
@@ -2511,16 +2514,16 @@ class ScopedGraphTest(test.TestCase):
           return it + 1, biases2
 
         _, biases2 = control_flow_ops.while_loop(loop_cond, loop_body, [
-            constant_op.constant(0), variables.Variable(array_ops.zeros([32]))
+            constant_op.constant(0), variables.VariableV1(array_ops.zeros([32]))
         ])
         hidden2 = nn_ops.relu(math_ops.matmul(hidden1, weights2) + biases2)
       # Linear
       with ops_lib.name_scope("softmax_linear"):
-        weights3 = variables.Variable(
+        weights3 = variables.VariableV1(
             random_ops.truncated_normal(
                 [32, 10], stddev=1.0 / math.sqrt(float(32))),
             name="weights")
-        biases3 = variables.Variable(array_ops.zeros([10]), name="biases")
+        biases3 = variables.VariableV1(array_ops.zeros([10]), name="biases")
         logits = math_ops.matmul(hidden2, weights3) + biases3
         ops_lib.add_to_collection("logits", logits)
 
@@ -2566,7 +2569,7 @@ class ScopedGraphTest(test.TestCase):
     with graph.as_default():
       # Hidden 2
       with ops_lib.name_scope("hidden2"):
-        weights = variables.Variable(
+        weights = variables.VariableV1(
             random_ops.truncated_normal(
                 [128, 32], stddev=1.0 / math.sqrt(float(128))),
             name="weights")
@@ -2583,16 +2586,16 @@ class ScopedGraphTest(test.TestCase):
           return it + 1, biases
 
         _, biases = control_flow_ops.while_loop(loop_cond, loop_body, [
-            constant_op.constant(0), variables.Variable(array_ops.zeros([32]))
+            constant_op.constant(0), variables.VariableV1(array_ops.zeros([32]))
         ])
         hidden2 = nn_ops.relu(math_ops.matmul(hidden1, weights) + biases)
       # Linear
       with ops_lib.name_scope("softmax_linear"):
-        weights = variables.Variable(
+        weights = variables.VariableV1(
             random_ops.truncated_normal(
                 [32, 10], stddev=1.0 / math.sqrt(float(32))),
             name="weights")
-        biases = variables.Variable(array_ops.zeros([10]), name="biases")
+        biases = variables.VariableV1(array_ops.zeros([10]), name="biases")
         logits = math_ops.matmul(hidden2, weights) + biases
         ops_lib.add_to_collection("logits", logits)
 
@@ -2629,9 +2632,9 @@ class ScopedGraphTest(test.TestCase):
       with ops_lib.name_scope("hidden1"):
         images = constant_op.constant(
             1.0, dtypes.float32, shape=[3, 2], name="images")
-        weights1 = variables.Variable(
+        weights1 = variables.VariableV1(
             [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], name="weights")
-        biases1 = variables.Variable([0.1] * 3, name="biases")
+        biases1 = variables.VariableV1([0.1] * 3, name="biases")
         nn_ops.relu(math_ops.matmul(images, weights1) + biases1, name="relu")
 
     # Run the graph and save scoped checkpoint.
@@ -2685,9 +2688,9 @@ class ScopedGraphTest(test.TestCase):
       with ops_lib.name_scope("hidden1"):
         images = constant_op.constant(
             1.0, dtypes.float32, shape=[3, 2], name="images")
-        weights1 = variables.Variable(
+        weights1 = variables.VariableV1(
             [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], name="weights")
-        biases1 = variables.Variable([0.1] * 3, name="biases")
+        biases1 = variables.VariableV1([0.1] * 3, name="biases")
         nn_ops.relu(math_ops.matmul(images, weights1) + biases1, name="relu")
 
     # Run the graph and save scoped checkpoint.
@@ -2720,12 +2723,12 @@ class ScopedGraphTest(test.TestCase):
     graph = ops_lib.Graph()
     with graph.as_default():
       with ops_lib.name_scope("hidden1"):
-        variable1 = variables.Variable([1.0], name="variable1")
+        variable1 = variables.VariableV1([1.0], name="variable1")
         saver1 = saver_module.Saver(var_list=[variable1])
         graph.add_to_collection(ops_lib.GraphKeys.SAVERS, saver1)
 
       with ops_lib.name_scope("hidden2"):
-        variable2 = variables.Variable([2.0], name="variable2")
+        variable2 = variables.VariableV1([2.0], name="variable2")
       saver2 = saver_module.Saver(var_list=[variable2], name="hidden2/")
       graph.add_to_collection(ops_lib.GraphKeys.SAVERS, saver2)
 
@@ -2978,7 +2981,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
 
     with ops_lib.Graph().as_default() as g:
-      a = variables.Variable(1., name="a")
+      a = variables.VariableV1(1., name="a")
       a_saver = saver_module.Saver([a])
 
       with self.session(graph=g) as sess:
@@ -2986,7 +2989,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
         save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
 
     with ops_lib.Graph().as_default() as g:
-      a = variables.Variable([1.], name="a")
+      a = variables.VariableV1([1.], name="a")
       a_saver = saver_module.Saver([a])
       with self.session(graph=g) as sess:
         with self.assertRaisesRegexp(
diff --git a/tensorflow/python/training/server_lib_same_variables_no_clear_test.py b/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
index c7e84e9ba1..5aa7f45c2b 100644
--- a/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
@@ -37,8 +37,8 @@ class SameVariablesNoClearTest(test.TestCase):
     server = server_lib.Server.create_local_server()
 
     with session.Session(server.target) as sess_1:
-      v0 = variables.Variable([[2, 1]], name="v0")
-      v1 = variables.Variable([[1], [2]], name="v1")
+      v0 = variables.VariableV1([[2, 1]], name="v0")
+      v1 = variables.VariableV1([[1], [2]], name="v1")
       v2 = math_ops.matmul(v0, v1)
       sess_1.run([v0.initializer, v1.initializer])
       self.assertAllEqual([[4]], sess_1.run(v2))
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index 063044f0d0..cf995707fc 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -76,9 +76,9 @@ class GrpcServerTest(test.TestCase):
   def testResetFails(self):
     # Creates variable with container name.
     with ops.container("test0"):
-      v0 = variables.Variable(1.0, name="v0")
+      v0 = variables.VariableV1(1.0, name="v0")
     # Creates variable with default container.
-    v1 = variables.Variable(2.0, name="v1")
+    v1 = variables.VariableV1(2.0, name="v1")
     # Verifies resetting the non-existent target returns error.
     with self.assertRaises(errors_impl.NotFoundError):
       session.Session.reset("nonexistent", ["test0"])
@@ -234,8 +234,8 @@ class GrpcServerTest(test.TestCase):
           [0.], dtype=dtypes.float32))
       self.assertIsNotNone(input_queue)
 
-      var = variables.Variable(1., dtype=dtypes.float32, trainable=False,
-                               name="var")
+      var = variables.VariableV1(1., dtype=dtypes.float32, trainable=False,
+                                 name="var")
 
       sess.run(variables.global_variables_initializer())
       queue_runner_impl.start_queue_runners(sess)
@@ -245,7 +245,7 @@ class GrpcServerTest(test.TestCase):
     server = self._cached_server
 
     init_value = array_ops.placeholder(dtypes.int32)
-    v = variables.Variable(init_value, validate_shape=False, name="v")
+    v = variables.VariableV1(init_value, validate_shape=False, name="v")
 
     sharing_config = config_pb2.ConfigProto(isolate_session_state=False)
     sharing_sess_0 = session.Session(server.target, config=sharing_config)
@@ -302,7 +302,7 @@ class GrpcServerTest(test.TestCase):
     isolate_config = config_pb2.ConfigProto(isolate_session_state=True)
 
     with ops.Graph().as_default():
-      w_vector = variables.Variable([1, 2, 3], name="w")
+      w_vector = variables.VariableV1([1, 2, 3], name="w")
       with session.Session(server.target, config=sharing_config) as sess:
         with self.assertRaises(errors_impl.FailedPreconditionError):
           sess.run(w_vector)
@@ -310,20 +310,20 @@ class GrpcServerTest(test.TestCase):
         self.assertAllEqual([1, 2, 3], sess.run(w_vector))
 
     with ops.Graph().as_default():
-      w_vector = variables.Variable([4, 5, 6], name="w")
+      w_vector = variables.VariableV1([4, 5, 6], name="w")
       with session.Session(server.target, config=sharing_config) as sess:
         self.assertAllEqual([1, 2, 3], sess.run(w_vector))
         sess.run(w_vector.initializer)
         self.assertAllEqual([4, 5, 6], sess.run(w_vector))
 
     with ops.Graph().as_default():
-      w_scalar = variables.Variable(86, name="w")
+      w_scalar = variables.VariableV1(86, name="w")
       with session.Session(server.target, config=sharing_config) as sess:
         with self.assertRaises(errors_impl.InvalidArgumentError):
           sess.run(w_scalar.initializer)
 
     with ops.Graph().as_default():
-      w_scalar = variables.Variable(37, name="w")
+      w_scalar = variables.VariableV1(37, name="w")
       with session.Session(server.target, config=isolate_config) as sess:
         with self.assertRaises(errors_impl.FailedPreconditionError):
           sess.run(w_scalar)
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index f1d18f7704..2b5c3b01de 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -40,7 +40,7 @@ class SessionManagerTest(test.TestCase):
 
   def testPrepareSessionSucceeds(self):
     with ops.Graph().as_default():
-      v = variables.Variable([1.0, 2.0, 3.0], name="v")
+      v = variables.VariableV1([1.0, 2.0, 3.0], name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       sess = sm.prepare_session(
@@ -50,7 +50,7 @@ class SessionManagerTest(test.TestCase):
   def testPrepareSessionSucceedsWithInitFeedDict(self):
     with ops.Graph().as_default():
       p = array_ops.placeholder(dtypes.float32, shape=(3,))
-      v = variables.Variable(p, name="v")
+      v = variables.VariableV1(p, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       sess = sm.prepare_session(
@@ -61,7 +61,7 @@ class SessionManagerTest(test.TestCase):
 
   def testPrepareSessionSucceedsWithInitFn(self):
     with ops.Graph().as_default():
-      v = variables.Variable([125], name="v")
+      v = variables.VariableV1([125], name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       sess = sm.prepare_session(
@@ -79,7 +79,7 @@ class SessionManagerTest(test.TestCase):
     gfile.MakeDirs(checkpoint_dir)
 
     with ops.Graph().as_default():
-      v = variables.Variable([1.0, 2.0, 3.0], name="v")
+      v = variables.VariableV1([1.0, 2.0, 3.0], name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       saver = saver_lib.Saver({"v": v})
@@ -97,7 +97,7 @@ class SessionManagerTest(test.TestCase):
       # Renames the checkpoint directory.
       os.rename(checkpoint_dir, checkpoint_dir2)
       gfile.MakeDirs(checkpoint_dir)
-      v = variables.Variable([6.0, 7.0, 8.0], name="v")
+      v = variables.VariableV1([6.0, 7.0, 8.0], name="v")
       with self.cached_session():
         self.assertEqual(False, variables.is_variable_initialized(v).eval())
       session_manager.SessionManager(
@@ -134,7 +134,7 @@ class SessionManagerTest(test.TestCase):
                                checkpoint_filename_with_path=None):
     # Create a new Graph and SessionManager and recover from a checkpoint.
     with ops.Graph().as_default():
-      v = variables.Variable(2, name="v")
+      v = variables.VariableV1(2, name="v")
       with session_lib.Session():
         self.assertEqual(False, variables.is_variable_initialized(v).eval())
       sm2 = session_manager.SessionManager(
@@ -162,7 +162,7 @@ class SessionManagerTest(test.TestCase):
     gfile.MakeDirs(checkpoint_dir)
 
     with ops.Graph().as_default():
-      v = variables.Variable(1, name="v")
+      v = variables.VariableV1(1, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       saver = saver_lib.Saver({"v": v})
@@ -186,7 +186,7 @@ class SessionManagerTest(test.TestCase):
 
   def testWaitForSessionReturnsNoneAfterTimeout(self):
     with ops.Graph().as_default():
-      variables.Variable(1, name="v")
+      variables.VariableV1(1, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables(),
           recovery_wait_secs=1)
@@ -217,7 +217,7 @@ class SessionManagerTest(test.TestCase):
     gfile.MakeDirs(checkpoint_dir)
 
     with ops.Graph().as_default():
-      v = variables.Variable(1, name="v")
+      v = variables.VariableV1(1, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       saver = saver_lib.Saver({"v": v})
@@ -230,8 +230,8 @@ class SessionManagerTest(test.TestCase):
                  os.path.join(checkpoint_dir, "recover_session_checkpoint"))
     # Create a new Graph and SessionManager and recover.
     with ops.Graph().as_default():
-      v = variables.Variable(2, name="v")
-      w = variables.Variable(
+      v = variables.VariableV1(2, name="v")
+      w = variables.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -275,7 +275,7 @@ class SessionManagerTest(test.TestCase):
     gfile.MakeDirs(checkpoint_dir)
 
     with ops.Graph().as_default():
-      v = variables.Variable(1, name="v")
+      v = variables.VariableV1(1, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       saver = saver_lib.Saver({"v": v})
@@ -288,8 +288,8 @@ class SessionManagerTest(test.TestCase):
                  os.path.join(checkpoint_dir, "recover_session_checkpoint"))
     # Create a new Graph and SessionManager and recover.
     with ops.Graph().as_default():
-      v = variables.Variable(2, name="v")
-      w = variables.Variable(
+      v = variables.VariableV1(2, name="v")
+      w = variables.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -321,7 +321,7 @@ class SessionManagerTest(test.TestCase):
     # local_init_op exactly once, regardless of whether the session was
     # successfully recovered.
     with ops.Graph().as_default():
-      w = variables.Variable(
+      w = variables.VariableV1(
           1,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -356,8 +356,8 @@ class SessionManagerTest(test.TestCase):
 
     # Create a new Graph and SessionManager and recover.
     with ops.Graph().as_default():
-      v = variables.Variable(2, name="v")
-      w = variables.Variable(
+      v = variables.VariableV1(2, name="v")
+      w = variables.VariableV1(
           1,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -389,8 +389,8 @@ class SessionManagerTest(test.TestCase):
   def testWaitForSessionLocalInit(self):
     server = server_lib.Server.create_local_server()
     with ops.Graph().as_default() as graph:
-      v = variables.Variable(1, name="v")
-      w = variables.Variable(
+      v = variables.VariableV1(1, name="v")
+      w = variables.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -420,8 +420,8 @@ class SessionManagerTest(test.TestCase):
 
   def testWaitForSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
     with ops.Graph().as_default() as graph:
-      v = variables.Variable(1, name="v")
-      w = variables.Variable(
+      v = variables.VariableV1(1, name="v")
+      w = variables.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -439,8 +439,8 @@ class SessionManagerTest(test.TestCase):
 
   def testWaitForSessionInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default() as graph:
-      v = variables.Variable(1, name="v")
-      w = variables.Variable(
+      v = variables.VariableV1(1, name="v")
+      w = variables.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -456,13 +456,13 @@ class SessionManagerTest(test.TestCase):
 
   def testPrepareSessionWithReadyForLocalInitOp(self):
     with ops.Graph().as_default():
-      v = variables.Variable(1, name="v")
-      w = variables.Variable(
+      v = variables.VariableV1(1, name="v")
+      w = variables.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w")
-      x = variables.Variable(
+      x = variables.VariableV1(
           3 * v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -495,25 +495,25 @@ class SessionManagerTest(test.TestCase):
 
   def testPrepareSessionWithPartialInitOp(self):
     with ops.Graph().as_default():
-      v = variables.Variable(1, name="v")
-      w = variables.Variable(
+      v = variables.VariableV1(1, name="v")
+      w = variables.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w")
-      x = variables.Variable(
+      x = variables.VariableV1(
           3 * v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="x")
       # TODO(b/70206927): Use ResourceVariables once they are handled properly.
-      v_res = variables.Variable(1, name="v_res")
-      w_res = variables.Variable(
+      v_res = variables.VariableV1(1, name="v_res")
+      w_res = variables.VariableV1(
           v_res,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w_res")
-      x_res = variables.Variable(
+      x_res = variables.VariableV1(
           3 * v_res,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -565,7 +565,7 @@ class SessionManagerTest(test.TestCase):
     # cyclic dependencies.
     with ops.Graph().as_default():
       i = control_flow_ops.while_loop(lambda i: i < 1, lambda i: i + 1, [0])
-      v = variables.Variable(array_ops.identity(i), name="v")
+      v = variables.VariableV1(array_ops.identity(i), name="v")
       with self.cached_session():
         self.assertEqual(False, variables.is_variable_initialized(v).eval())
       sm = session_manager.SessionManager(
@@ -579,8 +579,8 @@ class SessionManagerTest(test.TestCase):
 
   def testPrepareSessionDidNotInitLocalVariable(self):
     with ops.Graph().as_default():
-      v = variables.Variable(1, name="v")
-      w = variables.Variable(
+      v = variables.VariableV1(1, name="v")
+      w = variables.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -596,8 +596,8 @@ class SessionManagerTest(test.TestCase):
 
   def testPrepareSessionDidNotInitLocalVariableList(self):
     with ops.Graph().as_default():
-      v = variables.Variable(1, name="v")
-      w = variables.Variable(
+      v = variables.VariableV1(1, name="v")
+      w = variables.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -613,8 +613,8 @@ class SessionManagerTest(test.TestCase):
 
   def testPrepareSessionWithReadyNotReadyForLocal(self):
     with ops.Graph().as_default():
-      v = variables.Variable(1, name="v")
-      w = variables.Variable(
+      v = variables.VariableV1(1, name="v")
+      w = variables.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -634,8 +634,8 @@ class SessionManagerTest(test.TestCase):
 
   def testPrepareSessionWithInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default():
-      v = variables.Variable(1, name="v")
-      w = variables.Variable(
+      v = variables.VariableV1(1, name="v")
+      w = variables.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -656,7 +656,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
 
   def testPrepareSessionSucceeds(self):
     with ops.Graph().as_default():
-      v = variables.Variable([1.0, 2.0, 3.0], name="v")
+      v = variables.VariableV1([1.0, 2.0, 3.0], name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.assert_variables_initialized())
       sess = sm.prepare_session(
@@ -666,7 +666,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
   def testPrepareSessionSucceedsWithInitFeedDict(self):
     with ops.Graph().as_default():
       p = array_ops.placeholder(dtypes.float32, shape=(3,))
-      v = variables.Variable(p, name="v")
+      v = variables.VariableV1(p, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.assert_variables_initialized())
       sess = sm.prepare_session(
@@ -677,7 +677,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
 
   def testPrepareSessionSucceedsWithInitFn(self):
     with ops.Graph().as_default():
-      v = variables.Variable([125], name="v")
+      v = variables.VariableV1([125], name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.assert_variables_initialized())
       sess = sm.prepare_session(
@@ -695,7 +695,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
     gfile.MakeDirs(checkpoint_dir)
 
     with ops.Graph().as_default():
-      v = variables.Variable([1.0, 2.0, 3.0], name="v")
+      v = variables.VariableV1([1.0, 2.0, 3.0], name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.assert_variables_initialized())
       saver = saver_lib.Saver({"v": v})
@@ -713,7 +713,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
       # Renames the checkpoint directory.
       os.rename(checkpoint_dir, checkpoint_dir2)
       gfile.MakeDirs(checkpoint_dir)
-      v = variables.Variable([6.0, 7.0, 8.0], name="v")
+      v = variables.VariableV1([6.0, 7.0, 8.0], name="v")
       with self.cached_session():
         self.assertEqual(False, variables.is_variable_initialized(v).eval())
       session_manager.SessionManager(
@@ -755,7 +755,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
     gfile.MakeDirs(checkpoint_dir)
 
     with ops.Graph().as_default():
-      v = variables.Variable(1, name="v")
+      v = variables.VariableV1(1, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.assert_variables_initialized())
       saver = saver_lib.Saver({"v": v})
@@ -768,7 +768,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
                  os.path.join(checkpoint_dir, "recover_session_checkpoint"))
     # Create a new Graph and SessionManager and recover.
     with ops.Graph().as_default():
-      v = variables.Variable(2, name="v")
+      v = variables.VariableV1(2, name="v")
       with self.cached_session():
         self.assertEqual(False, variables.is_variable_initialized(v).eval())
       sm2 = session_manager.SessionManager(
@@ -785,7 +785,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
 
   def testWaitForSessionReturnsNoneAfterTimeout(self):
     with ops.Graph().as_default():
-      variables.Variable(1, name="v")
+      variables.VariableV1(1, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.assert_variables_initialized(),
           recovery_wait_secs=1)
diff --git a/tensorflow/python/training/supervisor_test.py b/tensorflow/python/training/supervisor_test.py
index caf6eba3e0..7cd99d8680 100644
--- a/tensorflow/python/training/supervisor_test.py
+++ b/tensorflow/python/training/supervisor_test.py
@@ -423,7 +423,7 @@ class SupervisorTest(test.TestCase):
   def testLogdirButExplicitlyNoSummaryWriter(self):
     logdir = self._test_dir("explicit_no_summary_writer")
     with ops.Graph().as_default():
-      variables.Variable([1.0], name="foo")
+      variables.VariableV1([1.0], name="foo")
       summary.scalar("c1", constant_op.constant(1))
       summary.scalar("c2", constant_op.constant(2))
       summary.scalar("c3", constant_op.constant(3))
@@ -491,7 +491,7 @@ class SupervisorTest(test.TestCase):
 
   def testNoLogdirSucceeds(self):
     with ops.Graph().as_default():
-      variables.Variable([1.0, 2.0, 3.0])
+      variables.VariableV1([1.0, 2.0, 3.0])
       sv = supervisor.Supervisor(logdir="", summary_op=None)
       sess = sv.prepare_or_wait_for_session("")
       sess.close()
@@ -499,7 +499,7 @@ class SupervisorTest(test.TestCase):
 
   def testUseSessionManager(self):
     with ops.Graph().as_default():
-      variables.Variable([1.0, 2.0, 3.0])
+      variables.VariableV1([1.0, 2.0, 3.0])
       sm = session_manager_lib.SessionManager()
       # Pass in session_manager. The additional init_op is ignored.
       sv = supervisor.Supervisor(logdir="", session_manager=sm)
@@ -508,7 +508,7 @@ class SupervisorTest(test.TestCase):
   def testInitOp(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
-      v = variables.Variable([1.0, 2.0, 3.0])
+      v = variables.VariableV1([1.0, 2.0, 3.0])
       sv = supervisor.Supervisor(logdir=logdir)
       sess = sv.prepare_or_wait_for_session("")
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
@@ -517,7 +517,7 @@ class SupervisorTest(test.TestCase):
   def testInitFn(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
-      v = variables.Variable([1.0, 2.0, 3.0])
+      v = variables.VariableV1([1.0, 2.0, 3.0])
 
       def _init_fn(sess):
         sess.run(v.initializer)
@@ -531,7 +531,7 @@ class SupervisorTest(test.TestCase):
     logdir = self._test_dir("feed_dict_init_op")
     with ops.Graph().as_default():
       p = array_ops.placeholder(dtypes.float32, shape=(3,))
-      v = variables.Variable(p, name="v")
+      v = variables.VariableV1(p, name="v")
       sv = supervisor.Supervisor(
           logdir=logdir,
           init_op=variables.global_variables_initializer(),
@@ -550,10 +550,10 @@ class SupervisorTest(test.TestCase):
       g = ops.Graph()
       with g.as_default():
         with ops.device("/job:local"):
-          v = variables.Variable(
+          v = variables.VariableV1(
               1, name="default_ready_for_local_init_op_v_" + str(uid))
           vadd = v.assign_add(1)
-          w = variables.Variable(
+          w = variables.VariableV1(
               v,
               trainable=False,
               collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -590,7 +590,7 @@ class SupervisorTest(test.TestCase):
 
     # Create a checkpoint.
     with ops.Graph().as_default():
-      v = variables.Variable(
+      v = variables.VariableV1(
           10.0, name="ready_for_local_init_op_restore_v_" + str(uid))
       summary.scalar("ready_for_local_init_op_restore_v_" + str(uid), v)
       sv = supervisor.Supervisor(logdir=logdir)
@@ -607,10 +607,10 @@ class SupervisorTest(test.TestCase):
       g = ops.Graph()
       with g.as_default():
         with ops.device("/job:local"):
-          v = variables.Variable(
+          v = variables.VariableV1(
               1.0, name="ready_for_local_init_op_restore_v_" + str(uid))
           vadd = v.assign_add(1)
-          w = variables.Variable(
+          w = variables.VariableV1(
               v,
               trainable=False,
               collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -642,13 +642,13 @@ class SupervisorTest(test.TestCase):
     logdir = self._test_dir("default_local_init_op")
     with ops.Graph().as_default():
       # A local variable.
-      v = variables.Variable(
+      v = variables.VariableV1(
           [1.0, 2.0, 3.0],
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES])
 
       # An entity which is initialized through a TABLE_INITIALIZER.
-      w = variables.Variable([4, 5, 6], trainable=False, collections=[])
+      w = variables.VariableV1([4, 5, 6], trainable=False, collections=[])
       ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, w.initializer)
 
       # This shouldn't add a variable to the VARIABLES collection responsible
@@ -668,7 +668,7 @@ class SupervisorTest(test.TestCase):
     with ops.Graph().as_default():
       with ops.device("/job:localhost"):
         # A local variable.
-        v = variables.Variable(
+        v = variables.VariableV1(
             [1.0, 2.0, 3.0],
             trainable=False,
             collections=[ops.GraphKeys.LOCAL_VARIABLES])
@@ -687,8 +687,8 @@ class SupervisorTest(test.TestCase):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("default_init_op_fails")
     with ops.Graph().as_default():
-      v = variables.Variable([1.0, 2.0, 3.0], name="v")
-      variables.Variable([4.0, 5.0, 6.0], name="w")
+      v = variables.VariableV1([1.0, 2.0, 3.0], name="v")
+      variables.VariableV1([4.0, 5.0, 6.0], name="w")
       # w will not be initialized.
       sv = supervisor.Supervisor(logdir=logdir, init_op=v.initializer)
       with self.assertRaisesRegexp(RuntimeError,
@@ -699,11 +699,11 @@ class SupervisorTest(test.TestCase):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("default_init_op_fails_for_local_variable")
     with ops.Graph().as_default():
-      v = variables.Variable(
+      v = variables.VariableV1(
           [1.0, 2.0, 3.0],
           name="v",
           collections=[ops.GraphKeys.LOCAL_VARIABLES])
-      variables.Variable(
+      variables.VariableV1(
           [1.0, 2.0, 3.0],
           name="w",
           collections=[ops.GraphKeys.LOCAL_VARIABLES])
@@ -716,17 +716,17 @@ class SupervisorTest(test.TestCase):
   def testSetupFail(self):
     logdir = self._test_dir("setup_fail")
     with ops.Graph().as_default():
-      variables.Variable([1.0, 2.0, 3.0], name="v")
+      variables.VariableV1([1.0, 2.0, 3.0], name="v")
       with self.assertRaisesRegexp(ValueError, "must have their device set"):
         supervisor.Supervisor(logdir=logdir, is_chief=False)
     with ops.Graph().as_default(), ops.device("/job:ps"):
-      variables.Variable([1.0, 2.0, 3.0], name="v")
+      variables.VariableV1([1.0, 2.0, 3.0], name="v")
       supervisor.Supervisor(logdir=logdir, is_chief=False)
 
   def testDefaultGlobalStep(self):
     logdir = self._test_dir("default_global_step")
     with ops.Graph().as_default():
-      variables.Variable(287, name="global_step")
+      variables.VariableV1(287, name="global_step")
       sv = supervisor.Supervisor(logdir=logdir)
       sess = sv.prepare_or_wait_for_session("")
       self.assertEquals(287, sess.run(sv.global_step))
@@ -735,7 +735,7 @@ class SupervisorTest(test.TestCase):
   def testRestoreFromMetaGraph(self):
     logdir = self._test_dir("restore_from_meta_graph")
     with ops.Graph().as_default():
-      variables.Variable(1, name="v0")
+      variables.VariableV1(1, name="v0")
       sv = supervisor.Supervisor(logdir=logdir)
       sess = sv.prepare_or_wait_for_session("")
       filename = sv.saver.save(sess, sv.save_path)
@@ -757,7 +757,7 @@ class SupervisorTest(test.TestCase):
     logdir = self._test_dir("standard_services_without_global_step")
     # Create a checkpoint.
     with ops.Graph().as_default():
-      v = variables.Variable([1.0], name="foo")
+      v = variables.VariableV1([1.0], name="foo")
       summary.scalar("v", v[0])
       sv = supervisor.Supervisor(logdir=logdir)
       meta_graph_def = meta_graph.create_meta_graph_def(
@@ -796,7 +796,7 @@ class SupervisorTest(test.TestCase):
     self.assertRaises(StopIteration, lambda: next(rr))
     # There should be a checkpoint file with the variable "foo"
     with ops.Graph().as_default(), self.cached_session() as sess:
-      v = variables.Variable([10.10], name="foo")
+      v = variables.VariableV1([10.10], name="foo")
       sav = saver_lib.Saver([v])
       sav.restore(sess, save_path)
       self.assertEqual(1.0, v.eval()[0])
@@ -807,7 +807,7 @@ class SupervisorTest(test.TestCase):
     logdir = self._test_dir("standard_services_with_global_step")
     # Create a checkpoint.
     with ops.Graph().as_default():
-      v = variables.Variable([123], name="global_step")
+      v = variables.VariableV1([123], name="global_step")
       sv = supervisor.Supervisor(logdir=logdir)
       meta_graph_def = meta_graph.create_meta_graph_def(
           saver_def=sv.saver.saver_def)
@@ -860,7 +860,7 @@ class SupervisorTest(test.TestCase):
     self.assertRaises(StopIteration, lambda: next(rr))
     # There should be a checkpoint file with the variable "foo"
     with ops.Graph().as_default(), self.cached_session() as sess:
-      v = variables.Variable([-12], name="global_step")
+      v = variables.VariableV1([-12], name="global_step")
       sav = saver_lib.Saver([v])
       sav.restore(sess, save_path)
       self.assertEqual(123, v.eval()[0])
diff --git a/tensorflow/python/training/sync_replicas_optimizer_test.py b/tensorflow/python/training/sync_replicas_optimizer_test.py
index fff17402e2..1ef8756ef6 100644
--- a/tensorflow/python/training/sync_replicas_optimizer_test.py
+++ b/tensorflow/python/training/sync_replicas_optimizer_test.py
@@ -40,11 +40,12 @@ def get_workers(num_workers, replicas_to_aggregate, workers):
     is_chief = (worker_id == 0)
     with graph.as_default():
       with ops.device("/job:ps/task:0"):
-        global_step = variables.Variable(0, name="global_step", trainable=False)
-        var_0 = variables.Variable(0.0, name="v0")
+        global_step = variables.VariableV1(
+            0, name="global_step", trainable=False)
+        var_0 = variables.VariableV1(0.0, name="v0")
       with ops.device("/job:ps/task:1"):
-        var_1 = variables.Variable(1.0, name="v1")
-        var_sparse = variables.Variable([[3.0], [4.0]], name="v_sparse")
+        var_1 = variables.VariableV1(1.0, name="v1")
+        var_sparse = variables.VariableV1([[3.0], [4.0]], name="v_sparse")
 
       with ops.device("/job:worker/task:" + str(worker_id)):
         grads_0 = constant_op.constant(0.1 + worker_id * 0.2)
@@ -272,8 +273,8 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
         replicas_to_aggregate=1,
         total_num_replicas=1)
     hook = opt.make_session_run_hook(True)
-    v = variables.Variable([0.])
-    global_step = variables.Variable(0, name="global_step", trainable=False)
+    v = variables.VariableV1([0.])
+    global_step = variables.VariableV1(0, name="global_step", trainable=False)
     opt.minimize(v, global_step=global_step)
     hook.begin()
 
@@ -282,8 +283,8 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
         opt=adam.AdamOptimizer(0.01),
         replicas_to_aggregate=1,
         total_num_replicas=1)
-    v = variables.Variable([0.], name="fetch_variable_test")
-    global_step = variables.Variable(0, name="global_step", trainable=False)
+    v = variables.VariableV1([0.], name="fetch_variable_test")
+    global_step = variables.VariableV1(0, name="global_step", trainable=False)
     opt.minimize(v, global_step=global_step)
     opt_variables = opt.variables()
     beta1_power, beta2_power = opt._opt._get_beta_accumulators()
diff --git a/tensorflow/python/training/training_ops_test.py b/tensorflow/python/training/training_ops_test.py
index d131a11067..f410ceaaff 100644
--- a/tensorflow/python/training/training_ops_test.py
+++ b/tensorflow/python/training/training_ops_test.py
@@ -51,7 +51,7 @@ class TrainingOpsTest(TensorFlowTestCase):
   def _testTypes(self, x, alpha, delta, use_gpu=None):
     self.setUp()
     with self.test_session(use_gpu=use_gpu):
-      var = variables.Variable(x)
+      var = variables.VariableV1(x)
       variables.global_variables_initializer().run()
       self.assertAllCloseAccordingToType(x, var.eval())
       apply_sgd = training_ops.apply_gradient_descent(var, alpha, delta)
@@ -70,8 +70,8 @@ class TrainingOpsTest(TensorFlowTestCase):
   def _testTypesForAdagrad(self, x, y, lr, grad, use_gpu=None):
     self.setUp()
     with self.test_session(use_gpu=use_gpu):
-      var = variables.Variable(x)
-      accum = variables.Variable(y)
+      var = variables.VariableV1(x)
+      accum = variables.VariableV1(y)
       variables.global_variables_initializer().run()
 
       self.assertAllCloseAccordingToType(x, var.eval())
@@ -94,9 +94,9 @@ class TrainingOpsTest(TensorFlowTestCase):
                         lr_power=-0.5):
     self.setUp()
     with self.test_session(use_gpu=use_gpu):
-      var = variables.Variable(x)
-      accum = variables.Variable(y)
-      linear = variables.Variable(z)
+      var = variables.VariableV1(x)
+      accum = variables.VariableV1(y)
+      linear = variables.VariableV1(z)
       variables.global_variables_initializer().run()
 
       self.assertAllCloseAccordingToType(x, var.eval())
@@ -148,8 +148,8 @@ class TrainingOpsTest(TensorFlowTestCase):
   def _testTypesForSparseAdagrad(self, x, y, lr, grad, indices):
     self.setUp()
     with self.test_session(use_gpu=False):
-      var = variables.Variable(x)
-      accum = variables.Variable(y)
+      var = variables.VariableV1(x)
+      accum = variables.VariableV1(y)
       variables.global_variables_initializer().run()
 
       self.assertAllCloseAccordingToType(x, var.eval())
@@ -178,9 +178,9 @@ class TrainingOpsTest(TensorFlowTestCase):
                               lr_power=-0.5):
     self.setUp()
     with self.test_session(use_gpu=False):
-      var = variables.Variable(x)
-      accum = variables.Variable(y)
-      linear = variables.Variable(z)
+      var = variables.VariableV1(x)
+      accum = variables.VariableV1(y)
+      linear = variables.VariableV1(z)
       variables.global_variables_initializer().run()
 
       self.assertAllCloseAccordingToType(x, var.eval())
@@ -257,9 +257,9 @@ class TrainingOpsTest(TensorFlowTestCase):
   def _testTypesForAdam(self, var, m, v, grad, use_gpu):
     self.setUp()
     with self.test_session(use_gpu=use_gpu):
-      var_t = variables.Variable(var)
-      m_t = variables.Variable(m)
-      v_t = variables.Variable(v)
+      var_t = variables.VariableV1(var)
+      m_t = variables.VariableV1(m)
+      v_t = variables.VariableV1(v)
 
       t = 1
       beta1 = np.array(0.9, dtype=var.dtype)
@@ -270,8 +270,8 @@ class TrainingOpsTest(TensorFlowTestCase):
       epsilon = np.array(1e-8, dtype=var.dtype)
       beta1_t = constant_op.constant(beta1, self._toType(var.dtype), [])
       beta2_t = constant_op.constant(beta2, self._toType(var.dtype), [])
-      beta1_power_t = variables.Variable(beta1_power)
-      beta2_power_t = variables.Variable(beta2_power)
+      beta1_power_t = variables.VariableV1(beta1_power)
+      beta2_power_t = variables.VariableV1(beta2_power)
       lr_t = constant_op.constant(lr, self._toType(var.dtype), [])
       epsilon_t = constant_op.constant(epsilon, self._toType(var.dtype), [])
       variables.global_variables_initializer().run()
diff --git a/tensorflow/python/training/training_util_test.py b/tensorflow/python/training/training_util_test.py
index 6cc177e0e8..ba64e785ac 100644
--- a/tensorflow/python/training/training_util_test.py
+++ b/tensorflow/python/training/training_util_test.py
@@ -49,7 +49,7 @@ class GlobalStepTest(test.TestCase):
   def test_invalid_shape(self):
     with ops.Graph().as_default() as g:
       self.assertIsNone(training_util.get_global_step())
-      variables.Variable(
+      variables.VariableV1(
           [0],
           trainable=False,
           dtype=dtypes.int32,
@@ -73,7 +73,7 @@ class GlobalStepTest(test.TestCase):
   def test_get_global_step(self):
     with ops.Graph().as_default() as g:
       self.assertIsNone(training_util.get_global_step())
-      variables.Variable(
+      variables.VariableV1(
           0,
           trainable=False,
           dtype=dtypes.int32,
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
index 05698b03ee..af7fc9d4ef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
@@ -1,5 +1,6 @@
 path: "tensorflow.Variable"
 tf_class {
+  is_instance: "<class \'tensorflow.python.ops.variables.VariableV1\'>"
   is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 503e145a91..509ceff9df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -2220,6 +2220,10 @@ tf_module {
     name: "variable_axis_size_partitioner"
     argspec: "args=[\'max_shard_bytes\', \'axis\', \'bytes_per_string_element\', \'max_shards\'], varargs=None, keywords=None, defaults=[\'0\', \'16\', \'None\'], "
   }
+  member_method {
+    name: "variable_creator_scope"
+    argspec: "args=[\'variable_creator\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "variable_op_scope"
     argspec: "args=[\'values\', \'name_or_scope\', \'default_name\', \'initializer\', \'regularizer\', \'caching_device\', \'partitioner\', \'custom_getter\', \'reuse\', \'dtype\', \'use_resource\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable-scope.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable-scope.pbtxt
deleted file mode 100644
index c13eb7b8bb..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable-scope.pbtxt
+++ /dev/null
@@ -1,105 +0,0 @@
-path: "tensorflow.VariableScope"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.variable_scope.VariableScope\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "caching_device"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "constraint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "custom_getter"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "original_name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "partitioner"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reuse"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "use_resource"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'reuse\', \'name\', \'initializer\', \'regularizer\', \'caching_device\', \'partitioner\', \'custom_getter\', \'name_scope\', \'dtype\', \'use_resource\', \'constraint\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\', \'None\', \'None\', \'None\', \'\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_collection"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_variable"
-    argspec: "args=[\'self\', \'var_store\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'reuse\', \'trainable\', \'collections\', \'caching_device\', \'partitioner\', \'validate_shape\', \'use_resource\', \'custom_getter\', \'constraint\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
-  }
-  member_method {
-    name: "global_variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "local_variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reuse_variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_caching_device"
-    argspec: "args=[\'self\', \'caching_device\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_custom_getter"
-    argspec: "args=[\'self\', \'custom_getter\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_dtype"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_initializer"
-    argspec: "args=[\'self\', \'initializer\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_partitioner"
-    argspec: "args=[\'self\', \'partitioner\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_regularizer"
-    argspec: "args=[\'self\', \'regularizer\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_use_resource"
-    argspec: "args=[\'self\', \'use_resource\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "trainable_variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.-save-slice-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.-save-slice-info.pbtxt
deleted file mode 100644
index ac3ccd468b..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable.-save-slice-info.pbtxt
+++ /dev/null
@@ -1,17 +0,0 @@
-path: "tensorflow.Variable.SaveSliceInfo"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.variables.SaveSliceInfo\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "spec"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'full_name\', \'full_shape\', \'var_offset\', \'var_shape\', \'save_slice_info_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "to_proto"
-    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
deleted file mode 100644
index 05698b03ee..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
+++ /dev/null
@@ -1,130 +0,0 @@
-path: "tensorflow.Variable"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "SaveSliceInfo"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "constraint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "device"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "initial_value"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'initial_value\', \'trainable\', \'collections\', \'validate_shape\', \'caching_device\', \'name\', \'variable_def\', \'dtype\', \'expected_shape\', \'import_scope\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
-  }
-  member_method {
-    name: "assign"
-    argspec: "args=[\'self\', \'value\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
-  }
-  member_method {
-    name: "assign_add"
-    argspec: "args=[\'self\', \'delta\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
-  }
-  member_method {
-    name: "assign_sub"
-    argspec: "args=[\'self\', \'delta\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
-  }
-  member_method {
-    name: "count_up_to"
-    argspec: "args=[\'self\', \'limit\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "eval"
-    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "from_proto"
-    argspec: "args=[\'variable_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_shape"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "initialized_value"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "load"
-    argspec: "args=[\'self\', \'value\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "read_value"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "scatter_add"
-    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "scatter_nd_add"
-    argspec: "args=[\'self\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "scatter_nd_sub"
-    argspec: "args=[\'self\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "scatter_nd_update"
-    argspec: "args=[\'self\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "scatter_sub"
-    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "scatter_update"
-    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "set_shape"
-    argspec: "args=[\'self\', \'shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "to_proto"
-    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "value"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
index d499c67d89..e3c63fe737 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
@@ -48,10 +48,6 @@ tf_module {
     name: "zeros"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "global_variables"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "he_normal"
     argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -68,12 +64,4 @@ tf_module {
     name: "lecun_uniform"
     argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "local_variables"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'var_list\', \'name\'], varargs=None, keywords=None, defaults=[\'init\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 96212f5528..d2dc8bc85f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -1,9 +1,5 @@
 path: "tensorflow"
 tf_module {
-  member {
-    name: "AUTO_REUSE"
-    mtype: "<enum \'_ReuseMode\'>"
-  }
   member {
     name: "AggregationMethod"
     mtype: "<type \'type\'>"
@@ -232,18 +228,10 @@ tf_module {
     name: "VarLenFeature"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "Variable"
-    mtype: "<class \'tensorflow.python.ops.variables.VariableMetaclass\'>"
-  }
   member {
     name: "VariableAggregation"
     mtype: "<class \'enum.EnumMeta\'>"
   }
-  member {
-    name: "VariableScope"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "VariableSynchronization"
     mtype: "<class \'enum.EnumMeta\'>"
@@ -552,10 +540,6 @@ tf_module {
     name: "user_ops"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "variable_scope"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "variance_scaling_initializer"
     mtype: "<type \'type\'>"
@@ -616,10 +600,6 @@ tf_module {
     name: "add_to_collections"
     argspec: "args=[\'names\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "all_variables"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "angle"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -732,10 +712,6 @@ tf_module {
     name: "assert_type"
     argspec: "args=[\'tensor\', \'tf_type\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "assert_variables_initialized"
-    argspec: "args=[\'var_list\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "atan"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1136,10 +1112,6 @@ tf_module {
     name: "get_default_session"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_local_variable"
-    argspec: "args=[\'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'collections\', \'caching_device\', \'partitioner\', \'validate_shape\', \'use_resource\', \'custom_getter\', \'constraint\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
-  }
   member_method {
     name: "get_seed"
     argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
@@ -1152,26 +1124,10 @@ tf_module {
     name: "get_session_tensor"
     argspec: "args=[\'handle\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "get_variable"
-    argspec: "args=[\'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'collections\', \'caching_device\', \'partitioner\', \'validate_shape\', \'use_resource\', \'custom_getter\', \'constraint\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
-  }
-  member_method {
-    name: "get_variable_scope"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "global_norm"
     argspec: "args=[\'t_list\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "global_variables"
-    argspec: "args=[\'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "global_variables_initializer"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "gradients"
     argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\', \'None\'], "
@@ -1248,18 +1204,6 @@ tf_module {
     name: "initialize_all_tables"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
   }
-  member_method {
-    name: "initialize_all_variables"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "initialize_local_variables"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "initialize_variables"
-    argspec: "args=[\'var_list\', \'name\'], varargs=None, keywords=None, defaults=[\'init\'], "
-  }
   member_method {
     name: "invert_permutation"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1288,10 +1232,6 @@ tf_module {
     name: "is_strictly_increasing"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "is_variable_initialized"
-    argspec: "args=[\'variable\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "lbeta"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1328,14 +1268,6 @@ tf_module {
     name: "load_op_library"
     argspec: "args=[\'library_filename\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "local_variables"
-    argspec: "args=[\'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "local_variables_initializer"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "log"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1448,14 +1380,6 @@ tf_module {
     name: "mod"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "model_variables"
-    argspec: "args=[\'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "moving_average_variables"
-    argspec: "args=[\'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "multinomial"
     argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -1656,10 +1580,6 @@ tf_module {
     name: "register_tensor_conversion_function"
     argspec: "args=[\'base_type\', \'conversion_func\', \'priority\'], varargs=None, keywords=None, defaults=[\'100\'], "
   }
-  member_method {
-    name: "report_uninitialized_variables"
-    argspec: "args=[\'var_list\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'report_uninitialized_variables\'], "
-  }
   member_method {
     name: "required_space_to_batch_paddings"
     argspec: "args=[\'input_shape\', \'block_shape\', \'base_paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -2068,10 +1988,6 @@ tf_module {
     name: "trace"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "trainable_variables"
-    argspec: "args=[\'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "transpose"
     argspec: "args=[\'a\', \'perm\', \'name\', \'conjugate\'], varargs=None, keywords=None, defaults=[\'None\', \'transpose\', \'False\'], "
@@ -2140,14 +2056,6 @@ tf_module {
     name: "variable_axis_size_partitioner"
     argspec: "args=[\'max_shard_bytes\', \'axis\', \'bytes_per_string_element\', \'max_shards\'], varargs=None, keywords=None, defaults=[\'0\', \'16\', \'None\'], "
   }
-  member_method {
-    name: "variable_op_scope"
-    argspec: "args=[\'values\', \'name_or_scope\', \'default_name\', \'initializer\', \'regularizer\', \'caching_device\', \'partitioner\', \'custom_getter\', \'reuse\', \'dtype\', \'use_resource\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables_initializer"
-    argspec: "args=[\'var_list\', \'name\'], varargs=None, keywords=None, defaults=[\'init\'], "
-  }
   member_method {
     name: "verify_tensor_all_finite"
     argspec: "args=[\'t\', \'msg\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.variable_scope.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.variable_scope.pbtxt
deleted file mode 100644
index e62dec93e6..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.variable_scope.pbtxt
+++ /dev/null
@@ -1,9 +0,0 @@
-path: "tensorflow.variable_scope"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.variable_scope.variable_scope\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name_or_scope\', \'default_name\', \'values\', \'initializer\', \'regularizer\', \'caching_device\', \'partitioner\', \'custom_getter\', \'reuse\', \'dtype\', \'use_resource\', \'constraint\', \'auxiliary_name_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
-  }
-}
-- 
GitLab


From f172c52ac74ae6db228119b90785add81648372e Mon Sep 17 00:00:00 2001
From: avijit-nervana <avijit.chakraborty@intel.com>
Date: Thu, 27 Sep 2018 12:57:24 -0700
Subject: [PATCH 093/570] Fixed the broken unit tests

---
 configure.py                    |  2 +-
 tensorflow/workspace.bzl        | 16 ++++++++--------
 third_party/mkl/build_defs.bzl  |  2 +-
 third_party/ngraph/ngraph.BUILD |  4 ++--
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/configure.py b/configure.py
index cc6a654a61..f71caa1994 100644
--- a/configure.py
+++ b/configure.py
@@ -1631,7 +1631,7 @@ def main():
     config_info_line('monolithic', 'Config for mostly static monolithic build.')
     config_info_line('gdr', 'Build with GDR support.')
     config_info_line('verbs', 'Build with libverbs support.')
-    config_info_line('ngraph', 'Build with Intel ngraph support.')
+    config_info_line('ngraph', 'Build with Intel nGraph support.')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index e5a0a0b2b7..6966783efd 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -862,11 +862,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "ngraph",
         urls = [
-            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.8.0.tar.gz",
-            "https://github.com/NervanaSystems/ngraph/archive/v0.8.0.tar.gz",
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.8.1.tar.gz",
+            "https://github.com/NervanaSystems/ngraph/archive/v0.8.1.tar.gz",
         ],
-        sha256 = "a8cf3ef2d0e6d31b54eb33f6a9e795f562195ce5c2a857e729ca9c35241cc45c",
-        strip_prefix = "ngraph-0.8.0",
+        sha256 = "bf9dcc88e5c66021e3aac80491a231711211540d613bf9b6bd28db3f5bb86b62",
+        strip_prefix = "ngraph-0.8.1",
         build_file = clean_dep("//third_party/ngraph:ngraph.BUILD"),
     )
 
@@ -884,11 +884,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "ngraph_tf",
         urls = [
-            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.6.0.tar.gz",
-            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.6.0.tar.gz",
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.6.1.tar.gz",
+            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.6.1.tar.gz",
         ],
-        sha256 = "1f49391c02bef24872e9f85591e60e0e7eef12a337db71390444118049fe451f",
-        strip_prefix = "ngraph-tf-0.6.0",
+        sha256 = "402f84c748c113780a60f35f39aab118435285543aee4900d712b76fbf8a21ee",
+        strip_prefix = "ngraph-tf-0.6.1",
         build_file = clean_dep("//third_party/ngraph:ngraph_tf.BUILD"),
     )
 
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index bb798e715a..10c2d90c84 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -92,7 +92,7 @@ def if_enable_mkl(if_true, if_false = []):
       A select evaluating to either if_true or if_false as appropriate.
     """
     return select({
-        "//third_party/mkl:enable_mkl": if_true,
+        str(Label("//third_party/mkl:enable_mkl")): if_true,
         "//conditions:default": if_false,
     })
 
diff --git a/third_party/ngraph/ngraph.BUILD b/third_party/ngraph/ngraph.BUILD
index 71b2187011..6602a480af 100644
--- a/third_party/ngraph/ngraph.BUILD
+++ b/third_party/ngraph/ngraph.BUILD
@@ -110,7 +110,7 @@ cc_library(
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
         '-D SHARED_LIB_EXT=\\".so\\"',
-        '-D NGRAPH_VERSION=\\"0.8.0\\"',
+        '-D NGRAPH_VERSION=\\"0.8.1\\"',
         "-D NGRAPH_DEX_ONLY",
     ],
     visibility = ["//visibility:public"],
@@ -144,7 +144,7 @@ cc_library(
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
         '-D SHARED_LIB_EXT=\\".so\\"',
-        '-D NGRAPH_VERSION=\\"0.8.0\\"',
+        '-D NGRAPH_VERSION=\\"0.8.1\\"',
     ],
     visibility = ["//visibility:public"],
     alwayslink = 1,
-- 
GitLab


From 5220e565b7cc32a5f757896c76c7d57c33bcd323 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Thu, 27 Sep 2018 14:01:27 -0700
Subject: [PATCH 094/570] Don't use tensorflow::Edge after freeing it

Even with this bug we were accidentally doing the right thing (so the test case
doesn't actually fail without the fix): deleting an Edge sets its input and
output indices to kControlSlot-1 so we'd normally expect to fail when there is a
control edge out of the TF cluster (because a control edge would be recognized
as a data edge).  But AddEdge(x, -1, y, -1) seems to do the right thing for both
control and data edges.

PiperOrigin-RevId: 214831204
---
 tensorflow/compiler/jit/BUILD                 |   2 +
 tensorflow/compiler/jit/build_xla_ops_pass.cc |  11 +-
 .../compiler/jit/build_xla_ops_pass_test.cc   | 112 ++++++++++++++++++
 3 files changed, 116 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/compiler/jit/build_xla_ops_pass_test.cc

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 4e184729ef..5bf4af1014 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -478,6 +478,7 @@ tf_cc_test(
     name = "compilation_passes_test",
     size = "small",
     srcs = [
+        "build_xla_ops_pass_test.cc",
         "encapsulate_subgraphs_pass_test.cc",
         "encapsulate_xla_computations_pass_test.cc",
         "mark_for_compilation_pass_test.cc",
@@ -486,6 +487,7 @@ tf_cc_test(
     deps = [
         ":common",
         ":compilation_passes",
+        ":node_matchers",
         ":xla_cluster_util",
         ":xla_gpu_device",
         "//tensorflow/cc:cc_ops",
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index 13a518d0e8..9e3fd93cda 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -112,16 +112,9 @@ static void MoveOutgoingEdges(Graph* g, Node* old_node, Node* new_node) {
   std::vector<const Edge*> out_edges(old_node->out_edges().begin(),
                                      old_node->out_edges().end());
   for (const Edge* edge : out_edges) {
-    Node* dst = edge->dst();
-    int src_output = edge->src_output();
-    int dst_input = edge->dst_input();
+    // TODO(sanjoy): This does not update NodeDef inputs.
+    g->AddEdge(new_node, edge->src_output(), edge->dst(), edge->dst_input());
     g->RemoveEdge(edge);
-
-    if (edge->IsControlEdge()) {
-      g->AddControlEdge(new_node, dst);
-    } else {
-      g->AddEdge(new_node, src_output, dst, dst_input);
-    }
   }
 }
 
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
new file mode 100644
index 0000000000..b7cb4506b9
--- /dev/null
+++ b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/build_xla_ops_pass.h"
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/resource_variable_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
+#include "tensorflow/compiler/jit/node_matchers.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using ::tensorflow::testing::FindNodeByName;
+using ::tensorflow::testing::matchers::CtrlDeps;
+using ::tensorflow::testing::matchers::NodeWith;
+using ::tensorflow::testing::matchers::Op;
+
+Status BuildXlaOps(const Scope& s, std::unique_ptr<Graph>* result) {
+  auto graph = absl::make_unique<Graph>(OpRegistry::Global());
+  TF_RETURN_IF_ERROR(s.ToGraph(graph.get()));
+
+  // Assign all nodes to the CPU device.
+  static const char* kCpuDevice = "/job:localhost/replica:0/task:0/cpu:0";
+  for (Node* n : graph->nodes()) {
+    if (n->assigned_device_name().empty()) {
+      n->set_assigned_device_name(kCpuDevice);
+    }
+  }
+
+  GraphOptimizationPassOptions opt_options;
+  opt_options.graph = &graph;
+  BuildXlaOpsPass pass;
+  TF_RETURN_IF_ERROR(pass.Run(opt_options));
+  *result = std::move(graph);
+  return Status::OK();
+}
+
+Status MakeXlaCompiledKernel(Graph* graph, const string& callee_name,
+                             const string& node_name, Node** result) {
+  NodeDef call_node;
+  call_node.set_name(node_name);
+  call_node.set_op(callee_name);
+  AddNodeAttr(kXlaCompiledKernelAttr, true, &call_node);
+  AddNodeAttr(kXlaNumConstantArgsAttr, 0, &call_node);
+  AddNodeAttr(kXlaNumResourceArgsAttr, 0, &call_node);
+  Status s;
+  *result = graph->AddNode(call_node, &s);
+  return s;
+}
+
+Node* MakeWrite(const Scope& scope, const string& id) {
+  Output var_handle =
+      ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
+  Output value_to_write =
+      ops::Const(scope.WithOpName("ValueToAssign" + id), 1.0f);
+  ops::AssignVariableOp assign_op(scope.WithOpName("Assignee" + id), var_handle,
+                                  value_to_write);
+  return assign_op.operation.node();
+}
+
+FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) {
+  FunctionDefLibrary flib_def;
+  FunctionDef func = FunctionDefHelper::Create(
+      /*function_name=*/name, /*in_def=*/{}, /*out_def=*/{"out: float"},
+      /*attr_def*/
+      {}, /*node_def=*/{FunctionDefHelper::Const("one", 1.0f)},
+      /*ret_def=*/{{"out", "out:output:0"}});
+  *flib_def.add_function() = std::move(func);
+  return flib_def;
+}
+
+TEST(BuildXlaOps, ControlDepsPreserved) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  FunctionDefLibrary flib_def =
+      CreateFunctionDefLibWithConstFunction("cluster_0");
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+  Node* call;
+  TF_ASSERT_OK(MakeXlaCompiledKernel(root.graph(), "cluster_0", "C", &call));
+  Node* write_op = MakeWrite(root, "write");
+  root.graph()->AddControlEdge(call, write_op);
+
+  std::unique_ptr<Graph> graph;
+  TF_ASSERT_OK(BuildXlaOps(root, &graph));
+
+  Node* write_op_new = FindNodeByName(graph.get(), write_op->name());
+  ASSERT_NE(write_op_new, nullptr);
+  EXPECT_THAT(write_op_new, NodeWith(CtrlDeps(NodeWith(Op("_XlaRun")))));
+}
+
+}  // namespace
+}  // namespace tensorflow
-- 
GitLab


From 2fb9377a5ec610b8eff853fd1d2d53eabf711eda Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Thu, 27 Sep 2018 14:03:52 -0700
Subject: [PATCH 095/570] Enable worker heartbeat polling for all available
 workers.

PiperOrigin-RevId: 214831772
---
 .../contrib/tpu/python/tpu/session_support.py | 52 ++++++++++++-------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/session_support.py b/tensorflow/contrib/tpu/python/tpu/session_support.py
index 3e91e2df32..24b9bd136b 100644
--- a/tensorflow/contrib/tpu/python/tpu/session_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/session_support.py
@@ -41,6 +41,25 @@ class CoordinatorShutdownException(Exception):
   pass
 
 
+def _make_heartbeat_op(session, device, request_ph):
+  """Return a heartbeat op or None if heartbeats are not supported by device."""
+  try:
+    with ops.device(device):
+      heartbeat_op = tpu_ops.worker_heartbeat(request_ph)
+      request = event_pb2.WorkerHeartbeatRequest()
+      options = config_pb2.RunOptions(timeout_in_ms=5000)
+      session.run(
+          heartbeat_op,
+          feed_dict={request_ph: request.SerializeToString()},
+          options=options)
+      return heartbeat_op
+  except errors.InvalidArgumentError as _:
+    return None
+  except errors.DeadlineExceededError as _:
+    logging.warning('Timeout connecting to %s when testing heartbeat', device)
+    return None
+
+
 class WorkerHeartbeatManager(object):
   """Manages the status/heartbeat monitor for a set of workers."""
 
@@ -72,30 +91,27 @@ class WorkerHeartbeatManager(object):
         name='worker_heartbeat_request', dtype=dtypes.string)
 
     heartbeat_ops = []
+    kept_devices = []
     for device in devices:
-      with ops.device(device):
-        heartbeat_ops.append(tpu_ops.worker_heartbeat(request_placeholder))
+      heartbeat_op = _make_heartbeat_op(session, device, request_placeholder)
+      if heartbeat_op is not None:
+        kept_devices.append(device)
+        heartbeat_ops.append(heartbeat_op)
+      else:
+        logging.warning('Heartbeat support not available for %s', device)
 
-    return WorkerHeartbeatManager(session, devices, heartbeat_ops,
+    return WorkerHeartbeatManager(session, kept_devices, heartbeat_ops,
                                   request_placeholder)
 
-  def heartbeat_supported(self):
-    """Returns True if heartbeat operations are supported on all workers."""
-    try:
-      # Send ping to verify worker has heartbeat support.
-      self.ping()
-      return True
-    except errors.InvalidArgumentError as _:
-      return False
+  def num_workers(self):
+    return len(self._devices)
 
   def configure(self, message):
     """Configure heartbeat manager for all devices.
 
     Args:
       message: `event_pb2.WorkerHeartbeatRequest`
-
     Returns: `None`
-
     """
     logging.info('Configuring worker heartbeat: %s',
                  text_format.MessageToString(message))
@@ -184,7 +200,6 @@ class WatchdogManager(threading.Thread):
     """Initialize a watchdog manager.
 
     Args:
-
       session: Session connected to worker devices.  A cloned session and graph
         will be created for managing worker pings.
       devices: Set of devices to monitor.  If none, all workers will be
@@ -277,16 +292,14 @@ class GracefulShutdownHook(session_run_hook.SessionRunHook):
           target=training_session.sess_str, graph=self._graph)
       self._workers = WorkerHeartbeatManager.from_devices(
           self._session, all_worker_devices(self._session))
-      self._heartbeat_supported = self._workers.heartbeat_supported()
+      self._heartbeat_supported = self._workers.num_workers() > 0
       if self._heartbeat_supported:
         self._workers.configure(
             event_pb2.WorkerHeartbeatRequest(
                 shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
       else:
         logging.warn(
-            'Worker heartbeats not supported by all workers.  No failure '
-            'handling will be enabled.'
-        )
+            'No workers support hearbeats. Failure handling will be disabled.')
 
   def saver(self):
     if self._saver:
@@ -303,8 +316,7 @@ class GracefulShutdownHook(session_run_hook.SessionRunHook):
       logging.error(
           'Multiple savers in the SAVERS collection.  On-demand checkpointing '
           'will be disabled. Pass an explicit `saver` to the constructor to '
-          'override this behavior.'
-      )
+          'override this behavior.')
       return None
 
     return savers[0]
-- 
GitLab


From cc83067469bc30bba55932c587f31ef68f15792f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 14:04:06 -0700
Subject: [PATCH 096/570] Migrate a few conv kernels to use new kernel
 signatures.

PiperOrigin-RevId: 214831837
---
 tensorflow/contrib/lite/kernels/conv.cc       | 70 +++++++++----------
 .../kernels/internal/optimized/cblas_conv.h   | 54 ++++++++------
 .../internal/optimized/multithreaded_conv.h   | 60 +++++++++-------
 3 files changed, 100 insertions(+), 84 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index 101b4fc961..dbcadbee14 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -86,6 +86,18 @@ struct OpData {
   bool run_multithreaded_kernel;
 };
 
+inline PaddingType RuntimePaddingType(TfLitePadding padding) {
+  switch (padding) {
+    case TfLitePadding::kTfLitePaddingSame:
+      return PaddingType::kSame;
+    case TfLitePadding::kTfLitePaddingValid:
+      return PaddingType::kValid;
+    case TfLitePadding::kTfLitePaddingUnknown:
+    default:
+      return PaddingType::kNone;
+  }
+}
+
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to use as scratch space for im2col, and
@@ -487,18 +499,18 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   } else {
     effective_kernel_type = kernel_type;
   }
+  ConvParams op_params;
+  op_params.padding_type = RuntimePaddingType(params->padding);
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
   switch (effective_kernel_type) {
     case kReference: {
-      ConvParams op_params;
-      op_params.padding_type = PaddingType::kSame;
-      op_params.padding_values.width = data->padding.width;
-      op_params.padding_values.height = data->padding.height;
-      op_params.stride_width = params->stride_width;
-      op_params.stride_height = params->stride_height;
-      op_params.dilation_width_factor = params->dilation_width_factor;
-      op_params.dilation_height_factor = params->dilation_height_factor;
-      op_params.float_activation_min = output_activation_min;
-      op_params.float_activation_max = output_activation_max;
       reference_ops::Conv(op_params, GetTensorShape(input),
                           GetTensorData<float>(input), GetTensorShape(filter),
                           GetTensorData<float>(filter), GetTensorShape(bias),
@@ -508,16 +520,6 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
       break;
     }
     case kGenericOptimized: {
-      ConvParams op_params;
-      op_params.padding_type = PaddingType::kSame;
-      op_params.padding_values.width = data->padding.width;
-      op_params.padding_values.height = data->padding.height;
-      op_params.stride_width = params->stride_width;
-      op_params.stride_height = params->stride_height;
-      op_params.dilation_width_factor = params->dilation_width_factor;
-      op_params.dilation_height_factor = params->dilation_height_factor;
-      op_params.float_activation_min = output_activation_min;
-      op_params.float_activation_max = output_activation_max;
       optimized_ops::Conv(op_params, GetTensorShape(input),
                           GetTensorData<float>(input), GetTensorShape(filter),
                           GetTensorData<float>(filter), GetTensorShape(bias),
@@ -534,25 +536,21 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
         filter_data = GetTensorData<float>(filter);
       }
       multithreaded_ops::Conv(
-          *eigen_support::GetThreadPoolDevice(context),
-          GetTensorData<float>(input), GetTensorDims(input), filter_data,
-          GetTensorDims(filter), GetTensorData<float>(bias),
-          GetTensorDims(bias), params->stride_width, params->stride_height,
-          data->padding.width, data->padding.height, params->padding,
-          output_activation_min, output_activation_max,
-          GetTensorData<float>(output), GetTensorDims(output),
-          GetTensorData<float>(im2col), GetTensorDims(im2col));
+          *eigen_support::GetThreadPoolDevice(context), op_params,
+          GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(filter), filter_data, GetTensorShape(bias),
+          GetTensorData<float>(bias), GetTensorShape(output),
+          GetTensorData<float>(output), GetTensorShape(im2col),
+          GetTensorData<float>(im2col));
       break;
     }
     case kCblasOptimized: {
-      cblas_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
-                      GetTensorData<float>(filter), GetTensorDims(filter),
-                      GetTensorData<float>(bias), GetTensorDims(bias),
-                      params->stride_width, params->stride_height,
-                      data->padding.width, data->padding.height,
-                      output_activation_min, output_activation_max,
-                      GetTensorData<float>(output), GetTensorDims(output),
-                      GetTensorData<float>(im2col), GetTensorDims(im2col));
+      cblas_ops::Conv(op_params, GetTensorShape(input),
+                      GetTensorData<float>(input), GetTensorShape(filter),
+                      GetTensorData<float>(filter), GetTensorShape(bias),
+                      GetTensorData<float>(bias), GetTensorShape(output),
+                      GetTensorData<float>(output), GetTensorShape(im2col),
+                      GetTensorData<float>(im2col));
       break;
     }
   }
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
index 40d42bbae9..2d96da65c3 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
@@ -31,20 +31,29 @@ limitations under the License.
 namespace tflite {
 namespace cblas_ops {
 
-inline void Conv(const float* input_data, const Dims<4>& input_dims,
-                 const float* filter_data, const Dims<4>& filter_dims,
-                 const float* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int pad_width,
-                 int pad_height, float output_activation_min,
-                 float output_activation_max, float* output_data,
-                 const Dims<4>& output_dims, float* im2col_data,
-                 const Dims<4>& im2col_dims) {
+inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& filter_shape,
+                 const float* filter_data, const RuntimeShape& bias_shape,
+                 const float* bias_data, const RuntimeShape& output_shape,
+                 float* output_data, const RuntimeShape& im2col_shape,
+                 float* im2col_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
   gemmlowp::ScopedProfilingLabel label("Conv/cblas");
 
   const float* gemm_input_data = nullptr;
-  const Dims<4>* gemm_input_dims = nullptr;
-  const int filter_width = ArraySize(filter_dims, 1);
-  const int filter_height = ArraySize(filter_dims, 2);
+  const RuntimeShape* gemm_input_shape = nullptr;
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
   const bool need_im2col = stride_width != 1 || stride_height != 1 ||
                            filter_width != 1 || filter_height != 1;
   if (need_im2col) {
@@ -55,18 +64,17 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims,
     op_params.padding_values.height = pad_height;
     op_params.stride_width = stride_width;
     op_params.stride_height = stride_height;
-    op_params.dilation_width_factor = 1;
-    op_params.dilation_height_factor = 1;
+    op_params.dilation_width_factor = dilation_width_factor;
+    op_params.dilation_height_factor = dilation_height_factor;
     optimized_ops::Im2col(op_params, filter_height, filter_width, 0,
-                          DimsToShape(input_dims), input_data,
-                          DimsToShape(im2col_dims), im2col_data);
+                          input_shape, input_data, im2col_shape, im2col_data);
 
     gemm_input_data = im2col_data;
-    gemm_input_dims = &im2col_dims;
+    gemm_input_shape = &im2col_shape;
   } else {
     TFLITE_DCHECK(!im2col_data);
     gemm_input_data = input_data;
-    gemm_input_dims = &input_dims;
+    gemm_input_shape = &input_shape;
   }
 
   // The following code computes matrix multiplication c = a * transponse(b)
@@ -78,10 +86,10 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims,
   const float* a = gemm_input_data;
   const float* b = filter_data;
   float* c = output_data;
-  int m = gemm_input_dims->sizes[1] * gemm_input_dims->sizes[2] *
-          gemm_input_dims->sizes[3];
-  int n = output_dims.sizes[0];
-  int k = gemm_input_dims->sizes[0];
+  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
+  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
+  int n = output_shape.Dims(3);
+  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
   // The stride of matrix a, b and c respectively.
   int stride_a = k;
   int stride_b = k;
@@ -91,8 +99,8 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims,
               stride_a, b, stride_b, 0.0f, c, stride_c);
 
   optimized_ops::AddBiasAndEvalActivationFunction(
-      output_activation_min, output_activation_max, DimsToShape(bias_dims),
-      bias_data, DimsToShape(output_dims), output_data);
+      output_activation_min, output_activation_max, bias_shape, bias_data,
+      output_shape, output_data);
 }
 
 }  // namespace cblas_ops
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
index b5d001cc9e..4139cf4eba 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
@@ -69,13 +69,13 @@ struct MatMulConvFunctor {
 template <class T>
 class EigenTensorConvFunctor {
  private:
-  Eigen::PaddingType TfLitePadding2EigenPadding(TfLitePadding padding) {
+  Eigen::PaddingType RuntimePadding2EigenPadding(PaddingType padding) {
     switch (padding) {
-      case kTfLitePaddingValid:
+      case PaddingType::kValid:
         return Eigen::PADDING_VALID;
-      case kTfLitePaddingSame:
+      case PaddingType::kSame:
         return Eigen::PADDING_SAME;
-      case kTfLitePaddingUnknown:
+      case PaddingType::kNone:
         assert(false);  // should never get here.
         return Eigen::PADDING_VALID;
     }
@@ -89,7 +89,7 @@ class EigenTensorConvFunctor {
                   int input_width, int input_depth, const T* filter_data,
                   int filter_height, int filter_width, int filter_count,
                   int stride_rows, int stride_cols, int pad_width,
-                  int pad_height, TfLitePadding padding, T* output_data,
+                  int pad_height, PaddingType padding, T* output_data,
                   int output_height, int output_width) {
     const bool is_1x1_kernel = (filter_height == 1 && filter_width == 1 &&
                                 stride_rows == 1 && stride_cols == 1);
@@ -127,28 +127,38 @@ class EigenTensorConvFunctor {
                               input_depth, filter_count);
       output.device(device) =
           Eigen::SpatialConvolution(input, filter, stride_cols, stride_rows,
-                                    TfLitePadding2EigenPadding(padding));
+                                    RuntimePadding2EigenPadding(padding));
     }
   }
 };
 
-inline void Conv(const Eigen::ThreadPoolDevice& device, const float* input_data,
-                 const Dims<4>& input_dims, const float* filter_data,
-                 const Dims<4>& filter_dims, const float* bias_data,
-                 const Dims<4>& bias_dims, int stride_width, int stride_height,
-                 int pad_width, int pad_height, TfLitePadding padding,
-                 float output_activation_min, float output_activation_max,
-                 float* output_data, const Dims<4>& output_dims,
-                 float* im2col_data, const Dims<4>& im2col_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
-  const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0);
-  const int input_height = ArraySize(input_dims, 2);
-  const int input_width = ArraySize(input_dims, 1);
-  const int filter_height = ArraySize(filter_dims, 2);
-  const int filter_width = ArraySize(filter_dims, 1);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_width = ArraySize(output_dims, 1);
+inline void Conv(const Eigen::ThreadPoolDevice& device,
+                 const ConvParams& params, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& filter_shape,
+                 const float* filter_data, const RuntimeShape& bias_shape,
+                 const float* bias_data, const RuntimeShape& output_shape,
+                 float* output_data, const RuntimeShape& im2col_shape,
+                 float* im2col_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const PaddingType padding = params.padding_type;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
   EigenTensorConvFunctor<float> conv_functor;
   conv_functor(device, input_data, im2col_data, batches, input_height,
                input_width, input_depth, filter_data, filter_height,
@@ -157,8 +167,8 @@ inline void Conv(const Eigen::ThreadPoolDevice& device, const float* input_data,
                output_width);
 
   optimized_ops::AddBiasAndEvalActivationFunction(
-      output_activation_min, output_activation_max, DimsToShape(bias_dims),
-      bias_data, DimsToShape(output_dims), output_data);
+      output_activation_min, output_activation_max, bias_shape, bias_data,
+      output_shape, output_data);
 }
 
 }  // namespace multithreaded_ops
-- 
GitLab


From d0397c3314600da0c9cdc300ae87483331d54298 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Thu, 27 Sep 2018 14:25:18 -0700
Subject: [PATCH 097/570] Rename TFLite Eager delegate -> Flex delegate

PiperOrigin-RevId: 214835588
---
 .../lite/delegates/{eager => flex}/BUILD      |  0
 .../delegates/{eager => flex}/buffer_map.cc   |  8 ++--
 .../delegates/{eager => flex}/buffer_map.h    | 12 ++---
 .../{eager => flex}/buffer_map_test.cc        |  6 +--
 .../delegates/{eager => flex}/delegate.cc     | 34 +++++++-------
 .../lite/delegates/{eager => flex}/delegate.h | 26 +++++-----
 .../{eager => flex}/delegate_data.cc          |  6 +--
 .../delegates/{eager => flex}/delegate_data.h | 16 +++----
 .../{eager => flex}/delegate_data_test.cc     |  6 +--
 .../{eager => flex}/delegate_test.cc          | 14 +++---
 .../lite/delegates/{eager => flex}/kernel.cc  | 30 ++++++------
 .../lite/delegates/{eager => flex}/kernel.h   | 12 ++---
 .../delegates/{eager => flex}/kernel_test.cc  | 16 +++----
 .../delegates/{eager => flex}/test_util.cc    | 47 +++++++++----------
 .../delegates/{eager => flex}/test_util.h     | 20 ++++----
 .../lite/delegates/{eager => flex}/util.cc    |  6 +--
 .../lite/delegates/{eager => flex}/util.h     | 10 ++--
 .../delegates/{eager => flex}/util_test.cc    |  6 +--
 tensorflow/contrib/lite/kernels/register.cc   |  8 ++--
 tensorflow/contrib/lite/model.cc              |  4 +-
 tensorflow/contrib/lite/python/convert.py     |  6 +--
 tensorflow/contrib/lite/python/lite_test.py   |  2 +-
 tensorflow/contrib/lite/testing/BUILD         |  2 +-
 .../contrib/lite/testing/generate_examples.py |  2 +-
 .../contrib/lite/testing/tflite_diff_flags.h  |  4 +-
 .../contrib/lite/testing/tflite_diff_util.h   |  2 +-
 .../contrib/lite/testing/tflite_driver.cc     |  6 +--
 .../contrib/lite/testing/tflite_driver.h      |  4 +-
 tensorflow/contrib/lite/toco/args.h           |  4 +-
 .../contrib/lite/toco/import_tensorflow.cc    |  4 +-
 .../contrib/lite/toco/import_tensorflow.h     |  2 +-
 tensorflow/contrib/lite/toco/tflite/export.cc | 20 ++++----
 tensorflow/contrib/lite/toco/tflite/export.h  |  4 +-
 .../contrib/lite/toco/tflite/export_test.cc   |  2 +-
 .../contrib/lite/toco/tflite/operator.cc      | 26 +++++-----
 .../contrib/lite/toco/tflite/operator.h       |  6 +--
 .../contrib/lite/toco/toco_cmdline_flags.cc   | 24 +++++-----
 tensorflow/contrib/lite/toco/toco_flags.proto | 16 +++----
 tensorflow/contrib/lite/toco/toco_tooling.cc  |  8 ++--
 tensorflow/contrib/lite/tools/benchmark/BUILD |  8 ++--
 .../tools/benchmark/benchmark_tflite_model.cc |  6 +--
 .../tools/benchmark/benchmark_tflite_model.h  |  4 +-
 tensorflow/contrib/lite/util.cc               |  6 +--
 tensorflow/contrib/lite/util.h                |  8 ++--
 tensorflow/contrib/lite/util_test.cc          | 16 +++----
 45 files changed, 239 insertions(+), 240 deletions(-)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/BUILD (100%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/buffer_map.cc (95%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/buffer_map.h (86%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/buffer_map_test.cc (98%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/delegate.cc (76%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/delegate.h (64%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/delegate_data.cc (94%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/delegate_data.h (78%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/delegate_data_test.cc (93%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/delegate_test.cc (95%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/kernel.cc (91%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/kernel.h (79%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/kernel_test.cc (94%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/test_util.cc (76%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/test_util.h (90%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/util.cc (96%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/util.h (89%)
 rename tensorflow/contrib/lite/delegates/{eager => flex}/util_test.cc (97%)

diff --git a/tensorflow/contrib/lite/delegates/eager/BUILD b/tensorflow/contrib/lite/delegates/flex/BUILD
similarity index 100%
rename from tensorflow/contrib/lite/delegates/eager/BUILD
rename to tensorflow/contrib/lite/delegates/flex/BUILD
diff --git a/tensorflow/contrib/lite/delegates/eager/buffer_map.cc b/tensorflow/contrib/lite/delegates/flex/buffer_map.cc
similarity index 95%
rename from tensorflow/contrib/lite/delegates/eager/buffer_map.cc
rename to tensorflow/contrib/lite/delegates/flex/buffer_map.cc
index e5a19c3997..63e39196d9 100644
--- a/tensorflow/contrib/lite/delegates/eager/buffer_map.cc
+++ b/tensorflow/contrib/lite/delegates/flex/buffer_map.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/eager/buffer_map.h"
+#include "tensorflow/contrib/lite/delegates/flex/buffer_map.h"
 
 #include "tensorflow/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/delegates/eager/util.h"
+#include "tensorflow/contrib/lite/delegates/flex/util.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/log_memory.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 namespace {
 // A tensor buffer that is allocated, deallocated and populated by TF Lite.
 class TfLiteTensorBuffer : public tensorflow::TensorBuffer {
@@ -107,5 +107,5 @@ void BufferMap::SetFromTensorFlow(int tensor_index, tensorflow::Tensor tensor) {
   id_to_tensor_[tensor_index] = std::move(tensor);
 }
 
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/eager/buffer_map.h b/tensorflow/contrib/lite/delegates/flex/buffer_map.h
similarity index 86%
rename from tensorflow/contrib/lite/delegates/eager/buffer_map.h
rename to tensorflow/contrib/lite/delegates/flex/buffer_map.h
index aaaa045840..4ce886568a 100644
--- a/tensorflow/contrib/lite/delegates/eager/buffer_map.h
+++ b/tensorflow/contrib/lite/delegates/flex/buffer_map.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_BUFFER_MAP_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_BUFFER_MAP_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_BUFFER_MAP_H_
+#define TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_BUFFER_MAP_H_
 
 #include <map>
 
@@ -21,12 +21,12 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 
 // Maps a TF Lite tensor index into a TensorFlow tensor.
 //
 // The TF Lite interpreter assigns integer indices to each of its tensors, but
-// the Eager delegate deals in terms of TensorFlow tensors. This class maps
+// the Flex delegate deals in terms of TensorFlow tensors. This class maps
 // from indices to tensors and allows the creation of new tensors to be
 // associated with a given index.
 class BufferMap {
@@ -55,7 +55,7 @@ class BufferMap {
   std::map<int, tensorflow::Tensor> id_to_tensor_;
 };
 
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_BUFFER_MAP_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_BUFFER_MAP_H_
diff --git a/tensorflow/contrib/lite/delegates/eager/buffer_map_test.cc b/tensorflow/contrib/lite/delegates/flex/buffer_map_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/delegates/eager/buffer_map_test.cc
rename to tensorflow/contrib/lite/delegates/flex/buffer_map_test.cc
index a046943e56..bb80e25e80 100644
--- a/tensorflow/contrib/lite/delegates/eager/buffer_map_test.cc
+++ b/tensorflow/contrib/lite/delegates/flex/buffer_map_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/eager/buffer_map.h"
+#include "tensorflow/contrib/lite/delegates/flex/buffer_map.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/util.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 namespace {
 
 using ::testing::ElementsAre;
@@ -164,7 +164,7 @@ TEST(BufferMapTest, TensorFlowOverwritesTfLite) {
 }
 
 }  // namespace
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate.cc b/tensorflow/contrib/lite/delegates/flex/delegate.cc
similarity index 76%
rename from tensorflow/contrib/lite/delegates/eager/delegate.cc
rename to tensorflow/contrib/lite/delegates/flex/delegate.cc
index 45fc158157..ba065a8ff5 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate.cc
+++ b/tensorflow/contrib/lite/delegates/flex/delegate.cc
@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
+#include "tensorflow/contrib/lite/delegates/flex/delegate.h"
 
 #include <vector>
 
 #include "tensorflow/contrib/lite/context_util.h"
-#include "tensorflow/contrib/lite/delegates/eager/buffer_map.h"
-#include "tensorflow/contrib/lite/delegates/eager/kernel.h"
-#include "tensorflow/contrib/lite/delegates/eager/util.h"
+#include "tensorflow/contrib/lite/delegates/flex/buffer_map.h"
+#include "tensorflow/contrib/lite/delegates/flex/kernel.h"
+#include "tensorflow/contrib/lite/delegates/flex/util.h"
 #include "tensorflow/contrib/lite/util.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 namespace delegate {
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
@@ -32,7 +32,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
   TfLiteIntArray* plan;
   TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
 
-  // Add all custom ops starting with "Eager" to list of supported nodes.
+  // Add all custom ops starting with "Flex" to list of supported nodes.
   std::vector<int> supported_nodes;
   for (int node_index : TfLiteIntArrayView(plan)) {
     TfLiteNode* node;
@@ -40,7 +40,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
     TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
         context, node_index, &node, &registration));
 
-    if (IsEagerOp(registration->custom_name)) {
+    if (IsFlexOp(registration->custom_name)) {
       supported_nodes.push_back(node_index);
     }
   }
@@ -81,28 +81,28 @@ TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
 }
 
 }  // namespace delegate
-}  // namespace eager
+}  // namespace flex
 
-std::unique_ptr<EagerDelegate> EagerDelegate::Create() {
-  std::unique_ptr<eager::DelegateData> delegate_data;
-  if (!eager::DelegateData::Create(&delegate_data).ok()) {
+std::unique_ptr<FlexDelegate> FlexDelegate::Create() {
+  std::unique_ptr<flex::DelegateData> delegate_data;
+  if (!flex::DelegateData::Create(&delegate_data).ok()) {
     fprintf(stderr, "Unable to initialize TensorFlow context.\n");
     return nullptr;
   }
 
-  return std::unique_ptr<EagerDelegate>(
-      new EagerDelegate(std::move(delegate_data)));
+  return std::unique_ptr<FlexDelegate>(
+      new FlexDelegate(std::move(delegate_data)));
 }
 
-EagerDelegate::EagerDelegate(std::unique_ptr<eager::DelegateData> delegate_data)
+FlexDelegate::FlexDelegate(std::unique_ptr<flex::DelegateData> delegate_data)
     : TfLiteDelegate{
           /*data_=*/delegate_data.get(),
-          /*nullptr,*/ &eager::delegate::Prepare,
-          /*CopyFromBufferHandle=*/&eager::delegate::CopyFromBufferHandle,
+          /*nullptr,*/ &flex::delegate::Prepare,
+          /*CopyFromBufferHandle=*/&flex::delegate::CopyFromBufferHandle,
           /*CopyToBufferHandle=*/nullptr,
           /*FreeBufferHandle=*/nullptr},
       delegate_data_(std::move(delegate_data)) {}
 
-EagerDelegate::~EagerDelegate() {}
+FlexDelegate::~FlexDelegate() {}
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate.h b/tensorflow/contrib/lite/delegates/flex/delegate.h
similarity index 64%
rename from tensorflow/contrib/lite/delegates/eager/delegate.h
rename to tensorflow/contrib/lite/delegates/flex/delegate.h
index 70f3c15af4..1017780dc7 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate.h
+++ b/tensorflow/contrib/lite/delegates/flex/delegate.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_DELEGATE_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_DELEGATE_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_DELEGATE_H_
+#define TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_DELEGATE_H_
 
 #include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/delegates/eager/delegate_data.h"
+#include "tensorflow/contrib/lite/delegates/flex/delegate_data.h"
 
 namespace tflite {
 
@@ -24,12 +24,12 @@ namespace tflite {
 // Delegate that can be used to extract parts of a graph that are designed to be
 // executed by TensorFlow's runtime via Eager.
 //
-// The interpreter must be constructed after the EagerDelegate and destructed
-// before the EagerDelegate. This delegate may be used with multiple
+// The interpreter must be constructed after the FlexDelegate and destructed
+// before the FlexDelegate. This delegate may be used with multiple
 // interpreters, but it is *not* thread-safe.
 //
 // Usage:
-//   auto delegate = EagerDelegate::Create();
+//   auto delegate = FlexDelegate::Create();
 //   ... build interpreter ...
 //
 //   if (delegate) {
@@ -39,21 +39,21 @@ namespace tflite {
 //   ... run inference ...
 //   ... destroy interpreter ...
 //   ... destroy delegate ...
-class EagerDelegate : public TfLiteDelegate {
+class FlexDelegate : public TfLiteDelegate {
  public:
   // Creates a delegate that supports TF ops.
   //
-  // If the underyling TF Eager context creation fails, returns null.
-  static std::unique_ptr<EagerDelegate> Create();
+  // If the underyling TF Flex context creation fails, returns null.
+  static std::unique_ptr<FlexDelegate> Create();
 
-  ~EagerDelegate();
+  ~FlexDelegate();
 
  private:
-  explicit EagerDelegate(std::unique_ptr<eager::DelegateData> delegate_data);
+  explicit FlexDelegate(std::unique_ptr<flex::DelegateData> delegate_data);
 
-  std::unique_ptr<eager::DelegateData> delegate_data_;
+  std::unique_ptr<flex::DelegateData> delegate_data_;
 };
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_DELEGATE_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_DELEGATE_H_
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate_data.cc b/tensorflow/contrib/lite/delegates/flex/delegate_data.cc
similarity index 94%
rename from tensorflow/contrib/lite/delegates/eager/delegate_data.cc
rename to tensorflow/contrib/lite/delegates/flex/delegate_data.cc
index 0fd5c976f8..8f985f770c 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate_data.cc
+++ b/tensorflow/contrib/lite/delegates/flex/delegate_data.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/eager/delegate_data.h"
+#include "tensorflow/contrib/lite/delegates/flex/delegate_data.h"
 
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 tensorflow::Status DelegateData::Create(std::unique_ptr<DelegateData>* data) {
   std::vector<tensorflow::Device*> devices;
 
@@ -43,5 +43,5 @@ DelegateData::DelegateData(tensorflow::EagerContext* eager_context)
 
 DelegateData::~DelegateData() {}
 
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate_data.h b/tensorflow/contrib/lite/delegates/flex/delegate_data.h
similarity index 78%
rename from tensorflow/contrib/lite/delegates/eager/delegate_data.h
rename to tensorflow/contrib/lite/delegates/flex/delegate_data.h
index 772d26f44e..8d75f0b0ef 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate_data.h
+++ b/tensorflow/contrib/lite/delegates/flex/delegate_data.h
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_DELEGATE_DATA_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_DELEGATE_DATA_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_DELEGATE_DATA_H_
+#define TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_DELEGATE_DATA_H_
 
-#include "tensorflow/contrib/lite/delegates/eager/buffer_map.h"
+#include "tensorflow/contrib/lite/delegates/flex/buffer_map.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 
-// Data kept by the Eager delegate for the lifetime of an Interpreter.
+// Data kept by the Flex delegate for the lifetime of an Interpreter.
 class DelegateData {
  public:
   // Create a new DelegateData, initialized with a newly-created EagerContext.
@@ -29,7 +29,7 @@ class DelegateData {
 
   ~DelegateData();
 
-  // The EagerContext that is required for execution of Eager Ops.
+  // The EagerContext that is required for execution of Flex Ops.
   tensorflow::EagerContext* GetEagerContext() { return eager_context_.get(); }
 
   // Map from TF Lite tensor index to TensorFlow tensor for a given context.
@@ -46,7 +46,7 @@ class DelegateData {
   std::unordered_map<const TfLiteContext*, BufferMap> buffer_map_;
 };
 
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_DELEGATE_DATA_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_DELEGATE_DATA_H_
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate_data_test.cc b/tensorflow/contrib/lite/delegates/flex/delegate_data_test.cc
similarity index 93%
rename from tensorflow/contrib/lite/delegates/eager/delegate_data_test.cc
rename to tensorflow/contrib/lite/delegates/flex/delegate_data_test.cc
index def063309f..30b10f435a 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate_data_test.cc
+++ b/tensorflow/contrib/lite/delegates/flex/delegate_data_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/eager/delegate_data.h"
+#include "tensorflow/contrib/lite/delegates/flex/delegate_data.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/testing/util.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 namespace {
 
 TEST(DelegateDataTest, Basic) {
@@ -39,7 +39,7 @@ TEST(DelegateDataTest, Basic) {
 }
 
 }  // namespace
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate_test.cc b/tensorflow/contrib/lite/delegates/flex/delegate_test.cc
similarity index 95%
rename from tensorflow/contrib/lite/delegates/eager/delegate_test.cc
rename to tensorflow/contrib/lite/delegates/flex/delegate_test.cc
index 43ec5d53b8..1813952cef 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate_test.cc
+++ b/tensorflow/contrib/lite/delegates/flex/delegate_test.cc
@@ -12,23 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
+#include "tensorflow/contrib/lite/delegates/flex/delegate.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/delegates/eager/test_util.h"
+#include "tensorflow/contrib/lite/delegates/flex/test_util.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 namespace {
 
 using ::testing::ContainsRegex;
 using ::testing::ElementsAre;
 
-class DelegateTest : public testing::EagerModelTest {
+class DelegateTest : public testing::FlexModelTest {
  public:
   DelegateTest() {
-    delegate_ = EagerDelegate::Create();
+    delegate_ = FlexDelegate::Create();
     interpreter_.reset(new Interpreter(&error_reporter_));
   }
 
@@ -46,7 +46,7 @@ class DelegateTest : public testing::EagerModelTest {
   }
 
  private:
-  std::unique_ptr<EagerDelegate> delegate_;
+  std::unique_ptr<FlexDelegate> delegate_;
 };
 
 TEST_F(DelegateTest, FullGraph) {
@@ -236,7 +236,7 @@ TEST_F(DelegateTest, MultipleInterpretersSameDelegate) {
 }
 
 }  // namespace
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/contrib/lite/delegates/eager/kernel.cc b/tensorflow/contrib/lite/delegates/flex/kernel.cc
similarity index 91%
rename from tensorflow/contrib/lite/delegates/eager/kernel.cc
rename to tensorflow/contrib/lite/delegates/flex/kernel.cc
index 48a2f56baf..e4f1aea990 100644
--- a/tensorflow/contrib/lite/delegates/eager/kernel.cc
+++ b/tensorflow/contrib/lite/delegates/flex/kernel.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/eager/kernel.h"
+#include "tensorflow/contrib/lite/delegates/flex/kernel.h"
 
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
 #include "tensorflow/contrib/lite/builtin_ops.h"
 #include "tensorflow/contrib/lite/c/c_api_internal.h"
 #include "tensorflow/contrib/lite/context_util.h"
-#include "tensorflow/contrib/lite/delegates/eager/delegate_data.h"
-#include "tensorflow/contrib/lite/delegates/eager/util.h"
+#include "tensorflow/contrib/lite/delegates/flex/delegate_data.h"
+#include "tensorflow/contrib/lite/delegates/flex/util.h"
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/string.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
@@ -28,10 +28,10 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 
-// Note: this is part of TF Lite's Eager delegation code which is to be
+// Note: this is part of TF Lite's Flex delegation code which is to be
 // completed soon.
 
-// This is the TF Lite op that is created by the eager delegate to handle
+// This is the TF Lite op that is created by the flex delegate to handle
 // execution of a supported subgraph. The usual flow is that the delegate
 // informs the interpreter of supported nodes in a graph, and each supported
 // subgraph is replaced with one instance of this kernel.
@@ -46,7 +46,7 @@ limitations under the License.
 // corresponding TensorFlow/Eager Op.
 
 namespace tflite {
-namespace eager {
+namespace flex {
 namespace kernel {
 
 // Controls the lifetime of tensor handles in a vector.
@@ -72,11 +72,11 @@ class VectorOfHandles {
 
 // Executes the TensorFlow op given by 'op_name', with the attributes specified
 // in 'nodedef'. Inputs and outputs are given as indices into the 'buffer_map'.
-tensorflow::Status ExecuteEagerOp(tensorflow::EagerContext* eager_context,
-                                  BufferMap* buffer_map, const string& op_name,
-                                  const tensorflow::NodeDef& nodedef,
-                                  const std::vector<int>& inputs,
-                                  const std::vector<int>& outputs) {
+tensorflow::Status ExecuteFlexOp(tensorflow::EagerContext* eager_context,
+                                 BufferMap* buffer_map, const string& op_name,
+                                 const tensorflow::NodeDef& nodedef,
+                                 const std::vector<int>& inputs,
+                                 const std::vector<int>& outputs) {
   const tensorflow::AttrTypeMap* attr_types;
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       tensorflow::AttrTypeMapForOp(op_name.c_str(), &attr_types),
@@ -258,13 +258,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Execute the TensorFlow Ops sequentially.
   for (const auto& node_data : op_data->nodes) {
     if (node_data.nodedef.op().empty()) {
-      context->ReportError(context, "Invalid NodeDef in Eager op '%s'",
+      context->ReportError(context, "Invalid NodeDef in Flex op '%s'",
                            node_data.name.c_str());
       return kTfLiteError;
     }
     auto status =
-        ExecuteEagerOp(eager_context, buffer_map, node_data.name,
-                       node_data.nodedef, node_data.inputs, node_data.outputs);
+        ExecuteFlexOp(eager_context, buffer_map, node_data.name,
+                      node_data.nodedef, node_data.inputs, node_data.outputs);
     TF_LITE_ENSURE_OK(context, ConvertStatus(context, status));
   }
 
@@ -295,5 +295,5 @@ TfLiteRegistration GetKernel() {
   return registration;
 }
 
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/eager/kernel.h b/tensorflow/contrib/lite/delegates/flex/kernel.h
similarity index 79%
rename from tensorflow/contrib/lite/delegates/eager/kernel.h
rename to tensorflow/contrib/lite/delegates/flex/kernel.h
index 2478abccaa..ac9313a37b 100644
--- a/tensorflow/contrib/lite/delegates/eager/kernel.h
+++ b/tensorflow/contrib/lite/delegates/flex/kernel.h
@@ -12,23 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_KERNEL_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_KERNEL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_KERNEL_H_
+#define TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_KERNEL_H_
 
 #include "tensorflow/contrib/lite/c/c_api_internal.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 
 // Return the registration object used to initialize and execute ops that will
 // be delegated to TensorFlow's Eager runtime. This TF Lite op is created by
-// the eager delegate to handle execution of a supported subgraph. The usual
+// the flex delegate to handle execution of a supported subgraph. The usual
 // flow is that the delegate informs the interpreter of supported nodes in a
 // graph, and each supported subgraph is replaced with one instance of this
 // kernel.
 TfLiteRegistration GetKernel();
 
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_KERNEL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_KERNEL_H_
diff --git a/tensorflow/contrib/lite/delegates/eager/kernel_test.cc b/tensorflow/contrib/lite/delegates/flex/kernel_test.cc
similarity index 94%
rename from tensorflow/contrib/lite/delegates/eager/kernel_test.cc
rename to tensorflow/contrib/lite/delegates/flex/kernel_test.cc
index 66f2226626..94a6f8b61a 100644
--- a/tensorflow/contrib/lite/delegates/eager/kernel_test.cc
+++ b/tensorflow/contrib/lite/delegates/flex/kernel_test.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/eager/kernel.h"
+#include "tensorflow/contrib/lite/delegates/flex/kernel.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/delegates/eager/delegate_data.h"
-#include "tensorflow/contrib/lite/delegates/eager/test_util.h"
+#include "tensorflow/contrib/lite/delegates/flex/delegate_data.h"
+#include "tensorflow/contrib/lite/delegates/flex/test_util.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 namespace {
 
 using ::testing::ContainsRegex;
@@ -31,12 +31,12 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
   TfLiteIntArray* size_and_nodes =
       ConvertVectorToTfLiteIntArray(supported_nodes);
   TF_LITE_ENSURE_STATUS(context->ReplaceSubgraphsWithDelegateKernels(
-      context, eager::GetKernel(), size_and_nodes, delegate));
+      context, flex::GetKernel(), size_and_nodes, delegate));
   TfLiteIntArrayFree(size_and_nodes);
   return kTfLiteOk;
 }
 
-class KernelTest : public testing::EagerModelTest {
+class KernelTest : public testing::FlexModelTest {
  public:
   KernelTest() {
     CHECK(DelegateData::Create(&delegate_data_).ok());
@@ -167,7 +167,7 @@ TEST_F(KernelTest, WrongSetOfNodes) {
 
   ASSERT_FALSE(Invoke());
   ASSERT_THAT(error_reporter().error_messages(),
-              ContainsRegex("Invalid NodeDef in Eager op"));
+              ContainsRegex("Invalid NodeDef in Flex op"));
 }
 
 TEST_F(KernelTest, MixedGraph) {
@@ -220,7 +220,7 @@ TEST_F(KernelTest, SplitGraph) {
 }
 
 }  // namespace
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/contrib/lite/delegates/eager/test_util.cc b/tensorflow/contrib/lite/delegates/flex/test_util.cc
similarity index 76%
rename from tensorflow/contrib/lite/delegates/eager/test_util.cc
rename to tensorflow/contrib/lite/delegates/flex/test_util.cc
index d47be761fb..69c336a01a 100644
--- a/tensorflow/contrib/lite/delegates/eager/test_util.cc
+++ b/tensorflow/contrib/lite/delegates/flex/test_util.cc
@@ -13,25 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/delegates/eager/test_util.h"
+#include "tensorflow/contrib/lite/delegates/flex/test_util.h"
 
 #include "absl/memory/memory.h"
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
 #include "tensorflow/contrib/lite/string.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 namespace testing {
 
-bool EagerModelTest::Invoke() { return interpreter_->Invoke() == kTfLiteOk; }
+bool FlexModelTest::Invoke() { return interpreter_->Invoke() == kTfLiteOk; }
 
-void EagerModelTest::SetShape(int tensor_index,
-                              const std::vector<int>& values) {
+void FlexModelTest::SetShape(int tensor_index, const std::vector<int>& values) {
   ASSERT_EQ(interpreter_->ResizeInputTensor(tensor_index, values), kTfLiteOk);
   ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
 }
 
-std::vector<int> EagerModelTest::GetShape(int tensor_index) {
+std::vector<int> FlexModelTest::GetShape(int tensor_index) {
   std::vector<int> result;
   auto* dims = interpreter_->tensor(tensor_index)->dims;
   result.reserve(dims->size);
@@ -41,13 +40,13 @@ std::vector<int> EagerModelTest::GetShape(int tensor_index) {
   return result;
 }
 
-TfLiteType EagerModelTest::GetType(int tensor_index) {
+TfLiteType FlexModelTest::GetType(int tensor_index) {
   return interpreter_->tensor(tensor_index)->type;
 }
 
-void EagerModelTest::AddTensors(int num_tensors, const std::vector<int>& inputs,
-                                const std::vector<int>& outputs,
-                                TfLiteType type, const std::vector<int>& dims) {
+void FlexModelTest::AddTensors(int num_tensors, const std::vector<int>& inputs,
+                               const std::vector<int>& outputs, TfLiteType type,
+                               const std::vector<int>& dims) {
   interpreter_->AddTensors(num_tensors);
   for (int i = 0; i < num_tensors; ++i) {
     TfLiteQuantizationParams quant;
@@ -66,8 +65,8 @@ void EagerModelTest::AddTensors(int num_tensors, const std::vector<int>& inputs,
   CHECK_EQ(interpreter_->SetOutputs(outputs), kTfLiteOk);
 }
 
-void EagerModelTest::AddTfLiteMulOp(const std::vector<int>& inputs,
-                                    const std::vector<int>& outputs) {
+void FlexModelTest::AddTfLiteMulOp(const std::vector<int>& inputs,
+                                   const std::vector<int>& outputs) {
   static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
   reg.builtin_code = BuiltinOperator_MUL;
   reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
@@ -90,8 +89,8 @@ void EagerModelTest::AddTfLiteMulOp(const std::vector<int>& inputs,
            kTfLiteOk);
 }
 
-void EagerModelTest::AddTfOp(TfOpType op, const std::vector<int>& inputs,
-                             const std::vector<int>& outputs) {
+void FlexModelTest::AddTfOp(TfOpType op, const std::vector<int>& inputs,
+                            const std::vector<int>& outputs) {
   auto attr = [](const string& key, const string& value) {
     return " attr{ key: '" + key + "' value {" + value + "}}";
   };
@@ -107,28 +106,28 @@ void EagerModelTest::AddTfOp(TfOpType op, const std::vector<int>& inputs,
   if (op == kUnpack) {
     string attributes =
         type_attribute + attr("num", "i: 2") + attr("axis", "i: 0");
-    AddTfOp("EagerUnpack", "Unpack", attributes, inputs, outputs);
+    AddTfOp("FlexUnpack", "Unpack", attributes, inputs, outputs);
   } else if (op == kIdentity) {
     string attributes = type_attribute;
-    AddTfOp("EagerIdentity", "Identity", attributes, inputs, outputs);
+    AddTfOp("FlexIdentity", "Identity", attributes, inputs, outputs);
   } else if (op == kAdd) {
     string attributes = type_attribute;
-    AddTfOp("EagerAdd", "Add", attributes, inputs, outputs);
+    AddTfOp("FlexAdd", "Add", attributes, inputs, outputs);
   } else if (op == kMul) {
     string attributes = type_attribute;
-    AddTfOp("EagerMul", "Mul", attributes, inputs, outputs);
+    AddTfOp("FlexMul", "Mul", attributes, inputs, outputs);
   } else if (op == kNonExistent) {
     AddTfOp("NonExistentOp", "NonExistentOp", "", inputs, outputs);
   } else if (op == kIncompatibleNodeDef) {
     // "Cast" op is created without attributes - making it incompatible.
-    AddTfOp("EagerCast", "Cast", "", inputs, outputs);
+    AddTfOp("FlexCast", "Cast", "", inputs, outputs);
   }
 }
 
-void EagerModelTest::AddTfOp(const char* tflite_name, const string& tf_name,
-                             const string& nodedef_str,
-                             const std::vector<int>& inputs,
-                             const std::vector<int>& outputs) {
+void FlexModelTest::AddTfOp(const char* tflite_name, const string& tf_name,
+                            const string& nodedef_str,
+                            const std::vector<int>& inputs,
+                            const std::vector<int>& outputs) {
   static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
   reg.builtin_code = BuiltinOperator_CUSTOM;
   reg.custom_name = tflite_name;
@@ -154,5 +153,5 @@ void EagerModelTest::AddTfOp(const char* tflite_name, const string& tf_name,
 }
 
 }  // namespace testing
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/eager/test_util.h b/tensorflow/contrib/lite/delegates/flex/test_util.h
similarity index 90%
rename from tensorflow/contrib/lite/delegates/eager/test_util.h
rename to tensorflow/contrib/lite/delegates/flex/test_util.h
index 816db41931..a8c81b90a3 100644
--- a/tensorflow/contrib/lite/delegates/eager/test_util.h
+++ b/tensorflow/contrib/lite/delegates/flex/test_util.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_TEST_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_TEST_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_TEST_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_TEST_UTIL_H_
 
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 namespace testing {
 
 enum TfOpType {
@@ -35,12 +35,12 @@ enum TfOpType {
 };
 
 // This class creates models with TF and TFLite ops. In order to use this class
-// to test the Eager delegate, implement a function that calls
+// to test the Flex delegate, implement a function that calls
 // interpreter->ModifyGraphWithDelegate.
-class EagerModelTest : public ::testing::Test {
+class FlexModelTest : public ::testing::Test {
  public:
-  EagerModelTest() {}
-  ~EagerModelTest() {}
+  FlexModelTest() {}
+  ~FlexModelTest() {}
 
   bool Invoke();
 
@@ -104,7 +104,7 @@ class EagerModelTest : public ::testing::Test {
 
  private:
   // Helper method to add a TensorFlow op. tflite_names needs to start with
-  // "Eager" in order to work with the Eager delegate.
+  // "Flex" in order to work with the Flex delegate.
   void AddTfOp(const char* tflite_name, const string& tf_name,
                const string& nodedef_str, const std::vector<int>& inputs,
                const std::vector<int>& outputs);
@@ -113,7 +113,7 @@ class EagerModelTest : public ::testing::Test {
 };
 
 }  // namespace testing
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_TEST_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_TEST_UTIL_H_
diff --git a/tensorflow/contrib/lite/delegates/eager/util.cc b/tensorflow/contrib/lite/delegates/flex/util.cc
similarity index 96%
rename from tensorflow/contrib/lite/delegates/eager/util.cc
rename to tensorflow/contrib/lite/delegates/flex/util.cc
index 051246bf86..829bc388bf 100644
--- a/tensorflow/contrib/lite/delegates/eager/util.cc
+++ b/tensorflow/contrib/lite/delegates/flex/util.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/eager/util.h"
+#include "tensorflow/contrib/lite/delegates/flex/util.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 
 TfLiteStatus ConvertStatus(TfLiteContext* context,
                            const tensorflow::Status& status) {
@@ -100,5 +100,5 @@ TfLiteType GetTensorFlowLiteType(TF_DataType type) {
   }
 }
 
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/eager/util.h b/tensorflow/contrib/lite/delegates/flex/util.h
similarity index 89%
rename from tensorflow/contrib/lite/delegates/eager/util.h
rename to tensorflow/contrib/lite/delegates/flex/util.h
index 930cb99cb9..7f910e7316 100644
--- a/tensorflow/contrib/lite/delegates/eager/util.h
+++ b/tensorflow/contrib/lite/delegates/flex/util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_UTIL_H_
 
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/contrib/lite/c/c_api_internal.h"
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 
 // Converts a tensorflow:Status into a TfLiteStatus. If the original status
 // represented an error, reports it using the given 'context'.
@@ -41,7 +41,7 @@ TF_DataType GetTensorFlowDataType(TfLiteType type);
 // Returns the TfLiteType that corresponds to the given TF C API Data type.
 TfLiteType GetTensorFlowLiteType(TF_DataType);
 
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_UTIL_H_
diff --git a/tensorflow/contrib/lite/delegates/eager/util_test.cc b/tensorflow/contrib/lite/delegates/flex/util_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/delegates/eager/util_test.cc
rename to tensorflow/contrib/lite/delegates/flex/util_test.cc
index aebc91149c..5f049e7b0a 100644
--- a/tensorflow/contrib/lite/delegates/eager/util_test.cc
+++ b/tensorflow/contrib/lite/delegates/flex/util_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/eager/util.h"
+#include "tensorflow/contrib/lite/delegates/flex/util.h"
 
 #include <cstdarg>
 
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/testing/util.h"
 
 namespace tflite {
-namespace eager {
+namespace flex {
 namespace {
 
 using tensorflow::DT_FLOAT;
@@ -132,7 +132,7 @@ TEST(UtilTest, TypeConversionsFromTensorFlow) {
 }
 
 }  // namespace
-}  // namespace eager
+}  // namespace flex
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 2f4b663a28..9402105fa7 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -125,7 +125,7 @@ TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) {
   context->ReportError(
       context,
       "Regular TensorFlow ops are not supported by this interpreter. Make sure "
-      "you invoke the Eager delegate before inference.");
+      "you invoke the Flex delegate before inference.");
   return kTfLiteError;
 }
 
@@ -136,13 +136,13 @@ const TfLiteRegistration* BuiltinOpResolver::FindOp(tflite::BuiltinOperator op,
 
 const TfLiteRegistration* BuiltinOpResolver::FindOp(const char* op,
                                                     int version) const {
-  // Return the NULL Op for all ops whose name start with "Eager", allowing
+  // Return the NULL Op for all ops whose name start with "Flex", allowing
   // the interpreter to delegate their execution.
-  if (IsEagerOp(op)) {
+  if (IsFlexOp(op)) {
     static TfLiteRegistration null_op{
         nullptr, nullptr, &UnsupportedTensorFlowOp,
         nullptr, nullptr, BuiltinOperator_CUSTOM,
-        "Eager", 1};
+        "Flex",  1};
     return &null_op;
   }
   return MutableOpResolver::FindOp(op, version);
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index ea2817beec..eff6181a61 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/nnapi_delegate.h"
 #endif
 #if defined(TFLITE_EXTENDED)
-#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
+#include "tensorflow/contrib/lite/delegates/flex/delegate.h"
 #endif
 #include "tensorflow/contrib/lite/version.h"
 
@@ -451,7 +451,7 @@ TfLiteStatus InterpreterBuilder::operator()(
   (**interpreter).SetVariables(std::move(variables));
 
 #if defined(TFLITE_EXTENDED)
-  if (auto delegate = EagerDelegate::Create()) {
+  if (auto delegate = FlexDelegate::Create()) {
     (**interpreter)
         .ModifyGraphWithDelegate(std::move(delegate),
                                  /*allow_dynamic_tensors=*/true);
diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
index 627be8f44f..73a420c47b 100644
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -241,10 +241,10 @@ def build_toco_convert_protos(input_tensors,
     toco.dump_graphviz_dir = dump_graphviz_dir
   toco.dump_graphviz_include_video = dump_graphviz_video
   if converter_mode == ConverterMode.TOCO_EXTENDED:
-    toco.allow_eager_ops = True
+    toco.allow_flex_ops = True
   elif converter_mode == ConverterMode.TOCO_EXTENDED_ALL:
-    toco.allow_eager_ops = True
-    toco.force_eager_ops = True
+    toco.allow_flex_ops = True
+    toco.force_flex_ops = True
 
   model = _model_flags_pb2.ModelFlags()
   model.change_concat_input_ranges = change_concat_input_ranges
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index 33f8fc1e8c..7b0df01d1d 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -432,7 +432,7 @@ class FromSessionTest(test_util.TensorFlowTestCase):
       interpreter.allocate_tensors()
     self.assertIn(
         'Regular TensorFlow ops are not supported by this interpreter. Make '
-        'sure you invoke the Eager delegate before inference.',
+        'sure you invoke the Flex delegate before inference.',
         str(error.exception))
 
   def testFloatTocoConverter(self):
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 55ef1172b2..f0bfec2338 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -164,7 +164,7 @@ cc_library(
         ":test_runner",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/delegates/eager:delegate",
+        "//tensorflow/contrib/lite/delegates/flex:delegate",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
     ],
 )
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 014c80b5ef..53bd88d087 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -343,7 +343,7 @@ def toco_convert(graph_def_str, input_tensors, output_tensors,
       opts = ("--input_arrays={0} --output_arrays={1}".format(
           ",".join(input_arrays), ",".join(output_tensors)))
     elif FLAGS.run_with_extended:
-      opts += " --allow_eager_ops --force_eager_ops"
+      opts += " --allow_flex_ops --force_flex_ops"
     cmd = ("%s --input_file=%s --output_file=%s %s > %s 2>&1" %
            (bin_path, graphdef_file.name, output_file.name, opts,
             stdout_file.name))
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_flags.h b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
index 3874bc31d7..ad889a2f19 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_flags.h
+++ b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
@@ -57,7 +57,7 @@ DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
                        "[optional] Number of full runs in each pass."),
       tensorflow::Flag("delegate", &values.delegate,
                        "[optional] Delegate to use for executing ops. Must be "
-                       "`{\"\", EAGER}`"),
+                       "`{\"\", FLEX}`"),
   };
 
   bool no_inputs = *argc == 1;
@@ -70,7 +70,7 @@ DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
              values.input_layer_shape.empty() || values.output_layer.empty()) {
     fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
     return {};
-  } else if (!(values.delegate == "" || values.delegate == "EAGER")) {
+  } else if (!(values.delegate == "" || values.delegate == "FLEX")) {
     fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
     return {};
   }
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_util.h b/tensorflow/contrib/lite/testing/tflite_diff_util.h
index f67992139f..28b14bd143 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_util.h
+++ b/tensorflow/contrib/lite/testing/tflite_diff_util.h
@@ -45,7 +45,7 @@ struct DiffOptions {
   // second pass does multiple inferences back to back.
   int num_runs_per_pass;
   // Path to the delegate library to be loaded in order to execute ops. Must be
-  // `{"", EAGER}`.
+  // `{"", FLEX}`.
   string delegate;
 };
 
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
index 17aa8cb293..ef49e6f8bc 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <iostream>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
-#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
+#include "tensorflow/contrib/lite/delegates/flex/delegate.h"
 #include "tensorflow/contrib/lite/testing/split.h"
 
 namespace tflite {
@@ -138,8 +138,8 @@ class TfLiteDriver::Expectation {
 
 TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name)
     : use_nnapi_(use_nnapi) {
-  if (delegate_name == "EAGER") {
-    delegate_ = EagerDelegate::Create();
+  if (delegate_name == "FLEX") {
+    delegate_ = FlexDelegate::Create();
   }
 }
 
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.h b/tensorflow/contrib/lite/testing/tflite_driver.h
index aed35f877d..dc2a4e5877 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.h
+++ b/tensorflow/contrib/lite/testing/tflite_driver.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <map>
 
-#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
+#include "tensorflow/contrib/lite/delegates/flex/delegate.h"
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
@@ -53,7 +53,7 @@ class TfLiteDriver : public TestRunner {
 
   class Expectation;
 
-  std::unique_ptr<EagerDelegate> delegate_;
+  std::unique_ptr<FlexDelegate> delegate_;
   bool use_nnapi_ = false;
   std::unique_ptr<FlatBufferModel> model_;
   std::unique_ptr<Interpreter> interpreter_;
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index f14dbc258b..2699ac76e1 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -248,9 +248,9 @@ struct ParsedTocoFlags {
   Arg<int64> dedupe_array_min_size_bytes = Arg<int64>(64);
   Arg<bool> split_tflite_lstm_inputs = Arg<bool>(true);
   // WARNING: Experimental interface, subject to change
-  Arg<bool> allow_eager_ops = Arg<bool>(false);
+  Arg<bool> allow_flex_ops = Arg<bool>(false);
   // WARNING: Experimental interface, subject to change
-  Arg<bool> force_eager_ops = Arg<bool>(false);
+  Arg<bool> force_flex_ops = Arg<bool>(false);
 };
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index e02d000e7e..5eaf6e27fc 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -2123,9 +2123,9 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
   Model* model = new Model;
   internal::ConverterMapType converter_map;
 
-  // This is used for the TFLite "Full Eager Mode" conversion. All the ops are
+  // This is used for the TFLite "Full Flex Mode" conversion. All the ops are
   // imported as `TensorFlowUnsupportedOperator`, and later all these ops are
-  // converted to TFLite Eager ops.
+  // converted to TFLite Flex ops.
   if (!tf_import_flags.import_all_ops_as_unsupported) {
     converter_map = internal::GetTensorFlowNodeConverterMap();
   }
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.h b/tensorflow/contrib/lite/toco/import_tensorflow.h
index 7db23f2d44..c5ff96956a 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.h
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.h
@@ -30,7 +30,7 @@ struct TensorFlowImportFlags {
 
   // Do not recognize any op and import all ops as
   // `TensorFlowUnsupportedOperator`. This is used to populated with the
-  // `force_eager_ops` flag.
+  // `force_flex_ops` flag.
   bool import_all_ops_as_unsupported = false;
 };
 
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index 9f60942f47..0c9fac249c 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -50,16 +50,16 @@ namespace {
 details::OperatorKey GetOperatorKey(
     const ::toco::Operator& op,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
-    bool allow_eager_ops) {
+    bool allow_flex_ops) {
   string custom_code;
   if (op.type == OperatorType::kUnsupported) {
     const TensorFlowUnsupportedOperator& unsupported_op =
         static_cast<const TensorFlowUnsupportedOperator&>(op);
 
-    // TODO(b/113715895): When `allow_eager_ops` is on, for now there's no way
+    // TODO(b/113715895): When `allow_flex_ops` is on, for now there's no way
     // to populate a regular custom op. We need to find a way to fix this.
-    if (allow_eager_ops) {
-      custom_code = string(::tflite::kEagerCustomCodePrefix) +
+    if (allow_flex_ops) {
+      custom_code = string(::tflite::kFlexCustomCodePrefix) +
                     unsupported_op.tensorflow_op;
     } else {
       custom_code = unsupported_op.tensorflow_op;
@@ -101,11 +101,11 @@ void LoadTensorsMap(const Model& model, TensorsMap* tensors_map) {
 void LoadOperatorsMap(
     const Model& model, OperatorsMap* operators_map,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
-    bool allow_eager_ops) {
+    bool allow_flex_ops) {
   // First find a list of unique operator types.
   std::set<OperatorKey> keys;
   for (const auto& op : model.operators) {
-    keys.insert(GetOperatorKey(*op, ops_by_type, allow_eager_ops));
+    keys.insert(GetOperatorKey(*op, ops_by_type, allow_flex_ops));
   }
   // Now assign indices to them and fill in the map.
   int index = 0;
@@ -216,7 +216,7 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
 
   for (const auto& op : model.operators) {
     const details::OperatorKey operator_key =
-        GetOperatorKey(*op, ops_by_type, params.allow_eager_ops);
+        GetOperatorKey(*op, ops_by_type, params.allow_flex_ops);
     int op_index = operators_map.at(operator_key);
     int op_version = operator_key.version;
 
@@ -281,7 +281,7 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
     }
 
     int op_index = operators_map.at(
-        GetOperatorKey(*op, ops_by_type, params.allow_eager_ops));
+        GetOperatorKey(*op, ops_by_type, params.allow_flex_ops));
 
     auto tflite_op_it = ops_by_type.find(op->type);
     BaseOperator* tflite_op = tflite_op_it == ops_by_type.end()
@@ -334,7 +334,7 @@ Offset<Vector<Offset<Buffer>>> ExportBuffers(
 
 void Export(const Model& model, string* output_file_contents,
             const ExportParams& params) {
-  const auto ops_by_type = BuildOperatorByTypeMap(params.allow_eager_ops);
+  const auto ops_by_type = BuildOperatorByTypeMap(params.allow_flex_ops);
   Export(model, output_file_contents, params, ops_by_type);
 }
 
@@ -349,7 +349,7 @@ void Export(
 
   details::OperatorsMap operators_map;
   details::LoadOperatorsMap(model, &operators_map, ops_by_type,
-                            params.allow_eager_ops);
+                            params.allow_flex_ops);
 
   std::vector<const Array*> buffers_to_write;
   Array empty_array;
diff --git a/tensorflow/contrib/lite/toco/tflite/export.h b/tensorflow/contrib/lite/toco/tflite/export.h
index b070a38768..29d6de4049 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.h
+++ b/tensorflow/contrib/lite/toco/tflite/export.h
@@ -26,7 +26,7 @@ namespace tflite {
 // The parameters for exporting a TFLite model.
 struct ExportParams {
   bool allow_custom_ops = false;
-  bool allow_eager_ops = false;
+  bool allow_flex_ops = false;
   bool quantize_weights = false;
 };
 
@@ -121,7 +121,7 @@ void LoadTensorsMap(const Model& model, TensorsMap* tensors_map);
 void LoadOperatorsMap(
     const Model& model, OperatorsMap* operators_map,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
-    bool allow_eager_ops);
+    bool allow_flex_ops);
 
 }  // namespace details
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/toco/tflite/export_test.cc b/tensorflow/contrib/lite/toco/tflite/export_test.cc
index 8d4d197c46..93882a91a7 100644
--- a/tensorflow/contrib/lite/toco/tflite/export_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export_test.cc
@@ -105,7 +105,7 @@ TEST_F(ExportTest, LoadOperatorsMap) {
 
   details::OperatorsMap operators;
   const auto ops_by_type = BuildOperatorByTypeMap();
-  // TODO(ycling): Add a test for allow_eager_ops.
+  // TODO(ycling): Add a test for allow_flex_ops.
   details::LoadOperatorsMap(input_model_, &operators, ops_by_type, false);
   EXPECT_EQ(0, operators[details::OperatorKey(OperatorType::kAdd, "", 1)]);
   EXPECT_EQ(1, operators[details::OperatorKey(OperatorType::kConv, "", 1)]);
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index ca2a6a19b3..9addbb81e7 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -1160,8 +1160,8 @@ class Unpack : public BuiltinOperator<UnpackOperator, ::tflite::UnpackOptions,
 class TensorFlowUnsupported : public BaseOperator {
  public:
   TensorFlowUnsupported(const string& name, OperatorType type,
-                        bool allow_eager_ops)
-      : BaseOperator(name, type), allow_eager_ops_(allow_eager_ops) {}
+                        bool allow_flex_ops)
+      : BaseOperator(name, type), allow_flex_ops_(allow_flex_ops) {}
 
   Options Serialize(const Operator& op,
                     flatbuffers::FlatBufferBuilder* builder) const override {
@@ -1177,9 +1177,9 @@ class TensorFlowUnsupported : public BaseOperator {
   std::unique_ptr<Operator> Deserialize(
       const BuiltinOptions* builtin_options,
       const CustomOptions* custom_options) const override {
-    // Deserializing Eager ops doesn't work now.
+    // Deserializing Flex ops doesn't work now.
     // TODO(ycling): Revisit and decide if we should fix the flow for importing
-    // TFLite models with Eager ops.
+    // TFLite models with Flex ops.
     auto op = absl::make_unique<TensorFlowUnsupportedOperator>();
     if (custom_options) {
       auto flexbuffer_map =
@@ -1200,13 +1200,13 @@ class TensorFlowUnsupported : public BaseOperator {
       return std::unique_ptr<flexbuffers::Builder>();
     }
 
-    if (allow_eager_ops_) {
+    if (allow_flex_ops_) {
       fbb->Vector([&]() {
         fbb->String(node_def.op());
         fbb->String(op.tensorflow_node_def);
       });
       fbb->Finish();
-      LOG(INFO) << "Writing eager op: " << node_def.op();
+      LOG(INFO) << "Writing flex op: " << node_def.op();
       return std::unique_ptr<flexbuffers::Builder>(fbb.release());
     }
 
@@ -1316,13 +1316,13 @@ class TensorFlowUnsupported : public BaseOperator {
   }
 
  private:
-  const bool allow_eager_ops_;
+  const bool allow_flex_ops_;
 };
 
 namespace {
 // Build a vector containing all the known operators.
 std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
-    bool allow_eager_ops = false) {
+    bool allow_flex_ops = false) {
   std::vector<std::unique_ptr<BaseOperator>> ops;
   using tensorflow::MakeUnique;
   // Builtin Operators.
@@ -1434,7 +1434,7 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   ops.push_back(MakeUnique<CTCBeamSearchDecoder>(
       "CTC_BEAM_SEARCH_DECODER", OperatorType::kCTCBeamSearchDecoder));
   ops.push_back(MakeUnique<TensorFlowUnsupported>(
-      "TENSORFLOW_UNSUPPORTED", OperatorType::kUnsupported, allow_eager_ops));
+      "TENSORFLOW_UNSUPPORTED", OperatorType::kUnsupported, allow_flex_ops));
 
   // There operators are supported by Toco, but not by TF Lite, and has no
   // attributes.
@@ -1512,11 +1512,11 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
 }  // namespace
 
 std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
-    bool allow_eager_ops) {
+    bool allow_flex_ops) {
   std::map<OperatorType, std::unique_ptr<BaseOperator>> result;
 
   std::vector<std::unique_ptr<BaseOperator>> ops =
-      BuildOperatorList(allow_eager_ops);
+      BuildOperatorList(allow_flex_ops);
   for (auto& op : ops) {
     result[op->type()] = std::move(op);
   }
@@ -1525,11 +1525,11 @@ std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
 }
 
 std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
-    bool allow_eager_ops) {
+    bool allow_flex_ops) {
   std::map<string, std::unique_ptr<BaseOperator>> result;
 
   std::vector<std::unique_ptr<BaseOperator>> ops =
-      BuildOperatorList(allow_eager_ops);
+      BuildOperatorList(allow_flex_ops);
   for (auto& op : ops) {
     result[op->name()] = std::move(op);
   }
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.h b/tensorflow/contrib/lite/toco/tflite/operator.h
index 702fb28ea6..13d9f6c49a 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/operator.h
@@ -26,15 +26,15 @@ namespace tflite {
 class BaseOperator;
 
 // Return a map contained all know TF Lite Operators, keyed by their names.
-// TODO(ycling): The pattern to propagate parameters (e.g. allow_eager_ops)
+// TODO(ycling): The pattern to propagate parameters (e.g. allow_flex_ops)
 // is ugly here. Consider refactoring.
 std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
-    bool allow_eager_ops = false);
+    bool allow_flex_ops = false);
 
 // Return a map contained all know TF Lite Operators, keyed by the type of
 // their tf.mini counterparts.
 std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
-    bool allow_eager_ops = false);
+    bool allow_flex_ops = false);
 
 // These are the flatbuffer types for custom and builtin options.
 using CustomOptions = flatbuffers::Vector<uint8_t>;
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index b6aebc0470..cff79776bc 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -167,11 +167,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
            "converted float model. Model size will be reduced and there will "
            "be latency improvements (at the cost of accuracy)."),
       // WARNING: Experimental interface, subject to change
-      Flag("allow_eager_ops", parsed_flags.allow_eager_ops.bind(),
-           parsed_flags.allow_eager_ops.default_value(), ""),
+      Flag("allow_flex_ops", parsed_flags.allow_flex_ops.bind(),
+           parsed_flags.allow_flex_ops.default_value(), ""),
       // WARNING: Experimental interface, subject to change
-      Flag("force_eager_ops", parsed_flags.force_eager_ops.bind(),
-           parsed_flags.force_eager_ops.default_value(), "")};
+      Flag("force_flex_ops", parsed_flags.force_flex_ops.bind(),
+           parsed_flags.force_flex_ops.default_value(), "")};
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
   if (asked_for_help) {
@@ -266,15 +266,15 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
   READ_TOCO_FLAG(split_tflite_lstm_inputs, FlagRequirement::kNone);
   READ_TOCO_FLAG(quantize_weights, FlagRequirement::kNone);
   READ_TOCO_FLAG(post_training_quantize, FlagRequirement::kNone);
-  READ_TOCO_FLAG(allow_eager_ops, FlagRequirement::kNone);
-  READ_TOCO_FLAG(force_eager_ops, FlagRequirement::kNone);
+  READ_TOCO_FLAG(allow_flex_ops, FlagRequirement::kNone);
+  READ_TOCO_FLAG(force_flex_ops, FlagRequirement::kNone);
 
-  if (parsed_toco_flags.force_eager_ops.value() &&
-      !parsed_toco_flags.allow_eager_ops.value()) {
-    // TODO(ycling): Consider to enforce `allow_eager_ops` when
-    // `force_eager_ops` is true.
-    LOG(WARNING) << "--force_eager_ops should always be used with "
-                    "--allow_eager_ops.";
+  if (parsed_toco_flags.force_flex_ops.value() &&
+      !parsed_toco_flags.allow_flex_ops.value()) {
+    // TODO(ycling): Consider to enforce `allow_flex_ops` when
+    // `force_flex_ops` is true.
+    LOG(WARNING) << "--force_flex_ops should always be used with "
+                    "--allow_flex_ops.";
   }
 
   // Deprecated flag handling.
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto
index 53d60fed05..ca3e64485e 100644
--- a/tensorflow/contrib/lite/toco/toco_flags.proto
+++ b/tensorflow/contrib/lite/toco/toco_flags.proto
@@ -190,16 +190,16 @@ message TocoFlags {
   // (at the cost of accuracy).
   optional bool post_training_quantize = 26 [default = false];
 
-  // When enabled, unsupported ops will be converted to TFLite Eager ops.
+  // When enabled, unsupported ops will be converted to TFLite Flex ops.
   // TODO(ycling): Consider to rename the following 2 flags and don't call it
-  // "Eager".
-  // `allow_eager_ops` should always be used with `allow_custom_ops`.
+  // "Flex".
+  // `allow_flex_ops` should always be used with `allow_custom_ops`.
   // WARNING: Experimental interface, subject to change
-  optional bool allow_eager_ops = 27 [default = false];
+  optional bool allow_flex_ops = 27 [default = false];
 
-  // When enabled, all TensorFlow ops will be converted to TFLite Eager
-  // ops directly. This will force `allow_eager_ops` to true.
-  // `force_eager_ops` should always be used with `allow_eager_ops`.
+  // When enabled, all TensorFlow ops will be converted to TFLite Flex
+  // ops directly. This will force `allow_flex_ops` to true.
+  // `force_flex_ops` should always be used with `allow_flex_ops`.
   // WARNING: Experimental interface, subject to change
-  optional bool force_eager_ops = 28 [default = false];
+  optional bool force_flex_ops = 28 [default = false];
 }
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index a08b02485f..106494f354 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -198,7 +198,7 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
               : (toco_flags.output_format() != TENSORFLOW_GRAPHDEF);
 
       tf_import_flags.import_all_ops_as_unsupported =
-          toco_flags.force_eager_ops();
+          toco_flags.force_flex_ops();
 
       model = ImportTensorFlowGraphDef(model_flags, tf_import_flags,
                                        input_file_contents);
@@ -409,9 +409,9 @@ void Export(const TocoFlags& toco_flags, const Model& model,
     case TFLITE: {
       toco::tflite::ExportParams params;
 
-      // Always allow custom ops when eager ops are allowed.
-      if (toco_flags.force_eager_ops() || toco_flags.allow_eager_ops()) {
-        params.allow_eager_ops = true;
+      // Always allow custom ops when flex ops are allowed.
+      if (toco_flags.force_flex_ops() || toco_flags.allow_flex_ops()) {
+        params.allow_flex_ops = true;
         params.allow_custom_ops = true;
       } else if (allow_custom_ops) {
         params.allow_custom_ops = true;
diff --git a/tensorflow/contrib/lite/tools/benchmark/BUILD b/tensorflow/contrib/lite/tools/benchmark/BUILD
index dc97d22401..bc18d40313 100644
--- a/tensorflow/contrib/lite/tools/benchmark/BUILD
+++ b/tensorflow/contrib/lite/tools/benchmark/BUILD
@@ -36,7 +36,7 @@ cc_binary(
 )
 
 cc_binary(
-    name = "benchmark_model_plus_eager",
+    name = "benchmark_model_plus_flex",
     srcs = [
         "benchmark_main.cc",
     ],
@@ -49,7 +49,7 @@ cc_binary(
         "//conditions:default": [],
     }),
     deps = [
-        ":benchmark_tflite_model_plus_eager_lib",
+        ":benchmark_tflite_model_plus_flex_lib",
         ":logging",
     ],
 )
@@ -111,7 +111,7 @@ cc_library(
 )
 
 cc_library(
-    name = "benchmark_tflite_model_plus_eager_lib",
+    name = "benchmark_tflite_model_plus_flex_lib",
     srcs = [
         "benchmark_tflite_model.cc",
         "logging.h",
@@ -123,7 +123,7 @@ cc_library(
         ":logging",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/delegates/eager:delegate",
+        "//tensorflow/contrib/lite/delegates/flex:delegate",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
         "//tensorflow/contrib/lite/profiling:profile_summarizer",
     ],
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
index ef4f0fa80d..d989ee720d 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include <vector>
 
 #ifdef TFLITE_EXTENDED
-#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
+#include "tensorflow/contrib/lite/delegates/flex/delegate.h"
 #endif  // TFLITE_EXTENDED
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
@@ -306,8 +306,8 @@ void BenchmarkTfLiteModel::Init() {
   interpreter->UseNNAPI(use_nnapi);
 
 #ifdef TFLITE_EXTENDED
-  TFLITE_LOG(INFO) << "Instantiating Eager Delegate";
-  delegate_ = EagerDelegate::Create();
+  TFLITE_LOG(INFO) << "Instantiating Flex Delegate";
+  delegate_ = FlexDelegate::Create();
   if (delegate_) {
     interpreter->ModifyGraphWithDelegate(delegate_.get(),
                                          /*allow_dynamic_tensors=*/true);
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
index 8541512bc8..9343824b4a 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <vector>
 
 #ifdef TFLITE_EXTENDED
-#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
+#include "tensorflow/contrib/lite/delegates/flex/delegate.h"
 #endif  // TFLITE_EXTENDED
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/profiling/profile_summarizer.h"
@@ -74,7 +74,7 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
 
  private:
 #ifdef TFLITE_EXTENDED
-  std::unique_ptr<EagerDelegate> delegate_;
+  std::unique_ptr<FlexDelegate> delegate_;
 #endif  // TFLITE_EXTENDED
   std::unique_ptr<tflite::FlatBufferModel> model;
   std::unique_ptr<tflite::Interpreter> interpreter;
diff --git a/tensorflow/contrib/lite/util.cc b/tensorflow/contrib/lite/util.cc
index 7950653da9..6aa35b5227 100644
--- a/tensorflow/contrib/lite/util.cc
+++ b/tensorflow/contrib/lite/util.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 namespace tflite {
 
-bool IsEagerOp(const char* custom_name) {
-  return custom_name && strncmp(custom_name, kEagerCustomCodePrefix,
-                                strlen(kEagerCustomCodePrefix)) == 0;
+bool IsFlexOp(const char* custom_name) {
+  return custom_name && strncmp(custom_name, kFlexCustomCodePrefix,
+                                strlen(kFlexCustomCodePrefix)) == 0;
 }
 
 TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input) {
diff --git a/tensorflow/contrib/lite/util.h b/tensorflow/contrib/lite/util.h
index 6d81f844f8..31292a6f81 100644
--- a/tensorflow/contrib/lite/util.h
+++ b/tensorflow/contrib/lite/util.h
@@ -26,15 +26,15 @@ limitations under the License.
 
 namespace tflite {
 
-// The prefix of Eager op custom code.
+// The prefix of Flex op custom code.
 // This will be matched agains the `custom_code` field in `OperatorCode`
 // Flatbuffer Table.
 // WARNING: This is an experimental API and subject to change.
-constexpr char kEagerCustomCodePrefix[] = "Eager";
+constexpr char kFlexCustomCodePrefix[] = "Flex";
 
 // Checks whether the prefix of the custom name indicates the operation is an
-// Eager operation.
-bool IsEagerOp(const char* custom_name);
+// Flex operation.
+bool IsFlexOp(const char* custom_name);
 
 // Converts a `std::vector` to a `TfLiteIntArray`. The caller takes ownership
 // of the returned pointer.
diff --git a/tensorflow/contrib/lite/util_test.cc b/tensorflow/contrib/lite/util_test.cc
index c5c1709f1d..25f3aded71 100644
--- a/tensorflow/contrib/lite/util_test.cc
+++ b/tensorflow/contrib/lite/util_test.cc
@@ -41,14 +41,14 @@ TEST(ConvertVectorToTfLiteIntArray, TestWithEmptyVector) {
   TfLiteIntArrayFree(output);
 }
 
-TEST(UtilTest, IsEagerOp) {
-  EXPECT_TRUE(IsEagerOp("Eager"));
-  EXPECT_TRUE(IsEagerOp("EagerOp"));
-  EXPECT_FALSE(IsEagerOp("eager"));
-  EXPECT_FALSE(IsEagerOp("Eage"));
-  EXPECT_FALSE(IsEagerOp("OpEager"));
-  EXPECT_FALSE(IsEagerOp(nullptr));
-  EXPECT_FALSE(IsEagerOp(""));
+TEST(UtilTest, IsFlexOp) {
+  EXPECT_TRUE(IsFlexOp("Flex"));
+  EXPECT_TRUE(IsFlexOp("FlexOp"));
+  EXPECT_FALSE(IsFlexOp("flex"));
+  EXPECT_FALSE(IsFlexOp("Fle"));
+  EXPECT_FALSE(IsFlexOp("OpFlex"));
+  EXPECT_FALSE(IsFlexOp(nullptr));
+  EXPECT_FALSE(IsFlexOp(""));
 }
 
 }  // namespace
-- 
GitLab


From 8276ef6088ecedd4a5f62a8eacd35a075a43746c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 15:07:12 -0700
Subject: [PATCH 098/570] Updates Interpreter to be initialized with a
 MappedByteBuffer for backward compatibility.

PiperOrigin-RevId: 214843130
---
 .../java/org/tensorflow/lite/Interpreter.java     | 15 +++++++++++++++
 .../java/org/tensorflow/lite/InterpreterTest.java |  4 ++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index eacfa0c827..5cc6e754f3 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -17,6 +17,7 @@ package org.tensorflow.lite;
 
 import java.io.File;
 import java.nio.ByteBuffer;
+import java.nio.MappedByteBuffer;
 import java.util.HashMap;
 import java.util.Map;
 import org.checkerframework.checker.nullness.qual.NonNull;
@@ -148,6 +149,20 @@ public final class Interpreter implements AutoCloseable {
     this(byteBuffer, new Options().setNumThreads(numThreads));
   }
 
+  /**
+   * Initializes a {@code Interpreter} with a {@code MappedByteBuffer} to the model file.
+   *
+   * <p>The {@code MappedByteBuffer} should remain unchanged after the construction of a {@code
+   * Interpreter}.
+   *
+   * @deprecated Prefer using the {@link #Interpreter(ByteBuffer,Options)} constructor. This method
+   *     will be removed in a future release.
+   */
+  @Deprecated
+  public Interpreter(@NonNull MappedByteBuffer mappedByteBuffer) {
+    this(mappedByteBuffer, /* options= */ null);
+  }
+
   /**
    * Initializes a {@code Interpreter} with a {@code ByteBuffer} of a model file and a set of custom
    * {@link #Options}.
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index fdd5063156..a98fca0132 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -71,7 +71,7 @@ public final class InterpreterTest {
     Path path = MODEL_FILE.toPath();
     FileChannel fileChannel =
         (FileChannel) Files.newByteChannel(path, EnumSet.of(StandardOpenOption.READ));
-    MappedByteBuffer mappedByteBuffer =
+    ByteBuffer mappedByteBuffer =
         fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileChannel.size());
     Interpreter interpreter = new Interpreter(mappedByteBuffer);
     float[] oneD = {1.23f, 6.54f, 7.81f};
@@ -118,7 +118,7 @@ public final class InterpreterTest {
     byteBuffer.order(ByteOrder.nativeOrder());
     fileChannel.read(byteBuffer);
     try {
-      Interpreter interpreter = new Interpreter(byteBuffer);
+      new Interpreter(byteBuffer);
       fail();
     } catch (IllegalArgumentException e) {
       assertThat(e)
-- 
GitLab


From 17320a0543de32715159a732be065a55a3d990db Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Thu, 27 Sep 2018 15:28:50 -0700
Subject: [PATCH 099/570] Fix heartbeat probing.

PiperOrigin-RevId: 214846488
---
 .../contrib/tpu/python/tpu/session_support.py | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/session_support.py b/tensorflow/contrib/tpu/python/tpu/session_support.py
index 24b9bd136b..05264f5a46 100644
--- a/tensorflow/contrib/tpu/python/tpu/session_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/session_support.py
@@ -44,21 +44,25 @@ class CoordinatorShutdownException(Exception):
 def _make_heartbeat_op(session, device, request_ph):
   """Return a heartbeat op or None if heartbeats are not supported by device."""
   try:
-    with ops.device(device):
-      heartbeat_op = tpu_ops.worker_heartbeat(request_ph)
-      request = event_pb2.WorkerHeartbeatRequest()
-      options = config_pb2.RunOptions(timeout_in_ms=5000)
-      session.run(
-          heartbeat_op,
-          feed_dict={request_ph: request.SerializeToString()},
-          options=options)
-      return heartbeat_op
+    # Test if we can connect in a isolated graph + session
+    with ops.Graph().as_default():
+      with session_lib.Session(target=session.sess_str) as temp_session:
+        with ops.device(device):
+          heartbeat_op = tpu_ops.worker_heartbeat('')
+          options = config_pb2.RunOptions(timeout_in_ms=5000)
+          temp_session.run(heartbeat_op, options=options)
   except errors.InvalidArgumentError as _:
+    logging.warning('Error running heartbeat on %s', device)
     return None
   except errors.DeadlineExceededError as _:
     logging.warning('Timeout connecting to %s when testing heartbeat', device)
     return None
 
+  # If we successfully connected and pinged the worker, go ahead and construct
+  # the operation.
+  with ops.device(device):
+    return tpu_ops.worker_heartbeat(request_ph)
+
 
 class WorkerHeartbeatManager(object):
   """Manages the status/heartbeat monitor for a set of workers."""
@@ -171,7 +175,7 @@ class WorkerHeartbeatManager(object):
 def all_worker_devices(session):
   """Return a list of devices for each worker in the system."""
   devices = session.list_devices()
-  return [device.name for device in devices if 'CPU' in device.name]
+  return [device.name for device in devices if ':CPU:' in device.name]
 
 
 class WatchdogManager(threading.Thread):
-- 
GitLab


From a3291ab1f2cb9ea2c4e4b3b9b26ad1a1866dfc50 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Thu, 27 Sep 2018 15:32:00 -0700
Subject: [PATCH 100/570] Update function registration with both inference
 function and forward/backward function pair.

PiperOrigin-RevId: 214847027
---
 tensorflow/python/eager/function.py      | 21 ++++++++++++--
 tensorflow/python/eager/function_test.py | 37 +++++++++++++++---------
 2 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index b28befeb62..dd3e1a3723 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -1328,8 +1328,25 @@ def register(func, *args, **kwargs):
                      "Got type: %s" % type(func))
   concrete_func = func.get_concrete_function(*args, **kwargs)
   graph = ops.get_default_graph()
-  concrete_func._inference_function.add_to_graph(graph)   # pylint: disable=protected-access
-  # TODO(scottzhu): support concrete_func._backward_graph_function in future.
+
+  # There are two situations for the actual call of a defun:
+  # 1. If none of the input args are resource variables or watch by any tape,
+  #   it will run the _inference_function of concrete_func for forward pass, and
+  #   the gradient will be generated by standard mechanism.
+  # 2. Otherwise, defun will create two functions, one for forward pass, and the
+  #   backward pass will be created via tape.
+  # When registering the function, we put both cases into graph.
+  # pylint: disable=protected-access
+  concrete_func._inference_function.add_to_graph(graph)
+
+  if concrete_func._backward_graph_function is None:
+    concrete_func._construct_backprop_function()
+  forward_function = concrete_func._forward_function
+  backward_function = concrete_func._backward_graph_function._inference_function
+  forward_function.add_to_graph(graph)
+  backward_function.add_to_graph(graph)
+  # pylint: enable=protected-access
+
   return concrete_func
 
 
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 59faf967c5..34a2648e26 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -1669,12 +1669,23 @@ class FunctionTest(test.TestCase):
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 2)
+        self.assertEqual(len(graph._functions), 6)
+        # two sets of functions, each of them are (inference, forward, backward)
         functions = list(graph._functions.values())
-        pre_register_matmul_func_name = functions[0].definition.signature.name
-        self.assertRegexpMatches(pre_register_matmul_func_name, '.*matmul.*')
-        pre_register_add_func_name = functions[1].definition.signature.name
-        self.assertRegexpMatches(pre_register_add_func_name, '.*add.*')
+        captured_function_names = [
+            f.definition.signature.name for f in functions
+        ]
+        expected_func_name_regex = [
+            '.*inference.*matmul.*',
+            '.*forward.*matmul.*',
+            '.*inference.*backward.*matmul.*',
+            '.*inference.*add.*',
+            '.*forward.*add.*',
+            '.*inference.*backward.*add.*',
+        ]
+        for i in range(len(functions)):
+          self.assertRegexpMatches(captured_function_names[i],
+                                   expected_func_name_regex[i])
 
         sq = defun_matmul(t, t)
         double = add(t, t)
@@ -1682,12 +1693,11 @@ class FunctionTest(test.TestCase):
         self.assertAllEqual(double.eval().reshape(-1), [2, 4, 6, 8])
         # Make sure the pre registered function is used, and no other function
         # is added.
-        self.assertEqual(len(graph._functions), 2)
+        self.assertEqual(len(graph._functions), 6)
         functions = list(graph._functions.values())
-        called_func_name = functions[0].definition.signature.name
-        self.assertEqual(pre_register_matmul_func_name, called_func_name)
-        called_func_name = functions[1].definition.signature.name
-        self.assertEqual(pre_register_add_func_name, called_func_name)
+        for i in range(len(functions)):
+          self.assertEquals(captured_function_names[i],
+                            functions[i].definition.signature.name)
 
   def testRegisterFunctionWithInputSignature(self):
     def matmul(x, y):
@@ -1705,7 +1715,7 @@ class FunctionTest(test.TestCase):
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 1)
+        self.assertEqual(len(graph._functions), 3)
 
         # Test input param shape mismatch
         t2 = constant_op.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
@@ -1728,7 +1738,7 @@ class FunctionTest(test.TestCase):
         graph = ops.get_default_graph()
         # Only one function is registered since the input param are in same type
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 1)
+        self.assertEqual(len(graph._functions), 3)
 
   def testCallingFunctionWithDifferentVariables(self):
 
@@ -1767,7 +1777,8 @@ class FunctionTest(test.TestCase):
                                  'be Tensors;.*'):
       graph_function('Not a Tensor.')
 
-  def testSwapImplementationWithGrapplerPlugin(self):
+  # TODO(scottzhu): Revive the test once the grappler plugin is updated.
+  def disabled_testSwapImplementationWithGrapplerPlugin(self):
     rewrites = rewriter_config_pb2.RewriterConfig()
     # function_optimizer has to be turn off, otherwise it will delete the
     # registered function if it does not get called.
-- 
GitLab


From bdab0b3c111bbe1c9656fa2228f1a4d28df5a7bf Mon Sep 17 00:00:00 2001
From: Mingsheng Hong <hongm@google.com>
Date: Thu, 27 Sep 2018 15:32:38 -0700
Subject: [PATCH 101/570] Added an experimental API for user to set an internal
 error status.

See
https://github.com/apple/swift/pull/19588/files#diff-923cd5ac82727b31d446c23641b3d749
for an example usage.

Also removed an experimental API that's no longer needed.

PiperOrigin-RevId: 214847132
---
 tensorflow/c/c_api_experimental.cc | 34 +++---------------------------
 tensorflow/c/c_api_experimental.h  |  6 ++----
 2 files changed, 5 insertions(+), 35 deletions(-)

diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index f316e4ba67..d4b78138e9 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -8738,35 +8738,7 @@ void TFE_TensorHandlePrintDebugString(TFE_TensorHandle* handle) {
   TF_DeleteStatus(status);
 }
 
-TFE_TensorHandle* TFE_RunConstOp(TFE_Context* ctx) {
-  // Intentionally LOG into INFO below for ease of debugging.
-  VLOG(1) << "TFE_RunConstOp called";
-
-  auto* status = TF_NewStatus();
-  auto* op = TFE_NewOp(ctx, "Const", status);
-  CheckOk(status);
-  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
-
-  auto* tensor =
-      TF_AllocateTensor(TF_FLOAT, /*shape.data()*/ nullptr, /*shape.size()*/ 0,
-                        TF_DataTypeSize(TF_FLOAT) * 1);
-  auto* ptr = reinterpret_cast<char*>(TF_TensorData(tensor));
-  *reinterpret_cast<float*>(ptr) = 17.0;
-
-  TFE_OpSetAttrTensor(op, "value", tensor, status);
-  CheckOk(status);
-  TF_DeleteTensor(tensor);
-  VLOG(1) << "New op created";
-
-  TFE_TensorHandle* retval;
-  int num_retvals = 1;
-  TFE_Execute(op, &retval, &num_retvals, status);
-  CheckOk(status);
-  CHECK_EQ(num_retvals, 1);
-  VLOG(1) << "Op executed";
-
-  TFE_DeleteOp(op);
-  TF_DeleteStatus(status);
-
-  return retval;
+TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
+                                                      const char* errMsg) {
+  status->status = tensorflow::errors::Internal(errMsg);
 }
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 950ad9aeed..d98d532e32 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -180,10 +180,8 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueVariantTensor(
 TF_CAPI_EXPORT extern void TFE_TensorHandlePrintDebugString(
     TFE_TensorHandle* handle);
 
-// Returns a const scalar tensor.
-// Caller owns both the input and the output tensor handles.
-// TODO: Remove this API with hard-coded tensor computation.
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_RunConstOp(TFE_Context* ctx);
+TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
+                                                      const char* errMsg);
 
 #ifdef __cplusplus
 } /* end extern "C" */
-- 
GitLab


From 8f85a9de475f0acf0abef4fabc12943e2e487bf7 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Thu, 27 Sep 2018 15:37:49 -0700
Subject: [PATCH 102/570] Do not specify dilation rate to depthwise conv2d.

PiperOrigin-RevId: 214848057
---
 tensorflow/contrib/quantize/python/fold_batch_norms.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index e5790a6e13..7575b1b6cd 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -418,10 +418,11 @@ def _CloneWithNewOperands(layer_op, input_tensor, weight_tensor,
         transpose_b=layer_op.get_attr('transpose_b'),
         name=new_layer_name)
   elif layer_op.type == 'DepthwiseConv2dNative':
+    # We don't copy dilation rate because we reuse the input SpaceToBatch
+    # and create our own BatchToSpace operation below.
     conv = nn.depthwise_conv2d(
         input_tensor,
         weight_tensor,
-        rate=layer_op.get_attr('dilations'),
         strides=layer_op.get_attr('strides'),
         padding=layer_op.get_attr('padding'),
         name=new_layer_name)
-- 
GitLab


From bfec3d54fed955a4b145220e64c48b94fbb04ae7 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Thu, 27 Sep 2018 15:38:48 -0700
Subject: [PATCH 103/570] [XLA] Use a result cache to speed up
 InstructionFusion::CanFuseOnAllPaths()

PiperOrigin-RevId: 214848216
---
 .../xla/service/instruction_fusion.cc         | 29 ++++++++++++++-----
 .../compiler/xla/service/instruction_fusion.h | 11 +++++--
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 3fdc2cee9a..e884122fcb 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -188,13 +188,20 @@ bool InstructionFusion::EffectivelyAtMostUnary(HloInstruction* hlo) {
 
 bool InstructionFusion::CanFuseOnAllPaths(
     HloInstruction* producer, HloInstruction* consumer,
-    const HloInstructionSet& do_not_duplicate) {
+    const HloInstructionSet& do_not_fuse,
+    tensorflow::gtl::FlatMap<std::pair<HloInstruction*, HloInstruction*>, bool>*
+        result_cache) {
   if (consumer == producer) {
     return true;
   }
   if (!consumer->IsFusible()) {
     return false;
   }
+  auto cache_it = result_cache->find(std::make_pair(producer, consumer));
+  if (cache_it != result_cache->end()) {
+    return cache_it->second;
+  }
+  bool result = true;
   for (int64 i = 0, e = consumer->operand_count(); i < e; ++i) {
     auto* consumer_operand = consumer->mutable_operand(i);
     // If the operand is not on a path to the producer, it doesn't matter
@@ -202,20 +209,23 @@ bool InstructionFusion::CanFuseOnAllPaths(
     if (!reachability_->IsReachable(producer, consumer_operand)) {
       continue;
     }
-    if (do_not_duplicate.count(consumer_operand) > 0 ||
-        !ShouldFuse(consumer, i)) {
-      return false;
+    if (do_not_fuse.count(consumer_operand) > 0 || !ShouldFuse(consumer, i)) {
+      result = false;
+      break;
     }
     // The producer is reachable from consumer_operand which means we need
     // to be able to fuse consumer_operand into consumer in order for
     // producer to be fusible into consumer on all paths.
     // Perform the recursive step: make sure producer can be fused into
     // consumer_operand on all paths.
-    if (!CanFuseOnAllPaths(producer, consumer_operand, do_not_duplicate)) {
-      return false;
+    if (!CanFuseOnAllPaths(producer, consumer_operand, do_not_fuse,
+                           result_cache)) {
+      result = false;
+      break;
     }
   }
-  return true;
+  result_cache->emplace(std::make_pair(producer, consumer), result);
+  return result;
 }
 
 InstructionFusion::HloInstructionSet
@@ -231,6 +241,8 @@ InstructionFusion::ComputeGloballyUnfusible(
   // fusing operations that require duplication later depending on
   // is_expensive_().
   HloInstructionSet do_not_duplicate;
+  tensorflow::gtl::FlatMap<std::pair<HloInstruction*, HloInstruction*>, bool>
+      can_fuse_on_all_paths_result_cache;
   for (HloInstruction* consumer : post_order) {
     for (HloInstruction* producer : consumer->operands()) {
       if (do_not_duplicate.count(producer) > 0) {
@@ -286,7 +298,8 @@ InstructionFusion::ComputeGloballyUnfusible(
       // A will be not allowed to be fused into B, as it cannot be fused via
       // all paths.
       if (producer->IsFusible() &&
-          CanFuseOnAllPaths(producer, consumer, do_not_duplicate)) {
+          CanFuseOnAllPaths(producer, consumer, do_not_duplicate,
+                            &can_fuse_on_all_paths_result_cache)) {
         continue;
       }
       do_not_duplicate.insert(producer);
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index 7e1196fb7f..c1ec3b18a1 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -151,8 +151,15 @@ class InstructionFusion : public HloModulePass {
 
   // Whether or not we can fuse producer into consumer on all paths
   // from the producer to the consumer where nodes are HLOs and edges are uses.
-  bool CanFuseOnAllPaths(HloInstruction* producer, HloInstruction* consumer,
-                         const HloInstructionSet& do_not_fuse);
+  //
+  // A map from <producer, consumer> to a bool is required as the result cache
+  // to store and query the results of calls to this function, in order to avoid
+  // repeated computations.
+  bool CanFuseOnAllPaths(
+      HloInstruction* producer, HloInstruction* consumer,
+      const HloInstructionSet& do_not_fuse,
+      tensorflow::gtl::FlatMap<std::pair<HloInstruction*, HloInstruction*>,
+                               bool>* result_cache);
 
   // Computes the set of nodes that we do not want to fuse into any of their
   // consumers based on a global analysis of the HLO graph.
-- 
GitLab


From b56164c72b8f123bfc675f930111af8801fe034f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 15:49:53 -0700
Subject: [PATCH 104/570] Automated rollback of commit
 425e96f3ae4eb338268e3738260f9d79e4bdd893. Revert #20539.

PiperOrigin-RevId: 214849875
---
 tensorflow/contrib/layers/python/layers/embedding_ops.py | 8 +++-----
 tensorflow/python/feature_column/feature_column_v2.py    | 8 +++-----
 tensorflow/python/ops/embedding_ops.py                   | 8 +++-----
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index 17ee8c0733..60e1d85ea9 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -112,11 +112,9 @@ def safe_embedding_lookup_sparse(embedding_weights,
   dtype = sparse_weights.dtype if sparse_weights is not None else None
   if isinstance(embedding_weights, variables.PartitionedVariable):
     embedding_weights = list(embedding_weights)
-  if not isinstance(embedding_weights[0],
-                    resource_variable_ops.ResourceVariable):
-    embedding_weights = [
-        ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
-    ]
+  embedding_weights = [
+      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
+  ]
 
   contrib_tensor_util.assert_same_float_dtype(embedding_weights +
                                               [sparse_weights])
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index b62c16ea5a..289f6d0d14 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -3447,11 +3447,9 @@ def _safe_embedding_lookup_sparse(embedding_weights,
     raise ValueError('Missing embedding_weights %s.' % embedding_weights)
 
   dtype = sparse_weights.dtype if sparse_weights is not None else None
-  if not isinstance(embedding_weights[0],
-                    resource_variable_ops.ResourceVariable):
-    embedding_weights = [
-        ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
-    ]
+  embedding_weights = [
+      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
+  ]
 
   with ops.name_scope(name, 'embedding_lookup',
                       embedding_weights + [sparse_ids,
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 60d73a1693..6263041b8d 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -550,11 +550,9 @@ def safe_embedding_lookup_sparse(embedding_weights,
     raise ValueError('Missing embedding_weights %s.' % embedding_weights)
 
   dtype = sparse_weights.dtype if sparse_weights is not None else None
-  if not isinstance(embedding_weights[0],
-                    resource_variable_ops.ResourceVariable):
-    embedding_weights = [
-        ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
-    ]
+  embedding_weights = [
+      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
+  ]
 
   with ops.name_scope(name, 'embedding_lookup',
                       embedding_weights + [sparse_ids,
-- 
GitLab


From b8c86c3bbd8271ed968087f24e7fb704103bc733 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 15:50:41 -0700
Subject: [PATCH 105/570] Support saving/restoring of string tensors with
 lengths greater than 2^32.

PiperOrigin-RevId: 214849978
---
 tensorflow/core/util/tensor_bundle/BUILD      |   1 +
 .../core/util/tensor_bundle/tensor_bundle.cc  |  52 +++++++++-----
 .../util/tensor_bundle/tensor_bundle_test.cc  |  64 +++++++++++++++++-
 .../testdata/old_string_tensors/README        |   3 +
 .../foo.data-00000-of-00001                   | Bin 0 -> 1080 bytes
 .../testdata/old_string_tensors/foo.index     | Bin 0 -> 211 bytes
 6 files changed, 100 insertions(+), 20 deletions(-)
 create mode 100644 tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/README
 create mode 100644 tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/foo.data-00000-of-00001
 create mode 100644 tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/foo.index

diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index 648358606c..4d4db86df2 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -64,6 +64,7 @@ cc_library(
 tf_cc_test(
     name = "tensor_bundle_test",
     srcs = ["tensor_bundle_test.cc"],
+    data = glob(["testdata/**"]),
     deps = [
         ":tensor_bundle",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index ea8a259d1a..2dcb57a1f9 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -64,27 +64,36 @@ namespace {
 // Reads "num_elements" string elements from file[offset, offset+size) into the
 // length-N "destination".  Discards the original content of "destination".
 //
-// Checksums the string lengths (as restored uint32, not varint32 bytes) and
-// string bytes, and stores it into "actual_crc32c".
+// Checksums the string lengths (as restored uint32 or uint64, not varint64
+// bytes) and string bytes, and stores it into "actual_crc32c".
 Status ReadStringTensor(io::InputBuffer* buffered_file, size_t num_elements,
                         size_t offset, size_t size, string* destination,
                         uint32* actual_crc32c) {
   if (size == 0) return Status::OK();
   CHECK_GT(size, 0);
 
-  // Reads "num_elements" varint32's from "buffered_file".
+  // Reads "num_elements" varint64's from "buffered_file".
   TF_RETURN_IF_ERROR(buffered_file->Seek(offset));
-  std::vector<uint32> string_lengths(num_elements);
+  std::vector<uint64> string_lengths(num_elements);
   for (size_t i = 0; i < num_elements; ++i) {
-    TF_RETURN_IF_ERROR(buffered_file->ReadVarint32(&string_lengths[i]));
+    TF_RETURN_IF_ERROR(buffered_file->ReadVarint64(&string_lengths[i]));
+    if (string_lengths[i] <= UINT32_MAX) {
+      // We need to do this because older checkpoints only used uint32s and we
+      // should still support them.
+      const uint32 elem_size_uint32 = static_cast<uint32>(string_lengths[i]);
+      *actual_crc32c = crc32c::Extend(
+          *actual_crc32c, reinterpret_cast<const char*>(&elem_size_uint32),
+          sizeof(uint32));
+    } else {
+      *actual_crc32c = crc32c::Extend(
+          *actual_crc32c, reinterpret_cast<const char*>(&string_lengths[i]),
+          sizeof(uint64));
+    }
   }
   if (offset + size < buffered_file->Tell()) {
     return errors::DataLoss("String lengths longer than expected offset ",
                             offset + size);
   }
-  *actual_crc32c =
-      crc32c::Value(reinterpret_cast<const char*>(string_lengths.data()),
-                    sizeof(uint32) * num_elements);
 
   // Reads the length-checksum.
   uint32 length_checksum = 0;
@@ -104,7 +113,7 @@ Status ReadStringTensor(io::InputBuffer* buffered_file, size_t num_elements,
 
   // Reads the actual string bytes.
   for (size_t i = 0; i < num_elements; ++i) {
-    const uint32 string_length = string_lengths[i];
+    const uint64 string_length = string_lengths[i];
     string* buffer = &destination[i];
 
     buffer->resize(string_length);
@@ -218,8 +227,8 @@ Status WriteTensor(const Tensor& val, FileOutputBuffer* out,
 Status WriteStringTensor(const Tensor& val, FileOutputBuffer* out,
                          size_t* bytes_written, uint32* crc32c) {
   // On-disk format:
-  //   [varint32 len0]..[varint32 lenL][4 byte cksum on lengths][string bytes]
-  // Var "crc32c" checksums the string lengths (as uint32, not varint32 bytes),
+  //   [varint64 len0]..[varint64 lenL][4 byte cksum on lengths][string bytes]
+  // Var "crc32c" checksums the string lengths (as uint64, not varint64 bytes),
   // the length-checksum, and all the string bytes.
   DCHECK_EQ(val.dtype(), DT_STRING);
   const string* strings = GetStringBackingBuffer(val);
@@ -230,12 +239,21 @@ Status WriteStringTensor(const Tensor& val, FileOutputBuffer* out,
   *crc32c = 0;
   for (int64 i = 0; i < val.NumElements(); ++i) {
     const string* elem = &strings[i];
-    DCHECK_EQ(elem->size(), static_cast<uint32>(elem->size()));
-    const uint32 elem_size = static_cast<uint32>(elem->size());
-
-    core::PutVarint32(&lengths, elem_size);
-    *crc32c = crc32c::Extend(*crc32c, reinterpret_cast<const char*>(&elem_size),
-                             sizeof(uint32));
+    DCHECK_EQ(elem->size(), static_cast<uint64>(elem->size()));
+    const uint64 elem_size = static_cast<uint64>(elem->size());
+
+    core::PutVarint64(&lengths, elem_size);
+    if (elem_size <= UINT32_MAX) {
+      // We need to do this because older checkpoints only used uint32s and we
+      // should still support them.
+      const uint32 elem_size_uint32 = static_cast<uint32>(elem_size);
+      *crc32c = crc32c::Extend(*crc32c,
+                               reinterpret_cast<const char*>(&elem_size_uint32),
+                               sizeof(uint32));
+    } else {
+      *crc32c = crc32c::Extend(
+          *crc32c, reinterpret_cast<const char*>(&elem_size), sizeof(uint64));
+    }
   }
   TF_RETURN_IF_ERROR(out->Append(lengths));
   *bytes_written = lengths.size();
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index 59c42baa06..9567e4750b 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -39,6 +39,11 @@ string Prefix(const string& prefix) {
   return strings::StrCat(testing::TmpDir(), "/", prefix);
 }
 
+string TestdataPrefix(const string& prefix) {
+  return strings::StrCat(testing::TensorFlowSrcRoot(),
+                         "/core/util/tensor_bundle/testdata/", prefix);
+}
+
 template <typename T>
 Tensor Constant(T v, TensorShape shape) {
   Tensor ret(DataTypeToEnum<T>::value, shape);
@@ -458,7 +463,26 @@ TEST(TensorBundleTest, NonStandardShapes) {
   TestNonStandardShapes<qint8>();
 }
 
+TEST(TensorBundleTest, StringTensorsOldFormat) {
+  // Test string tensor bundle made with previous version of code that use
+  // varint32s to store string lengths (we now use varint64s).
+  BundleReader reader(Env::Default(), TestdataPrefix("old_string_tensors/foo"));
+  TF_ASSERT_OK(reader.status());
+  EXPECT_EQ(AllTensorKeys(&reader),
+            std::vector<string>({"floats", "scalar", "string_tensor", "strs"}));
+
+  Expect<string>(&reader, "string_tensor", Tensor(DT_STRING, TensorShape({1})));
+  Expect<string>(&reader, "scalar", test::AsTensor<string>({"hello"}));
+  Expect<string>(
+      &reader, "strs",
+      test::AsTensor<string>({"hello", "", "x01", string(1 << 10, 'c')}));
+  Expect<float>(&reader, "floats", Constant_2x3<float>(16.18));
+}
+
 TEST(TensorBundleTest, StringTensors) {
+  constexpr size_t kLongLength = static_cast<size_t>(UINT32_MAX) + 1;
+  Tensor long_string_tensor(DT_STRING, TensorShape({1}));
+
   {
     BundleWriter writer(Env::Default(), Prefix("foo"));
     TF_EXPECT_OK(writer.Add("string_tensor",
@@ -467,6 +491,12 @@ TEST(TensorBundleTest, StringTensors) {
     TF_EXPECT_OK(writer.Add(
         "strs",
         test::AsTensor<string>({"hello", "", "x01", string(1 << 25, 'c')})));
+
+    // Requires a 64-bit length.
+    string* backing_string = long_string_tensor.flat<string>().data();
+    backing_string->assign(kLongLength, 'd');
+    TF_EXPECT_OK(writer.Add("long_scalar", long_string_tensor));
+
     // Mixes in some floats.
     TF_EXPECT_OK(writer.Add("floats", Constant_2x3<float>(16.18)));
     TF_ASSERT_OK(writer.Finish());
@@ -474,9 +504,9 @@ TEST(TensorBundleTest, StringTensors) {
   {
     BundleReader reader(Env::Default(), Prefix("foo"));
     TF_ASSERT_OK(reader.status());
-    EXPECT_EQ(
-        AllTensorKeys(&reader),
-        std::vector<string>({"floats", "scalar", "string_tensor", "strs"}));
+    EXPECT_EQ(AllTensorKeys(&reader),
+              std::vector<string>({"floats", "long_scalar", "scalar",
+                                   "string_tensor", "strs"}));
 
     Expect<string>(&reader, "string_tensor",
                    Tensor(DT_STRING, TensorShape({1})));
@@ -484,7 +514,35 @@ TEST(TensorBundleTest, StringTensors) {
     Expect<string>(
         &reader, "strs",
         test::AsTensor<string>({"hello", "", "x01", string(1 << 25, 'c')}));
+
     Expect<float>(&reader, "floats", Constant_2x3<float>(16.18));
+
+    // We don't use the Expect function so we can re-use the
+    // `long_string_tensor` buffer for reading out long_scalar to keep memory
+    // usage reasonable.
+    EXPECT_TRUE(reader.Contains("long_scalar"));
+    DataType dtype;
+    TensorShape shape;
+    TF_ASSERT_OK(reader.LookupDtypeAndShape("long_scalar", &dtype, &shape));
+    EXPECT_EQ(DT_STRING, dtype);
+    EXPECT_EQ(TensorShape({1}), shape);
+
+    // Zero-out the string so that we can be sure the new one is read in.
+    string* backing_string = long_string_tensor.flat<string>().data();
+    backing_string->assign("");
+
+    // Read long_scalar and check it contains kLongLength 'd's.
+    TF_ASSERT_OK(reader.Lookup("long_scalar", &long_string_tensor));
+    ASSERT_EQ(backing_string, long_string_tensor.flat<string>().data());
+    EXPECT_EQ(kLongLength, backing_string->length());
+    for (char c : *backing_string) {
+      // Not using ASSERT_EQ('d', c) because this way is twice as fast due to
+      // compiler optimizations.
+      if (c != 'd') {
+        FAIL() << "long_scalar is not full of 'd's as expected.";
+        break;
+      }
+    }
   }
 }
 
diff --git a/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/README b/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/README
new file mode 100644
index 0000000000..428d3ef79e
--- /dev/null
+++ b/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/README
@@ -0,0 +1,3 @@
+This tensor bundle was generated from cl/214343133, before string tensor
+lengths were written as varint64s. This is here to check backwards
+compatibility between the new code and old checkpoints.
diff --git a/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/foo.data-00000-of-00001 b/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/foo.data-00000-of-00001
new file mode 100644
index 0000000000000000000000000000000000000000..23b488e5feaefa970927bfd93c4a989fb494fae9
GIT binary patch
literal 1080
zcmZQrRxN17dh^&E=Zw^xoP1UW<_3;AL9;J|c@+kR$)jL21V%$(#D>6<f<{Md7y$UV
BqVNC!

literal 0
HcmV?d00001

diff --git a/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/foo.index b/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/foo.index
new file mode 100644
index 0000000000000000000000000000000000000000..a22a69e6e1de790801fa87b8cf677a73ee287b9c
GIT binary patch
literal 211
zcmZQzVB=tvV&Y(AU=vHr$xkdP=3o@!5MttB5@O<DR#?EHAz|t=t0R(uO|UpQF(<Kz
zgI$P4h>3$yfmMUc^v#0<nv6XBB}JKe>G370dByo~c@0+6z3J@xm>GqNp;}lJ_%$YS
zm<D}g`@_J%z`zJ1F74aw1){;61D_UdU|?h_VW@)(ZD6iRX=db@z{m#@!jHp$H*~9%
Hy59x>Q*9-v

literal 0
HcmV?d00001

-- 
GitLab


From ece50dd9992ac17e3094c7f6d1914febd7a036b5 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 27 Sep 2018 16:05:51 -0700
Subject: [PATCH 106/570] [tf.data Introducing tf.data.Dataset.reduce() which
 reduces elements of a (finite) dataset to a single element.

PiperOrigin-RevId: 214852364
---
 .../base_api/api_def_ReduceDataset.pbtxt      |  26 ++++
 .../data/group_by_reducer_dataset_op.cc       |   4 +-
 .../data/group_by_window_dataset_op.cc        |   4 +-
 tensorflow/core/kernels/data/iterator_ops.cc  | 111 ++++++++++++++++
 .../core/kernels/data/scan_dataset_op.cc      |   4 +-
 tensorflow/core/ops/dataset_ops.cc            |  13 ++
 tensorflow/python/data/kernel_tests/BUILD     |  18 +++
 .../kernel_tests/reduce_dataset_op_test.py    | 124 ++++++++++++++++++
 tensorflow/python/data/ops/dataset_ops.py     | 120 +++++++++++++++++
 .../golden/v1/tensorflow.data.-dataset.pbtxt  |   4 +
 ...ow.data.-fixed-length-record-dataset.pbtxt |   4 +
 .../tensorflow.data.-t-f-record-dataset.pbtxt |   4 +
 .../tensorflow.data.-text-line-dataset.pbtxt  |   4 +
 .../golden/v2/tensorflow.data.-dataset.pbtxt  |   4 +
 ...ow.data.-fixed-length-record-dataset.pbtxt |   4 +
 .../tensorflow.data.-t-f-record-dataset.pbtxt |   4 +
 .../tensorflow.data.-text-line-dataset.pbtxt  |   4 +
 17 files changed, 447 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ReduceDataset.pbtxt
 create mode 100644 tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py

diff --git a/tensorflow/core/api_def/base_api/api_def_ReduceDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReduceDataset.pbtxt
new file mode 100644
index 0000000000..08414b3e68
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReduceDataset.pbtxt
@@ -0,0 +1,26 @@
+op {
+  visibility: HIDDEN
+  graph_op_name: "ReduceDataset"
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+    name: "initial_state"
+    description: <<END
+A nested structure of tensors, representing the initial state of the
+transformation.
+END
+  }
+  attr {
+    name: "f"
+    description: <<END
+A function that maps `(old_state, input_element)` to `new_state`. It must take
+two arguments and return a nested structures of tensors. The structure of
+`new_state` must match the structure of `initial_state`.
+END
+  }
+  summary: "Reduces the input dataset to a singleton using a reduce function."
+}
diff --git a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
index d6ee42a7c6..e7244ee208 100644
--- a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
@@ -30,8 +30,7 @@ namespace {
 class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit GroupByReducerDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
+      : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("key_func", &key_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("init_func", &init_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("reduce_func", &reduce_func_));
@@ -421,7 +420,6 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
     const std::vector<PartialTensorShape> output_shapes_;
   };
 
-  const int graph_def_version_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList key_func_;
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
index 8b417bb1c2..14aefe5d54 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
@@ -31,8 +31,7 @@ namespace {
 class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit GroupByWindowDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
+      : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("key_func", &key_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("reduce_func", &reduce_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("window_size_func", &window_size_func_));
@@ -507,7 +506,6 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
     const std::vector<PartialTensorShape> output_shapes_;
   };
 
-  const int graph_def_version_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList key_func_;
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index c0bc507ec0..7a833668ac 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -659,6 +659,115 @@ class ToSingleElementOp : public AsyncOpKernel {
   BackgroundWorker background_worker_;
 };
 
+class ReduceDatasetOp : public AsyncOpKernel {
+ public:
+  explicit ReduceDatasetOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        background_worker_(
+            ctx->env(),
+            strings::StrCat("reduce_thread_", SanitizeThreadSuffix(name()))) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &reduce_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
+                                     &use_inter_op_parallelism_));
+  }
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    // The call to `iterator->GetNext()` may block and depend on an
+    // inter-op thread pool thread, so we issue the call from the
+    // owned thread pool.
+    background_worker_.Schedule([this, ctx, done]() {
+      DatasetBase* dataset;
+      OP_REQUIRES_OK_ASYNC(
+          ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
+      OpInputList inputs;
+      OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("initial_state", &inputs),
+                           done);
+      std::vector<Tensor> state(inputs.begin(), inputs.end());
+
+      std::unique_ptr<CapturedFunction> captured_func;
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          CapturedFunction::Create(reduce_func_, ctx, "other_arguments",
+                                   use_inter_op_parallelism_, &captured_func),
+          done);
+
+      IteratorContext iter_ctx(ctx);
+      OP_REQUIRES_OK_ASYNC(ctx, captured_func->Instantiate(&iter_ctx), done);
+
+      std::unique_ptr<IteratorBase> iterator;
+      OP_REQUIRES_OK_ASYNC(
+          ctx, dataset->MakeIterator(&iter_ctx, "ReduceIterator", &iterator),
+          done);
+
+      // NOTE(jsimsa): We must destroy the iterator before calling `done()`, to
+      // avoid destruction races.
+      IteratorBase* raw_iterator = iterator.release();
+      auto cleanup = gtl::MakeCleanup([raw_iterator, done] {
+        delete raw_iterator;
+        done();
+      });
+
+      // Iterate through the input dataset.
+      Status status;
+      while (true) {
+        std::vector<Tensor> next_input_element;
+        bool end_of_input;
+        status = raw_iterator->GetNext(&iter_ctx, &next_input_element,
+                                       &end_of_input);
+        if (!status.ok() || end_of_input) {
+          break;
+        }
+
+        // Run the reduce function to update the current state.
+        std::vector<Tensor> args;
+        args.reserve(state.size() + next_input_element.size());
+        std::copy(state.begin(), state.end(), std::back_inserter(args));
+        std::copy(next_input_element.begin(), next_input_element.end(),
+                  std::back_inserter(args));
+
+        std::vector<Tensor> reduce_func_output;
+        status =
+            captured_func->Run(&iter_ctx, std::move(args), &reduce_func_output);
+        if (!status.ok()) {
+          break;
+        }
+        std::swap(reduce_func_output, state);
+      }
+
+      if (!status.ok()) {
+        ctx->SetStatus(status);
+        return;
+      }
+      for (int i = 0; i < state.size(); ++i) {
+        OP_REQUIRES_ASYNC(
+            ctx, state[i].dtype() == output_types_[i],
+            errors::InvalidArgument(
+                "The result does not match the expected type for component ", i,
+                ". Expected: ", DataTypeString(output_types_[i]),
+                ". Actual: ", DataTypeString(state[i].dtype()), "."),
+            done);
+        OP_REQUIRES_ASYNC(
+            ctx, output_shapes_[i].IsCompatibleWith(state[i].shape()),
+            errors::InvalidArgument(
+                "The result does not match the expected shape for component ",
+                i, ". Expected: ", output_shapes_[i].DebugString(),
+                ". Actual: ", state[i].shape().DebugString(), "."),
+            done);
+        ctx->set_output(i, state[i]);
+      }
+    });
+  }
+
+ private:
+  NameAttrList reduce_func_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  bool use_inter_op_parallelism_;
+  BackgroundWorker background_worker_;
+};
+
 class OneShotIteratorOp : public AsyncOpKernel {
  public:
   explicit OneShotIteratorOp(OpKernelConstruction* ctx)
@@ -1146,6 +1255,8 @@ REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE_GPU),
                         AnonymousIteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("DatasetToSingleElement").Device(DEVICE_CPU),
                         ToSingleElementOp);
+REGISTER_KERNEL_BUILDER(Name("ReduceDataset").Device(DEVICE_CPU),
+                        ReduceDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("OneShotIterator").Device(DEVICE_CPU),
                         OneShotIteratorOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/scan_dataset_op.cc b/tensorflow/core/kernels/data/scan_dataset_op.cc
index dbe31f37b8..2a911aa368 100644
--- a/tensorflow/core/kernels/data/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/scan_dataset_op.cc
@@ -32,8 +32,7 @@ namespace {
 class ScanDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit ScanDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
+      : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Tstate", &state_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
@@ -258,7 +257,6 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
     const std::vector<PartialTensorShape> output_shapes_;
   };
 
-  const int graph_def_version_;
   DataTypeVector state_types_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 1ada623cf5..71f4cc3c4c 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -756,6 +756,19 @@ REGISTER_OP("DatasetToSingleElement")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(IteratorGetNextShapeFn);
 
+REGISTER_OP("ReduceDataset")
+    .Input("input_dataset: variant")
+    .Input("initial_state: Tstate")
+    .Input("other_arguments: Targuments")
+    .Output("components: output_types")
+    .Attr("f: func")
+    .Attr("Tstate: list(type) >= 1")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("use_inter_op_parallelism: bool = true")
+    .SetShapeFn(IteratorGetNextShapeFn);
+
 REGISTER_OP("IteratorToStringHandle")
     .Input("resource_handle: resource")
     .Output("string_handle: string")
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index fdcbfc3684..5f9818566f 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -404,6 +404,24 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "reduce_dataset_op_test",
+    size = "small",
+    srcs = ["reduce_dataset_op_test.py"],
+    additional_deps = [
+        ":test_base",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 tf_py_test(
     name = "sequence_dataset_op_test",
     size = "small",
diff --git a/tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py b/tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py
new file mode 100644
index 0000000000..11e07300b9
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py
@@ -0,0 +1,124 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def testSum(self):
+    for i in range(10):
+      ds = dataset_ops.Dataset.range(1, i + 1)
+      result = ds.reduce(np.int64(0), lambda x, y: x + y)
+      with self.cached_session() as sess:
+        self.assertEqual(((i + 1) * i) // 2, sess.run(result))
+
+  def testSumTuple(self):
+
+    def reduce_fn(state, value):
+      v1, v2 = value
+      return state + v1 + v2
+
+    for i in range(10):
+      ds = dataset_ops.Dataset.range(1, i + 1)
+      ds = dataset_ops.Dataset.zip((ds, ds))
+      result = ds.reduce(np.int64(0), reduce_fn)
+      with self.cached_session() as sess:
+        self.assertEqual(((i + 1) * i), sess.run(result))
+
+  def testSumAndCount(self):
+
+    def reduce_fn(state, value):
+      s, c = state
+      return s + value, c + 1
+
+    for i in range(10):
+      ds = dataset_ops.Dataset.range(1, i + 1)
+      result = ds.reduce((np.int64(0), np.int64(0)), reduce_fn)
+      with self.cached_session() as sess:
+        s, c = sess.run(result)
+        self.assertEqual(((i + 1) * i) // 2, s)
+        self.assertEqual(i, c)
+
+  def testSquareUsingPlaceholder(self):
+    delta = array_ops.placeholder(dtype=dtypes.int64)
+
+    def reduce_fn(state, _):
+      return state + delta
+
+    for i in range(10):
+      ds = dataset_ops.Dataset.range(1, i + 1)
+      result = ds.reduce(np.int64(0), reduce_fn)
+      with self.cached_session() as sess:
+        square = sess.run(result, feed_dict={delta: i})
+        self.assertEqual(i * i, square)
+
+  def testSparse(self):
+
+    def reduce_fn(_, value):
+      return value
+
+    def make_sparse_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
+    for i in range(10):
+      ds = dataset_ops.Dataset.from_tensors(make_sparse_fn(i+1))
+      result = ds.reduce(make_sparse_fn(0), reduce_fn)
+      with self.cached_session() as sess:
+        self.assertSparseValuesEqual(make_sparse_fn(i+1), sess.run(result))
+
+  def testNested(self):
+
+    def reduce_fn(state, value):
+      state["dense"] += value["dense"]
+      state["sparse"] = value["sparse"]
+      return state
+
+    def make_sparse_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
+    def map_fn(i):
+      return {"dense": math_ops.cast(i, dtype=dtypes.int64),
+              "sparse": make_sparse_fn(math_ops.cast(i, dtype=dtypes.int64))}
+
+    for i in range(10):
+      ds = dataset_ops.Dataset.range(1, i + 1).map(map_fn)
+      result = ds.reduce(map_fn(0), reduce_fn)
+      with self.cached_session() as sess:
+        result = sess.run(result)
+        self.assertEqual(((i + 1) * i) // 2, result["dense"])
+        self.assertSparseValuesEqual(make_sparse_fn(i), result["sparse"])
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index ac87a451b1..6bba72a8e9 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -1205,6 +1205,126 @@ class Dataset(object):
       shift = size
     return WindowDataset(self, size, shift, stride, drop_remainder)
 
+  def reduce(self, initial_state, reduce_func):
+    """Reduces the input dataset to a single element.
+
+    The transformation calls `reduce_func` successively on every element of
+    the input dataset until the dataset is exhausted, aggregating information in
+    its internal state. The `initial_state` argument is used for the initial
+    state and the final state is returned as the result.
+
+    For example:
+    - `tf.data.Dataset.range(5).reduce(np.int64(0), lambda x, _: x + 1)`
+      produces `5`
+    - `tf.data.Dataset.range(5).reduce(np.int64(0), lambda x, y: x + y)`
+      produces `10`
+
+    Args:
+      initial_state: A nested structure of tensors, representing the initial
+        state of the transformation.
+      reduce_func: A function that maps `(old_state, input_element)` to
+        `new_state`. It must take two arguments and return a nested structure
+        of tensors. The structure of `new_state` must match the structure of
+        `initial_state`.
+
+    Returns:
+      A nested structure of `tf.Tensor` objects, corresponding to the final
+      state of the transformation.
+
+    """
+
+    with ops.name_scope("initial_state"):
+      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+      # values to tensors.
+      initial_state = nest.pack_sequence_as(initial_state, [
+          sparse_tensor_lib.SparseTensor.from_value(t)
+          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(
+              t, name="component_%d" % i)
+          for i, t in enumerate(nest.flatten(initial_state))
+      ])
+
+    # Compute initial values for the state classes, shapes and types based on
+    # the initial state.
+    state_classes = sparse.get_classes(initial_state)
+    state_shapes = nest.pack_sequence_as(
+        initial_state, [t.get_shape() for t in nest.flatten(initial_state)])
+    state_types = nest.pack_sequence_as(
+        initial_state, [t.dtype for t in nest.flatten(initial_state)])
+
+    # Iteratively rerun the reduce function until reaching a fixed point on
+    # `self._state_shapes`.
+    need_to_rerun = True
+    while need_to_rerun:
+
+      wrapped_func = StructuredFunctionWrapper(
+          reduce_func,
+          "reduce()",
+          input_classes=(state_classes, self.output_classes),
+          input_shapes=(state_shapes, self.output_shapes),
+          input_types=(state_types, self.output_types),
+          add_to_graph=False)
+
+      # Extract and validate class information from the returned values.
+      output_classes = wrapped_func.output_classes
+      for new_state_class, state_class in zip(
+          nest.flatten(output_classes), nest.flatten(state_classes)):
+        if not issubclass(new_state_class, state_class):
+          raise TypeError(
+              "The element classes for the new state must match the initial "
+              "state. Expected %s; got %s." % (state_classes,
+                                               wrapped_func.output_classes))
+
+      # Extract and validate type information from the returned values.
+      output_types = wrapped_func.output_types
+      for new_state_type, state_type in zip(
+          nest.flatten(output_types), nest.flatten(state_types)):
+        if new_state_type != state_type:
+          raise TypeError(
+              "The element types for the new state must match the initial "
+              "state. Expected %s; got %s." % (state_types,
+                                               wrapped_func.output_types))
+
+      # Extract shape information from the returned values.
+      output_shapes = wrapped_func.output_shapes
+      flat_state_shapes = nest.flatten(state_shapes)
+      flat_new_state_shapes = nest.flatten(output_shapes)
+      weakened_state_shapes = [
+          original.most_specific_compatible_shape(new)
+          for original, new in zip(flat_state_shapes, flat_new_state_shapes)
+      ]
+
+      need_to_rerun = False
+      for original_shape, weakened_shape in zip(flat_state_shapes,
+                                                weakened_state_shapes):
+        if original_shape.ndims is not None and (
+            weakened_shape.ndims is None or
+            original_shape.as_list() != weakened_shape.as_list()):
+          need_to_rerun = True
+          break
+
+      if need_to_rerun:
+        state_shapes = nest.pack_sequence_as(state_shapes,
+                                             weakened_state_shapes)
+
+    reduce_func = wrapped_func.function
+    reduce_func.add_to_graph(ops.get_default_graph())
+
+    return sparse.deserialize_sparse_tensors(
+        nest.pack_sequence_as(
+            output_types,
+            gen_dataset_ops.reduce_dataset(
+                self._as_variant_tensor(),  # pylint: disable=protected-access
+                nest.flatten(sparse.serialize_sparse_tensors(initial_state)),
+                reduce_func.captured_inputs,
+                f=reduce_func,
+                output_shapes=nest.flatten(
+                    sparse.as_dense_shapes(output_shapes, output_classes)),
+                output_types=nest.flatten(
+                    sparse.as_dense_types(output_types, output_classes)))),
+        output_types,
+        output_shapes,
+        output_classes)
+
 
 class DatasetSource(Dataset):
   """Abstract class representing a dataset with no inputs."""
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index c3ba2dba57..825afb622f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "range"
     argspec: "args=[], varargs=args, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 3541671bee..cdad5f6360 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "range"
     argspec: "args=[], varargs=args, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index b113c18ee0..df41bff1b5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "range"
     argspec: "args=[], varargs=args, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index 7210bf5db4..028bcc2ce9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "range"
     argspec: "args=[], varargs=args, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index c3ba2dba57..825afb622f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "range"
     argspec: "args=[], varargs=args, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 3541671bee..cdad5f6360 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "range"
     argspec: "args=[], varargs=args, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index b113c18ee0..df41bff1b5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "range"
     argspec: "args=[], varargs=args, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index 7210bf5db4..028bcc2ce9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "range"
     argspec: "args=[], varargs=args, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
-- 
GitLab


From d8a370274d6ab8c68edcce66849b4e96aed2fa0d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 16:10:08 -0700
Subject: [PATCH 107/570] Optimize ParseNodeNameAsStringPiece and related
 functions, since they are the most costly functions in Grappler.

PiperOrigin-RevId: 214853009
---
 .../core/grappler/optimizers/data/BUILD       |   1 +
 .../optimizers/data/function_utils.cc         |   1 +
 tensorflow/core/grappler/utils.cc             |  39 -------
 tensorflow/core/grappler/utils.h              | 110 +++++++++++++-----
 tensorflow/core/grappler/utils_test.cc        |  19 +++
 5 files changed, 102 insertions(+), 68 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index d198a2a591..81c1bddf67 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -94,6 +94,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
 )
 
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.cc b/tensorflow/core/grappler/optimizers/data/function_utils.cc
index e3f6d8e1ea..311df15bc2 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/lib/strings/scanner.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index db6e4e6852..5867d01324 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -156,45 +156,6 @@ bool IsControlInput(const string& name) {
   return !name.empty() && name[0] == '^';
 }
 
-string NodeName(const string& name) {
-  int position;
-  return ParseNodeName(name, &position);
-}
-
-int NodePosition(const string& name) {
-  int position;
-  ParseNodeNameAsStringPiece(name, &position);
-  return position;
-}
-
-int NodePositionIfSameNode(const string& input_name, const string& node_name) {
-  const bool is_ctrl = input_name[0] == '^';
-  auto input_it = is_ctrl ? input_name.begin() + 1 : input_name.begin();
-  auto node_it = node_name.begin();
-  if (node_name.empty() ||
-      std::distance(input_it, input_name.end()) < node_name.size()) {
-    return -2;
-  }
-  while (node_it != node_name.end()) {
-    if (*input_it++ != *node_it++) {
-      return -2;
-    }
-  }
-  if (input_it == input_name.end()) {
-    return is_ctrl ? -1 : 0;
-  } else if (*input_it++ == ':') {
-    StringPiece remaining(&(*input_it),
-                          std::distance(input_it, input_name.end()));
-    int position;
-    if (!strings::safe_strto32(remaining, &position)) {
-      return -2;
-    }
-    return is_ctrl ? -1 : position;
-  } else {
-    return -2;
-  }
-}
-
 string AddPrefixToNodeName(const string& name, const string& prefix,
                            const string& delimiter) {
   if (!name.empty()) {
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 296ee1678e..95126d470c 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/lib/strings/scanner.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -102,40 +101,92 @@ bool IsControlInput(const string& name);
 // True iff 'name1' and 'name2' refer to the same input.
 bool IsSameInput(const string& name1, const string& name2);
 
+// Returns the trailing position number (or zero if no number is present) if
+// NodeName(input_name) is equal to node_name. Returns -1 for control inputs.
+// Returns -2 if NodeName(input_name) is not equal to node_name.
+// Note: This function is used very heavily, and this hand-optimized
+// version is 3-4x faster than the version using Scanner, which it replaced.
+// This is worth the reduction in readability.
+inline int NodePositionIfSameNode(const string& input_name,
+                                  const string& node_name) {
+  if (input_name.empty()) return -2;
+  const bool is_ctrl = input_name[0] == '^';
+  auto input_it = is_ctrl ? input_name.begin() + 1 : input_name.begin();
+  auto node_it = node_name.begin();
+  if (node_name.empty() ||
+      std::distance(input_it, input_name.end()) < node_name.size()) {
+    return -2;
+  }
+  while (node_it != node_name.end()) {
+    if (*input_it++ != *node_it++) {
+      return -2;
+    }
+  }
+  if (input_it == input_name.end()) {
+    return is_ctrl ? -1 : 0;
+  } else if (*input_it++ == ':') {
+    StringPiece remaining(&(*input_it),
+                          std::distance(input_it, input_name.end()));
+    int position;
+    if (!strings::safe_strto32(remaining, &position)) {
+      return -2;
+    }
+    return is_ctrl ? -1 : position;
+  } else {
+    return -2;
+  }
+}
+
 // Return the node name corresponding to 'name' if name is valid, or the empty
 // string otherwise.
-string NodeName(const string& name);
+inline StringPiece NodeNameAsStringPiece(const string& name) {
+  static const string empty;
+  if (name.empty()) return StringPiece(empty);
+  const auto begin_it = name[0] == '^' ? name.begin() + 1 : name.begin();
+  auto end_it = begin_it;
+  while (end_it != name.end() && *end_it != ':') {
+    ++end_it;
+  }
+  if (end_it != name.end() && *end_it != ':') {
+    return StringPiece(empty);
+  }
+  return StringPiece(&(*begin_it), std::distance(begin_it, end_it));
+}
 
-// Get the trailing position number ":{digits}" (if any) of a node name.
-// Returns -1 for control inputs.
-int NodePosition(const string& name);
+// Return the node name corresponding to 'name' if name is valid, or the empty
+// string otherwise.
+inline string NodeName(const string& name) {
+  return string(NodeNameAsStringPiece(name));
+}
 
+// Returns the node name and position in a single call.
 inline StringPiece ParseNodeNameAsStringPiece(const string& name,
                                               int* position) {
-  // Strip the prefix '^' (if any), and strip the trailing ":{digits} (if any)
-  // to get a node name.
-  strings::Scanner scan(name);
-  scan.ZeroOrOneLiteral("^")
-      .RestartCapture()
-      .One(strings::Scanner::LETTER_DIGIT_DOT_UNDERSCORE)
-      .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
-  StringPiece capture;
-  StringPiece remaining;
-  if (scan.Peek(':') != ':' || !scan.GetResult(&remaining, &capture)) {
+  static const string empty;
+  if (name.empty()) {
     *position = 0;
-    static const string empty;
     return StringPiece(empty);
-  } else {
-    if (name[0] == '^') {
-      *position = -1;
-    } else if (remaining.empty()) {
-      *position = 0;
-    } else {
-      // Skip the first ':' character.
-      CHECK(strings::safe_strto32(remaining.substr(1), position));
+  }
+  const bool is_ctrl = name[0] == '^';
+  const auto begin_it = is_ctrl ? name.begin() + 1 : name.begin();
+  *position = is_ctrl ? -1 : 0;
+  auto end_it = begin_it;
+  while (end_it != name.end() && *end_it != ':') {
+    ++end_it;
+  }
+  const StringPiece node_name(&(*begin_it), std::distance(begin_it, end_it));
+  if (end_it != name.end()) {
+    if (*end_it != ':') {
+      return StringPiece(empty);
+    } else if (!is_ctrl) {
+      ++end_it;
+      StringPiece remaining(&(*end_it), std::distance(end_it, name.end()));
+      if (!strings::safe_strto32(remaining, position)) {
+        return StringPiece(empty);
+      }
     }
-    return capture;
   }
+  return node_name;
 }
 
 // Returns the node name and position in a single call.
@@ -143,10 +194,11 @@ inline string ParseNodeName(const string& name, int* position) {
   return string(ParseNodeNameAsStringPiece(name, position));
 }
 
-// Returns NodePosition(input_name) if NodeName(input_name) == node_name.
-// Otherwise returns -2;
-// REQUIRES: inputs_name.size() > 0 && node_name.size() > 0.
-int NodePositionIfSameNode(const string& input_name, const string& node_name);
+inline int NodePosition(const string& name) {
+  int position;
+  ParseNodeNameAsStringPiece(name, &position);
+  return position;
+}
 
 // Add a prefix to a node name with a custom delimiter.
 string AddPrefixToNodeName(const string& name, const string& prefix,
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 6b787a6910..9b6c1f690b 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -371,6 +371,25 @@ BM_NodePositionIfSameNode("^foo/bar/baz", "foo/bar/baz", Match_Ctrl);
 BM_NodePositionIfSameNode("blah", "foo/bar/baz", NoMatch_0);
 BM_NodePositionIfSameNode("foo/bar/baz/gnu", "foo/bar/baz", NoMatch_end);
 
+#define BM_ParseNodeNameAsStringPiece(I, NAME)                               \
+  static void BM_ParseNodeNameAsStringPiece_##NAME(int iters) {              \
+    string input = I;                                                        \
+    for (int i = 0; i < iters; ++i) {                                        \
+      int position;                                                          \
+      const StringPiece name = ParseNodeNameAsStringPiece(input, &position); \
+      CHECK_GE(position, -1);                                                \
+      CHECK(!name.empty());                                                  \
+    }                                                                        \
+  }                                                                          \
+  BENCHMARK(BM_ParseNodeNameAsStringPiece_##NAME)
+
+BM_ParseNodeNameAsStringPiece("foo", foo);
+BM_ParseNodeNameAsStringPiece("foo/bar/baz", foo_bar_baz);
+BM_ParseNodeNameAsStringPiece("^foo/bar/baz", foo_bar_baz_ctrl);
+BM_ParseNodeNameAsStringPiece("foo:123", foo123);
+BM_ParseNodeNameAsStringPiece("foo/bar/baz:123", foo_bar_baz_123);
+BM_ParseNodeNameAsStringPiece("^foo/bar/baz:123", foo_bar_baz_123_ctrl);
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
-- 
GitLab


From f41573b7956871b4142c97eb85ddf163ad641976 Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Thu, 27 Sep 2018 16:16:20 -0700
Subject: [PATCH 108/570] Automated rollback of commit
 750466c6e6624d279de7f9a43accd682d487509c

PiperOrigin-RevId: 214853846
---
 tensorflow/core/BUILD                         |  16 --
 .../core/common_runtime/direct_session.cc     |  49 +---
 .../core/common_runtime/direct_session.h      |   3 -
 .../common_runtime/direct_session_test.cc     |  28 --
 tensorflow/core/framework/run_handler.cc      | 248 ------------------
 tensorflow/core/framework/run_handler.h       |  95 -------
 tensorflow/core/framework/run_handler_util.cc |  57 ----
 tensorflow/core/framework/run_handler_util.h  |  43 ---
 .../core/framework/run_handler_util_test.cc   |  93 -------
 tensorflow/core/protobuf/config.proto         |   5 -
 ...ensorflow.-run-options.-experimental.pbtxt |   6 -
 .../golden/v1/tensorflow.-run-options.pbtxt   |   6 -
 ...ensorflow.-run-options.-experimental.pbtxt |   6 -
 .../golden/v2/tensorflow.-run-options.pbtxt   |   6 -
 14 files changed, 6 insertions(+), 655 deletions(-)
 delete mode 100644 tensorflow/core/framework/run_handler.cc
 delete mode 100644 tensorflow/core/framework/run_handler.h
 delete mode 100644 tensorflow/core/framework/run_handler_util.cc
 delete mode 100644 tensorflow/core/framework/run_handler_util.h
 delete mode 100644 tensorflow/core/framework/run_handler_util_test.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 01e2e9f62b..ca247dc56b 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2484,8 +2484,6 @@ FRAMEWORK_INTERNAL_PUBLIC_HEADERS = [
     "framework/op_segment.h",
     "framework/rendezvous.h",  # only needed for tests
     "framework/resource_var.h",
-    "framework/run_handler.h",
-    "framework/run_handler_util.h",
     "framework/tensor_reference.h",
     "framework/tracking_allocator.h",  # only needed for tests
     "framework/unique_tensor_references.h",
@@ -2972,7 +2970,6 @@ tf_cuda_library(
         ":core_cpu_internal",
         ":device_tracer",
         ":framework",
-        ":framework_internal",
         ":graph",
         ":lib",
         ":lib_internal",
@@ -4120,19 +4117,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "framework_run_handler_util_test",
-    size = "small",
-    srcs = ["framework/run_handler_util_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":framework_internal",
-        ":lib",
-        ":test",
-        ":test_main",
-    ],
-)
-
 tf_cuda_cc_test(
     name = "common_runtime_direct_session_test",
     size = "small",
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 458e133b68..841181f8c3 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/run_handler.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -245,21 +244,6 @@ void DirectSession::SchedClosure(thread::ThreadPool* pool,
 #endif  // __ANDROID__
 }
 
-static RunHandlerPool* GetOrCreateRunHandlerPool(
-    const SessionOptions& options) {
-  static RunHandlerPool* pool =
-      new RunHandlerPool(NumInterOpThreadsFromSessionOptions(options));
-  return pool;
-}
-
-bool DirectSession::ShouldUseRunHandlerPool() const {
-  if (options_.config.session_inter_op_thread_pool_size() > 0 ||
-      options_.config.use_per_session_threads()) {
-    return false;
-  }
-  return true;
-}
-
 DirectSession::DirectSession(const SessionOptions& options,
                              const DeviceMgr* device_mgr,
                              DirectSessionFactory* const factory)
@@ -598,37 +582,16 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
     }
   }
 
-  std::unique_ptr<RunHandler> handler;
-  if (ShouldUseRunHandlerPool() &&
-      run_options.experimental().use_run_handler_pool()) {
-    // Non-null only when a global inter-op pool is used.
-    VLOG(1) << "Using RunHandler to scheduler inter-op closures.";
-    handler = GetOrCreateRunHandlerPool(options_)->Get();
-  }
-  auto* handler_ptr = handler.get();
-
-  Executor::Args::Runner default_runner = nullptr;
-
-  if (pool == nullptr) {
-    default_runner = [](Executor::Args::Closure c) { c(); };
-  } else if (handler_ptr != nullptr) {
-    default_runner = [handler_ptr](Executor::Args::Closure c) {
-      handler_ptr->ScheduleInterOpClosure(std::move(c));
-    };
-  } else {
-    default_runner = [this, pool](Executor::Args::Closure c) {
-      SchedClosure(pool, std::move(c));
-    };
-  }
-
+  Executor::Args::Runner default_runner = [this,
+                                           pool](Executor::Args::Closure c) {
+    SchedClosure(pool, std::move(c));
+  };
   for (const auto& item : executors_and_keys->items) {
-    // TODO(azaks): support partial run.
-    // TODO(azaks): if the device picks its own threadpool, we need to assign
+    // TODO(zhengxq): support partial run.
+    // TODO(zhengxq): if the device picks its own threadpool, we need to assign
     //     less threads to the main compute pool by default.
     thread::ThreadPool* device_thread_pool =
         item.device->tensorflow_device_thread_pool();
-    // TODO(crk): Investigate usage of RunHandlerPool when using device specific
-    // thread pool(s).
     if (!device_thread_pool) {
       args.runner = default_runner;
     } else {
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 3a168bbe3f..4a6a921ea7 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -247,9 +247,6 @@ class DirectSession : public Session {
                                    ExecutorsAndKeys* executors_and_keys,
                                    RunMetadata* run_metadata);
 
-  // Returns whether inter-op execution uses a global pool.
-  bool ShouldUseRunHandlerPool() const;
-
   ::tensorflow::Status ExtendLocked(const GraphDef& graph)
       EXCLUSIVE_LOCKS_REQUIRED(graph_state_lock_);
 
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index e3e431f800..65e816c202 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -625,34 +625,6 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetworkWithOpts_Callable) {
   EXPECT_EQ(run_metadata.step_stats().dev_stats_size(), 2);
 }
 
-TEST_F(DirectSessionMinusAXTest, UseRunHandlerPool) {
-  Initialize({3, 2, -1, 0});
-  auto session = CreateSession();
-  ASSERT_TRUE(session != nullptr);
-  TF_ASSERT_OK(session->Create(def_));
-  std::vector<std::pair<string, Tensor>> inputs;
-
-  // Request two targets: one fetch output and one non-fetched output.
-  std::vector<string> output_names = {y_ + ":0"};
-  std::vector<string> target_nodes = {y_neg_};
-  std::vector<Tensor> outputs;
-
-  // Prepares RunOptions and RunMetadata
-  RunOptions run_options;
-  run_options.mutable_experimental()->set_use_run_handler_pool(true);
-
-  Status s = session->Run(run_options, inputs, output_names, target_nodes,
-                          &outputs, nullptr);
-  TF_ASSERT_OK(s);
-
-  ASSERT_EQ(1, outputs.size());
-  // The first output should be initialized and have the correct
-  // output.
-  auto mat = outputs[0].matrix<float>();
-  ASSERT_TRUE(outputs[0].IsInitialized());
-  EXPECT_FLOAT_EQ(5.0, mat(0, 0));
-}
-
 TEST(DirectSessionTest, KeepsStateAcrossRunsOfSession) {
   GraphDef def;
   Graph g(OpRegistry::Global());
diff --git a/tensorflow/core/framework/run_handler.cc b/tensorflow/core/framework/run_handler.cc
deleted file mode 100644
index 9c6490a603..0000000000
--- a/tensorflow/core/framework/run_handler.cc
+++ /dev/null
@@ -1,248 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#define EIGEN_USE_THREADS
-
-#include "tensorflow/core/framework/run_handler.h"
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/run_handler_util.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/util/ptr_util.h"
-
-namespace tensorflow {
-
-// Contains the concrete implementation of the RunHandler.
-// Externally visible RunHandler class simply forwards the work to this one.
-class RunHandler::Impl {
- public:
-  explicit Impl(RunHandlerPool::Impl* pool_impl) : pool_impl_(pool_impl) {
-    Reset();
-  }
-
-  ~Impl() {}
-
-  void set_inter_op_scheduling_range(std::uint_fast32_t start,
-                                     std::uint_fast32_t limit) {
-    inter_op_scheduling_range_.store(EncodePartition(start, limit),
-                                     std::memory_order_release);
-  }
-
-  std::uint_fast32_t inter_op_scheduling_range() const {
-    return inter_op_scheduling_range_.load(std::memory_order_acquire);
-  }
-
-  // Stores now time (in microseconds) since unix epoch when the handler is
-  // requested via RunHandlerPool::Get().
-  uint64 start_time_us() const { return start_time_us_; }
-
-  void ScheduleInterOpClosure(std::function<void()> fn);
-
-  void Reset();
-
-  RunHandlerPool::Impl* pool_impl() { return pool_impl_; }
-
- private:
-  // Encoding/decoding logic for storing [start, limit) into a single
-  // uint_fast32_t int. We assume that pool_num_threads < (1 << 16).
-  const int kMaxPartitionBits = 16;
-  const int kMaxThreads = 1 << kMaxPartitionBits;
-
-  std::uint_fast32_t EncodePartition(std::uint_fast32_t start,
-                                     std::uint_fast32_t limit) {
-    return (start << kMaxPartitionBits) | limit;
-  }
-
-  void DecodePartition(std::uint_fast32_t val, std::uint_fast32_t* start,
-                       std::uint_fast32_t* limit) {
-    *limit = val & (kMaxThreads - 1);
-    val >>= kMaxPartitionBits;
-    *start = val;
-  }
-
-  std::atomic_uint_fast32_t inter_op_scheduling_range_;
-  RunHandlerPool::Impl* pool_impl_;  // NOT OWNED.
-  uint64 start_time_us_;
-};
-
-// Contains shared state across all run handlers present in the pool. Also
-// responsible for pool management decisions.
-// This class is thread safe.
-class RunHandlerPool::Impl {
- public:
-  // Maximum number of handlers pre-created during pool construction time. The
-  // number has been chosen expecting each handler might at least want 1
-  // inter-op thread for execution (during compute intensive workloads like
-  // inference).
-  static const int kMaxHandlers = 128;
-
-  explicit Impl(int num_inter_op_threads)
-      : inter_op_thread_pool_(new thread::ThreadPool(
-            Env::Default(), ThreadOptions(), "inter_op", num_inter_op_threads)),
-        iterations_(0) {
-    VLOG(1) << "Creating a RunHandlerPool with max handlers: " << kMaxHandlers;
-    for (int i = 0; i < kMaxHandlers; ++i) {
-      handlers_.emplace_back(new RunHandler::Impl(this));
-      free_handlers_.push_back(handlers_.back().get());
-    }
-  }
-
-  ~Impl() {
-    // Sanity check that all handlers have been returned back to the pool before
-    // destruction.
-    DCHECK_EQ(handlers_.size(), kMaxHandlers);
-    DCHECK_EQ(free_handlers_.size(), handlers_.size());
-    DCHECK_EQ(sorted_active_handlers_.size(), 0);
-  }
-
-  thread::ThreadPool* inter_op_thread_pool() const {
-    return inter_op_thread_pool_.get();
-  }
-
-  std::unique_ptr<RunHandler> Get() LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    while (free_handlers_.empty()) {
-      one_handler_free_.wait(l);
-    }
-    // Remove the last entry from free_handlers_ and add to the end of
-    // sorted_active_handlers_.
-    auto* handler_impl = free_handlers_.back();
-    handler_impl->Reset();
-    // Sortedness isn't violated if we simply add at the end of the list, since
-    // handlers are expected to be obtained in increasing order of time.
-    sorted_active_handlers_.push_back(handler_impl);
-    DCHECK_LE(sorted_active_handlers_.size(), kMaxHandlers);
-    free_handlers_.pop_back();
-
-    RecomputePoolStatsLocked();
-    return WrapUnique<RunHandler>(new RunHandler(handler_impl));
-  }
-
-  void ReleaseHandler(RunHandler::Impl* handler) LOCKS_EXCLUDED(mu_) {
-    {
-      mutex_lock l(mu_);
-      DCHECK_GT(sorted_active_handlers_.size(), 0);
-
-      uint64 now = tensorflow::Env::Default()->NowMicros();
-      double elapsed = (now - handler->start_time_us()) / 1000.0;
-      time_hist_.Add(elapsed);
-
-      // Erase from and update sorted_active_handlers_. Add it to the end of
-      // free_handlers_.
-      auto iter = std::find(sorted_active_handlers_.begin(),
-                            sorted_active_handlers_.end(), handler);
-      DCHECK(iter != sorted_active_handlers_.end())
-          << "Unexpected handler: " << handler
-          << " is being requested for release";
-
-      // Remove this handler from this list and add it to the list of free
-      // handlers.
-      sorted_active_handlers_.erase(iter);
-      free_handlers_.push_back(handler);
-      DCHECK_LE(free_handlers_.size(), kMaxHandlers);
-
-      RecomputePoolStatsLocked();
-    }
-    one_handler_free_.notify_one();
-  }
-
- private:
-  void RecomputePoolStatsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Thread safe part.
-  const std::unique_ptr<thread::ThreadPool> inter_op_thread_pool_;
-
-  // Thread compatible part used only by lock under RunHandlerPool.
-  // Handlers are sorted by start time.
-  std::vector<RunHandler::Impl*> sorted_active_handlers_ GUARDED_BY(mu_);
-  std::vector<RunHandler::Impl*> free_handlers_ GUARDED_BY(mu_);
-  std::vector<std::unique_ptr<RunHandler::Impl>> handlers_ GUARDED_BY(mu_);
-  // Histogram of elapsed runtime of every handler (in ms).
-  histogram::Histogram time_hist_ GUARDED_BY(mu_);
-  std::vector<std::uint_fast32_t> inter_op_start_ GUARDED_BY(mu_);
-  std::vector<std::uint_fast32_t> inter_op_limit_ GUARDED_BY(mu_);
-  int64 iterations_ GUARDED_BY(mu_);
-  condition_variable one_handler_free_;
-  mutex mu_;
-};
-
-void RunHandlerPool::Impl::RecomputePoolStatsLocked() {
-  int num_active_requests = sorted_active_handlers_.size();
-  if (num_active_requests == 0) return;
-
-  int num_threads = inter_op_thread_pool_->NumThreads();
-
-  inter_op_start_.resize(num_active_requests);
-  inter_op_limit_.resize(num_active_requests);
-
-  const int kMinThreadsPerRequest = 3;
-  ComputeInterOpSchedulingRanges(num_active_requests, num_threads,
-                                 kMinThreadsPerRequest, &inter_op_start_,
-                                 &inter_op_limit_);
-
-  for (int i = 0; i < num_active_requests; ++i) {
-    sorted_active_handlers_[i]->set_inter_op_scheduling_range(
-        inter_op_start_[i], inter_op_limit_[i]);
-  }
-
-  if (iterations_++ % 5000 == 0 && VLOG_IS_ON(1)) {
-    VLOG(1) << "Printing time histogram: " << time_hist_.ToString();
-    VLOG(1) << "Active session runs: " << num_active_requests;
-    uint64 now = tensorflow::Env::Default()->NowMicros();
-    string ranges_str = "";
-    string times_str = "";
-    for (int i = 0; i < num_active_requests; ++i) {
-      if (i > 0) {
-        times_str += " ";
-        ranges_str += " ";
-      }
-
-      times_str += strings::StrCat(
-          (now - sorted_active_handlers_[i]->start_time_us()) / 1000.0, " ms.");
-      ranges_str += strings::StrCat("[", inter_op_start_[i], ", ",
-                                    inter_op_limit_[i], ")");
-    }
-    VLOG(1) << "Elapsed times are: " << times_str;
-    VLOG(1) << "Ranges are: " << ranges_str;
-  }
-}
-
-void RunHandler::Impl::ScheduleInterOpClosure(std::function<void()> fn) {
-  std::uint_fast32_t start = 0, limit = 0;
-  DecodePartition(inter_op_scheduling_range(), &start, &limit);
-  pool_impl_->inter_op_thread_pool()->Schedule(std::move(fn));
-}
-
-void RunHandler::Impl::Reset() {
-  set_inter_op_scheduling_range(
-      0, pool_impl_->inter_op_thread_pool()->NumThreads());
-  start_time_us_ = tensorflow::Env::Default()->NowMicros();
-}
-
-RunHandlerPool::RunHandlerPool(int num_inter_op_threads)
-    : impl_(new Impl(num_inter_op_threads)) {}
-
-RunHandlerPool::~RunHandlerPool() {}
-
-std::unique_ptr<RunHandler> RunHandlerPool::Get() { return impl_->Get(); }
-
-RunHandler::RunHandler(Impl* impl) : impl_(impl) {}
-
-void RunHandler::ScheduleInterOpClosure(std::function<void()> fn) {
-  impl_->ScheduleInterOpClosure(std::move(fn));
-}
-
-RunHandler::~RunHandler() { impl_->pool_impl()->ReleaseHandler(impl_); }
-}  // namespace tensorflow
diff --git a/tensorflow/core/framework/run_handler.h b/tensorflow/core/framework/run_handler.h
deleted file mode 100644
index 72fa6301b4..0000000000
--- a/tensorflow/core/framework/run_handler.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_H_
-#define TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_H_
-
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/histogram/histogram.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-
-namespace tensorflow {
-
-class RunHandler;
-
-// RunHandlerPool is a fixed size pool of pre-allocated RunHandlers
-// that can be used for tracking inter-op work for a given Session::Run().
-// RunHandler(s) in the pool are initially 'inactive'. A RunHandler becomes
-// 'active' when its unique_ptr is returned by Get() and is being used by a
-// client. It becomes 'inactive' once more when its unique_ptr gets destroyed.
-//
-// Expected usage:
-//
-// * Create a single RunHandlerPool (say run_handler_pool_).
-//
-// * When a Session::Run() is invoked, obtain a handler by:
-// auto handler = run_handler_pool_->Get();
-//
-// * Use handler for scheduling all inter-op work by:
-// handler->ScheduleInterOpClosure(closure);
-//
-// This class is thread safe.
-class RunHandlerPool {
- public:
-  explicit RunHandlerPool(int num_inter_op_threads);
-  ~RunHandlerPool();
-
-  // Returns an inactive RunHandler from the pool.
-  //
-  // RunHandlers in RunHandlerPool are initially 'inactive'.
-  // A RunHandler becomes 'active' when its unique_ptr its returned by Get()
-  // and is being used by a client.  It becomes 'inactive' once more when the
-  // unique_ptr is destroyed.
-  //
-  // Will block unless there is an inactive handler.
-  std::unique_ptr<RunHandler> Get();
-
- private:
-  class Impl;
-  friend class RunHandler;
-
-  std::unique_ptr<Impl> impl_;
-};
-
-// RunHandler can be used to schedule inter-op closures to run on a global pool
-// shared across all Session::Run(s).
-//
-// It can only be created via RunHandlerPool::Get().
-//
-// This class can be used instead of directly scheduling closures on a global
-// pool since it maintains a global view across all sessions and optimizes pool
-// scheduling to improve (median and tail) latency.
-//
-// This class is thread safe.
-class RunHandler {
- public:
-  void ScheduleInterOpClosure(std::function<void()> fn);
-
-  ~RunHandler();
-
- private:
-  class Impl;
-  friend class RunHandlerPool::Impl;
-
-  explicit RunHandler(Impl* impl);
-
-  Impl* impl_;  // NOT OWNED.
-};
-
-}  // end namespace tensorflow.
-
-#endif  // TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_H_
diff --git a/tensorflow/core/framework/run_handler_util.cc b/tensorflow/core/framework/run_handler_util.cc
deleted file mode 100644
index 3087998c69..0000000000
--- a/tensorflow/core/framework/run_handler_util.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/run_handler_util.h"
-
-#include <algorithm>
-#include <cmath>
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-void ComputeInterOpSchedulingRanges(int num_active_requests, int num_threads,
-                                    int min_threads_per_request,
-                                    std::vector<std::uint_fast32_t>* start_vec,
-                                    std::vector<std::uint_fast32_t>* end_vec) {
-  // Each request is expected to have weight W[i] = num_active_requests - i.
-  // Therefore, total_weight = sum of all request weights.
-  float total_weight = 0.5f * num_active_requests * (num_active_requests + 1);
-  float demand_factor = static_cast<float>(num_threads) / total_weight;
-  float last_cumulative_weight = 0.0;
-  min_threads_per_request = std::max(1, min_threads_per_request);
-  for (int i = 0; i != num_active_requests; i++) {
-    float cumulative_weight =
-        static_cast<float>(i + 1) *
-        (num_active_requests - static_cast<float>(i) * 0.5f);
-    float weight = cumulative_weight - last_cumulative_weight;
-    // Quantize thread_demand by rounding up, and also satisfying
-    // `min_threads_per_request` constraint.
-    // Note: We subtract a small epsilon (0.00001) to prevent ceil(..) from
-    // rounding weights like 4.0 to 5.
-    int demand =
-        std::max(min_threads_per_request,
-                 static_cast<int>(ceil(weight * demand_factor - 0.00001f)));
-    // For the quantized range [start, end); compute the floor of real start,
-    // and expand downwards from there with length `demand` and adjust for
-    // boundary conditions.
-    int start = last_cumulative_weight * demand_factor;
-    int end = std::min(num_threads, start + demand);
-    start = std::max(0, std::min(start, end - demand));
-    start_vec->at(i) = start;
-    end_vec->at(i) = end;
-    last_cumulative_weight = cumulative_weight;
-  }
-}
-}  // namespace tensorflow
diff --git a/tensorflow/core/framework/run_handler_util.h b/tensorflow/core/framework/run_handler_util.h
deleted file mode 100644
index c0c36aeccb..0000000000
--- a/tensorflow/core/framework/run_handler_util.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
-#define TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
-
-#include <cstdint>
-#include <vector>
-
-namespace tensorflow {
-
-// Assign thread ranges to requests.
-// Requests are numbered 0...num_active_requests-1, and
-// threads are numbered 0...num_threads-1.
-// On return, the range start_vec->at(i)...end_vec->at(i)-1
-// indicates the subrange of the threads available to request i.
-// The ranges given to different requests may overlap.
-// Lower numbered requests will tend to be assigned more threads.
-// Thus, a client might associate older requests with lower
-// array indices so they receive access to more threads.
-// However, the routine ensures that each request is given access
-// to at least min(min_threads_per_request, num_threads)  threads.
-// Every thread will be assigned to at least one request range,
-// assuming there is at least one request.
-void ComputeInterOpSchedulingRanges(int num_active_requests, int num_threads,
-                                    int min_threads_per_request,
-                                    std::vector<std::uint_fast32_t>* start_vec,
-                                    std::vector<std::uint_fast32_t>* end_vec);
-
-}  // end namespace tensorflow
-#endif  // TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
diff --git a/tensorflow/core/framework/run_handler_util_test.cc b/tensorflow/core/framework/run_handler_util_test.cc
deleted file mode 100644
index a1928c132b..0000000000
--- a/tensorflow/core/framework/run_handler_util_test.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/run_handler_util.h"
-
-#include <vector>
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test.h"
-namespace tensorflow {
-namespace {
-
-void VerifyFunction(int num_active_requests, int num_threads,
-                    int min_threads_per_request, bool print_stats = false) {
-  if (print_stats) {
-    LOG(INFO) << "Test case# num_active_requests: " << num_active_requests
-              << " num_threads: " << num_threads
-              << " min_threads: " << min_threads_per_request;
-  }
-  std::vector<std::uint_fast32_t> start(num_active_requests);
-  std::vector<std::uint_fast32_t> end(num_active_requests);
-
-  ComputeInterOpSchedulingRanges(num_active_requests, num_threads,
-                                 min_threads_per_request, &start, &end);
-  string range_str = "";
-  for (int i = 0; i < num_active_requests; ++i) {
-    if (i > 0) range_str += " ";
-    range_str += strings::StrCat("[", start[i], ", ", end[i], ")");
-
-    ASSERT_GE(start[i], 0) << range_str;
-    ASSERT_LE(end[i], num_threads) << range_str;
-    if (i > 0) {
-      // Due to linearly decreasing demand, #threads(i - 1) >= #threads(i)
-      ASSERT_GE(end[i - 1] - start[i - 1], end[i] - start[i]) << range_str;
-      // No missing threads.
-      ASSERT_GE(end[i - 1], start[i]) << range_str;
-    }
-    // Each interval is at least of size 'min_threads_per_request'.
-    ASSERT_GE((end[i] - start[i]), min_threads_per_request) << range_str;
-    // Verify that assigned (quantized) threads is not overly estimated
-    // from real demand, when the demand is high (>=
-    // min_threads_per_request).
-    float entry_weight = num_active_requests - i;
-    float total_weight = 0.5f * num_active_requests * (num_active_requests + 1);
-    float thread_demand = (entry_weight * num_threads) / total_weight;
-    if (thread_demand > min_threads_per_request) {
-      // We expect some over-estimation of threads due to quantization,
-      // but we hope it's not more than 1 extra thread.
-      ASSERT_NEAR(end[i] - start[i], thread_demand, 1.0)
-          << "Ranges: " << range_str << " thread_demand: " << thread_demand
-          << " i: " << i;
-    }
-  }
-  ASSERT_EQ(end[num_active_requests - 1], num_threads);
-  ASSERT_EQ(start[0], 0);
-  if (print_stats) {
-    LOG(INFO) << "Assigned ranges: " << range_str;
-  }
-}
-
-TEST(RunHandlerUtilTest, TestComputeInterOpSchedulingRanges) {
-  const int kMinThreadsPerRequestBound = 12;
-  const int kMaxActiveRequests = 128;
-  const int kMaxThreads = 128;
-
-  for (int min_threads_per_request = 1;
-       min_threads_per_request <= kMinThreadsPerRequestBound;
-       ++min_threads_per_request) {
-    for (int num_active_requests = 1; num_active_requests <= kMaxActiveRequests;
-         ++num_active_requests) {
-      for (int num_threads = min_threads_per_request;
-           num_threads <= kMaxThreads; ++num_threads) {
-        VerifyFunction(num_active_requests, num_threads,
-                       min_threads_per_request);
-      }
-    }
-  }
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 104ab039cb..85cd02350a 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -453,11 +453,6 @@ message RunOptions {
     // same group_key value (in a distributed computation where tasks
     // run disjoint graphs).
     int64 collective_graph_key = 1;
-    // If true, then operations (using the inter-op pool) across all
-    // session::run() calls will be centrally scheduled, optimizing for (median
-    // and tail) latency.
-    // Consider using this option for CPU-bound workloads like inference.
-    bool use_run_handler_pool = 2;
   };
 
   Experimental experimental = 8;
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
index 47b5b56faf..537e73aa89 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
@@ -8,11 +8,5 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_INT64
     }
-    field {
-      name: "use_run_handler_pool"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
index c0c2e7b9f8..cec04a2bf0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
@@ -55,12 +55,6 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT64
       }
-      field {
-        name: "use_run_handler_pool"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_BOOL
-      }
     }
     enum_type {
       name: "TraceLevel"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
index 47b5b56faf..537e73aa89 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
@@ -8,11 +8,5 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_INT64
     }
-    field {
-      name: "use_run_handler_pool"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
index c0c2e7b9f8..cec04a2bf0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
@@ -55,12 +55,6 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT64
       }
-      field {
-        name: "use_run_handler_pool"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_BOOL
-      }
     }
     enum_type {
       name: "TraceLevel"
-- 
GitLab


From 5f67bf69d3f53d1cd3bb86ebeeb03ea2bba5911b Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Thu, 27 Sep 2018 16:16:26 -0700
Subject: [PATCH 109/570] Support nested variants in CopyHostToDevice and
 CopyDeviceToHost.

PiperOrigin-RevId: 214853860
---
 tensorflow/core/common_runtime/copy_tensor.cc | 82 +++++++++++--------
 tensorflow/python/kernel_tests/BUILD          |  4 +-
 .../python/kernel_tests/list_ops_test.py      | 26 ++++++
 3 files changed, 75 insertions(+), 37 deletions(-)

diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
index d800a86199..6e2eb66b94 100644
--- a/tensorflow/core/common_runtime/copy_tensor.cc
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -61,26 +61,33 @@ void CopyHostToDevice(const Tensor* input, Allocator* cpu_allocator,
       status_cb->Unref();
     };
     auto copier = std::bind(
-        [dst, recv_dev_context, out_allocator, status_cb](
-            StatusCallback wrapped_done_,
-            // Begin unbound arguments
-            const Tensor& from, Tensor* to) {
-          if (!DMAHelper::CanUseDMA(&from)) {
-            Status err = errors::InvalidArgument(
-                "During Variant Host->Device Copy: "
-                "non-DMA-copy attempted of tensor type: ",
-                DataTypeString(from.dtype()));
-            status_cb->UpdateStatus(err);
-            return err;
-          }
-          if (status_cb->ok()) {
+        [dst, recv_dev_context, out_allocator, status_cb, cpu_allocator,
+         edge_name](StatusCallback wrapped_done_,
+                    // Begin unbound arguments
+                    const Tensor& from, Tensor* to) {
+          if (from.dtype() == DT_VARIANT) {
             status_cb->Ref();
-            *to = Tensor(out_allocator, from.dtype(), from.shape());
-            recv_dev_context->CopyCPUTensorToDevice(&from, dst, to,
-                                                    wrapped_done_);
+            CopyHostToDevice(&from, cpu_allocator, out_allocator, edge_name,
+                             dst, to, recv_dev_context, wrapped_done_);
             return Status::OK();
           } else {
-            return status_cb->status();
+            if (!DMAHelper::CanUseDMA(&from)) {
+              Status err = errors::InvalidArgument(
+                  "During Variant Host->Device Copy: "
+                  "non-DMA-copy attempted of tensor type: ",
+                  DataTypeString(from.dtype()));
+              status_cb->UpdateStatus(err);
+              return err;
+            }
+            if (status_cb->ok()) {
+              status_cb->Ref();
+              *to = Tensor(out_allocator, from.dtype(), from.shape());
+              recv_dev_context->CopyCPUTensorToDevice(&from, dst, to,
+                                                      wrapped_done_);
+              return Status::OK();
+            } else {
+              return status_cb->status();
+            }
           }
         },
         std::move(wrapped_done), std::placeholders::_1, std::placeholders::_2);
@@ -119,26 +126,33 @@ void CopyDeviceToHost(const Tensor* input, Allocator* cpu_allocator,
       status_cb->Unref();
     };
     auto copier = std::bind(
-        [edge_name, src, send_dev_context, out_allocator, status_cb](
-            StatusCallback wrapped_done_,
-            // Begin unbound arguments
-            const Tensor& from, Tensor* to) {
-          if (!DMAHelper::CanUseDMA(&from)) {
-            Status err = errors::InvalidArgument(
-                "During Variant Device->Host Copy: "
-                "non-DMA-copy attempted of tensor type: ",
-                DataTypeString(from.dtype()));
-            status_cb->UpdateStatus(err);
-            return err;
-          }
-          if (status_cb->ok()) {
+        [edge_name, src, send_dev_context, out_allocator, status_cb,
+         cpu_allocator](StatusCallback wrapped_done_,
+                        // Begin unbound arguments
+                        const Tensor& from, Tensor* to) {
+          if (from.dtype() == DT_VARIANT) {
             status_cb->Ref();
-            *to = Tensor(out_allocator, from.dtype(), from.shape());
-            send_dev_context->CopyDeviceTensorToCPU(&from, edge_name, src, to,
-                                                    wrapped_done_);
+            CopyDeviceToHost(&from, cpu_allocator, out_allocator, edge_name,
+                             src, to, send_dev_context, wrapped_done_);
             return Status::OK();
           } else {
-            return status_cb->status();
+            if (!DMAHelper::CanUseDMA(&from)) {
+              Status err = errors::InvalidArgument(
+                  "During Variant Device->Host Copy: "
+                  "non-DMA-copy attempted of tensor type: ",
+                  DataTypeString(from.dtype()));
+              status_cb->UpdateStatus(err);
+              return err;
+            }
+            if (status_cb->ok()) {
+              status_cb->Ref();
+              *to = Tensor(out_allocator, from.dtype(), from.shape());
+              send_dev_context->CopyDeviceTensorToCPU(&from, edge_name, src, to,
+                                                      wrapped_done_);
+              return Status::OK();
+            } else {
+              return status_cb->status();
+            }
           }
         },
         std::move(wrapped_done), std::placeholders::_1, std::placeholders::_2);
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index c2e36e5e19..280c18ec00 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -3257,8 +3257,7 @@ tf_py_test(
     tags = ["no_gpu"],  # TODO(b/111656070)
 )
 
-# TODO(b/116053459): Replace with cuda_py_test.
-tf_py_test(
+cuda_py_test(
     name = "while_v2_test",
     size = "medium",
     srcs = ["while_v2_test.py"],
@@ -3278,5 +3277,4 @@ tf_py_test(
         "//tensorflow/python:while_v2",
     ],
     grpc_enabled = True,
-    tags = ["no_gpu"],  # TODO(b/116053459)
 )
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 0f5607712b..ae413edaec 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -170,6 +170,32 @@ class ListOpsTest(test_util.TensorFlowTestCase):
             list_ops.tensor_list_pop_back(
                 l_cpu, element_dtype=dtypes.float32)[1]), 2.0)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testCPUGPUCopyNested(self):
+    if not context.num_gpus():
+      return
+    t = constant_op.constant([1.0, 2.0])
+    child_l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    l = list_ops.empty_tensor_list(
+        element_shape=constant_op.constant([], dtype=dtypes.int32),
+        element_dtype=dtypes.variant)
+    l = list_ops.tensor_list_push_back(l, child_l)
+    with context.device("gpu:0"):
+      l_gpu = array_ops.identity(l)
+      _, child_l_gpu = list_ops.tensor_list_pop_back(
+          l_gpu, element_dtype=dtypes.variant)
+      self.assertAllEqual(
+          self.evaluate(
+              list_ops.tensor_list_pop_back(
+                  child_l_gpu, element_dtype=dtypes.float32)[1]), 2.0)
+    l_cpu = array_ops.identity(l_gpu)
+    _, child_l_cpu = list_ops.tensor_list_pop_back(
+        l_cpu, element_dtype=dtypes.variant)
+    self.assertAllEqual(
+        self.evaluate(
+            list_ops.tensor_list_pop_back(
+                child_l_cpu, element_dtype=dtypes.float32)[1]), 2.0)
+
   def testGraphStack(self):
     with self.cached_session():
       tl = list_ops.empty_tensor_list(
-- 
GitLab


From 2330933ddd0b29ad206e351c9120e621cdaf6312 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Thu, 27 Sep 2018 16:19:09 -0700
Subject: [PATCH 110/570] Rename TFLite Extended -> TFLite Flex

PiperOrigin-RevId: 214854303
---
 tensorflow/contrib/lite/build_def.bzl                  |  4 ++--
 tensorflow/contrib/lite/model.cc                       |  4 ++--
 tensorflow/contrib/lite/python/convert.py              |  8 ++++----
 tensorflow/contrib/lite/python/lite_test.py            |  4 ++--
 tensorflow/contrib/lite/testing/generate_examples.py   | 10 +++++-----
 .../testing/model_coverage/model_coverage_lib_test.py  |  2 +-
 tensorflow/contrib/lite/tools/benchmark/BUILD          |  4 ++--
 .../lite/tools/benchmark/benchmark_tflite_model.cc     |  8 ++++----
 .../lite/tools/benchmark/benchmark_tflite_model.h      |  8 ++++----
 9 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 7f5c6bdc2f..7ef26de69f 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -301,7 +301,7 @@ def generated_test_conversion_modes():
     """Returns a list of conversion modes."""
 
     # TODO(nupurgarg): Add "pb2lite" when it's in open source. b/113614050.
-    return ["toco-extended", ""]
+    return ["toco-flex", ""]
 
 def generated_test_models_all():
     """Generates a list of all tests with the different converters.
@@ -335,7 +335,7 @@ def gen_zip_test(name, test_name, conversion_mode, **kwargs):
         # TODO(nupurgarg): Comment in when pb2lite is in open source. b/113614050.
         # if conversion_mode == "pb2lite":
         #     toco = "//tensorflow/contrib/lite/experimental/pb2lite:pb2lite"
-        flags = "--ignore_toco_errors --run_with_extended"
+        flags = "--ignore_toco_errors --run_with_flex"
         kwargs["tags"].append("skip_already_failing")
         kwargs["tags"].append("no_oss")
         kwargs["tags"].append("notap")
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index eff6181a61..d50c345194 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #ifndef TFLITE_MCU
 #include "tensorflow/contrib/lite/nnapi_delegate.h"
 #endif
-#if defined(TFLITE_EXTENDED)
+#if defined(TFLITE_FLEX)
 #include "tensorflow/contrib/lite/delegates/flex/delegate.h"
 #endif
 #include "tensorflow/contrib/lite/version.h"
@@ -450,7 +450,7 @@ TfLiteStatus InterpreterBuilder::operator()(
   }
   (**interpreter).SetVariables(std::move(variables));
 
-#if defined(TFLITE_EXTENDED)
+#if defined(TFLITE_FLEX)
   if (auto delegate = FlexDelegate::Create()) {
     (**interpreter)
         .ModifyGraphWithDelegate(std::move(delegate),
diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
index 73a420c47b..613a1530f7 100644
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -67,12 +67,12 @@ class ConverterMode(enum.Enum):
   # Convert model using TOCO such that only unsupported operations are
   # represented as TensorFlow ops.
   # WARNING: Experimental interface, subject to change.
-  TOCO_EXTENDED = "TOCO_EXTENDED"
+  TOCO_FLEX = "TOCO_FLEX"
 
   # Convert model using TOCO such that all operations are represented as
   # TensorFlow ops.
   # WARNING: Experimental interface, subject to change.
-  TOCO_EXTENDED_ALL = "TOCO_EXTENDED_ALL"
+  TOCO_FLEX_ALL = "TOCO_FLEX_ALL"
 
   def __str__(self):
     return self.value
@@ -240,9 +240,9 @@ def build_toco_convert_protos(input_tensors,
   if dump_graphviz_dir:
     toco.dump_graphviz_dir = dump_graphviz_dir
   toco.dump_graphviz_include_video = dump_graphviz_video
-  if converter_mode == ConverterMode.TOCO_EXTENDED:
+  if converter_mode == ConverterMode.TOCO_FLEX:
     toco.allow_flex_ops = True
-  elif converter_mode == ConverterMode.TOCO_EXTENDED_ALL:
+  elif converter_mode == ConverterMode.TOCO_FLEX_ALL:
     toco.allow_flex_ops = True
     toco.force_flex_ops = True
 
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index 7b0df01d1d..d243a494f6 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -412,7 +412,7 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     # Ensure that the quantized weights tflite model is smaller.
     self.assertTrue(len(quantized_tflite) < len(float_tflite))
 
-  def testExtendedMode(self):
+  def testFlexMode(self):
     in_tensor = array_ops.placeholder(
         shape=[1, 16, 16, 3], dtype=dtypes.float32)
     out_tensor = in_tensor + in_tensor
@@ -421,7 +421,7 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    converter.converter_mode = lite.ConverterMode.TOCO_EXTENDED_ALL
+    converter.converter_mode = lite.ConverterMode.TOCO_FLEX_ALL
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 53bd88d087..18036fac6f 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -81,9 +81,9 @@ parser.add_argument(
     action="store_true",
     help="Include intermediate graphdefs in the output zip files.")
 parser.add_argument(
-    "--run_with_extended",
+    "--run_with_flex",
     action="store_true",
-    help="Whether the TFLite Extended converter is being used.")
+    help="Whether the TFLite Flex converter is being used.")
 
 RANDOM_SEED = 342
 TEST_INPUT_DEPTH = 3
@@ -339,10 +339,10 @@ def toco_convert(graph_def_str, input_tensors, output_tensors,
     graphdef_file.flush()
 
     # TODO(aselle): Switch this to subprocess at some point.
-    if "pb2lite" in bin_path and FLAGS.run_with_extended:
+    if "pb2lite" in bin_path and FLAGS.run_with_flex:
       opts = ("--input_arrays={0} --output_arrays={1}".format(
           ",".join(input_arrays), ",".join(output_tensors)))
-    elif FLAGS.run_with_extended:
+    elif FLAGS.run_with_flex:
       opts += " --allow_flex_ops --force_flex_ops"
     cmd = ("%s --input_file=%s --output_file=%s %s > %s 2>&1" %
            (bin_path, graphdef_file.name, output_file.name, opts,
@@ -3333,7 +3333,7 @@ def main(unused_args):
   # list of valid conversion modes is defined in
   # generated_test_conversion_modes() in build_def.bzl.
   test_function = ("make_%s_tests" % (out.replace(".zip", "").replace(
-      "pb2lite", "").replace("toco-extended", "").rstrip("_")))
+      "pb2lite", "").replace("toco-flex", "").rstrip("_")))
   if test_function not in globals():
     raise RuntimeError("Can't find a test function to create %r. Tried %r" %
                        (out, test_function))
diff --git a/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib_test.py b/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib_test.py
index 5f3355e734..1498f86c6f 100644
--- a/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib_test.py
+++ b/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib_test.py
@@ -123,7 +123,7 @@ class EvaluateKerasModel(test.TestCase):
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(
-        keras_file, converter_mode=lite.ConverterMode.TOCO_EXTENDED)
+        keras_file, converter_mode=lite.ConverterMode.TOCO_FLEX)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/lite/tools/benchmark/BUILD b/tensorflow/contrib/lite/tools/benchmark/BUILD
index bc18d40313..502e181139 100644
--- a/tensorflow/contrib/lite/tools/benchmark/BUILD
+++ b/tensorflow/contrib/lite/tools/benchmark/BUILD
@@ -40,7 +40,7 @@ cc_binary(
     srcs = [
         "benchmark_main.cc",
     ],
-    copts = common_copts + ["-DTFLITE_EXTENDED"],
+    copts = common_copts + ["-DTFLITE_FLEX"],
     linkopts = tflite_linkopts() + select({
         "//tensorflow:android": [
             "-pie",  # Android 5.0 and later supports only PIE
@@ -117,7 +117,7 @@ cc_library(
         "logging.h",
     ],
     hdrs = ["benchmark_tflite_model.h"],
-    copts = common_copts + ["-DTFLITE_EXTENDED"],
+    copts = common_copts + ["-DTFLITE_FLEX"],
     deps = [
         ":benchmark_model_lib",
         ":logging",
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
index d989ee720d..463d5993f4 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#ifdef TFLITE_EXTENDED
+#ifdef TFLITE_FLEX
 #include "tensorflow/contrib/lite/delegates/flex/delegate.h"
-#endif  // TFLITE_EXTENDED
+#endif  // TFLITE_FLEX
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/op_resolver.h"
@@ -305,14 +305,14 @@ void BenchmarkTfLiteModel::Init() {
 
   interpreter->UseNNAPI(use_nnapi);
 
-#ifdef TFLITE_EXTENDED
+#ifdef TFLITE_FLEX
   TFLITE_LOG(INFO) << "Instantiating Flex Delegate";
   delegate_ = FlexDelegate::Create();
   if (delegate_) {
     interpreter->ModifyGraphWithDelegate(delegate_.get(),
                                          /*allow_dynamic_tensors=*/true);
   }
-#endif  // TFLITE_EXTENDED
+#endif  // TFLITE_FLEX
 
   auto interpreter_inputs = interpreter->inputs();
 
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
index 9343824b4a..b091e18a29 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
@@ -20,9 +20,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#ifdef TFLITE_EXTENDED
+#ifdef TFLITE_FLEX
 #include "tensorflow/contrib/lite/delegates/flex/delegate.h"
-#endif  // TFLITE_EXTENDED
+#endif  // TFLITE_FLEX
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/profiling/profile_summarizer.h"
 #include "tensorflow/contrib/lite/tools/benchmark/benchmark_model.h"
@@ -73,9 +73,9 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   void PrepareInputsAndOutputs() override;
 
  private:
-#ifdef TFLITE_EXTENDED
+#ifdef TFLITE_FLEX
   std::unique_ptr<FlexDelegate> delegate_;
-#endif  // TFLITE_EXTENDED
+#endif  // TFLITE_FLEX
   std::unique_ptr<tflite::FlatBufferModel> model;
   std::unique_ptr<tflite::Interpreter> interpreter;
   std::vector<InputLayerInfo> inputs;
-- 
GitLab


From 0a9ee95ed9c26bef58e9daadcb6935807d90fcd3 Mon Sep 17 00:00:00 2001
From: Yanan Cao <ycao@google.com>
Date: Thu, 27 Sep 2018 17:04:17 -0700
Subject: [PATCH 111/570] Disable summary ops from lower-level xla.compile API
 rather than xla.estimator_model_fn

PiperOrigin-RevId: 214860981
---
 tensorflow/contrib/compiler/xla.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/compiler/xla.py b/tensorflow/contrib/compiler/xla.py
index 1e30525159..873b03580d 100644
--- a/tensorflow/contrib/compiler/xla.py
+++ b/tensorflow/contrib/compiler/xla.py
@@ -293,7 +293,8 @@ def _compile_internal(computation, inputs=None):
     saved_use_resource = vscope.use_resource
     vscope.set_use_resource(True)
 
-    outputs = computation(*computation_inputs)
+    with _disable_summary_context():
+      outputs = computation(*computation_inputs)
 
     # Restore variable scope after computation.
     vscope.set_use_resource(saved_use_resource)
@@ -371,13 +372,13 @@ def _disable_summary_context():
   Yields:
     None.
   """
-  origional_skip_summary_func = summary_op_util.skip_summary
+  original_skip_summary_func = summary_op_util.skip_summary
   summary_op_util.skip_summary = lambda: True
 
   try:
     yield
   finally:
-    summary_op_util.skip_summary = origional_skip_summary_func
+    summary_op_util.skip_summary = original_skip_summary_func
 
 
 class _CapturedObject(object):
@@ -436,8 +437,7 @@ class _ModelFnWrapper(object):
     if mode == model_fn_lib.ModeKeys.TRAIN:
       train_step, captured_scaffold_fn = self._make_train_step(
           features, labels, params)
-      with _disable_summary_context():
-        (loss,) = compile(train_step)
+      (loss,) = compile(train_step)
       return model_fn_lib.EstimatorSpec(
           mode=mode,
           loss=loss,
@@ -446,8 +446,7 @@ class _ModelFnWrapper(object):
     elif mode == model_fn_lib.ModeKeys.EVAL:
       eval_step, captured_eval_metric_fn, captured_scaffold_fn = (
           self._make_eval_step(features, labels, params))
-      with _disable_summary_context():
-        outputs = compile(eval_step)
+      outputs = compile(eval_step)
       loss = outputs[0]
 
       # Calculate eval_metric_ops if eval_metric_fn is set and captured.
-- 
GitLab


From 9b8390e7cd664d8fad9dd3f7172a56135585b481 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Thu, 27 Sep 2018 17:04:19 -0700
Subject: [PATCH 112/570] Remove testing non-core APIs from
 api_compatibility_test.

Some APIs are moving out of core TF repo. These APIs will have their own
API compat tests. Adding flag --only_test_core_api=true which will not
check for changes to non-core APIs.

PiperOrigin-RevId: 214860984
---
 tensorflow/tools/api/tests/BUILD              |  1 +
 .../tools/api/tests/api_compatibility_test.py | 39 +++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 4efa4a9651..3cbea41dca 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -19,6 +19,7 @@ py_test(
         "api_compatibility_test.py",
         "//tensorflow:tf_python_api_gen_v2",
     ],
+    args = ["--only_test_core_api=true"],
     data = [
         "//tensorflow/tools/api/golden:api_golden_v1",
         "//tensorflow/tools/api/golden:api_golden_v2",
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index d06c7f2d49..6487a6267e 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -56,6 +56,14 @@ _UPDATE_GOLDENS_HELP = """
      have to be authorized by TensorFlow leads.
 """
 
+# DEFINE_boolean, only_test_core_api, default False:
+_ONLY_TEST_CORE_API_HELP = """
+    Some TF APIs are being moved outside of the tensorflow/ directory. There is
+    no garuntee which versions of these APIs will be present when running this
+    test. Therefore, do not error out on API changes in non-core TF code
+    if this flag is set.
+"""
+
 # DEFINE_boolean, verbose_diffs, default True:
 _VERBOSE_DIFFS_HELP = """
      If set to true, print line by line diffs on all libraries. If set to
@@ -67,6 +75,8 @@ _API_GOLDEN_FOLDER_V2 = 'tensorflow/tools/api/golden/v2'
 _TEST_README_FILE = 'tensorflow/tools/api/tests/README.txt'
 _UPDATE_WARNING_FILE = 'tensorflow/tools/api/tests/API_UPDATE_WARNING.txt'
 
+_NON_CORE_PACKAGES = ['estimator']
+
 
 def _KeyToFilePath(key, api_version):
   """From a given key, construct a filepath.
@@ -111,6 +121,19 @@ def _VerifyNoSubclassOfMessageVisitor(path, parent, unused_children):
         'They are not yet supported by the API tools.' % path)
 
 
+def _FilterNonCoreGoldenFiles(golden_file_list):
+  """Filter out non-core API pbtxt files."""
+  filtered_file_list = []
+  filtered_package_prefixes = [
+      'tensorflow.%s.' % p for p in _NON_CORE_PACKAGES]
+  for f in golden_file_list:
+    if any([f.rsplit('/')[-1].startswith(pre)
+            for pre in filtered_package_prefixes]):
+      continue
+    filtered_file_list.append(f)
+  return filtered_file_list
+
+
 class ApiCompatibilityTest(test.TestCase):
 
   def __init__(self, *args, **kwargs):
@@ -233,6 +256,9 @@ class ApiCompatibilityTest(test.TestCase):
       return
     visitor = public_api.PublicAPIVisitor(_VerifyNoSubclassOfMessageVisitor)
     visitor.do_not_descend_map['tf'].append('contrib')
+    if FLAGS.only_test_core_api:
+      visitor.do_not_descend_map['tf'].extend(
+          _NON_CORE_PACKAGES)
     traverse.traverse(tf_v2.compat.v1, visitor)
 
   def testNoSubclassOfMessageV2(self):
@@ -240,6 +266,9 @@ class ApiCompatibilityTest(test.TestCase):
       return
     visitor = public_api.PublicAPIVisitor(_VerifyNoSubclassOfMessageVisitor)
     visitor.do_not_descend_map['tf'].append('contrib')
+    if FLAGS.only_test_core_api:
+      visitor.do_not_descend_map['tf'].extend(
+          _NON_CORE_PACKAGES)
     traverse.traverse(tf_v2, visitor)
 
   def _checkBackwardsCompatibility(
@@ -252,6 +281,9 @@ class ApiCompatibilityTest(test.TestCase):
     public_api_visitor.do_not_descend_map['tf'].append('contrib')
     public_api_visitor.do_not_descend_map['tf.GPUOptions'] = [
         'Experimental']
+    if FLAGS.only_test_core_api:
+      public_api_visitor.do_not_descend_map['tf'].extend(
+          _NON_CORE_PACKAGES)
     if additional_private_map:
       public_api_visitor.private_map.update(additional_private_map)
 
@@ -260,6 +292,8 @@ class ApiCompatibilityTest(test.TestCase):
 
     # Read all golden files.
     golden_file_list = file_io.get_matching_files(golden_file_pattern)
+    if FLAGS.only_test_core_api:
+      golden_file_list = _FilterNonCoreGoldenFiles(golden_file_list)
 
     def _ReadFileToProto(filename):
       """Read a filename, create a protobuf from its contents."""
@@ -325,6 +359,11 @@ if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument(
       '--update_goldens', type=bool, default=False, help=_UPDATE_GOLDENS_HELP)
+  # TODO(mikecase): Create Estimator's own API compatibility test or
+  # a more general API compatibility test for use for TF components.
+  parser.add_argument(
+      '--only_test_core_api', type=bool, default=False,
+      help=_ONLY_TEST_CORE_API_HELP)
   parser.add_argument(
       '--verbose_diffs', type=bool, default=True, help=_VERBOSE_DIFFS_HELP)
   FLAGS, unparsed = parser.parse_known_args()
-- 
GitLab


From 7fbc44d63b25eddfc384922809426319728f949c Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Thu, 27 Sep 2018 17:19:17 -0700
Subject: [PATCH 113/570] [Java]: Release 1.11.0

PiperOrigin-RevId: 214862838
---
 tensorflow/java/maven/libtensorflow/pom.xml              | 2 +-
 tensorflow/java/maven/libtensorflow_jni/pom.xml          | 2 +-
 tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml      | 2 +-
 tensorflow/java/maven/pom.xml                            | 2 +-
 tensorflow/java/maven/proto/pom.xml                      | 2 +-
 tensorflow/java/maven/spark-tensorflow-connector/pom.xml | 2 +-
 tensorflow/java/maven/tensorflow-hadoop/pom.xml          | 2 +-
 tensorflow/java/maven/tensorflow/pom.xml                 | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 9fc6969c20..6b3e305e5d 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.11.0-rc2</version>
+    <version>1.11.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index 68712082e1..f130515934 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.11.0-rc2</version>
+    <version>1.11.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index f031173c99..67ecc2d597 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.11.0-rc2</version>
+    <version>1.11.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 2cac27990e..8ba859da01 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.11.0-rc2</version>
+  <version>1.11.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 8a93091276..dcd654d713 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.11.0-rc2</version>
+    <version>1.11.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
index 014bd8d212..45214f834c 100644
--- a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
+++ b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
@@ -6,7 +6,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>spark-tensorflow-connector_2.11</artifactId>
     <packaging>jar</packaging>
-    <version>1.11.0-rc2</version>
+    <version>1.11.0</version>
     <name>spark-tensorflow-connector</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord connector for Apache Spark DataFrames</description>
diff --git a/tensorflow/java/maven/tensorflow-hadoop/pom.xml b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
index d07c5fcd98..a8669ee72b 100644
--- a/tensorflow/java/maven/tensorflow-hadoop/pom.xml
+++ b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
@@ -5,7 +5,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>tensorflow-hadoop</artifactId>
     <packaging>jar</packaging>
-    <version>1.11.0-rc2</version>
+    <version>1.11.0</version>
     <name>tensorflow-hadoop</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop</description>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index af0c68a4ed..67d628ba11 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.11.0-rc2</version>
+    <version>1.11.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
-- 
GitLab


From f7e5a4e5f1de355cbbe70215f08d962e027cd0dc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 17:20:54 -0700
Subject: [PATCH 114/570] Update ops-related pbtxt files.

PiperOrigin-RevId: 214863042
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 53 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 53 +++++++++++++++++++
 2 files changed, 106 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 7625524674..32ce31cf23 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -44855,6 +44855,59 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ReduceDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "ReduceJoin"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 83af07431c..02a7f8d717 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -22868,6 +22868,59 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ReduceDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "ReduceJoin"
   input_arg {
-- 
GitLab


From c1f557705143f69988ec272f2cf659c7d525974c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 17:45:56 -0700
Subject: [PATCH 115/570] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 214866490

---
 tensorflow/go/op/wrappers.go | 508 +++++++++++++++++------------------
 1 file changed, 254 insertions(+), 254 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 96df1eee30..2f297d5161 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -26837,6 +26837,260 @@ func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
 	return op.Output(0)
 }
 
+// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
+type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
+
+// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LearnedUnigramCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
+
+// SerializeSparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeSparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
+
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+//
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["min_after_dequeue"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that randomizes the order of elements.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomShuffleQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Draw bounding boxes on a batch of images.
+//
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
+//
+// Parts of the bounding box may fall outside the image.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
+//
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DrawBoundingBoxes",
+		Input: []tf.Input{
+			images, boxes,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Gets the next output from the given iterator.
 //
 // This operation is a synchronous version IteratorGetNext. It should only be used
@@ -30988,260 +31242,6 @@ func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths
 	return op.Output(0)
 }
 
-// SerializeSparseAttr is an optional argument to SerializeSparse.
-type SerializeSparseAttr func(optionalAttr)
-
-// SerializeSparseOutType sets the optional out_type attribute to value.
-//
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
-
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
-//
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that randomizes the order of elements.
-//
-// Arguments:
-//	component_types: The type of each component in a value.
-//
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
-//
-// Parts of the bounding box may fall outside the image.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
-//
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
-		Input: []tf.Input{
-			images, boxes,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
-type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
-
-// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LearnedUnigramCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // Computes gradients for the scaled exponential linear (Selu) operation.
 //
 // Arguments:
-- 
GitLab


From 4bab3e375b7fffbc8878313089a2bd680952aced Mon Sep 17 00:00:00 2001
From: Sourabh Bajaj <sourabhbajaj@google.com>
Date: Thu, 27 Sep 2018 17:54:44 -0700
Subject: [PATCH 116/570] Change test size as it has been timing out
 consistently

PiperOrigin-RevId: 214867453
---
 tensorflow/contrib/distribute/python/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 7eead6e472..e329b964c4 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -453,7 +453,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "estimator_training_test",
-    size = "large",
+    size = "enormous",
     srcs = ["estimator_training_test.py"],
     additional_deps = [
         ":combinations",
-- 
GitLab


From 96f3428e33e18477661b8d8cf78f2db457c8881b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 18:43:55 -0700
Subject: [PATCH 117/570] Let feature columns correctly handle rank-1 sparse
 tensors from an empty batch.

reshape can't determine the size of the last dimension when reshaping
shape (0) to (0, 1).

PiperOrigin-RevId: 214872677
---
 .../python/feature_column/feature_column.py      |  2 +-
 .../python/feature_column/feature_column_test.py | 12 ++++++++++++
 .../python/feature_column/feature_column_v2.py   |  2 +-
 .../feature_column/feature_column_v2_test.py     | 16 ++++++++++++++++
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 226e273660..618e70f3a5 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -2318,7 +2318,7 @@ class _LazyBuilder(object):
       # Input_tensor must have rank 1.
       if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
         return sparse_ops.sparse_reshape(
-            input_tensor, [array_ops.shape(input_tensor)[0], -1])
+            input_tensor, [array_ops.shape(input_tensor)[0], 1])
       else:
         return array_ops.expand_dims(input_tensor, -1)
 
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index abb79efa68..1ae510250c 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -169,6 +169,18 @@ class LazyColumnTest(test.TestCase):
         TypeError, '"key" must be either a "str" or "_FeatureColumn".'):
       builder.get(NotAFeatureColumn())
 
+  def test_expand_dim_rank_1_sparse_tensor_empty_batch(self):
+    # empty 1-D sparse tensor:
+    builder = _LazyBuilder(features={'a': sparse_tensor.SparseTensor(
+        indices=np.reshape(np.array([], dtype=np.int64), (0, 1)),
+        dense_shape=[0],
+        values=np.array([]))})
+    with self.cached_session():
+      spv = builder.get('a').eval()
+      self.assertAllEqual(np.array([0, 1], dtype=np.int64), spv.dense_shape)
+      self.assertAllEqual(
+          np.reshape(np.array([], dtype=np.int64), (0, 2)), spv.indices)
+
 
 class NumericColumnTest(test.TestCase):
 
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 289f6d0d14..538641c251 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -2341,7 +2341,7 @@ class FeatureTransformationCache(object):
       # Input_tensor must have rank 1.
       if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
         return sparse_ops.sparse_reshape(
-            input_tensor, [array_ops.shape(input_tensor)[0], -1])
+            input_tensor, [array_ops.shape(input_tensor)[0], 1])
       else:
         return array_ops.expand_dims(input_tensor, -1)
 
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index 58168e0f9e..2970431167 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -177,6 +177,22 @@ class LazyColumnTest(test.TestCase):
         TypeError, '"key" must be either a "str" or "FeatureColumn".'):
       transformation_cache.get(NotAFeatureColumn(), None)
 
+  def test_expand_dim_rank_1_sparse_tensor_empty_batch(self):
+    # empty 1-D sparse tensor:
+    transformation_cache = FeatureTransformationCache(
+        features={
+            'a':
+                sparse_tensor.SparseTensor(
+                    indices=np.reshape(np.array([], dtype=np.int64), (0, 1)),
+                    dense_shape=[0],
+                    values=np.array([]))
+        })
+    with self.cached_session():
+      spv = transformation_cache.get('a', None).eval()
+      self.assertAllEqual(np.array([0, 1], dtype=np.int64), spv.dense_shape)
+      self.assertAllEqual(
+          np.reshape(np.array([], dtype=np.int64), (0, 2)), spv.indices)
+
 
 class NumericColumnTest(test.TestCase):
 
-- 
GitLab


From 70f071f7afb2deffddbd9937d7a76b1e1c0b2b75 Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Thu, 27 Sep 2018 19:20:59 -0700
Subject: [PATCH 118/570] Fix failing test.

PiperOrigin-RevId: 214875840
---
 .../estimator_batch/dnn_tree_combined_estimator_test.py       | 3 ++-
 .../contrib/boosted_trees/estimator_batch/estimator_test.py   | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
index 04baa329a0..6b6fe9663a 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
@@ -188,7 +188,8 @@ class CoreDNNBoostedTreeCombinedTest(test_util.TensorFlowTestCase):
 
     # Train for a few steps.
     est.train(input_fn=_train_input_fn, steps=1000)
-    # 10 steps for dnn, 3  for 1 tree of depth 3 + 1 after the tree finished
+    # 10 steps for dnn + 3 for 1 tree of depth 3 + 1 after the tree finished
+    # + 1 for resource variables.
     self._assert_checkpoint(est.model_dir, global_step=15)
     res = est.evaluate(input_fn=_eval_input_fn, steps=1)
     self.assertLess(0.5, res["auc"])
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index c155128c0e..d7b14e00ba 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -238,8 +238,8 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
         output_leaf_index=False)
 
     classifier.fit(input_fn=_train_input_fn, steps=15)
-    # When no override of global steps, 5 steps were used.
-    self._assert_checkpoint(classifier.model_dir, global_step=5)
+    # When no override of global steps, 6 steps were used.
+    self._assert_checkpoint(classifier.model_dir, global_step=6)
 
   def testOverridesGlobalSteps(self):
     learner_config = learner_pb2.LearnerConfig()
-- 
GitLab


From acb13e448786838feb500973f51279dc90eeab50 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 27 Sep 2018 20:01:16 -0700
Subject: [PATCH 119/570] Fix visibility

PiperOrigin-RevId: 214878220
---
 tensorflow/tools/docs/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index b218e900bf..2a858b4fd6 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -37,6 +37,7 @@ py_library(
     name = "doc_controls",
     srcs = ["doc_controls.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
 )
 
 py_test(
-- 
GitLab


From a309e136dcfdd13dc8e8eb7570b6c5945bb6f967 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 20:02:51 -0700
Subject: [PATCH 120/570] Keras Lambda - enhancements to output_shape
 computation

PiperOrigin-RevId: 214878428
---
 tensorflow/python/keras/layers/core.py      | 51 +++++++++++++++------
 tensorflow/python/keras/layers/core_test.py | 45 ++++++++++++++++++
 2 files changed, 82 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 4032202986..efa21955e6 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -671,22 +671,34 @@ class Lambda(Layer):
     if mask is not None:
       self.supports_masking = True
     self.mask = mask
-    if output_shape is None:
-      self._output_shape = None
-    elif isinstance(output_shape, (tuple, list)):
-      self._output_shape = tuple(output_shape)
-    else:
-      if not callable(output_shape):
-        raise TypeError('In Lambda, `output_shape` '
-                        'must be a list, a tuple, or a function.')
-      self._output_shape = output_shape
+    if (output_shape is not None and not isinstance(output_shape,
+                                                    (tuple, list)) and
+        not callable(output_shape)):
+      raise TypeError('In Lambda, `output_shape` '
+                      'must be a list, a tuple, or a function.')
+    # Convert a list representing a single shape into a tuple.
+    if (isinstance(output_shape, list) and isinstance(output_shape[0],
+                                                      (int, type(None)))):
+      output_shape = tuple(output_shape)
+    self._output_shape = output_shape
 
   @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if self._output_shape is None:
       if context.executing_eagerly():
-        raise NotImplementedError
-      x = K.placeholder(shape=input_shape)
+        # Make use of existing autocomputation for Eager mode but provide
+        # Lambda-specific error message.
+        try:
+          return super(Lambda, self).compute_output_shape(input_shape)
+        except NotImplementedError:
+          raise NotImplementedError('We could not automatically infer '
+                                    'the static shape of the Lambda\'s output.'
+                                    ' Please specify the `output_shape` for'
+                                    ' this Lambda.')
+      if isinstance(input_shape, list):
+        x = [K.placeholder(shape=shape) for shape in input_shape]
+      else:
+        x = K.placeholder(shape=input_shape)
       x = self.call(x)
       if isinstance(x, list):
         return [tensor_shape.TensorShape(K.int_shape(x_elem)) for x_elem in x]
@@ -697,16 +709,27 @@ class Lambda(Layer):
         num_samples = input_shape[0][0]
       else:
         num_samples = input_shape[0] if input_shape else None
-      return tensor_shape.TensorShape((num_samples,) +
-                                      tuple(self._output_shape))
+      # List here represents multiple outputs.
+      if isinstance(self._output_shape, list):
+        return [
+            tensor_shape.TensorShape((num_samples,) + tuple(single_shape))
+            for single_shape in self._output_shape
+        ]
+      return tensor_shape.TensorShape((num_samples,) + self._output_shape)
     else:
       shape = self._output_shape(input_shape)
       if not isinstance(shape, (list, tuple)):
         raise ValueError(
             '`output_shape` function must return a tuple or a list of tuples.')
+      # List here can represent multiple outputs or single output.
       if isinstance(shape, list):
-        if isinstance(shape[0], int) or shape[0] is None:
+        # Convert list representing single output into a tuple.
+        if isinstance(shape[0], (int, type(None))):
           shape = tuple(shape)
+        else:
+          return [
+              tensor_shape.TensorShape(single_shape) for single_shape in shape
+          ]
       return tensor_shape.TensorShape(shape)
 
   def call(self, inputs, mask=None):
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index 1df1d575b1..f0fea1f65c 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -252,6 +252,51 @@ class CoreLayersTest(test.TestCase):
       l(keras.backend.variable(np.ones((1, 1))))
       self.assertEqual('lambda', l.get_config()['output_shape_type'])
 
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_lambda_output_shape_autocalculate_multiple_inputs(self):
+
+    def lambda_fn(x):
+      return math_ops.matmul(x[0], x[1])
+
+    l = keras.layers.Lambda(lambda_fn)
+    output_shape = l.compute_output_shape([(10, 10), (10, 20)])
+    self.assertAllEqual((10, 20), output_shape)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_lambda_output_shape_list_multiple_outputs(self):
+
+    def lambda_fn(x):
+      return x
+
+    l = keras.layers.Lambda(lambda_fn, output_shape=[(10,), (20,)])
+    output_shape = l.compute_output_shape([(10, 10), (10, 20)])
+    self.assertAllEqual([(10, 10), (10, 20)], output_shape)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_lambda_output_shape_tuple_with_none(self):
+
+    def lambda_fn(x):
+      return x
+
+    l = keras.layers.Lambda(lambda_fn, output_shape=(None, 10))
+    output_shape = l.compute_output_shape((5, 10, 20))
+    # Dimension(None) != Dimension(None), so check
+    # str representations for equality.
+    self.assertAllEqual(('5', '?', '10'), tuple([str(s) for s in output_shape]))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_lambda_output_shape_function_multiple_outputs(self):
+
+    def lambda_fn(x):
+      return x
+
+    def output_shape_fn(input_shape):
+      return input_shape
+
+    l = keras.layers.Lambda(lambda_fn, output_shape=output_shape_fn)
+    output_shape = l.compute_output_shape([(10, 10), (10, 20)])
+    self.assertAllEqual([(10, 10), (10, 20)], output_shape)
+
   def test_lambda_config_serialization(self):
     with self.cached_session():
       # test serialization with output_shape and output_shape_type
-- 
GitLab


From d377fdee3a5e266ac330a6742c15ece8e7ed8aa0 Mon Sep 17 00:00:00 2001
From: Daryl Ng <darylng@google.com>
Date: Thu, 27 Sep 2018 20:10:31 -0700
Subject: [PATCH 121/570] Adding to tpu_lib depenencies to
 optimization_parameters_py, tpu_embedding_configuration_py, and
 tpu_embedding_output_layout_py.

PiperOrigin-RevId: 214879168
---
 tensorflow/contrib/tpu/BUILD | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index ac38612603..e9aa037634 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -252,7 +252,10 @@ py_library(
         ":tpu_py",
         "//tensorflow/contrib/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
+        "//tensorflow/contrib/tpu/proto:optimization_parameters_proto_py",
         "//tensorflow/contrib/tpu/proto:topology_proto_py",
+        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_py",
+        "//tensorflow/contrib/tpu/proto:tpu_embedding_output_layout_proto_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
-- 
GitLab


From 986193d79e00f1780fb3278ed890a72f7285f66e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 20:14:30 -0700
Subject: [PATCH 122/570] Move obsolete kernel code to legacy files.

PiperOrigin-RevId: 214879388
---
 .../internal/optimized/depthwiseconv_float.h  |   74 --
 .../internal/optimized/depthwiseconv_uint8.h  |  102 --
 .../internal/optimized/legacy_optimized_ops.h |  941 ++++++++++++++-
 .../internal/optimized/optimized_ops.h        |  798 ------------
 .../internal/reference/depthwiseconv_float.h  |   75 --
 .../internal/reference/depthwiseconv_uint8.h  |  103 --
 .../internal/reference/fully_connected.h      |  134 ---
 .../internal/reference/legacy_reference_ops.h | 1067 ++++++++++++++++-
 .../internal/reference/reference_ops.h        |  762 ------------
 .../lite/kernels/internal/reference/softmax.h |   23 -
 10 files changed, 2001 insertions(+), 2078 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
index 114575a96a..d8dd7bba89 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -1092,80 +1092,6 @@ inline void DepthwiseConv(
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
-                          const float* filter_data, const Dims<4>& filter_dims,
-                          const float* bias_data, const Dims<4>& bias_dims,
-                          int stride_width, int stride_height,
-                          int dilation_width_factor, int dilation_height_factor,
-                          int pad_width, int pad_height, int depth_multiplier,
-                          float output_activation_min,
-                          float output_activation_max, float* output_data,
-                          const Dims<4>& output_dims) {
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride_width;
-  op_params.stride_height = stride_height;
-  op_params.dilation_width_factor = dilation_width_factor;
-  op_params.dilation_height_factor = dilation_height_factor;
-  op_params.depth_multiplier = depth_multiplier;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  DepthwiseConv(op_params, DimsToShape(input_dims), input_data,
-                DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
-                bias_data, DimsToShape(output_dims), output_data);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
-                          const float* filter_data, const Dims<4>& filter_dims,
-                          const float* bias_data, const Dims<4>& bias_dims,
-                          int stride_width, int stride_height, int pad_width,
-                          int pad_height, int depth_multiplier,
-                          float output_activation_min,
-                          float output_activation_max, float* output_data,
-                          const Dims<4>& output_dims) {
-  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
-                bias_dims, stride_width, stride_height, 1, 1, pad_width,
-                pad_height, depth_multiplier, output_activation_min,
-                output_activation_max, output_data, output_dims);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
-                   const float* filter_data, const Dims<4>& filter_dims,
-                   const float* bias_data, const Dims<4>& bias_dims,
-                   int stride_width, int stride_height, int pad_width,
-                   int pad_height, int depth_multiplier, float* output_data,
-                   const Dims<4>& output_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
-                bias_dims, stride_width, stride_height, pad_width, pad_height,
-                depth_multiplier, output_activation_min, output_activation_max,
-                output_data, output_dims);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
-                   const float* filter_data, const Dims<4>& filter_dims,
-                   const float* bias_data, const Dims<4>& bias_dims, int stride,
-                   int pad_width, int pad_height, int depth_multiplier,
-                   float* output_data, const Dims<4>& output_dims) {
-  DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
-                    bias_dims, stride, stride, pad_width, pad_height,
-                    depth_multiplier, output_data, output_dims);
-}
-
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index a70545599b..803eff292a 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -24,9 +24,6 @@ limitations under the License.
 namespace tflite {
 namespace optimized_ops {
 
-// TODO(b/80418076): Move to legacy ops file, along with invocations.
-static constexpr int kDepthwiseReverseShift = -1;
-
 // Implementation of quantized DepthwiseConv
 
 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
@@ -1996,105 +1993,6 @@ inline void DepthwiseConv(
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
-                          int32 input_offset, const uint8* filter_data,
-                          const Dims<4>& filter_dims, int32 filter_offset,
-                          const int32* bias_data, const Dims<4>& bias_dims,
-                          int stride_width, int stride_height,
-                          int dilation_width_factor, int dilation_height_factor,
-                          int pad_width, int pad_height, int depth_multiplier,
-                          int32 output_offset, int32 output_multiplier,
-                          int output_shift, int32 output_activation_min,
-                          int32 output_activation_max, uint8* output_data,
-                          const Dims<4>& output_dims) {
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride_width;
-  op_params.stride_height = stride_height;
-  op_params.dilation_width_factor = dilation_width_factor;
-  op_params.dilation_height_factor = dilation_height_factor;
-  op_params.depth_multiplier = depth_multiplier;
-  op_params.quantized_activation_min = output_activation_min;
-  op_params.quantized_activation_max = output_activation_max;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = kDepthwiseReverseShift * output_shift;
-
-  DepthwiseConv(op_params, DimsToShape(input_dims), input_data,
-                DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
-                bias_data, DimsToShape(output_dims), output_data);
-}
-
-inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
-                          int32 input_offset, const uint8* filter_data,
-                          const Dims<4>& filter_dims, int32 filter_offset,
-                          const int32* bias_data, const Dims<4>& bias_dims,
-                          int stride_width, int stride_height, int pad_width,
-                          int pad_height, int depth_multiplier,
-                          int32 output_offset, int32 output_multiplier,
-                          int output_shift, int32 output_activation_min,
-                          int32 output_activation_max, uint8* output_data,
-                          const Dims<4>& output_dims) {
-  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
-                filter_offset, bias_data, bias_dims, stride_width,
-                stride_height, 1, 1, pad_width, pad_height, depth_multiplier,
-                output_offset, output_multiplier, output_shift,
-                output_activation_min, output_activation_max, output_data,
-                output_dims);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy, for compatibility with old checked-in code.
-template <FusedActivationFunctionType Ac>
-void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
-                   int32 input_offset, const uint8* filter_data,
-                   const Dims<4>& filter_dims, int32 filter_offset,
-                   const int32* bias_data, const Dims<4>& bias_dims,
-                   int stride_width, int stride_height, int pad_width,
-                   int pad_height, int depth_multiplier, int32 output_offset,
-                   int32 output_multiplier, int output_shift,
-                   int32 output_activation_min, int32 output_activation_max,
-                   uint8* output_data, const Dims<4>& output_dims) {
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
-  }
-  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
-                filter_offset, bias_data, bias_dims, stride_width,
-                stride_height, pad_width, pad_height, depth_multiplier,
-                output_offset, output_multiplier, output_shift,
-                output_activation_min, output_activation_max, output_data,
-                output_dims);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy, for compatibility with old checked-in code.
-template <FusedActivationFunctionType Ac>
-void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
-                   int32 input_offset, const uint8* filter_data,
-                   const Dims<4>& filter_dims, int32 filter_offset,
-                   const int32* bias_data, const Dims<4>& bias_dims, int stride,
-                   int pad_width, int pad_height, int depth_multiplier,
-                   int32 output_offset, int32 output_multiplier,
-                   int output_shift, int32 output_activation_min,
-                   int32 output_activation_max, uint8* output_data,
-                   const Dims<4>& output_dims) {
-  DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data,
-                    filter_dims, filter_offset, bias_data, bias_dims, stride,
-                    stride, pad_width, pad_height, depth_multiplier,
-                    output_offset, output_multiplier, output_shift,
-                    output_activation_min, output_activation_max, output_data,
-                    output_dims);
-}
-
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h
index b6151c40b3..4218be20a4 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <sys/types.h>
 
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
@@ -28,9 +30,857 @@ namespace optimized_ops {
 
 // Unoptimized reference ops:
 using reference_ops::ArgMax;
+using reference_ops::ArgMinMax;
+using reference_ops::Broadcast4DSlowGreater;
+using reference_ops::Broadcast4DSlowGreaterEqual;
+using reference_ops::Broadcast4DSlowGreaterEqualWithScaling;
+using reference_ops::Broadcast4DSlowGreaterWithScaling;
+using reference_ops::Broadcast4DSlowLess;
+using reference_ops::Broadcast4DSlowLessEqual;
+using reference_ops::Broadcast4DSlowLessEqualWithScaling;
+using reference_ops::Broadcast4DSlowLessWithScaling;
+using reference_ops::BroadcastAdd4DSlow;
+using reference_ops::BroadcastGreater;
+using reference_ops::BroadcastGreaterEqual;
+using reference_ops::BroadcastLess;
+using reference_ops::BroadcastLessEqual;
+using reference_ops::BroadcastMul4DSlow;
+using reference_ops::BroadcastSub4DSlow;
+using reference_ops::Concatenation;
+using reference_ops::ConcatenationWithScaling;
+using reference_ops::DepthConcatenation;
+using reference_ops::Dequantize;
+using reference_ops::Div;
+using reference_ops::FakeQuant;
+using reference_ops::Gather;
+using reference_ops::Greater;
+using reference_ops::GreaterEqual;
+using reference_ops::GreaterEqualWithScaling;
+using reference_ops::GreaterWithScaling;
+using reference_ops::Less;
+using reference_ops::LessEqual;
+using reference_ops::LessEqualWithScaling;
+using reference_ops::LessWithScaling;
+using reference_ops::Mean;
+using reference_ops::RankOneSelect;
 using reference_ops::Relu1;
 using reference_ops::Relu6;
+using reference_ops::ReluX;
+using reference_ops::Select;
 using reference_ops::SpaceToBatchND;
+using reference_ops::Split;
+using reference_ops::StridedSlice;
+using reference_ops::TensorFlowSplit;
+using reference_ops::Transpose;
+
+static constexpr int kDepthwiseReverseShift = -1;
+
+template <typename Scalar, int N>
+VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) {
+  const int size = FlatSize(dims);
+  return VectorMap<Scalar>(data, size, 1);
+}
+
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data,
+                                                const Dims<N>& dims) {
+  const int rows = dims.sizes[0];
+  int cols = 1;
+  for (int d = 1; d < N; d++) {
+    cols *= dims.sizes[d];
+  }
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data,
+                                               const Dims<N>& dims) {
+  const int cols = dims.sizes[N - 1];
+  int rows = 1;
+  for (int d = 0; d < N - 1; d++) {
+    rows *= dims.sizes[d];
+  }
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+template <typename Scalar, int N>
+ArrayMap<Scalar> MapAsArrayWithFirstDimAsRows(Scalar* data,
+                                              const Dims<N>& dims) {
+  const int rows = dims.sizes[0];
+  int cols = 1;
+  for (int d = 1; d < N; d++) {
+    cols *= dims.sizes[d];
+  }
+  return ArrayMap<Scalar>(data, rows, cols);
+}
+
+// TODO(b/62193649): this function is only needed as long
+// as we have the --variable_batch hack.
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
+                                                   const Dims<N>& dims,
+                                                   int rows) {
+  const int flatsize = FlatSize(dims);
+  TFLITE_DCHECK((flatsize % rows) == 0);
+  const int cols = flatsize / rows;
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+inline bool AreSameDims(const Dims<4>& dims1, const Dims<4>& dims2) {
+  for (int i = 0; i < 4; i++) {
+    if (dims1.sizes[i] != dims2.sizes[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          const float* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height,
+                          int dilation_width_factor, int dilation_height_factor,
+                          int pad_width, int pad_height, int depth_multiplier,
+                          float output_activation_min,
+                          float output_activation_max, float* output_data,
+                          const Dims<4>& output_dims) {
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.depth_multiplier = depth_multiplier;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  DepthwiseConv(op_params, DimsToShape(input_dims), input_data,
+                DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+                bias_data, DimsToShape(output_dims), output_data);
+}
+
+inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          const float* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, int depth_multiplier,
+                          float output_activation_min,
+                          float output_activation_max, float* output_data,
+                          const Dims<4>& output_dims) {
+  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
+                bias_dims, stride_width, stride_height, 1, 1, pad_width,
+                pad_height, depth_multiplier, output_activation_min,
+                output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int depth_multiplier, float* output_data,
+                   const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
+                bias_dims, stride_width, stride_height, pad_width, pad_height,
+                depth_multiplier, output_activation_min, output_activation_max,
+                output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims, int stride,
+                   int pad_width, int pad_height, int depth_multiplier,
+                   float* output_data, const Dims<4>& output_dims) {
+  DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+                    bias_dims, stride, stride, pad_width, pad_height,
+                    depth_multiplier, output_data, output_dims);
+}
+
+inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                          int32 input_offset, const uint8* filter_data,
+                          const Dims<4>& filter_dims, int32 filter_offset,
+                          const int32* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height,
+                          int dilation_width_factor, int dilation_height_factor,
+                          int pad_width, int pad_height, int depth_multiplier,
+                          int32 output_offset, int32 output_multiplier,
+                          int output_shift, int32 output_activation_min,
+                          int32 output_activation_max, uint8* output_data,
+                          const Dims<4>& output_dims) {
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.depth_multiplier = depth_multiplier;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kDepthwiseReverseShift * output_shift;
+
+  DepthwiseConv(op_params, DimsToShape(input_dims), input_data,
+                DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+                bias_data, DimsToShape(output_dims), output_data);
+}
+
+inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                          int32 input_offset, const uint8* filter_data,
+                          const Dims<4>& filter_dims, int32 filter_offset,
+                          const int32* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, int depth_multiplier,
+                          int32 output_offset, int32 output_multiplier,
+                          int output_shift, int32 output_activation_min,
+                          int32 output_activation_max, uint8* output_data,
+                          const Dims<4>& output_dims) {
+  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
+                filter_offset, bias_data, bias_dims, stride_width,
+                stride_height, 1, 1, pad_width, pad_height, depth_multiplier,
+                output_offset, output_multiplier, output_shift,
+                output_activation_min, output_activation_max, output_data,
+                output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                   int32 input_offset, const uint8* filter_data,
+                   const Dims<4>& filter_dims, int32 filter_offset,
+                   const int32* bias_data, const Dims<4>& bias_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int depth_multiplier, int32 output_offset,
+                   int32 output_multiplier, int output_shift,
+                   int32 output_activation_min, int32 output_activation_max,
+                   uint8* output_data, const Dims<4>& output_dims) {
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
+                filter_offset, bias_data, bias_dims, stride_width,
+                stride_height, pad_width, pad_height, depth_multiplier,
+                output_offset, output_multiplier, output_shift,
+                output_activation_min, output_activation_max, output_data,
+                output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                   int32 input_offset, const uint8* filter_data,
+                   const Dims<4>& filter_dims, int32 filter_offset,
+                   const int32* bias_data, const Dims<4>& bias_dims, int stride,
+                   int pad_width, int pad_height, int depth_multiplier,
+                   int32 output_offset, int32 output_multiplier,
+                   int output_shift, int32 output_activation_min,
+                   int32 output_activation_max, uint8* output_data,
+                   const Dims<4>& output_dims) {
+  DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data,
+                    filter_dims, filter_offset, bias_data, bias_dims, stride,
+                    stride, pad_width, pad_height, depth_multiplier,
+                    output_offset, output_multiplier, output_shift,
+                    output_activation_min, output_activation_max, output_data,
+                    output_dims);
+}
+
+inline void AddBiasAndEvalActivationFunction(const float* bias_data,
+                                             const Dims<4>& bias_dims,
+                                             float* array_data,
+                                             const Dims<4>& array_dims,
+                                             float output_activation_min,
+                                             float output_activation_max) {
+  AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
+                                   DimsToShape(bias_dims), bias_data,
+                                   DimsToShape(array_dims), array_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AddBiasAndEvalActivationFunction(const float* bias_data,
+                                      const Dims<4>& bias_dims,
+                                      float* array_data,
+                                      const Dims<4>& array_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  AddBiasAndEvalActivationFunction(bias_data, bias_dims, array_data, array_dims,
+                                   output_activation_min,
+                                   output_activation_max);
+}
+
+inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                           const float* weights_data,
+                           const Dims<4>& weights_dims, const float* bias_data,
+                           const Dims<4>& bias_dims,
+                           float output_activation_min,
+                           float output_activation_max, float* output_data,
+                           const Dims<4>& output_dims) {
+  tflite::FullyConnectedParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  FullyConnected(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(weights_dims), weights_data,
+                 DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+                 output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                    const float* weights_data, const Dims<4>& weights_dims,
+                    const float* bias_data, const Dims<4>& bias_dims,
+                    float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
+                 bias_dims, output_activation_min, output_activation_max,
+                 output_data, output_dims);
+}
+
+inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+                           int32 input_offset, const uint8* filter_data,
+                           const Dims<4>& filter_dims, int32 filter_offset,
+                           const int32* bias_data, const Dims<4>& bias_dims,
+                           int32 output_offset, int32 output_multiplier,
+                           int output_shift, int32 output_activation_min,
+                           int32 output_activation_max, uint8* output_data,
+                           const Dims<4>& output_dims,
+                           gemmlowp::GemmContext* gemm_context) {
+  tflite::FullyConnectedParams op_params;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  FullyConnected(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+                 bias_data, DimsToShape(output_dims), output_data,
+                 gemm_context);
+}
+
+inline void FullyConnected(
+    const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
+    const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
+    const int32* bias_data_int32, const Dims<4>& bias_dims, int32 output_offset,
+    int32 output_multiplier, int output_shift, int32 output_activation_min,
+    int32 output_activation_max, int16* output_data, const Dims<4>& output_dims,
+    gemmlowp::GemmContext* gemm_context) {
+  tflite::FullyConnectedParams op_params;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  FullyConnected(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+                 bias_data_int32, DimsToShape(output_dims), output_data,
+                 gemm_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+                    int32 input_offset, const uint8* filter_data,
+                    const Dims<4>& filter_dims, int32 filter_offset,
+                    const int32* bias_data, const Dims<4>& bias_dims,
+                    int32 output_offset, int32 output_multiplier,
+                    int output_shift, int32 output_activation_min,
+                    int32 output_activation_max, uint8* output_data,
+                    const Dims<4>& output_dims,
+                    gemmlowp::GemmContext* gemm_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
+                 filter_offset, bias_data, bias_dims, output_offset,
+                 output_multiplier, output_shift, output_activation_min,
+                 output_activation_max, output_data, output_dims, gemm_context);
+}
+
+inline void ShuffledFullyConnected(
+    const uint8* input_data, const Dims<4>& input_dims,
+    const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
+    const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    int16* output_data, const Dims<4>& output_dims,
+    uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) {
+  tflite::FullyConnectedParams op_params;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  ShuffledFullyConnected(op_params, DimsToShape(input_dims), input_data,
+                         DimsToShape(weights_dims), shuffled_weights_data,
+                         DimsToShape(bias_dims), bias_data,
+                         DimsToShape(output_dims), output_data,
+                         shuffled_input_workspace_data, gemm_context);
+}
+
+template <typename T>
+inline void ExtractPatchIntoBufferColumn(
+    const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth,
+    int stride_width, int stride_height, int pad_width, int pad_height,
+    int in_width, int in_height, int in_depth, int single_buffer_length,
+    int buffer_id, const T* in_data, T* conv_buffer_data, uint8 zero_byte) {
+  ExtractPatchIntoBufferColumn(
+      DimsToShape(input_dims), w, h, b, kheight, kwidth, stride_width,
+      stride_height, pad_width, pad_height, in_width, in_height, in_depth,
+      single_buffer_length, buffer_id, in_data, conv_buffer_data, zero_byte);
+}
+
+template <typename T>
+void DilatedIm2col(const T* input_data, const Dims<4>& input_dims,
+                   const Dims<4>& filter_dims, int stride_width,
+                   int stride_height, int dilation_width_factor,
+                   int dilation_height_factor, int pad_width, int pad_height,
+                   const Dims<4>& output_dims, uint8 zero_byte,
+                   T* im2col_data) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+
+  DilatedIm2col(op_params, zero_byte, DimsToShape(input_dims), input_data,
+                DimsToShape(filter_dims), DimsToShape(output_dims),
+                im2col_data);
+}
+
+template <typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width,
+            int stride_height, int pad_width, int pad_height, int kheight,
+            int kwidth, uint8 zero_byte, T* output_data,
+            const Dims<4>& output_dims) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+
+  Im2col(op_params, kheight, kwidth, zero_byte, DimsToShape(input_dims),
+         input_data, DimsToShape(output_dims), output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
+            int pad_width, int pad_height, int kheight, int kwidth,
+            uint8 zero_byte, T* output_data, const Dims<4>& output_dims) {
+  Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
+         kwidth, zero_byte, output_data, output_dims);
+}
+
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+                 const float* filter_data, const Dims<4>& filter_dims,
+                 const float* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int dilation_width_factor,
+                 int dilation_height_factor, int pad_width, int pad_height,
+                 float output_activation_min, float output_activation_max,
+                 float* output_data, const Dims<4>& output_dims,
+                 float* im2col_data, const Dims<4>& im2col_dims) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims),
+       filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+       output_data, DimsToShape(im2col_dims), im2col_data);
+}
+
+inline void HybridConv(const int8_t* input_data, const Dims<4>& input_dims,
+                       const int8_t* filter_data, const Dims<4>& filter_dims,
+                       const float* bias_data, const Dims<4>& bias_dims,
+                       int stride_width, int stride_height, int pad_width,
+                       int pad_height, float* scaling_factors_ptr,
+                       float output_activation_min, float output_activation_max,
+                       float* output_data, const Dims<4>& output_dims,
+                       int8_t* im2col_data, const Dims<4>& im2col_dims) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  HybridConv(op_params, scaling_factors_ptr, DimsToShape(input_dims),
+             input_data, DimsToShape(filter_dims), filter_data,
+             DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+             output_data, DimsToShape(im2col_dims), im2col_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int dilation_width_factor,
+          int dilation_height_factor, int pad_width, int pad_height,
+          float* output_data, const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+       stride_width, stride_height, dilation_width_factor,
+       dilation_height_factor, pad_width, pad_height, output_activation_min,
+       output_activation_max, output_data, output_dims, im2col_data,
+       im2col_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+       stride_width, stride_height, 1, 1, pad_width, pad_height,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride,
+          int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+           bias_dims, stride, stride, 1, 1, pad_width, pad_height, output_data,
+           output_dims, im2col_data, im2col_dims);
+}
+
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int dilation_width_factor,
+                 int dilation_height_factor, int pad_width, int pad_height,
+                 int32 output_offset, int32 output_multiplier, int output_shift,
+                 int32 output_activation_min, int32 output_activation_max,
+                 uint8* output_data, const Dims<4>& output_dims,
+                 uint8* im2col_data, const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims),
+       filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+       output_data, DimsToShape(im2col_dims), im2col_data, gemm_context);
+}
+
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32 output_offset, int32 output_multiplier,
+                 int output_shift, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims, uint8* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemm_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32 output_offset, int32 output_multiplier,
+                 int output_shift, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims, uint8* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemm_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const uint8* input_data, const Dims<4>& input_dims,
+          int32 input_offset, const uint8* filter_data,
+          const Dims<4>& filter_dims, int32 filter_offset,
+          const int32* bias_data, const Dims<4>& bias_dims, int stride,
+          int pad_width, int pad_height, int32 output_offset,
+          int32 output_multiplier, int output_shift,
+          int32 output_activation_min, int32 output_activation_max,
+          uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
+          const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride, stride, pad_width,
+       pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemm_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
+            int pad_width, int pad_height, int kheight, int kwidth,
+            uint8 zero_byte, T* output_data, const Dims<4>& output_dims) {
+  Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
+         kwidth, zero_byte, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void ConvAsGemm(const float* input_data, const Dims<4>& input_dims,
+                const float* filter_data, const Dims<4>& filter_dims,
+                const float* bias_data, const Dims<4>& bias_dims,
+                float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("ConvAsGemm");
+
+  const auto input_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  const auto filter_matrix_map =
+      MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
+  auto output_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
+
+  AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data,
+                                       output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims,
+                int32 input_offset, const uint8* filter_data,
+                const Dims<4>& filter_dims, int32 filter_offset,
+                const int32* bias_data, const Dims<4>& bias_dims,
+                int32 output_offset, int32 output_multiplier, int output_shift,
+                int32 output_activation_min, int32 output_activation_max,
+                uint8* output_data, const Dims<4>& output_dims,
+                gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label("ConvAsGemm/8bit");
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  const int input_rows = input_dims.sizes[0];
+  const int input_cols = FlatSizeSkipDim(input_dims, 0);
+  const int filter_rows = filter_dims.sizes[3];
+  const int filter_cols = FlatSizeSkipDim(filter_dims, 3);
+  const int output_rows = output_dims.sizes[0];
+  const int output_cols = FlatSizeSkipDim(output_dims, 0);
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(output_cols, input_cols);
+  TFLITE_DCHECK_EQ(filter_cols, input_rows);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, output_rows, filter_cols, filter_cols);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, filter_cols, output_cols, filter_cols);
+  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, output_cols, output_rows);
+  const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp(
+      bias_data, output_rows, output_offset, output_multiplier, -output_shift,
+      output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
+      input_offset, output_pipeline);
+}
+
+inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, float* output_data,
+                          const Dims<4>& output_dims, float* im2col_data,
+                          const Dims<4>& im2col_dims) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+
+  TransposeConv(op_params, DimsToShape(input_dims), input_data,
+                DimsToShape(filter_dims), filter_data, DimsToShape(output_dims),
+                output_data, DimsToShape(im2col_dims), im2col_data);
+}
+
+template <typename T>
+void TransposeIm2col(const T* input_data, const Dims<4>& input_dims,
+                     const Dims<4>& filter_dims, int stride_width,
+                     int stride_height, int pad_width, int pad_height,
+                     const Dims<4>& output_dims, uint8 zero_byte,
+                     T* im2col_data) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+
+  TransposeIm2col(op_params, zero_byte, DimsToShape(input_dims), input_data,
+                  DimsToShape(filter_dims), DimsToShape(output_dims),
+                  im2col_data);
+}
+
+inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
+                     const float* prev_activ_data,
+                     const Dims<4>& prev_activ_dims, const float* weights_data,
+                     const Dims<4>& weights_dims, const float* bias_data,
+                     const Dims<4>& bias_dims, const float* prev_state_data,
+                     const Dims<4>& prev_state_dims, float* output_state_data,
+                     const Dims<4>& output_state_dims, float* output_activ_data,
+                     const Dims<4>& output_activ_dims, float* concat_temp_data,
+                     const Dims<4>& concat_temp_dims, float* activ_temp_data,
+                     const Dims<4>& activ_temp_dims) {
+  tflite::LstmCellParams op_params;
+  // Float LSTM cell does not need parameters to be set: leave untouched.
+
+  LstmCell(op_params, DimsToShape(input_dims), input_data,
+           DimsToShape(prev_activ_dims), prev_activ_data,
+           DimsToShape(weights_dims), weights_data, DimsToShape(bias_dims),
+           bias_data, DimsToShape(prev_state_dims), prev_state_data,
+           DimsToShape(output_state_dims), output_state_data,
+           DimsToShape(output_activ_dims), output_activ_data,
+           DimsToShape(concat_temp_dims), concat_temp_data,
+           DimsToShape(activ_temp_dims), activ_temp_data);
+}
+
+template <int StateIntegerBits>
+void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
+              const uint8* prev_activ_data_uint8,
+              const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8,
+              const Dims<4>& weights_dims, const int32* bias_data_int32,
+              const Dims<4>& bias_dims, const int16* prev_state_data_int16,
+              const Dims<4>& prev_state_dims, int16* output_state_data_int16,
+              const Dims<4>& output_state_dims, uint8* output_activ_data_uint8,
+              const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8,
+              const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16,
+              const Dims<4>& activ_temp_dims, int32 weights_zero_point,
+              int32 accum_multiplier, int accum_shift,
+              gemmlowp::GemmContext* gemm_context) {
+  tflite::LstmCellParams op_params;
+  op_params.weights_zero_point = weights_zero_point;
+  op_params.accum_multiplier = accum_multiplier;
+  op_params.accum_shift = accum_shift;
+
+  LstmCell<StateIntegerBits>(
+      op_params, DimsToShape(input_dims), input_data_uint8,
+      DimsToShape(prev_activ_dims), prev_activ_data_uint8,
+      DimsToShape(weights_dims), weights_data_uint8, DimsToShape(bias_dims),
+      bias_data_int32, DimsToShape(prev_state_dims), prev_state_data_int16,
+      DimsToShape(output_state_dims), output_state_data_int16,
+      DimsToShape(output_activ_dims), output_activ_data_uint8,
+      DimsToShape(concat_temp_dims), concat_temp_data_uint8,
+      DimsToShape(activ_temp_dims), activ_temp_data_int16, gemm_context);
+}
+
+template <typename T>
+void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  BroadcastDiv4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
 
 template <FusedActivationFunctionType Ac>
 void L2Normalization(const float* input_data, const RuntimeShape& input_shape,
@@ -574,6 +1424,14 @@ void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride,
              filter_width, filter_height, output_data, output_dims);
 }
 
+inline void Softmax(const float* input_data, const RuntimeShape& input_shape,
+                    float beta, float* output_data,
+                    const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  params.beta = beta;
+  Softmax(params, input_shape, input_data, output_shape, output_data);
+}
+
 inline void Softmax(const float* input_data, const Dims<4>& input_dims,
                     float beta, float* output_data,
                     const Dims<4>& output_dims) {
@@ -581,6 +1439,16 @@ inline void Softmax(const float* input_data, const Dims<4>& input_dims,
           DimsToShape(output_dims));
 }
 
+inline void Softmax(const uint8* input_data, const RuntimeShape& input_shape,
+                    int32 input_beta_multiplier, int32 input_beta_left_shift,
+                    int diff_min, uint8* output_data,
+                    const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  params.input_multiplier = input_beta_multiplier;
+  params.input_left_shift = input_beta_left_shift;
+  params.diff_min = diff_min;
+  Softmax(params, input_shape, input_data, output_shape, output_data);
+}
 inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
                     int32 input_beta_multiplier, int32 input_beta_left_shift,
                     int diff_min, uint8* output_data,
@@ -590,12 +1458,33 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
           DimsToShape(output_dims));
 }
 
+inline void LogSoftmax(const float* input_data, const RuntimeShape& input_shape,
+                       float* output_data, const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  // No params currently used for float LogSoftmax.
+  LogSoftmax(params, input_shape, input_data, output_shape, output_data);
+}
+
 inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
                        float* output_data, const Dims<4>& output_dims) {
   LogSoftmax(input_data, DimsToShape(input_dims), output_data,
              DimsToShape(output_dims));
 }
 
+inline void LogSoftmax(const uint8* input_data, const RuntimeShape& input_shape,
+                       int32 input_multiplier, int32 input_left_shift,
+                       int32 reverse_scaling_divisor,
+                       int32 reverse_scaling_right_shift, int diff_min,
+                       uint8* output_data, const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  params.input_multiplier = input_multiplier;
+  params.input_left_shift = input_left_shift;
+  params.reverse_scaling_divisor = reverse_scaling_divisor;
+  params.reverse_scaling_right_shift = reverse_scaling_right_shift;
+  params.diff_min = diff_min;
+  LogSoftmax(params, input_shape, input_data, output_shape, output_data);
+}
+
 inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
                        int32 input_multiplier, int32 input_left_shift,
                        int32 reverse_scaling_divisor,
@@ -607,6 +1496,18 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
              DimsToShape(output_dims));
 }
 
+inline void Logistic(const uint8* input_data, const RuntimeShape& input_shape,
+                     int32 input_zero_point, int32 input_range_radius,
+                     int32 input_multiplier, int input_left_shift,
+                     uint8* output_data, const RuntimeShape& output_shape) {
+  LogisticParams params;
+  params.input_zero_point = input_zero_point;
+  params.input_range_radius = input_range_radius;
+  params.input_multiplier = input_multiplier;
+  params.input_left_shift = input_left_shift;
+  Logistic(params, input_shape, input_data, output_shape, output_data);
+}
+
 inline void Logistic(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
   Logistic(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
@@ -622,6 +1523,20 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
            DimsToShape(output_dims));
 }
 
+inline void Logistic(const RuntimeShape& input_shape, const int16* input_data,
+                     const RuntimeShape& output_shape, int16* output_data) {
+  LogisticParams params;
+  // No params currently needed by int16 Logistic.
+  Logistic(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Logistic(const int16* input_data, const RuntimeShape& input_shape,
+                     int16* output_data, const RuntimeShape& output_shape) {
+  LogisticParams params;
+  // No params currently needed by int16 Logistic.
+  Logistic(params, input_shape, input_data, output_shape, output_data);
+}
+
 inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
                      int16* output_data, const Dims<4>& output_dims) {
   Logistic(input_data, DimsToShape(input_dims), output_data,
@@ -634,6 +1549,18 @@ inline void Tanh(const float* input_data, const Dims<4>& input_dims,
        output_data);
 }
 
+inline void Tanh(const uint8* input_data, const RuntimeShape& input_shape,
+                 int32 input_zero_point, int32 input_range_radius,
+                 int32 input_multiplier, int input_left_shift,
+                 uint8* output_data, const RuntimeShape& output_shape) {
+  TanhParams params;
+  params.input_zero_point = input_zero_point;
+  params.input_range_radius = input_range_radius;
+  params.input_multiplier = input_multiplier;
+  params.input_left_shift = input_left_shift;
+  Tanh(params, input_shape, input_data, output_shape, output_data);
+}
+
 inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
                  int32 input_zero_point, int32 input_range_radius,
                  int32 input_multiplier, int input_left_shift,
@@ -643,6 +1570,14 @@ inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
        DimsToShape(output_dims));
 }
 
+inline void Tanh(const int16* input_data, const RuntimeShape& input_shape,
+                 int input_left_shift, int16* output_data,
+                 const RuntimeShape& output_shape) {
+  TanhParams params;
+  params.input_left_shift = input_left_shift;
+  Tanh(params, input_shape, input_data, output_shape, output_data);
+}
+
 inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
                  int input_left_shift, int16* output_data,
                  const Dims<4>& output_dims) {
@@ -777,7 +1712,6 @@ inline void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims,
                      DimsToShape(output_dims), output_data);
 }
 
-// Legacy Dims<4>.
 inline void LocalResponseNormalization(const float* input_data,
                                        const Dims<4>& input_dims, int range,
                                        float bias, float alpha, float beta,
@@ -793,7 +1727,6 @@ inline void LocalResponseNormalization(const float* input_data,
                              DimsToShape(output_dims), output_data);
 }
 
-// Legacy Dims<4> version.
 template <typename SrcT, typename DstT>
 void Cast(const SrcT* input_data, const Dims<4>& input_dims, DstT* output_data,
           const Dims<4>& output_dims) {
@@ -801,14 +1734,12 @@ void Cast(const SrcT* input_data, const Dims<4>& input_dims, DstT* output_data,
        output_data);
 }
 
-// Legacy Dims<4> version.
 inline void Floor(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
   Floor(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
         output_data);
 }
 
-// Legacy Dims<4>
 inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                            const int32* output_size_data,
                            const Dims<4>& output_size_dims, float* output_data,
@@ -820,7 +1751,6 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                  DimsToShape(output_dims), output_data);
 }
 
-// Legacy Dims<4>
 inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims,
                            const int32* output_size_data,
                            const Dims<4>& output_size_dims, uint8* output_data,
@@ -850,7 +1780,6 @@ inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims,
                  output_data, output_dims, /*align_corners=*/false);
 }
 
-// Legacy Dims<4>.
 template <typename T>
 inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 732880d9da..77f84e0c1c 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -52,10 +52,6 @@ using reference_ops::Broadcast4DSlowLessEqual;
 using reference_ops::Broadcast4DSlowLessEqualWithScaling;
 using reference_ops::Broadcast4DSlowLessWithScaling;
 using reference_ops::BroadcastAdd4DSlow;
-using reference_ops::BroadcastGreater;
-using reference_ops::BroadcastGreaterEqual;
-using reference_ops::BroadcastLess;
-using reference_ops::BroadcastLessEqual;
 using reference_ops::BroadcastMul4DSlow;
 using reference_ops::BroadcastSub4DSlow;
 using reference_ops::Concatenation;
@@ -82,7 +78,6 @@ using reference_ops::Select;
 using reference_ops::SpaceToBatchND;
 using reference_ops::Split;
 using reference_ops::StridedSlice;
-using reference_ops::TensorFlowSplit;
 using reference_ops::Transpose;
 
 // TODO(b/80247582) Remove this constant.
@@ -112,12 +107,6 @@ VectorMap<Scalar> MapAsVector(Scalar* data, const RuntimeShape& shape) {
   return VectorMap<Scalar>(data, size, 1);
 }
 
-template <typename Scalar, int N>
-VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) {
-  const int size = FlatSize(dims);
-  return VectorMap<Scalar>(data, size, 1);
-}
-
 // Make a local VectorMap typedef allowing to map a float array
 // as a Eigen matrix expression. The same explanation as for VectorMap
 // above also applies here.
@@ -145,28 +134,6 @@ MatrixMap<Scalar> MapAsMatrixWithFirstDimAsCols(Scalar* data,
   return MatrixMap<Scalar>(data, rows, cols);
 }
 
-template <typename Scalar, int N>
-MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data,
-                                                const Dims<N>& dims) {
-  const int rows = dims.sizes[0];
-  int cols = 1;
-  for (int d = 1; d < N; d++) {
-    cols *= dims.sizes[d];
-  }
-  return MatrixMap<Scalar>(data, rows, cols);
-}
-
-template <typename Scalar, int N>
-MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data,
-                                               const Dims<N>& dims) {
-  const int cols = dims.sizes[N - 1];
-  int rows = 1;
-  for (int d = 0; d < N - 1; d++) {
-    rows *= dims.sizes[d];
-  }
-  return MatrixMap<Scalar>(data, rows, cols);
-}
-
 template <typename Scalar>
 using ArrayMap = typename std::conditional<
     std::is_const<Scalar>::value,
@@ -174,17 +141,6 @@ using ArrayMap = typename std::conditional<
                                   Eigen::Dynamic, Eigen::Dynamic>>,
     Eigen::Map<Eigen::Array<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
 
-template <typename Scalar, int N>
-ArrayMap<Scalar> MapAsArrayWithFirstDimAsRows(Scalar* data,
-                                              const Dims<N>& dims) {
-  const int rows = dims.sizes[0];
-  int cols = 1;
-  for (int d = 1; d < N; d++) {
-    cols *= dims.sizes[d];
-  }
-  return ArrayMap<Scalar>(data, rows, cols);
-}
-
 template <typename Scalar>
 ArrayMap<Scalar> MapAsArrayWithLastDimAsRows(Scalar* data,
                                              const RuntimeShape& shape) {
@@ -206,20 +162,6 @@ struct TTypes {
       UnalignedConstMatrix;
 };
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-// TODO(b/62193649): this function is only needed as long
-// as we have the --variable_batch hack.
-template <typename Scalar, int N>
-MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
-                                                   const Dims<N>& dims,
-                                                   int rows) {
-  const int flatsize = FlatSize(dims);
-  TFLITE_DCHECK((flatsize % rows) == 0);
-  const int cols = flatsize / rows;
-  return MatrixMap<Scalar>(data, rows, cols);
-}
-
 // TODO(b/62193649): this function is only needed as long
 // as we have the --variable_batch hack.
 template <typename Scalar>
@@ -271,15 +213,6 @@ SaturatingRoundingMultiplyByPOTParam(
       SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
 }
 
-inline bool AreSameDims(const Dims<4>& dims1, const Dims<4>& dims2) {
-  for (int i = 0; i < 4; i++) {
-    if (dims1.sizes[i] != dims2.sizes[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
 inline void AddBiasAndEvalActivationFunction(float output_activation_min,
                                              float output_activation_max,
                                              const RuntimeShape& bias_shape,
@@ -353,33 +286,6 @@ inline void AddBiasAndEvalActivationFunction(float output_activation_min,
 #endif
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void AddBiasAndEvalActivationFunction(const float* bias_data,
-                                             const Dims<4>& bias_dims,
-                                             float* array_data,
-                                             const Dims<4>& array_dims,
-                                             float output_activation_min,
-                                             float output_activation_max) {
-  AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
-                                   DimsToShape(bias_dims), bias_data,
-                                   DimsToShape(array_dims), array_data);
-}
-
-// Note: This to be converted to RuntimeShapes along with Conv.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void AddBiasAndEvalActivationFunction(const float* bias_data,
-                                      const Dims<4>& bias_dims,
-                                      float* array_data,
-                                      const Dims<4>& array_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-  AddBiasAndEvalActivationFunction(bias_data, bias_dims, array_data, array_dims,
-                                   output_activation_min,
-                                   output_activation_max);
-}
-
 template <typename Lhs, typename Rhs, typename Result>
 void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs,
           Eigen::MatrixBase<Result>* result) {
@@ -926,38 +832,6 @@ inline void FullyConnected(
                                    output_data);
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
-                           const float* weights_data,
-                           const Dims<4>& weights_dims, const float* bias_data,
-                           const Dims<4>& bias_dims,
-                           float output_activation_min,
-                           float output_activation_max, float* output_data,
-                           const Dims<4>& output_dims) {
-  tflite::FullyConnectedParams op_params;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  FullyConnected(op_params, DimsToShape(input_dims), input_data,
-                 DimsToShape(weights_dims), weights_data,
-                 DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
-                 output_data);
-}
-
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void FullyConnected(const float* input_data, const Dims<4>& input_dims,
-                    const float* weights_data, const Dims<4>& weights_dims,
-                    const float* bias_data, const Dims<4>& bias_dims,
-                    float* output_data, const Dims<4>& output_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-  FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
-                 bias_dims, output_activation_min, output_activation_max,
-                 output_data, output_dims);
-}
-
 #ifdef USE_NEON
 inline void FullyConnectedAsGEMV(
     const RuntimeShape& input_shape, const uint8* input_data,
@@ -1204,33 +1078,6 @@ inline void FullyConnected(
       input_offset, output_pipeline);
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
-                           int32 input_offset, const uint8* filter_data,
-                           const Dims<4>& filter_dims, int32 filter_offset,
-                           const int32* bias_data, const Dims<4>& bias_dims,
-                           int32 output_offset, int32 output_multiplier,
-                           int output_shift, int32 output_activation_min,
-                           int32 output_activation_max, uint8* output_data,
-                           const Dims<4>& output_dims,
-                           gemmlowp::GemmContext* gemm_context) {
-  tflite::FullyConnectedParams op_params;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = kReverseShift * output_shift;
-  op_params.quantized_activation_min = output_activation_min;
-  op_params.quantized_activation_max = output_activation_max;
-
-  FullyConnected(op_params, DimsToShape(input_dims), input_data,
-                 DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
-                 bias_data, DimsToShape(output_dims), output_data,
-                 gemm_context);
-}
-
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const uint8* input_data, const RuntimeShape& filter_shape,
@@ -1318,54 +1165,6 @@ inline void FullyConnected(
       input_offset, output_pipeline);
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void FullyConnected(
-    const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
-    const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
-    const int32* bias_data_int32, const Dims<4>& bias_dims, int32 output_offset,
-    int32 output_multiplier, int output_shift, int32 output_activation_min,
-    int32 output_activation_max, int16* output_data, const Dims<4>& output_dims,
-    gemmlowp::GemmContext* gemm_context) {
-  tflite::FullyConnectedParams op_params;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = kReverseShift * output_shift;
-  op_params.quantized_activation_min = output_activation_min;
-  op_params.quantized_activation_max = output_activation_max;
-
-  FullyConnected(op_params, DimsToShape(input_dims), input_data,
-                 DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
-                 bias_data_int32, DimsToShape(output_dims), output_data,
-                 gemm_context);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
-                    int32 input_offset, const uint8* filter_data,
-                    const Dims<4>& filter_dims, int32 filter_offset,
-                    const int32* bias_data, const Dims<4>& bias_dims,
-                    int32 output_offset, int32 output_multiplier,
-                    int output_shift, int32 output_activation_min,
-                    int32 output_activation_max, uint8* output_data,
-                    const Dims<4>& output_dims,
-                    gemmlowp::GemmContext* gemm_context) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
-                 filter_offset, bias_data, bias_dims, output_offset,
-                 output_multiplier, output_shift, output_activation_min,
-                 output_activation_max, output_data, output_dims, gemm_context);
-}
-
 // Internal function doing the actual arithmetic work for
 // ShuffledFullyConnected.
 // May be called either directly by it (single-threaded case) or may be used
@@ -1810,29 +1609,6 @@ inline void ShuffledFullyConnected(
   gemm_context->workers_pool()->Execute(tasks);
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void ShuffledFullyConnected(
-    const uint8* input_data, const Dims<4>& input_dims,
-    const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
-    const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
-    int output_shift, int32 output_activation_min, int32 output_activation_max,
-    int16* output_data, const Dims<4>& output_dims,
-    uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) {
-  tflite::FullyConnectedParams op_params;
-  op_params.output_multiplier = output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = kReverseShift * output_shift;
-  op_params.quantized_activation_min = output_activation_min;
-  op_params.quantized_activation_max = output_activation_max;
-
-  ShuffledFullyConnected(op_params, DimsToShape(input_dims), input_data,
-                         DimsToShape(weights_dims), shuffled_weights_data,
-                         DimsToShape(bias_dims), bias_data,
-                         DimsToShape(output_dims), output_data,
-                         shuffled_input_workspace_data, gemm_context);
-}
-
 template <typename T>
 inline void ExtractPatchIntoBufferColumn(const RuntimeShape& input_shape, int w,
                                          int h, int b, int kheight, int kwidth,
@@ -1923,20 +1699,6 @@ inline void ExtractPatchIntoBufferColumn(const RuntimeShape& input_shape, int w,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <typename T>
-inline void ExtractPatchIntoBufferColumn(
-    const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth,
-    int stride_width, int stride_height, int pad_width, int pad_height,
-    int in_width, int in_height, int in_depth, int single_buffer_length,
-    int buffer_id, const T* in_data, T* conv_buffer_data, uint8 zero_byte) {
-  ExtractPatchIntoBufferColumn(
-      DimsToShape(input_dims), w, h, b, kheight, kwidth, stride_width,
-      stride_height, pad_width, pad_height, in_width, in_height, in_depth,
-      single_buffer_length, buffer_id, in_data, conv_buffer_data, zero_byte);
-}
-
 template <typename T>
 void DilatedIm2col(const ConvParams& params, uint8 zero_byte,
                    const RuntimeShape& input_shape, const T* input_data,
@@ -2020,30 +1782,6 @@ void DilatedIm2col(const ConvParams& params, uint8 zero_byte,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <typename T>
-void DilatedIm2col(const T* input_data, const Dims<4>& input_dims,
-                   const Dims<4>& filter_dims, int stride_width,
-                   int stride_height, int dilation_width_factor,
-                   int dilation_height_factor, int pad_width, int pad_height,
-                   const Dims<4>& output_dims, uint8 zero_byte,
-                   T* im2col_data) {
-  tflite::ConvParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride_width;
-  op_params.stride_height = stride_height;
-  op_params.dilation_width_factor = dilation_width_factor;
-  op_params.dilation_height_factor = dilation_height_factor;
-
-  DilatedIm2col(op_params, zero_byte, DimsToShape(input_dims), input_data,
-                DimsToShape(filter_dims), DimsToShape(output_dims),
-                im2col_data);
-}
-
 template <typename T>
 void Im2col(const ConvParams& params, int kheight, int kwidth, uint8 zero_byte,
             const RuntimeShape& input_shape, const T* input_data,
@@ -2079,36 +1817,6 @@ void Im2col(const ConvParams& params, int kheight, int kwidth, uint8 zero_byte,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <typename T>
-void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width,
-            int stride_height, int pad_width, int pad_height, int kheight,
-            int kwidth, uint8 zero_byte, T* output_data,
-            const Dims<4>& output_dims) {
-  tflite::ConvParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride_width;
-  op_params.stride_height = stride_height;
-  op_params.dilation_width_factor = 1;
-  op_params.dilation_height_factor = 1;
-
-  Im2col(op_params, kheight, kwidth, zero_byte, DimsToShape(input_dims),
-         input_data, DimsToShape(output_dims), output_data);
-}
-
-// legacy, for compatibility with old checked-in code
-template <typename T>
-void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
-            int pad_width, int pad_height, int kheight, int kwidth,
-            uint8 zero_byte, T* output_data, const Dims<4>& output_dims) {
-  Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
-         kwidth, zero_byte, output_data, output_dims);
-}
-
 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                  const float* input_data, const RuntimeShape& filter_shape,
                  const float* filter_data, const RuntimeShape& bias_shape,
@@ -2172,33 +1880,6 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                                    output_data);
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Conv(const float* input_data, const Dims<4>& input_dims,
-                 const float* filter_data, const Dims<4>& filter_dims,
-                 const float* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int dilation_width_factor,
-                 int dilation_height_factor, int pad_width, int pad_height,
-                 float output_activation_min, float output_activation_max,
-                 float* output_data, const Dims<4>& output_dims,
-                 float* im2col_data, const Dims<4>& im2col_dims) {
-  tflite::ConvParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride_width;
-  op_params.stride_height = stride_height;
-  op_params.dilation_width_factor = dilation_width_factor;
-  op_params.dilation_height_factor = dilation_height_factor;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims),
-       filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
-       output_data, DimsToShape(im2col_dims), im2col_data);
-}
-
 inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr,
                        const RuntimeShape& input_shape,
                        const int8_t* input_data,
@@ -2279,82 +1960,6 @@ inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr,
                                    output_data);
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void HybridConv(const int8_t* input_data, const Dims<4>& input_dims,
-                       const int8_t* filter_data, const Dims<4>& filter_dims,
-                       const float* bias_data, const Dims<4>& bias_dims,
-                       int stride_width, int stride_height, int pad_width,
-                       int pad_height, float* scaling_factors_ptr,
-                       float output_activation_min, float output_activation_max,
-                       float* output_data, const Dims<4>& output_dims,
-                       int8_t* im2col_data, const Dims<4>& im2col_dims) {
-  tflite::ConvParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride_width;
-  op_params.stride_height = stride_height;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  HybridConv(op_params, scaling_factors_ptr, DimsToShape(input_dims),
-             input_data, DimsToShape(filter_dims), filter_data,
-             DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
-             output_data, DimsToShape(im2col_dims), im2col_data);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <FusedActivationFunctionType Ac>
-void Conv(const float* input_data, const Dims<4>& input_dims,
-          const float* filter_data, const Dims<4>& filter_dims,
-          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
-          int stride_height, int dilation_width_factor,
-          int dilation_height_factor, int pad_width, int pad_height,
-          float* output_data, const Dims<4>& output_dims, float* im2col_data,
-          const Dims<4>& im2col_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
-       stride_width, stride_height, dilation_width_factor,
-       dilation_height_factor, pad_width, pad_height, output_activation_min,
-       output_activation_max, output_data, output_dims, im2col_data,
-       im2col_dims);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Conv(const float* input_data, const Dims<4>& input_dims,
-          const float* filter_data, const Dims<4>& filter_dims,
-          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
-          int stride_height, int pad_width, int pad_height, float* output_data,
-          const Dims<4>& output_dims, float* im2col_data,
-          const Dims<4>& im2col_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
-       stride_width, stride_height, 1, 1, pad_width, pad_height,
-       output_activation_min, output_activation_max, output_data, output_dims,
-       im2col_data, im2col_dims);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Conv(const float* input_data, const Dims<4>& input_dims,
-          const float* filter_data, const Dims<4>& filter_dims,
-          const float* bias_data, const Dims<4>& bias_dims, int stride,
-          int pad_width, int pad_height, float* output_data,
-          const Dims<4>& output_dims, float* im2col_data,
-          const Dims<4>& im2col_dims) {
-  Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
-           bias_dims, stride, stride, 1, 1, pad_width, pad_height, output_data,
-           output_dims, im2col_data, im2col_dims);
-}
-
 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                  const uint8* input_data, const RuntimeShape& filter_shape,
                  const uint8* filter_data, const RuntimeShape& bias_shape,
@@ -2446,192 +2051,6 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
       input_offset, output_pipeline);
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
-                 int32 input_offset, const uint8* filter_data,
-                 const Dims<4>& filter_dims, int32 filter_offset,
-                 const int32* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int dilation_width_factor,
-                 int dilation_height_factor, int pad_width, int pad_height,
-                 int32 output_offset, int32 output_multiplier, int output_shift,
-                 int32 output_activation_min, int32 output_activation_max,
-                 uint8* output_data, const Dims<4>& output_dims,
-                 uint8* im2col_data, const Dims<4>& im2col_dims,
-                 gemmlowp::GemmContext* gemm_context) {
-  tflite::ConvParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride_width;
-  op_params.stride_height = stride_height;
-  op_params.dilation_width_factor = dilation_width_factor;
-  op_params.dilation_height_factor = dilation_height_factor;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = kReverseShift * output_shift;
-  op_params.quantized_activation_min = output_activation_min;
-  op_params.quantized_activation_max = output_activation_max;
-
-  Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims),
-       filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
-       output_data, DimsToShape(im2col_dims), im2col_data, gemm_context);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
-                 int32 input_offset, const uint8* filter_data,
-                 const Dims<4>& filter_dims, int32 filter_offset,
-                 const int32* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int pad_width,
-                 int pad_height, int32 output_offset, int32 output_multiplier,
-                 int output_shift, int32 output_activation_min,
-                 int32 output_activation_max, uint8* output_data,
-                 const Dims<4>& output_dims, uint8* im2col_data,
-                 const Dims<4>& im2col_dims,
-                 gemmlowp::GemmContext* gemm_context) {
-  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
-       filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1,
-       pad_width, pad_height, output_offset, output_multiplier, output_shift,
-       output_activation_min, output_activation_max, output_data, output_dims,
-       im2col_data, im2col_dims, gemm_context);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
-                 int32 input_offset, const uint8* filter_data,
-                 const Dims<4>& filter_dims, int32 filter_offset,
-                 const int32* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int pad_width,
-                 int pad_height, int32 output_offset, int32 output_multiplier,
-                 int output_shift, int32 output_activation_min,
-                 int32 output_activation_max, uint8* output_data,
-                 const Dims<4>& output_dims, uint8* im2col_data,
-                 const Dims<4>& im2col_dims,
-                 gemmlowp::GemmContext* gemm_context) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
-  }
-  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
-       filter_offset, bias_data, bias_dims, stride_width, stride_height,
-       pad_width, pad_height, output_offset, output_multiplier, output_shift,
-       output_activation_min, output_activation_max, output_data, output_dims,
-       im2col_data, im2col_dims, gemm_context);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Conv(const uint8* input_data, const Dims<4>& input_dims,
-          int32 input_offset, const uint8* filter_data,
-          const Dims<4>& filter_dims, int32 filter_offset,
-          const int32* bias_data, const Dims<4>& bias_dims, int stride,
-          int pad_width, int pad_height, int32 output_offset,
-          int32 output_multiplier, int output_shift,
-          int32 output_activation_min, int32 output_activation_max,
-          uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
-          const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
-       filter_offset, bias_data, bias_dims, stride, stride, pad_width,
-       pad_height, output_offset, output_multiplier, output_shift,
-       output_activation_min, output_activation_max, output_data, output_dims,
-       im2col_data, im2col_dims, gemm_context);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac, typename T>
-void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
-            int pad_width, int pad_height, int kheight, int kwidth,
-            uint8 zero_byte, T* output_data, const Dims<4>& output_dims) {
-  Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
-         kwidth, zero_byte, output_data, output_dims);
-}
-
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void ConvAsGemm(const float* input_data, const Dims<4>& input_dims,
-                const float* filter_data, const Dims<4>& filter_dims,
-                const float* bias_data, const Dims<4>& bias_dims,
-                float* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("ConvAsGemm");
-
-  const auto input_matrix_map =
-      MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
-  const auto filter_matrix_map =
-      MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
-  auto output_matrix_map =
-      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
-
-  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
-
-  AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data,
-                                       output_dims);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims,
-                int32 input_offset, const uint8* filter_data,
-                const Dims<4>& filter_dims, int32 filter_offset,
-                const int32* bias_data, const Dims<4>& bias_dims,
-                int32 output_offset, int32 output_multiplier, int output_shift,
-                int32 output_activation_min, int32 output_activation_max,
-                uint8* output_data, const Dims<4>& output_dims,
-                gemmlowp::GemmContext* gemm_context) {
-  gemmlowp::ScopedProfilingLabel label("ConvAsGemm/8bit");
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  const int input_rows = input_dims.sizes[0];
-  const int input_cols = FlatSizeSkipDim(input_dims, 0);
-  const int filter_rows = filter_dims.sizes[3];
-  const int filter_cols = FlatSizeSkipDim(filter_dims, 3);
-  const int output_rows = output_dims.sizes[0];
-  const int output_cols = FlatSizeSkipDim(output_dims, 0);
-  TFLITE_DCHECK_EQ(output_rows, filter_rows);
-  TFLITE_DCHECK_EQ(output_cols, input_cols);
-  TFLITE_DCHECK_EQ(filter_cols, input_rows);
-  TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows);
-  TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1);
-  TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1);
-  TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1);
-  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
-      filter_data, output_rows, filter_cols, filter_cols);
-  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
-      input_data, filter_cols, output_cols, filter_cols);
-  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
-      output_data, output_rows, output_cols, output_rows);
-  const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp(
-      bias_data, output_rows, output_offset, output_multiplier, -output_shift,
-      output_activation_min, output_activation_max);
-  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
-                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
-      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
-      input_offset, output_pipeline);
-}
-
 template <typename T>
 inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
                          const RuntimeShape& unextended_input_shape,
@@ -3548,21 +2967,6 @@ void BroadcastDiv4DSlow(const ArithmeticParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy Dims<4>.
-template <typename T>
-void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T output_activation_min, T output_activation_max,
-                  T* output_data, const Dims<4>& output_dims) {
-  tflite::ArithmeticParams op_params;
-  SetActivationParams(output_activation_min, output_activation_max, &op_params);
-
-  BroadcastDiv4DSlow(op_params, DimsToShape(input1_dims), input1_data,
-                     DimsToShape(input2_dims), input2_data,
-                     DimsToShape(output_dims), output_data);
-}
-
 // TODO(aselle): This is not actually optimized yet.
 inline void SubNonBroadcast(const ArithmeticParams& params,
                             const RuntimeShape& input1_shape,
@@ -3756,31 +3160,6 @@ inline void LstmCell(
       output_state_map.tanh();
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
-                     const float* prev_activ_data,
-                     const Dims<4>& prev_activ_dims, const float* weights_data,
-                     const Dims<4>& weights_dims, const float* bias_data,
-                     const Dims<4>& bias_dims, const float* prev_state_data,
-                     const Dims<4>& prev_state_dims, float* output_state_data,
-                     const Dims<4>& output_state_dims, float* output_activ_data,
-                     const Dims<4>& output_activ_dims, float* concat_temp_data,
-                     const Dims<4>& concat_temp_dims, float* activ_temp_data,
-                     const Dims<4>& activ_temp_dims) {
-  tflite::LstmCellParams op_params;
-  // Float LSTM cell does not need parameters to be set: leave untouched.
-
-  LstmCell(op_params, DimsToShape(input_dims), input_data,
-           DimsToShape(prev_activ_dims), prev_activ_data,
-           DimsToShape(weights_dims), weights_data, DimsToShape(bias_dims),
-           bias_data, DimsToShape(prev_state_dims), prev_state_data,
-           DimsToShape(output_state_dims), output_state_data,
-           DimsToShape(output_activ_dims), output_activ_data,
-           DimsToShape(concat_temp_dims), concat_temp_data,
-           DimsToShape(activ_temp_dims), activ_temp_data);
-}
-
 // Quantized LSTM cell. Currently just a copy of the reference impl in
 // reference_ops.h. See the big function comment there, not replicating it
 // here.
@@ -4071,37 +3450,6 @@ inline void LstmCell(
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <int StateIntegerBits>
-void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
-              const uint8* prev_activ_data_uint8,
-              const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8,
-              const Dims<4>& weights_dims, const int32* bias_data_int32,
-              const Dims<4>& bias_dims, const int16* prev_state_data_int16,
-              const Dims<4>& prev_state_dims, int16* output_state_data_int16,
-              const Dims<4>& output_state_dims, uint8* output_activ_data_uint8,
-              const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8,
-              const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16,
-              const Dims<4>& activ_temp_dims, int32 weights_zero_point,
-              int32 accum_multiplier, int accum_shift,
-              gemmlowp::GemmContext* gemm_context) {
-  tflite::LstmCellParams op_params;
-  op_params.weights_zero_point = weights_zero_point;
-  op_params.accum_multiplier = accum_multiplier;
-  op_params.accum_shift = accum_shift;
-
-  LstmCell<StateIntegerBits>(
-      op_params, DimsToShape(input_dims), input_data_uint8,
-      DimsToShape(prev_activ_dims), prev_activ_data_uint8,
-      DimsToShape(weights_dims), weights_data_uint8, DimsToShape(bias_dims),
-      bias_data_int32, DimsToShape(prev_state_dims), prev_state_data_int16,
-      DimsToShape(output_state_dims), output_state_data_int16,
-      DimsToShape(output_activ_dims), output_activ_data_uint8,
-      DimsToShape(concat_temp_dims), concat_temp_data_uint8,
-      DimsToShape(activ_temp_dims), activ_temp_data_int16, gemm_context);
-}
-
 inline int NodeOffset(int b, int h, int w, int height, int width) {
   return (b * height + h) * width + w;
 }
@@ -4561,16 +3909,6 @@ inline void Softmax(const SoftmaxParams& params,
   out_mat.array().rowwise() *= scale;
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Softmax(const float* input_data, const RuntimeShape& input_shape,
-                    float beta, float* output_data,
-                    const RuntimeShape& output_shape) {
-  SoftmaxParams params;
-  params.beta = beta;
-  Softmax(params, input_shape, input_data, output_shape, output_data);
-}
-
 inline void Softmax(const SoftmaxParams& params,
                     const RuntimeShape& input_shape, const uint8* input_data,
                     const RuntimeShape& output_shape, uint8* output_data) {
@@ -4782,19 +4120,6 @@ inline void Softmax(const SoftmaxParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Softmax(const uint8* input_data, const RuntimeShape& input_shape,
-                    int32 input_beta_multiplier, int32 input_beta_left_shift,
-                    int diff_min, uint8* output_data,
-                    const RuntimeShape& output_shape) {
-  SoftmaxParams params;
-  params.input_multiplier = input_beta_multiplier;
-  params.input_left_shift = input_beta_left_shift;
-  params.diff_min = diff_min;
-  Softmax(params, input_shape, input_data, output_shape, output_data);
-}
-
 // TODO(myenik): This is the same as the reference implementation, not actually
 // optimized yet.
 inline void LogSoftmax(const SoftmaxParams& params,
@@ -4832,15 +4157,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy
-inline void LogSoftmax(const float* input_data, const RuntimeShape& input_shape,
-                       float* output_data, const RuntimeShape& output_shape) {
-  SoftmaxParams params;
-  // No params currently used for float LogSoftmax.
-  LogSoftmax(params, input_shape, input_data, output_shape, output_data);
-}
-
 template <int OutputIntegerBits, int InputIntegerBits>
 inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
 log_x_for_x_greater_than_or_equal_to_1_impl(
@@ -5045,22 +4361,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void LogSoftmax(const uint8* input_data, const RuntimeShape& input_shape,
-                       int32 input_multiplier, int32 input_left_shift,
-                       int32 reverse_scaling_divisor,
-                       int32 reverse_scaling_right_shift, int diff_min,
-                       uint8* output_data, const RuntimeShape& output_shape) {
-  SoftmaxParams params;
-  params.input_multiplier = input_multiplier;
-  params.input_left_shift = input_left_shift;
-  params.reverse_scaling_divisor = reverse_scaling_divisor;
-  params.reverse_scaling_right_shift = reverse_scaling_right_shift;
-  params.diff_min = diff_min;
-  LogSoftmax(params, input_shape, input_data, output_shape, output_data);
-}
-
 inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
                      const RuntimeShape& output_shape, float* output_data) {
   gemmlowp::ScopedProfilingLabel label("Logistic");
@@ -5219,20 +4519,6 @@ inline void Logistic(const LogisticParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Logistic(const uint8* input_data, const RuntimeShape& input_shape,
-                     int32 input_zero_point, int32 input_range_radius,
-                     int32 input_multiplier, int input_left_shift,
-                     uint8* output_data, const RuntimeShape& output_shape) {
-  LogisticParams params;
-  params.input_zero_point = input_zero_point;
-  params.input_range_radius = input_range_radius;
-  params.input_multiplier = input_multiplier;
-  params.input_left_shift = input_left_shift;
-  Logistic(params, input_shape, input_data, output_shape, output_data);
-}
-
 inline void Logistic(const LogisticParams& params,
                      const RuntimeShape& input_shape, const int16* input_data,
                      const RuntimeShape& output_shape, int16* output_data) {
@@ -5294,24 +4580,6 @@ inline void Logistic(const LogisticParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy version.
-inline void Logistic(const RuntimeShape& input_shape, const int16* input_data,
-                     const RuntimeShape& output_shape, int16* output_data) {
-  LogisticParams params;
-  // No params currently needed by int16 Logistic.
-  Logistic(params, input_shape, input_data, output_shape, output_data);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy version.
-inline void Logistic(const int16* input_data, const RuntimeShape& input_shape,
-                     int16* output_data, const RuntimeShape& output_shape) {
-  LogisticParams params;
-  // No params currently needed by int16 Logistic.
-  Logistic(params, input_shape, input_data, output_shape, output_data);
-}
-
 inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
                  const RuntimeShape& output_shape, float* output_data) {
   gemmlowp::ScopedProfilingLabel label("Tanh");
@@ -5479,20 +4747,6 @@ inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Tanh(const uint8* input_data, const RuntimeShape& input_shape,
-                 int32 input_zero_point, int32 input_range_radius,
-                 int32 input_multiplier, int input_left_shift,
-                 uint8* output_data, const RuntimeShape& output_shape) {
-  TanhParams params;
-  params.input_zero_point = input_zero_point;
-  params.input_range_radius = input_range_radius;
-  params.input_multiplier = input_multiplier;
-  params.input_left_shift = input_left_shift;
-  Tanh(params, input_shape, input_data, output_shape, output_data);
-}
-
 inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
                  const int16* input_data, const RuntimeShape& output_shape,
                  int16* output_data) {
@@ -5594,16 +4848,6 @@ inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Tanh(const int16* input_data, const RuntimeShape& input_shape,
-                 int input_left_shift, int16* output_data,
-                 const RuntimeShape& output_shape) {
-  TanhParams params;
-  params.input_left_shift = input_left_shift;
-  Tanh(params, input_shape, input_data, output_shape, output_data);
-}
-
 template <typename SrcT, typename DstT>
 inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data,
                  const RuntimeShape& output_shape, DstT* output_data) {
@@ -6486,27 +5730,6 @@ void TransposeIm2col(const ConvParams& params, uint8 zero_byte,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <typename T>
-void TransposeIm2col(const T* input_data, const Dims<4>& input_dims,
-                     const Dims<4>& filter_dims, int stride_width,
-                     int stride_height, int pad_width, int pad_height,
-                     const Dims<4>& output_dims, uint8 zero_byte,
-                     T* im2col_data) {
-  tflite::ConvParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride_width;
-  op_params.stride_height = stride_height;
-
-  TransposeIm2col(op_params, zero_byte, DimsToShape(input_dims), input_data,
-                  DimsToShape(filter_dims), DimsToShape(output_dims),
-                  im2col_data);
-}
-
 inline void TransposeConv(
     const ConvParams& params, const RuntimeShape& input_shape,
     const float* input_data, const RuntimeShape& filter_shape,
@@ -6530,27 +5753,6 @@ inline void TransposeConv(
   Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
-                          const float* filter_data, const Dims<4>& filter_dims,
-                          int stride_width, int stride_height, int pad_width,
-                          int pad_height, float* output_data,
-                          const Dims<4>& output_dims, float* im2col_data,
-                          const Dims<4>& im2col_dims) {
-  tflite::ConvParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride_width;
-  op_params.stride_height = stride_height;
-
-  TransposeConv(op_params, DimsToShape(input_dims), input_data,
-                DimsToShape(filter_dims), filter_data, DimsToShape(output_dims),
-                output_data, DimsToShape(im2col_dims), im2col_data);
-}
-
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
index a8428528c9..11224270a4 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
@@ -94,81 +94,6 @@ inline void DepthwiseConv(
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
-                          const float* filter_data, const Dims<4>& filter_dims,
-                          const float* bias_data, const Dims<4>& bias_dims,
-                          int stride_width, int stride_height,
-                          int dilation_width_factor, int dilation_height_factor,
-                          int pad_width, int pad_height, int depth_multiplier,
-                          float output_activation_min,
-                          float output_activation_max, float* output_data,
-                          const Dims<4>& output_dims) {
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride_width;
-  op_params.stride_height = stride_height;
-  op_params.dilation_width_factor = dilation_width_factor;
-  op_params.dilation_height_factor = dilation_height_factor;
-  op_params.depth_multiplier = depth_multiplier;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  DepthwiseConv(op_params, DimsToShape(input_dims), input_data,
-                DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
-                bias_data, DimsToShape(output_dims), output_data);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
-                          const float* filter_data, const Dims<4>& filter_dims,
-                          const float* bias_data, const Dims<4>& bias_dims,
-                          int stride_width, int stride_height, int pad_width,
-                          int pad_height, int depth_multiplier,
-                          float output_activation_min,
-                          float output_activation_max, float* output_data,
-                          const Dims<4>& output_dims) {
-  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
-                bias_dims, stride_width, stride_height, 1, 1, pad_width,
-                pad_height, depth_multiplier, output_activation_min,
-                output_activation_max, output_data, output_dims);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy, for compatibility with old checked-in code.
-template <FusedActivationFunctionType Ac>
-void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
-                   const float* filter_data, const Dims<4>& filter_dims,
-                   const float* bias_data, const Dims<4>& bias_dims,
-                   int stride_width, int stride_height, int pad_width,
-                   int pad_height, int depth_multiplier, float* output_data,
-                   const Dims<4>& output_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
-                bias_dims, stride_width, stride_height, pad_width, pad_height,
-                depth_multiplier, output_activation_min, output_activation_max,
-                output_data, output_dims);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy, for compatibility with old checked-in code.
-template <FusedActivationFunctionType Ac>
-void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
-                   const float* filter_data, const Dims<4>& filter_dims,
-                   const float* bias_data, const Dims<4>& bias_dims, int stride,
-                   int pad_width, int pad_height, int depth_multiplier,
-                   float* output_data, const Dims<4>& output_dims) {
-  DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
-                    bias_dims, stride, stride, pad_width, pad_height,
-                    depth_multiplier, output_data, output_dims);
-}
-
 }  // end namespace reference_ops
 }  // end namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
index e8fc566502..eab28e6c84 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -25,9 +25,6 @@ limitations under the License.
 namespace tflite {
 namespace reference_ops {
 
-// TODO(b/80418076): Move to legacy ops file, along with invocations.
-static constexpr int kDepthwiseReverseShift = -1;
-
 inline void DepthwiseConv(
     const DepthwiseParams& params, const RuntimeShape& input_shape,
     const uint8* input_data, const RuntimeShape& filter_shape,
@@ -109,106 +106,6 @@ inline void DepthwiseConv(
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
-                          int32 input_offset, const uint8* filter_data,
-                          const Dims<4>& filter_dims, int32 filter_offset,
-                          const int32* bias_data, const Dims<4>& bias_dims,
-                          int stride_width, int stride_height,
-                          int dilation_width_factor, int dilation_height_factor,
-                          int pad_width, int pad_height, int depth_multiplier,
-                          int32 output_offset, int32 output_multiplier,
-                          int output_shift, int32 output_activation_min,
-                          int32 output_activation_max, uint8* output_data,
-                          const Dims<4>& output_dims) {
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride_width;
-  op_params.stride_height = stride_height;
-  op_params.dilation_width_factor = dilation_width_factor;
-  op_params.dilation_height_factor = dilation_height_factor;
-  op_params.depth_multiplier = depth_multiplier;
-  op_params.quantized_activation_min = output_activation_min;
-  op_params.quantized_activation_max = output_activation_max;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = kDepthwiseReverseShift * output_shift;
-
-  DepthwiseConv(op_params, DimsToShape(input_dims), input_data,
-                DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
-                bias_data, DimsToShape(output_dims), output_data);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
-                          int32 input_offset, const uint8* filter_data,
-                          const Dims<4>& filter_dims, int32 filter_offset,
-                          const int32* bias_data, const Dims<4>& bias_dims,
-                          int stride_width, int stride_height, int pad_width,
-                          int pad_height, int depth_multiplier,
-                          int32 output_offset, int32 output_multiplier,
-                          int output_shift, int32 output_activation_min,
-                          int32 output_activation_max, uint8* output_data,
-                          const Dims<4>& output_dims) {
-  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
-                filter_offset, bias_data, bias_dims, stride_width,
-                stride_height, 1, 1, pad_width, pad_height, depth_multiplier,
-                output_offset, output_multiplier, output_shift,
-                output_activation_min, output_activation_max, output_data,
-                output_dims);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy, for compatibility with old checked-in code.
-template <FusedActivationFunctionType Ac>
-void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
-                   int32 input_offset, const uint8* filter_data,
-                   const Dims<4>& filter_dims, int32 filter_offset,
-                   const int32* bias_data, const Dims<4>& bias_dims,
-                   int stride_width, int stride_height, int pad_width,
-                   int pad_height, int depth_multiplier, int32 output_offset,
-                   int32 output_multiplier, int output_shift,
-                   int32 output_activation_min, int32 output_activation_max,
-                   uint8* output_data, const Dims<4>& output_dims) {
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
-  }
-  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
-                filter_offset, bias_data, bias_dims, stride_width,
-                stride_height, pad_width, pad_height, depth_multiplier,
-                output_offset, output_multiplier, output_shift,
-                output_activation_min, output_activation_max, output_data,
-                output_dims);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy, for compatibility with old checked-in code.
-template <FusedActivationFunctionType Ac>
-void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
-                   int32 input_offset, const uint8* filter_data,
-                   const Dims<4>& filter_dims, int32 filter_offset,
-                   const int32* bias_data, const Dims<4>& bias_dims, int stride,
-                   int pad_width, int pad_height, int depth_multiplier,
-                   int32 output_offset, int32 output_multiplier,
-                   int output_shift, int32 output_activation_min,
-                   int32 output_activation_max, uint8* output_data,
-                   const Dims<4>& output_dims) {
-  DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data,
-                    filter_dims, filter_offset, bias_data, bias_dims, stride,
-                    stride, pad_width, pad_height, depth_multiplier,
-                    output_offset, output_multiplier, output_shift,
-                    output_activation_min, output_activation_max, output_data,
-                    output_dims);
-}
-
 }  // end namespace reference_ops
 }  // end namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h b/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h
index 23325e8c4c..3c7fd29256 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h
@@ -62,39 +62,6 @@ inline void FullyConnected(
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
-                           const float* weights_data,
-                           const Dims<4>& weights_dims, const float* bias_data,
-                           const Dims<4>& bias_dims,
-                           float output_activation_min,
-                           float output_activation_max, float* output_data,
-                           const Dims<4>& output_dims) {
-  tflite::FullyConnectedParams op_params;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  FullyConnected(op_params, DimsToShape(input_dims), input_data,
-                 DimsToShape(weights_dims), weights_data,
-                 DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
-                 output_data);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void FullyConnected(const float* input_data, const Dims<4>& input_dims,
-                    const float* weights_data, const Dims<4>& weights_dims,
-                    const float* bias_data, const Dims<4>& bias_dims,
-                    float* output_data, const Dims<4>& output_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-  FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
-                 bias_dims, output_activation_min, output_activation_max,
-                 output_data, output_dims);
-}
-
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const uint8* input_data, const RuntimeShape& filter_shape,
@@ -144,32 +111,6 @@ inline void FullyConnected(
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
-                           int32 input_offset, const uint8* filter_data,
-                           const Dims<4>& filter_dims, int32 filter_offset,
-                           const int32* bias_data, const Dims<4>& bias_dims,
-                           int32 output_offset, int32 output_multiplier,
-                           int output_shift, int32 output_activation_min,
-                           int32 output_activation_max, uint8* output_data,
-                           const Dims<4>& output_dims, void* gemm_context) {
-  tflite::FullyConnectedParams op_params;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = kReverseShift * output_shift;
-  op_params.quantized_activation_min = output_activation_min;
-  op_params.quantized_activation_max = output_activation_max;
-
-  FullyConnected(op_params, DimsToShape(input_dims), input_data,
-                 DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
-                 bias_data, DimsToShape(output_dims), output_data,
-                 gemm_context);
-}
-
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const uint8* input_data, const RuntimeShape& filter_shape,
@@ -224,32 +165,6 @@ inline void FullyConnected(
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
-                           int32 input_offset, const uint8* filter_data,
-                           const Dims<4>& filter_dims, int32 filter_offset,
-                           const int32* bias_data, const Dims<4>& bias_dims,
-                           int32 output_offset, int32 output_multiplier,
-                           int output_shift, int32 output_activation_min,
-                           int32 output_activation_max, int16* output_data,
-                           const Dims<4>& output_dims, void* gemm_context) {
-  tflite::FullyConnectedParams op_params;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = kReverseShift * output_shift;
-  op_params.quantized_activation_min = output_activation_min;
-  op_params.quantized_activation_max = output_activation_max;
-
-  FullyConnected(op_params, DimsToShape(input_dims), input_data,
-                 DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
-                 bias_data, DimsToShape(output_dims), output_data,
-                 gemm_context);
-}
-
 inline void ShuffledFullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const uint8* input_data, const RuntimeShape& weights_shape,
@@ -405,55 +320,6 @@ inline void ShuffledFullyConnected(
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void ShuffledFullyConnected(
-    const uint8* input_data, const Dims<4>& input_dims,
-    const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
-    const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
-    int output_shift, int32 output_activation_min, int32 output_activation_max,
-    int16* output_data, const Dims<4>& output_dims,
-    uint8* shuffled_input_workspace_data, void* gemm_context) {
-  tflite::FullyConnectedParams op_params;
-  op_params.output_multiplier = output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = kReverseShift * output_shift;
-  op_params.quantized_activation_min = output_activation_min;
-  op_params.quantized_activation_max = output_activation_max;
-
-  ShuffledFullyConnected(op_params, DimsToShape(input_dims), input_data,
-                         DimsToShape(weights_dims), shuffled_weights_data,
-                         DimsToShape(bias_dims), bias_data,
-                         DimsToShape(output_dims), output_data,
-                         shuffled_input_workspace_data, gemm_context);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
-                    int32 input_offset, const uint8* filter_data,
-                    const Dims<4>& filter_dims, int32 filter_offset,
-                    const int32* bias_data, const Dims<4>& bias_dims,
-                    int32 output_offset, int32 output_multiplier,
-                    int output_shift, int32 output_activation_min,
-                    int32 output_activation_max, uint8* output_data,
-                    const Dims<4>& output_dims, void* gemm_context) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
-  }
-  FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
-                 filter_offset, bias_data, bias_dims, output_offset,
-                 output_multiplier, output_shift, output_activation_min,
-                 output_activation_max, output_data, output_dims, gemm_context);
-}
-
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
index 683ccdc74d..be99240b1f 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <sys/types.h>
 
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
 
@@ -26,6 +28,1070 @@ namespace tflite {
 
 namespace reference_ops {
 
+static constexpr int kDepthwiseReverseShift = -1;
+
+inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          const float* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height,
+                          int dilation_width_factor, int dilation_height_factor,
+                          int pad_width, int pad_height, int depth_multiplier,
+                          float output_activation_min,
+                          float output_activation_max, float* output_data,
+                          const Dims<4>& output_dims) {
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.depth_multiplier = depth_multiplier;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  DepthwiseConv(op_params, DimsToShape(input_dims), input_data,
+                DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+                bias_data, DimsToShape(output_dims), output_data);
+}
+
+inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          const float* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, int depth_multiplier,
+                          float output_activation_min,
+                          float output_activation_max, float* output_data,
+                          const Dims<4>& output_dims) {
+  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
+                bias_dims, stride_width, stride_height, 1, 1, pad_width,
+                pad_height, depth_multiplier, output_activation_min,
+                output_activation_max, output_data, output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int depth_multiplier, float* output_data,
+                   const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
+                bias_dims, stride_width, stride_height, pad_width, pad_height,
+                depth_multiplier, output_activation_min, output_activation_max,
+                output_data, output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims, int stride,
+                   int pad_width, int pad_height, int depth_multiplier,
+                   float* output_data, const Dims<4>& output_dims) {
+  DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+                    bias_dims, stride, stride, pad_width, pad_height,
+                    depth_multiplier, output_data, output_dims);
+}
+
+inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                          int32 input_offset, const uint8* filter_data,
+                          const Dims<4>& filter_dims, int32 filter_offset,
+                          const int32* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height,
+                          int dilation_width_factor, int dilation_height_factor,
+                          int pad_width, int pad_height, int depth_multiplier,
+                          int32 output_offset, int32 output_multiplier,
+                          int output_shift, int32 output_activation_min,
+                          int32 output_activation_max, uint8* output_data,
+                          const Dims<4>& output_dims) {
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.depth_multiplier = depth_multiplier;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kDepthwiseReverseShift * output_shift;
+
+  DepthwiseConv(op_params, DimsToShape(input_dims), input_data,
+                DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+                bias_data, DimsToShape(output_dims), output_data);
+}
+
+inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                          int32 input_offset, const uint8* filter_data,
+                          const Dims<4>& filter_dims, int32 filter_offset,
+                          const int32* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, int depth_multiplier,
+                          int32 output_offset, int32 output_multiplier,
+                          int output_shift, int32 output_activation_min,
+                          int32 output_activation_max, uint8* output_data,
+                          const Dims<4>& output_dims) {
+  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
+                filter_offset, bias_data, bias_dims, stride_width,
+                stride_height, 1, 1, pad_width, pad_height, depth_multiplier,
+                output_offset, output_multiplier, output_shift,
+                output_activation_min, output_activation_max, output_data,
+                output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                   int32 input_offset, const uint8* filter_data,
+                   const Dims<4>& filter_dims, int32 filter_offset,
+                   const int32* bias_data, const Dims<4>& bias_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int depth_multiplier, int32 output_offset,
+                   int32 output_multiplier, int output_shift,
+                   int32 output_activation_min, int32 output_activation_max,
+                   uint8* output_data, const Dims<4>& output_dims) {
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
+                filter_offset, bias_data, bias_dims, stride_width,
+                stride_height, pad_width, pad_height, depth_multiplier,
+                output_offset, output_multiplier, output_shift,
+                output_activation_min, output_activation_max, output_data,
+                output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                   int32 input_offset, const uint8* filter_data,
+                   const Dims<4>& filter_dims, int32 filter_offset,
+                   const int32* bias_data, const Dims<4>& bias_dims, int stride,
+                   int pad_width, int pad_height, int depth_multiplier,
+                   int32 output_offset, int32 output_multiplier,
+                   int output_shift, int32 output_activation_min,
+                   int32 output_activation_max, uint8* output_data,
+                   const Dims<4>& output_dims) {
+  DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data,
+                    filter_dims, filter_offset, bias_data, bias_dims, stride,
+                    stride, pad_width, pad_height, depth_multiplier,
+                    output_offset, output_multiplier, output_shift,
+                    output_activation_min, output_activation_max, output_data,
+                    output_dims);
+}
+
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+                 const float* filter_data, const Dims<4>& filter_dims,
+                 const float* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int dilation_width_factor,
+                 int dilation_height_factor, int pad_width, int pad_height,
+                 float output_activation_min, float output_activation_max,
+                 float* output_data, const Dims<4>& output_dims,
+                 float* im2col_data, const Dims<4>& im2col_dims) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims),
+       filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+       output_data, DimsToShape(im2col_dims), im2col_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int dilation_width_factor,
+          int dilation_height_factor, int pad_width, int pad_height,
+          float* output_data, const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+       stride_width, stride_height, dilation_width_factor,
+       dilation_height_factor, pad_width, pad_height, output_activation_min,
+       output_activation_max, output_data, output_dims, im2col_data,
+       im2col_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+       stride_width, stride_height, 1, 1, pad_width, pad_height,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride,
+          int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+           bias_dims, stride, stride, 1, 1, pad_width, pad_height, output_data,
+           output_dims, im2col_data, im2col_dims);
+}
+
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int dilation_width_factor,
+                 int dilation_height_factor, int pad_width, int pad_height,
+                 int32 output_offset, int32 output_multiplier, int output_shift,
+                 int32 output_activation_min, int32 output_activation_max,
+                 uint8* output_data, const Dims<4>& output_dims,
+                 uint8* im2col_data, const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims),
+       filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+       output_data, DimsToShape(im2col_dims), im2col_data, gemm_context);
+}
+
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32 output_offset, int32 output_multiplier,
+                 int output_shift, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims, uint8* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemm_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32 output_offset, int32 output_multiplier,
+                 int output_shift, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims, uint8* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemm_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const uint8* input_data, const Dims<4>& input_dims,
+          int32 input_offset, const uint8* filter_data,
+          const Dims<4>& filter_dims, int32 filter_offset,
+          const int32* bias_data, const Dims<4>& bias_dims, int stride,
+          int pad_width, int pad_height, int32 output_offset,
+          int32 output_multiplier, int output_shift,
+          int32 output_activation_min, int32 output_activation_max,
+          uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
+          const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
+  Conv<Ac>(input_data, input_dims, input_offset, filter_data, filter_dims,
+           filter_offset, bias_data, bias_dims, stride, stride, pad_width,
+           pad_height, output_offset, output_multiplier, output_shift,
+           output_activation_min, output_activation_max, output_data,
+           output_dims, im2col_data, im2col_dims, gemm_context);
+}
+
+inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, float* output_data,
+                          const Dims<4>& output_dims, float* im2col_data,
+                          const Dims<4>& im2col_dims) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+
+  TransposeConv(op_params, DimsToShape(input_dims), input_data,
+                DimsToShape(filter_dims), filter_data, DimsToShape(output_dims),
+                output_data, DimsToShape(im2col_dims), im2col_data);
+}
+
+inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                           const float* weights_data,
+                           const Dims<4>& weights_dims, const float* bias_data,
+                           const Dims<4>& bias_dims,
+                           float output_activation_min,
+                           float output_activation_max, float* output_data,
+                           const Dims<4>& output_dims) {
+  tflite::FullyConnectedParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  FullyConnected(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(weights_dims), weights_data,
+                 DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+                 output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                    const float* weights_data, const Dims<4>& weights_dims,
+                    const float* bias_data, const Dims<4>& bias_dims,
+                    float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
+                 bias_dims, output_activation_min, output_activation_max,
+                 output_data, output_dims);
+}
+
+inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+                           int32 input_offset, const uint8* filter_data,
+                           const Dims<4>& filter_dims, int32 filter_offset,
+                           const int32* bias_data, const Dims<4>& bias_dims,
+                           int32 output_offset, int32 output_multiplier,
+                           int output_shift, int32 output_activation_min,
+                           int32 output_activation_max, uint8* output_data,
+                           const Dims<4>& output_dims,
+                           gemmlowp::GemmContext* gemm_context) {
+  tflite::FullyConnectedParams op_params;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  FullyConnected(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+                 bias_data, DimsToShape(output_dims), output_data,
+                 gemm_context);
+}
+
+inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+                           int32 input_offset, const uint8* filter_data,
+                           const Dims<4>& filter_dims, int32 filter_offset,
+                           const int32* bias_data, const Dims<4>& bias_dims,
+                           int32 output_offset, int32 output_multiplier,
+                           int output_shift, int32 output_activation_min,
+                           int32 output_activation_max, int16* output_data,
+                           const Dims<4>& output_dims,
+                           gemmlowp::GemmContext* gemm_context) {
+  tflite::FullyConnectedParams op_params;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  FullyConnected(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+                 bias_data, DimsToShape(output_dims), output_data,
+                 gemm_context);
+}
+
+inline void ShuffledFullyConnected(
+    const uint8* input_data, const Dims<4>& input_dims,
+    const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
+    const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    int16* output_data, const Dims<4>& output_dims,
+    uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) {
+  tflite::FullyConnectedParams op_params;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  ShuffledFullyConnected(op_params, DimsToShape(input_dims), input_data,
+                         DimsToShape(weights_dims), shuffled_weights_data,
+                         DimsToShape(bias_dims), bias_data,
+                         DimsToShape(output_dims), output_data,
+                         shuffled_input_workspace_data, gemm_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+                    int32 input_offset, const uint8* filter_data,
+                    const Dims<4>& filter_dims, int32 filter_offset,
+                    const int32* bias_data, const Dims<4>& bias_dims,
+                    int32 output_offset, int32 output_multiplier,
+                    int output_shift, int32 output_activation_min,
+                    int32 output_activation_max, uint8* output_data,
+                    const Dims<4>& output_dims,
+                    gemmlowp::GemmContext* gemm_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
+                 filter_offset, bias_data, bias_dims, output_offset,
+                 output_multiplier, output_shift, output_activation_min,
+                 output_activation_max, output_data, output_dims, gemm_context);
+}
+
+inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
+                     const float* prev_activ_data,
+                     const Dims<4>& prev_activ_dims, const float* weights_data,
+                     const Dims<4>& weights_dims, const float* bias_data,
+                     const Dims<4>& bias_dims, const float* prev_state_data,
+                     const Dims<4>& prev_state_dims, float* output_state_data,
+                     const Dims<4>& output_state_dims, float* output_activ_data,
+                     const Dims<4>& output_activ_dims, float* concat_temp_data,
+                     const Dims<4>& concat_temp_dims, float* activ_temp_data,
+                     const Dims<4>& activ_temp_dims) {
+  tflite::LstmCellParams op_params;
+  // Float LSTM cell does not need parameters to be set: leave untouched.
+
+  LstmCell(op_params, DimsToShape(input_dims), input_data,
+           DimsToShape(prev_activ_dims), prev_activ_data,
+           DimsToShape(weights_dims), weights_data, DimsToShape(bias_dims),
+           bias_data, DimsToShape(prev_state_dims), prev_state_data,
+           DimsToShape(output_state_dims), output_state_data,
+           DimsToShape(output_activ_dims), output_activ_data,
+           DimsToShape(concat_temp_dims), concat_temp_data,
+           DimsToShape(activ_temp_dims), activ_temp_data);
+}
+
+template <int StateIntegerBits>
+void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
+              const uint8* prev_activ_data_uint8,
+              const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8,
+              const Dims<4>& weights_dims, const int32* bias_data_int32,
+              const Dims<4>& bias_dims, const int16* prev_state_data_int16,
+              const Dims<4>& prev_state_dims, int16* output_state_data_int16,
+              const Dims<4>& output_state_dims, uint8* output_activ_data_uint8,
+              const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8,
+              const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16,
+              const Dims<4>& activ_temp_dims, int32 weights_zero_point,
+              int32 accum_multiplier, int accum_shift,
+              gemmlowp::GemmContext* gemm_context) {
+  tflite::LstmCellParams op_params;
+  op_params.weights_zero_point = weights_zero_point;
+  op_params.accum_multiplier = accum_multiplier;
+  op_params.accum_shift = accum_shift;
+
+  LstmCell<StateIntegerBits>(
+      op_params, DimsToShape(input_dims), input_data_uint8,
+      DimsToShape(prev_activ_dims), prev_activ_data_uint8,
+      DimsToShape(weights_dims), weights_data_uint8, DimsToShape(bias_dims),
+      bias_data_int32, DimsToShape(prev_state_dims), prev_state_data_int16,
+      DimsToShape(output_state_dims), output_state_data_int16,
+      DimsToShape(output_activ_dims), output_activ_data_uint8,
+      DimsToShape(concat_temp_dims), concat_temp_data_uint8,
+      DimsToShape(activ_temp_dims), activ_temp_data_int16, gemm_context);
+}
+
+template <typename T>
+void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  BroadcastDiv4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void Div(const T* input1_data, const Dims<4>& input1_dims,
+                const T* input2_data, const Dims<4>& input2_dims,
+                T output_activation_min, T output_activation_max,
+                T* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  Div(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+inline void Concatenation(int concat_dim, const Scalar* const* input_data,
+                          const Dims<4>* const* input_dims, int inputs_count,
+                          Scalar* output_data, const Dims<4>& output_dims) {
+  // For now we don't have a model with a Concatenation with fused activation.
+  TFLITE_DCHECK_EQ(Ac, FusedActivationFunctionType::kNone);
+
+  std::vector<RuntimeShape> input_shapes(inputs_count);
+  std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count);
+  for (int i = 0; i < inputs_count; ++i) {
+    ShapeFromDims(*input_dims[i], &input_shapes[i]);
+    input_shapes_indirect[i] = &input_shapes[i];
+  }
+  tflite::ConcatenationParams op_params;
+  op_params.axis = 3 - concat_dim;
+  op_params.inputs_count = inputs_count;
+
+  Concatenation(op_params, input_shapes_indirect.data(), input_data,
+                DimsToShape(output_dims), output_data);
+}
+
+inline void Concatenation(int concat_dim, const uint8* const* input_data,
+                          const Dims<4>* const* input_dims,
+                          const int32* input_zeropoint,
+                          const float* input_scale, int inputs_count,
+                          uint8* output_data, const Dims<4>& output_dims,
+                          const int32 output_zeropoint,
+                          const float output_scale) {
+  std::vector<RuntimeShape> input_shapes(inputs_count);
+  std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count);
+  for (int i = 0; i < inputs_count; ++i) {
+    ShapeFromDims(*input_dims[i], &input_shapes[i]);
+    input_shapes_indirect[i] = &input_shapes[i];
+  }
+  tflite::ConcatenationParams op_params;
+  op_params.axis = 3 - concat_dim;
+  op_params.input_zeropoint = input_zeropoint;
+  op_params.input_scale = input_scale;
+  op_params.inputs_count = inputs_count;
+  op_params.output_zeropoint = output_zeropoint;
+  op_params.output_scale = output_scale;
+
+  ConcatenationWithScaling(op_params, input_shapes_indirect.data(), input_data,
+                           DimsToShape(output_dims), output_data);
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void DepthConcatenation(const Scalar* const* input_data,
+                        const Dims<4>* const* input_dims, int inputs_count,
+                        Scalar* output_data, const Dims<4>& output_dims) {
+  // For now we don't have a model with a Concatenation with fused activation.
+  TFLITE_DCHECK_EQ(Ac, FusedActivationFunctionType::kNone);
+
+  std::vector<RuntimeShape> input_shapes(inputs_count);
+  std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count);
+  for (int i = 0; i < inputs_count; ++i) {
+    ShapeFromDims(*input_dims[i], &input_shapes[i]);
+    input_shapes_indirect[i] = &input_shapes[i];
+  }
+  tflite::ConcatenationParams op_params;
+  op_params.inputs_count = inputs_count;
+
+  DepthConcatenation(op_params, input_shapes_indirect.data(), input_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+template <typename Scalar>
+void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
+                     int axis, int outputs_count, Scalar* const* output_data,
+                     const Dims<4>* const* output_dims) {
+  std::vector<RuntimeShape> output_shapes(outputs_count);
+  std::vector<const RuntimeShape*> output_shapes_indirect(outputs_count);
+  for (int i = 0; i < outputs_count; ++i) {
+    ShapeFromDims(*output_dims[i], &output_shapes[i]);
+    output_shapes_indirect[i] = &output_shapes[i];
+  }
+  tflite::SplitParams op_params;
+  op_params.axis = 3 - axis;
+  op_params.num_split = outputs_count;
+
+  Split(op_params, DimsToShape(input_dims), input_data,
+        output_shapes_indirect.data(), output_data);
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
+                     int outputs_count, Scalar* const* output_data,
+                     const Dims<4>* const* output_dims) {
+  TFLITE_DCHECK_GE(outputs_count, 1);
+  for (int i = 0; i < outputs_count; i++) {
+    /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3);
+    /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2);
+    /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1);
+  }
+  // For now we don't have a model with a Split with fused activation.
+  TFLITE_DCHECK_EQ(Ac, FusedActivationFunctionType::kNone);
+
+  TensorFlowSplit(input_data, input_dims, /*axis=*/0, outputs_count,
+                  output_data, output_dims);
+}
+
+inline void Softmax(const float* input_data, const RuntimeShape& input_shape,
+                    float beta, float* output_data,
+                    const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  params.beta = beta;
+  Softmax(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Softmax(const uint8* input_data, const RuntimeShape& input_shape,
+                    int32 input_beta_multiplier, int32 input_beta_left_shift,
+                    int diff_min, uint8* output_data,
+                    const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  params.input_multiplier = input_beta_multiplier;
+  params.input_left_shift = input_beta_left_shift;
+  params.diff_min = diff_min;
+  Softmax(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void LogSoftmax(const float* input_data, const RuntimeShape& input_shape,
+                       float* output_data, const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  // No params currently used for float LogSoftmax.
+  LogSoftmax(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void LogSoftmax(const uint8* input_data, const RuntimeShape& input_shape,
+                       int32 input_multiplier, int32 input_left_shift,
+                       int32 reverse_scaling_divisor,
+                       int32 reverse_scaling_right_shift, int diff_min,
+                       uint8* output_data, const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  params.input_multiplier = input_multiplier;
+  params.input_left_shift = input_left_shift;
+  params.reverse_scaling_divisor = reverse_scaling_divisor;
+  params.reverse_scaling_right_shift = reverse_scaling_right_shift;
+  params.diff_min = diff_min;
+  LogSoftmax(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Logistic(const uint8* input_data, const RuntimeShape& input_shape,
+                     int32 input_zero_point, int32 input_range_radius,
+                     int32 input_multiplier, int input_left_shift,
+                     uint8* output_data, const RuntimeShape& output_shape) {
+  LogisticParams params;
+  params.input_zero_point = input_zero_point;
+  params.input_range_radius = input_range_radius;
+  params.input_multiplier = input_multiplier;
+  params.input_left_shift = input_left_shift;
+  Logistic(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Logistic(const RuntimeShape& input_shape, const int16* input_data,
+                     const RuntimeShape& output_shape, int16* output_data) {
+  LogisticParams params;
+  // No params currently needed by int16 Logistic.
+  Logistic(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Tanh(const uint8* input_data, const RuntimeShape& input_shape,
+                 int32 input_zero_point, int32 input_range_radius,
+                 int32 input_multiplier, int input_left_shift,
+                 uint8* output_data, const RuntimeShape& output_shape) {
+  TanhParams params;
+  params.input_zero_point = input_zero_point;
+  params.input_range_radius = input_range_radius;
+  params.input_multiplier = input_multiplier;
+  params.input_left_shift = input_left_shift;
+  Tanh(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Tanh(const int16* input_data, const RuntimeShape& input_shape,
+                 int input_left_shift, int16* output_data,
+                 const RuntimeShape& output_shape) {
+  TanhParams params;
+  params.input_left_shift = input_left_shift;
+  Tanh(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
+                       int32 zero_point, double scale, float* output_data,
+                       const Dims<4>& output_dims) {
+  tflite::DequantizationParams op_params;
+  op_params.zero_point = zero_point;
+  op_params.scale = scale;
+
+  Dequantize(op_params, DimsToShape(input_dims), input_data,
+             DimsToShape(output_dims), output_data);
+}
+
+inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
+                      float rmin, float rmax, int num_bits, float* output_data,
+                      const Dims<4>& output_dims) {
+  tflite::FakeQuantParams op_params;
+  op_params.num_bits = num_bits;
+  op_params.minmax.min = rmin;
+  op_params.minmax.max = rmax;
+
+  FakeQuant(op_params, DimsToShape(input_dims), input_data,
+            DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void Gather(const T* input_data, const Dims<4>& input_dims,
+                   int input_rank, const int32* coords_data,
+                   const Dims<4>& coords_dims, T* output_data,
+                   const Dims<4>& output_dims) {
+  tflite::GatherParams op_params;
+  op_params.input_rank = input_rank;
+
+  Gather(op_params, DimsToShape(input_dims), input_data,
+         DimsToShape(coords_dims), coords_data, DimsToShape(output_dims),
+         output_data);
+}
+
+inline uint32 LegacyReverseBits32(uint32 n) {
+  n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1);
+  n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2);
+  n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4);
+  return (((n & 0xFF) << 24) | ((n & 0xFF00) << 8) | ((n & 0xFF0000) >> 8) |
+          ((n & 0xFF000000) >> 24));
+}
+
+inline void StridedSliceReverseIndices(tflite::StridedSliceParams* p) {
+  TFLITE_CHECK_EQ(p->start_indices_count, p->stop_indices_count);
+  TFLITE_CHECK_EQ(p->stop_indices_count, p->strides_count);
+
+  std::reverse(p->start_indices, p->start_indices + p->start_indices_count);
+  std::reverse(p->stop_indices, p->stop_indices + p->stop_indices_count);
+  std::reverse(p->strides, p->strides + p->strides_count);
+
+  p->begin_mask = LegacyReverseBits32(static_cast<uint32>(p->begin_mask)) >>
+                  (32 - p->start_indices_count);
+  p->ellipsis_mask =
+      LegacyReverseBits32(static_cast<uint32>(p->ellipsis_mask)) >>
+      (32 - p->start_indices_count);
+  p->end_mask = LegacyReverseBits32(static_cast<uint32>(p->end_mask)) >>
+                (32 - p->start_indices_count);
+  p->new_axis_mask =
+      LegacyReverseBits32(static_cast<uint32>(p->new_axis_mask)) >>
+      (32 - p->start_indices_count);
+  p->shrink_axis_mask =
+      LegacyReverseBits32(static_cast<uint32>(p->shrink_axis_mask)) >>
+      (32 - p->start_indices_count);
+}
+
+template <typename T>
+inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
+                         int begin_mask, int end_mask, int shrink_axis_mask,
+                         const std::vector<int>& start_indices,
+                         const std::vector<int>& stop_indices,
+                         const std::vector<int>& strides, T* output_data,
+                         const Dims<4>& output_dims) {
+  TFLITE_DCHECK_EQ(start_indices.size(), 4);
+  auto op_params = strided_slice::BuildStridedSliceParams(
+      begin_mask, end_mask, shrink_axis_mask, start_indices, stop_indices,
+      strides);
+  StridedSliceReverseIndices(&op_params);
+
+  StridedSlice(op_params, DimsToShape(input_dims), input_data,
+               DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void Mean(const T* input_data, const Dims<4>& input_dims,
+                 const std::vector<int>& reduction_indices, T* output_data,
+                 const Dims<4>& output_dims) {
+  tflite::MeanParams op_params;
+  op_params.axis_count = reduction_indices.size();
+  for (int i = 0; i < op_params.axis_count; ++i) {
+    op_params.axis[i] = reduction_indices[op_params.axis_count - 1 - i];
+  }
+
+  Mean(op_params, DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+       output_data);
+}
+
+template <typename T>
+void Transpose(const T* input, const Dims<4>& input_dims, T* output,
+               const Dims<4>& output_dims, const int* permuted_axes) {
+  TransposeParams params;
+  params.perm_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    params.perm[i] = 3 - permuted_axes[3 - i];
+  }
+  Transpose(params, DimsToShape(input_dims), input, DimsToShape(output_dims),
+            output);
+}
+
+template <typename T, ComparisonFn<T> F>
+inline void Comparison(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, const Dims<4>& input2_dims,
+                       bool* output_data, const Dims<4>& output_dims) {
+  ComparisonParams op_params;
+  // No parameters needed.
+  ComparisonImpl<T, F>(op_params, DimsToShape(input1_dims), input1_data,
+                       DimsToShape(input2_dims), input2_data,
+                       DimsToShape(output_dims), output_data);
+}
+
+template <typename T, ComparisonFn<int32> F>
+inline void Comparison(int left_shift, const T* input1_data,
+                       const Dims<4>& input1_dims, int32 input1_offset,
+                       int32 input1_multiplier, int input1_shift,
+                       const T* input2_data, const Dims<4>& input2_dims,
+                       int32 input2_offset, int32 input2_multiplier,
+                       int input2_shift, bool* output_data,
+                       const Dims<4>& output_dims) {
+  tflite::ComparisonParams op_params;
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.input2_shift = kReverseShift * input2_shift;
+
+  ComparisonWithScaling<T, F>(op_params, DimsToShape(input1_dims), input1_data,
+                              DimsToShape(input2_dims), input2_data,
+                              DimsToShape(output_dims), output_data);
+}
+
+template <typename T, ComparisonFn<T> F>
+inline void BroadcastComparison(const T* input1_data,
+                                const Dims<4>& input1_dims,
+                                const T* input2_data,
+                                const Dims<4>& input2_dims, bool* output_data,
+                                const Dims<4>& output_dims) {
+  ComparisonParams op_params;
+  // No parameters needed.
+  BroadcastComparison4DSlowImpl<T, F>(op_params, DimsToShape(input1_dims),
+                                      input1_data, DimsToShape(input2_dims),
+                                      input2_data, DimsToShape(output_dims),
+                                      output_data);
+}
+
+template <typename T, ComparisonFn<int32> F>
+inline void BroadcastComparison(int left_shift, const T* input1_data,
+                                const Dims<4>& input1_dims, int32 input1_offset,
+                                int32 input1_multiplier, int input1_shift,
+                                const T* input2_data,
+                                const Dims<4>& input2_dims, int32 input2_offset,
+                                int32 input2_multiplier, int input2_shift,
+                                bool* output_data, const Dims<4>& output_dims) {
+  ComparisonParams op_params;
+
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.input2_shift = kReverseShift * input2_shift;
+
+  BroadcastComparison4DSlowWithScaling<T, F>(
+      op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+#define TFLITE_LEGACY_COMPARISON_OP(name)                                     \
+  template <typename T>                                                       \
+  inline void name(const T* input1_data, const Dims<4>& input1_dims,          \
+                   const T* input2_data, const Dims<4>& input2_dims,          \
+                   bool* output_data, const Dims<4>& output_dims) {           \
+    gemmlowp::ScopedProfilingLabel label(#name);                              \
+    Comparison<T, name##Fn>(input1_data, input1_dims, input2_data,            \
+                            input2_dims, output_data, output_dims);           \
+  }                                                                           \
+  template <typename T>                                                       \
+  inline void name(                                                           \
+      int left_shift, const T* input1_data, const Dims<4>& input1_dims,       \
+      int32 input1_offset, int32 input1_multiplier, int input1_shift,         \
+      const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset,  \
+      int32 input2_multiplier, int input2_shift, bool* output_data,           \
+      const Dims<4>& output_dims) {                                           \
+    gemmlowp::ScopedProfilingLabel label(#name "/8bit");                      \
+    Comparison<T, name##Fn>(left_shift, input1_data, input1_dims,             \
+                            input1_offset, input1_multiplier, input1_shift,   \
+                            input2_data, input2_dims, input2_offset,          \
+                            input2_multiplier, input2_shift, output_data,     \
+                            output_dims);                                     \
+  }                                                                           \
+  template <typename T>                                                       \
+  inline void Broadcast##name(                                                \
+      const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, \
+      const Dims<4>& input2_dims, bool* output_data,                          \
+      const Dims<4>& output_dims) {                                           \
+    gemmlowp::ScopedProfilingLabel label("Broadcast" #name);                  \
+    BroadcastComparison<T, name##Fn>(input1_data, input1_dims, input2_data,   \
+                                     input2_dims, output_data, output_dims);  \
+  }                                                                           \
+  template <typename T>                                                       \
+  inline void Broadcast##name(                                                \
+      int left_shift, const T* input1_data, const Dims<4>& input1_dims,       \
+      int32 input1_offset, int32 input1_multiplier, int input1_shift,         \
+      const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset,  \
+      int32 input2_multiplier, int input2_shift, bool* output_data,           \
+      const Dims<4>& output_dims) {                                           \
+    gemmlowp::ScopedProfilingLabel label("Broadcast" #name "/8bit");          \
+    BroadcastComparison<T, name##Fn>(left_shift, input1_data, input1_dims,    \
+                                     input1_offset, input1_multiplier,        \
+                                     input1_shift, input2_data, input2_dims,  \
+                                     input2_offset, input2_multiplier,        \
+                                     input2_shift, output_data, output_dims); \
+  }
+TFLITE_LEGACY_COMPARISON_OP(Equal);
+TFLITE_LEGACY_COMPARISON_OP(NotEqual);
+TFLITE_LEGACY_COMPARISON_OP(Greater);
+TFLITE_LEGACY_COMPARISON_OP(GreaterEqual);
+TFLITE_LEGACY_COMPARISON_OP(Less);
+TFLITE_LEGACY_COMPARISON_OP(LessEqual);
+#undef TFLITE_LEGACY_COMPARISON_OP
+
+template <typename D, typename T>
+inline void Select(const D* input_condition_data,
+                   const Dims<4>& input_condition_dims, const T* input_x_data,
+                   const Dims<4>& input_x_dims, const T* input_y_data,
+                   const Dims<4>& input_y_dims, T* output_data,
+                   const Dims<4>& output_dims) {
+  Select(DimsToShape(input_condition_dims), input_condition_data,
+         DimsToShape(input_x_dims), input_x_data, DimsToShape(input_y_dims),
+         input_y_data, DimsToShape(output_dims), output_data);
+}
+
+template <typename D, typename T>
+inline void RankOneSelect(const D* input_condition_data,
+                          const Dims<4>& input_condition_dims,
+                          const T* input_x_data, const Dims<4>& input_x_dims,
+                          const T* input_y_data, const Dims<4>& input_y_dims,
+                          T* output_data, const Dims<4>& output_dims) {
+  RankOneSelect(DimsToShape(input_condition_dims), input_condition_data,
+                DimsToShape(input_x_dims), input_x_data,
+                DimsToShape(input_y_dims), input_y_data,
+                DimsToShape(output_dims), output_data);
+}
+
+template <typename T, typename TI>
+inline void SparseToDense(const std::vector<std::vector<TI>>& indices,
+                          const T* values, T default_value, T* output_data,
+                          const Dims<4>& output_dims, bool value_is_scalar) {
+  SparseToDense(indices, values, default_value, value_is_scalar,
+                DimsToShape(output_dims), output_data);
+}
+
+template <typename Scalar>
+void Pack(int dim, const Scalar* const* input_data,
+          const Dims<4>* const* input_dims, int inputs_count,
+          Scalar* output_data, const Dims<4>& output_dims) {
+  std::vector<RuntimeShape> input_shapes(inputs_count);
+  std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count);
+  for (int i = 0; i < inputs_count; ++i) {
+    ShapeFromDims(*input_dims[i], &input_shapes[i]);
+    input_shapes_indirect[i] = &input_shapes[i];
+  }
+  tflite::PackParams op_params;
+  op_params.axis = 3 - dim;
+  op_params.inputs_count = inputs_count;
+
+  Pack(op_params, input_shapes_indirect.data(), input_data,
+       DimsToShape(output_dims), output_data);
+}
+
+template <typename Scalar>
+void Unpack(int axis, const Scalar* input_data, const Dims<4>& input_dims,
+            int dimensions, int outputs_count, Scalar* const* output_datas,
+            const Dims<4>& output_dims) {
+  tflite::UnpackParams op_params;
+  op_params.axis = 3 - axis;
+  op_params.num_split = outputs_count;
+
+  Unpack(op_params, DimsToShape(input_dims), input_data,
+         DimsToShape(output_dims), output_datas);
+}
+
+template <typename Scalar>
+void Pack(int dim, const Scalar* const* input_data,
+          const Dims<4>* const* input_dims, const int32* input_zeropoint,
+          const float* input_scale, int inputs_count, Scalar* output_data,
+          const Dims<4>& output_dims, const int32 output_zeropoint,
+          const float output_scale) {
+  std::vector<RuntimeShape> input_shapes(inputs_count);
+  std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count);
+  for (int i = 0; i < inputs_count; ++i) {
+    ShapeFromDims(*input_dims[i], &input_shapes[i]);
+    input_shapes_indirect[i] = &input_shapes[i];
+  }
+  tflite::PackParams op_params;
+  op_params.axis = 3 - dim;
+  op_params.input_zeropoint = input_zeropoint;
+  op_params.input_scale = input_scale;
+  op_params.inputs_count = inputs_count;
+  op_params.output_zeropoint = output_zeropoint;
+  op_params.output_scale = output_scale;
+
+  PackWithScaling(op_params, input_shapes_indirect.data(), input_data,
+                  DimsToShape(output_dims), output_data);
+}
+
 template <FusedActivationFunctionType Ac>
 void L2Normalization(const float* input_data, const RuntimeShape& input_shape,
                      float* output_data, const RuntimeShape& output_shape) {
@@ -342,7 +1408,6 @@ inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
               DimsToShape(output_dims), output_data);
 }
 
-// Legacy.
 // Transitional version that will be moved shortly to legacy_reference_ops, as
 // part of RuntimeShape revisions.
 inline void BroadcastMul4DSlow(const uint8* input1_data,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index f3f1595035..59f17ae854 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -231,83 +231,6 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Conv(const float* input_data, const Dims<4>& input_dims,
-                 const float* filter_data, const Dims<4>& filter_dims,
-                 const float* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int dilation_width_factor,
-                 int dilation_height_factor, int pad_width, int pad_height,
-                 float output_activation_min, float output_activation_max,
-                 float* output_data, const Dims<4>& output_dims,
-                 float* im2col_data, const Dims<4>& im2col_dims) {
-  tflite::ConvParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride_width;
-  op_params.stride_height = stride_height;
-  op_params.dilation_width_factor = dilation_width_factor;
-  op_params.dilation_height_factor = dilation_height_factor;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims),
-       filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
-       output_data, DimsToShape(im2col_dims), im2col_data);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <FusedActivationFunctionType Ac>
-void Conv(const float* input_data, const Dims<4>& input_dims,
-          const float* filter_data, const Dims<4>& filter_dims,
-          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
-          int stride_height, int dilation_width_factor,
-          int dilation_height_factor, int pad_width, int pad_height,
-          float* output_data, const Dims<4>& output_dims, float* im2col_data,
-          const Dims<4>& im2col_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
-       stride_width, stride_height, dilation_width_factor,
-       dilation_height_factor, pad_width, pad_height, output_activation_min,
-       output_activation_max, output_data, output_dims, im2col_data,
-       im2col_dims);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Conv(const float* input_data, const Dims<4>& input_dims,
-          const float* filter_data, const Dims<4>& filter_dims,
-          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
-          int stride_height, int pad_width, int pad_height, float* output_data,
-          const Dims<4>& output_dims, float* im2col_data,
-          const Dims<4>& im2col_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
-       stride_width, stride_height, 1, 1, pad_width, pad_height,
-       output_activation_min, output_activation_max, output_data, output_dims,
-       im2col_data, im2col_dims);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Conv(const float* input_data, const Dims<4>& input_dims,
-          const float* filter_data, const Dims<4>& filter_dims,
-          const float* bias_data, const Dims<4>& bias_dims, int stride,
-          int pad_width, int pad_height, float* output_data,
-          const Dims<4>& output_dims, float* im2col_data,
-          const Dims<4>& im2col_dims) {
-  Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
-           bias_dims, stride, stride, 1, 1, pad_width, pad_height, output_data,
-           output_dims, im2col_data, im2col_dims);
-}
-
 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                  const uint8* input_data, const RuntimeShape& filter_shape,
                  const uint8* filter_data, const RuntimeShape& bias_shape,
@@ -391,111 +314,6 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
-                 int32 input_offset, const uint8* filter_data,
-                 const Dims<4>& filter_dims, int32 filter_offset,
-                 const int32* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int dilation_width_factor,
-                 int dilation_height_factor, int pad_width, int pad_height,
-                 int32 output_offset, int32 output_multiplier, int output_shift,
-                 int32 output_activation_min, int32 output_activation_max,
-                 uint8* output_data, const Dims<4>& output_dims,
-                 uint8* im2col_data, const Dims<4>& im2col_dims,
-                 gemmlowp::GemmContext* gemm_context) {
-  tflite::ConvParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride_width;
-  op_params.stride_height = stride_height;
-  op_params.dilation_width_factor = dilation_width_factor;
-  op_params.dilation_height_factor = dilation_height_factor;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = kReverseShift * output_shift;
-  op_params.quantized_activation_min = output_activation_min;
-  op_params.quantized_activation_max = output_activation_max;
-
-  Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims),
-       filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
-       output_data, DimsToShape(im2col_dims), im2col_data, gemm_context);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
-                 int32 input_offset, const uint8* filter_data,
-                 const Dims<4>& filter_dims, int32 filter_offset,
-                 const int32* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int pad_width,
-                 int pad_height, int32 output_offset, int32 output_multiplier,
-                 int output_shift, int32 output_activation_min,
-                 int32 output_activation_max, uint8* output_data,
-                 const Dims<4>& output_dims, uint8* im2col_data,
-                 const Dims<4>& im2col_dims,
-                 gemmlowp::GemmContext* gemm_context) {
-  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
-       filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1,
-       pad_width, pad_height, output_offset, output_multiplier, output_shift,
-       output_activation_min, output_activation_max, output_data, output_dims,
-       im2col_data, im2col_dims, gemm_context);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
-                 int32 input_offset, const uint8* filter_data,
-                 const Dims<4>& filter_dims, int32 filter_offset,
-                 const int32* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int pad_width,
-                 int pad_height, int32 output_offset, int32 output_multiplier,
-                 int output_shift, int32 output_activation_min,
-                 int32 output_activation_max, uint8* output_data,
-                 const Dims<4>& output_dims, uint8* im2col_data,
-                 const Dims<4>& im2col_dims,
-                 gemmlowp::GemmContext* gemm_context) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
-  }
-  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
-       filter_offset, bias_data, bias_dims, stride_width, stride_height,
-       pad_width, pad_height, output_offset, output_multiplier, output_shift,
-       output_activation_min, output_activation_max, output_data, output_dims,
-       im2col_data, im2col_dims, gemm_context);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Conv(const uint8* input_data, const Dims<4>& input_dims,
-          int32 input_offset, const uint8* filter_data,
-          const Dims<4>& filter_dims, int32 filter_offset,
-          const int32* bias_data, const Dims<4>& bias_dims, int stride,
-          int pad_width, int pad_height, int32 output_offset,
-          int32 output_multiplier, int output_shift,
-          int32 output_activation_min, int32 output_activation_max,
-          uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
-          const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
-  Conv<Ac>(input_data, input_dims, input_offset, filter_data, filter_dims,
-           filter_offset, bias_data, bias_dims, stride, stride, pad_width,
-           pad_height, output_offset, output_multiplier, output_shift,
-           output_activation_min, output_activation_max, output_data,
-           output_dims, im2col_data, im2col_dims, gemm_context);
-}
-
 template <typename T>
 inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
                          const RuntimeShape& unextended_input_shape,
@@ -1385,21 +1203,6 @@ void BroadcastDiv4DSlow(const ArithmeticParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy Dims<4>.
-template <typename T>
-void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T output_activation_min, T output_activation_max,
-                  T* output_data, const Dims<4>& output_dims) {
-  tflite::ArithmeticParams op_params;
-  SetActivationParams(output_activation_min, output_activation_max, &op_params);
-
-  BroadcastDiv4DSlow(op_params, DimsToShape(input1_dims), input1_data,
-                     DimsToShape(input2_dims), input2_data,
-                     DimsToShape(output_dims), output_data);
-}
-
 template <typename T>
 inline void Div(const ArithmeticParams& params,
                 const RuntimeShape& input1_shape, const T* input1_data,
@@ -1418,21 +1221,6 @@ inline void Div(const ArithmeticParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy Dims<4>.
-template <typename T>
-inline void Div(const T* input1_data, const Dims<4>& input1_dims,
-                const T* input2_data, const Dims<4>& input2_dims,
-                T output_activation_min, T output_activation_max,
-                T* output_data, const Dims<4>& output_dims) {
-  tflite::ArithmeticParams op_params;
-  SetActivationParams(output_activation_min, output_activation_max, &op_params);
-
-  Div(op_params, DimsToShape(input1_dims), input1_data,
-      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
-      output_data);
-}
-
 inline void SubNonBroadcast(const ArithmeticParams& params,
                             const RuntimeShape& input1_shape,
                             const float* input1_data,
@@ -1772,34 +1560,10 @@ inline void Concatenation(const ConcatenationParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy Dims<4>.
-template <FusedActivationFunctionType Ac, typename Scalar>
-inline void Concatenation(int concat_dim, const Scalar* const* input_data,
-                          const Dims<4>* const* input_dims, int inputs_count,
-                          Scalar* output_data, const Dims<4>& output_dims) {
-  // For now we don't have a model with a Concatenation with fused activation.
-  TFLITE_DCHECK_EQ(Ac, FusedActivationFunctionType::kNone);
-
-  std::vector<RuntimeShape> input_shapes(inputs_count);
-  std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count);
-  for (int i = 0; i < inputs_count; ++i) {
-    ShapeFromDims(*input_dims[i], &input_shapes[i]);
-    input_shapes_indirect[i] = &input_shapes[i];
-  }
-  tflite::ConcatenationParams op_params;
-  op_params.axis = 3 - concat_dim;
-  op_params.inputs_count = inputs_count;
-
-  Concatenation(op_params, input_shapes_indirect.data(), input_data,
-                DimsToShape(output_dims), output_data);
-}
-
 // TODO(prabhumk): This is the same as the optimized implementation.
 // TODO(prabhumk): The quantized implementation of concatentation isn't fully
 // quantized as it takes scale as a floating point value. This should be fixed
 // when optimizng this routine further.
-
 inline void ConcatenationWithScaling(const ConcatenationParams& params,
                                      const RuntimeShape* const* input_shapes,
                                      const uint8* const* input_data,
@@ -1862,33 +1626,6 @@ inline void ConcatenationWithScaling(const ConcatenationParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy Dims<4>.
-inline void Concatenation(int concat_dim, const uint8* const* input_data,
-                          const Dims<4>* const* input_dims,
-                          const int32* input_zeropoint,
-                          const float* input_scale, int inputs_count,
-                          uint8* output_data, const Dims<4>& output_dims,
-                          const int32 output_zeropoint,
-                          const float output_scale) {
-  std::vector<RuntimeShape> input_shapes(inputs_count);
-  std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count);
-  for (int i = 0; i < inputs_count; ++i) {
-    ShapeFromDims(*input_dims[i], &input_shapes[i]);
-    input_shapes_indirect[i] = &input_shapes[i];
-  }
-  tflite::ConcatenationParams op_params;
-  op_params.axis = 3 - concat_dim;
-  op_params.input_zeropoint = input_zeropoint;
-  op_params.input_scale = input_scale;
-  op_params.inputs_count = inputs_count;
-  op_params.output_zeropoint = output_zeropoint;
-  op_params.output_scale = output_scale;
-
-  ConcatenationWithScaling(op_params, input_shapes_indirect.data(), input_data,
-                           DimsToShape(output_dims), output_data);
-}
-
 template <typename Scalar>
 void Pack(const PackParams& params, const RuntimeShape* const* input_shapes,
           const Scalar* const* input_data, const RuntimeShape& output_shape,
@@ -2002,26 +1739,6 @@ void DepthConcatenation(const ConcatenationParams& params,
                 output_data);
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-template <FusedActivationFunctionType Ac, typename Scalar>
-void DepthConcatenation(const Scalar* const* input_data,
-                        const Dims<4>* const* input_dims, int inputs_count,
-                        Scalar* output_data, const Dims<4>& output_dims) {
-  // For now we don't have a model with a Concatenation with fused activation.
-  TFLITE_DCHECK_EQ(Ac, FusedActivationFunctionType::kNone);
-  std::vector<RuntimeShape> input_shapes(inputs_count);
-  std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count);
-  for (int i = 0; i < inputs_count; ++i) {
-    ShapeFromDims(*input_dims[i], &input_shapes[i]);
-    input_shapes_indirect[i] = &input_shapes[i];
-  }
-  tflite::ConcatenationParams op_params;
-  op_params.inputs_count = inputs_count;
-
-  DepthConcatenation(op_params, input_shapes_indirect.data(), input_data,
-                     DimsToShape(output_dims), output_data);
-}
-
 inline void LstmCell(
     const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
     const float* input_data, const RuntimeShape& unextended_prev_activ_shape,
@@ -2139,31 +1856,6 @@ inline void LstmCell(
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
-                     const float* prev_activ_data,
-                     const Dims<4>& prev_activ_dims, const float* weights_data,
-                     const Dims<4>& weights_dims, const float* bias_data,
-                     const Dims<4>& bias_dims, const float* prev_state_data,
-                     const Dims<4>& prev_state_dims, float* output_state_data,
-                     const Dims<4>& output_state_dims, float* output_activ_data,
-                     const Dims<4>& output_activ_dims, float* concat_temp_data,
-                     const Dims<4>& concat_temp_dims, float* activ_temp_data,
-                     const Dims<4>& activ_temp_dims) {
-  tflite::LstmCellParams op_params;
-  // Float LSTM cell does not need parameters to be set: leave untouched.
-
-  LstmCell(op_params, DimsToShape(input_dims), input_data,
-           DimsToShape(prev_activ_dims), prev_activ_data,
-           DimsToShape(weights_dims), weights_data, DimsToShape(bias_dims),
-           bias_data, DimsToShape(prev_state_dims), prev_state_data,
-           DimsToShape(output_state_dims), output_state_data,
-           DimsToShape(output_activ_dims), output_activ_data,
-           DimsToShape(concat_temp_dims), concat_temp_data,
-           DimsToShape(activ_temp_dims), activ_temp_data);
-}
-
 // Quantized LSTM cell implementation.
 // The quantization of the input, output arrays is as follows:
 //  - The input activations are quantized as uint8 on the interval
@@ -2438,37 +2130,6 @@ inline void LstmCell(
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <int StateIntegerBits>
-void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
-              const uint8* prev_activ_data_uint8,
-              const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8,
-              const Dims<4>& weights_dims, const int32* bias_data_int32,
-              const Dims<4>& bias_dims, const int16* prev_state_data_int16,
-              const Dims<4>& prev_state_dims, int16* output_state_data_int16,
-              const Dims<4>& output_state_dims, uint8* output_activ_data_uint8,
-              const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8,
-              const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16,
-              const Dims<4>& activ_temp_dims, int32 weights_zero_point,
-              int32 accum_multiplier, int accum_shift,
-              gemmlowp::GemmContext* gemm_context) {
-  tflite::LstmCellParams op_params;
-  op_params.weights_zero_point = weights_zero_point;
-  op_params.accum_multiplier = accum_multiplier;
-  op_params.accum_shift = accum_shift;
-
-  LstmCell<StateIntegerBits>(
-      op_params, DimsToShape(input_dims), input_data_uint8,
-      DimsToShape(prev_activ_dims), prev_activ_data_uint8,
-      DimsToShape(weights_dims), weights_data_uint8, DimsToShape(bias_dims),
-      bias_data_int32, DimsToShape(prev_state_dims), prev_state_data_int16,
-      DimsToShape(output_state_dims), output_state_data_int16,
-      DimsToShape(output_activ_dims), output_activ_data_uint8,
-      DimsToShape(concat_temp_dims), concat_temp_data_uint8,
-      DimsToShape(activ_temp_dims), activ_temp_data_int16, gemm_context);
-}
-
 template <typename Scalar>
 void Split(const SplitParams& params, const RuntimeShape& input_shape,
            const Scalar* input_data, const RuntimeShape* const* output_shapes,
@@ -2511,45 +2172,6 @@ void Split(const SplitParams& params, const RuntimeShape& input_shape,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy Dims<4>.
-template <typename Scalar>
-void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
-                     int axis, int outputs_count, Scalar* const* output_data,
-                     const Dims<4>* const* output_dims) {
-  std::vector<RuntimeShape> output_shapes(outputs_count);
-  std::vector<const RuntimeShape*> output_shapes_indirect(outputs_count);
-  for (int i = 0; i < outputs_count; ++i) {
-    ShapeFromDims(*output_dims[i], &output_shapes[i]);
-    output_shapes_indirect[i] = &output_shapes[i];
-  }
-  tflite::SplitParams op_params;
-  op_params.axis = 3 - axis;
-  op_params.num_split = outputs_count;
-
-  Split(op_params, DimsToShape(input_dims), input_data,
-        output_shapes_indirect.data(), output_data);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy Dims<4>.
-template <FusedActivationFunctionType Ac, typename Scalar>
-void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
-                     int outputs_count, Scalar* const* output_data,
-                     const Dims<4>* const* output_dims) {
-  TFLITE_DCHECK_GE(outputs_count, 1);
-  for (int i = 0; i < outputs_count; i++) {
-    /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3);
-    /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2);
-    /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1);
-  }
-  // For now we don't have a model with a Split with fused activation.
-  TFLITE_DCHECK_EQ(Ac, FusedActivationFunctionType::kNone);
-
-  TensorFlowSplit(input_data, input_dims, /*axis=*/0, outputs_count,
-                  output_data, output_dims);
-}
-
 inline int NodeOffset(int b, int h, int w, int height, int width) {
   return (b * height + h) * width + w;
 }
@@ -2880,15 +2502,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy
-inline void LogSoftmax(const float* input_data, const RuntimeShape& input_shape,
-                       float* output_data, const RuntimeShape& output_shape) {
-  SoftmaxParams params;
-  // No params currently used for float LogSoftmax.
-  LogSoftmax(params, input_shape, input_data, output_shape, output_data);
-}
-
 // Although currently the name of this function says that it cannot handle
 // values less than 1, in practice it can handle as low as 1/x_max, where
 // x_max is the largest representable input.  In other words, the output range
@@ -3093,22 +2706,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void LogSoftmax(const uint8* input_data, const RuntimeShape& input_shape,
-                       int32 input_multiplier, int32 input_left_shift,
-                       int32 reverse_scaling_divisor,
-                       int32 reverse_scaling_right_shift, int diff_min,
-                       uint8* output_data, const RuntimeShape& output_shape) {
-  SoftmaxParams params;
-  params.input_multiplier = input_multiplier;
-  params.input_left_shift = input_left_shift;
-  params.reverse_scaling_divisor = reverse_scaling_divisor;
-  params.reverse_scaling_right_shift = reverse_scaling_right_shift;
-  params.diff_min = diff_min;
-  LogSoftmax(params, input_shape, input_data, output_shape, output_data);
-}
-
 inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
                      const RuntimeShape& output_shape, float* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
@@ -3170,20 +2767,6 @@ inline void Logistic(const LogisticParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Logistic(const uint8* input_data, const RuntimeShape& input_shape,
-                     int32 input_zero_point, int32 input_range_radius,
-                     int32 input_multiplier, int input_left_shift,
-                     uint8* output_data, const RuntimeShape& output_shape) {
-  LogisticParams params;
-  params.input_zero_point = input_zero_point;
-  params.input_range_radius = input_range_radius;
-  params.input_multiplier = input_multiplier;
-  params.input_left_shift = input_left_shift;
-  Logistic(params, input_shape, input_data, output_shape, output_data);
-}
-
 inline void Logistic(const LogisticParams& params,
                      const RuntimeShape& input_shape, const int16* input_data,
                      const RuntimeShape& output_shape, int16* output_data) {
@@ -3203,15 +2786,6 @@ inline void Logistic(const LogisticParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Logistic(const RuntimeShape& input_shape, const int16* input_data,
-                     const RuntimeShape& output_shape, int16* output_data) {
-  LogisticParams params;
-  // No params currently needed by int16 Logistic.
-  Logistic(params, input_shape, input_data, output_shape, output_data);
-}
-
 inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
                  const RuntimeShape& output_shape, float* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
@@ -3275,20 +2849,6 @@ inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Tanh(const uint8* input_data, const RuntimeShape& input_shape,
-                 int32 input_zero_point, int32 input_range_radius,
-                 int32 input_multiplier, int input_left_shift,
-                 uint8* output_data, const RuntimeShape& output_shape) {
-  TanhParams params;
-  params.input_zero_point = input_zero_point;
-  params.input_range_radius = input_range_radius;
-  params.input_multiplier = input_multiplier;
-  params.input_left_shift = input_left_shift;
-  Tanh(params, input_shape, input_data, output_shape, output_data);
-}
-
 inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
                  const int16* input_data, const RuntimeShape& output_shape,
                  int16* output_data) {
@@ -3323,16 +2883,6 @@ inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Tanh(const int16* input_data, const RuntimeShape& input_shape,
-                 int input_left_shift, int16* output_data,
-                 const RuntimeShape& output_shape) {
-  TanhParams params;
-  params.input_left_shift = input_left_shift;
-  Tanh(params, input_shape, input_data, output_shape, output_data);
-}
-
 inline void Dequantize(const tflite::DequantizationParams& op_params,
                        const RuntimeShape& input_shape, const uint8* input_data,
                        const RuntimeShape& output_shape, float* output_data) {
@@ -3347,19 +2897,6 @@ inline void Dequantize(const tflite::DequantizationParams& op_params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy Dims<4>.
-inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
-                       int32 zero_point, double scale, float* output_data,
-                       const Dims<4>& output_dims) {
-  tflite::DequantizationParams op_params;
-  op_params.zero_point = zero_point;
-  op_params.scale = scale;
-
-  Dequantize(op_params, DimsToShape(input_dims), input_data,
-             DimsToShape(output_dims), output_data);
-}
-
 inline void FakeQuant(const tflite::FakeQuantParams& op_params,
                       const RuntimeShape& input_shape, const float* input_data,
                       const RuntimeShape& output_shape, float* output_data) {
@@ -3383,20 +2920,6 @@ inline void FakeQuant(const tflite::FakeQuantParams& op_params,
                     output_data, flat_size);
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy Dims<4>.
-inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
-                      float rmin, float rmax, int num_bits, float* output_data,
-                      const Dims<4>& output_dims) {
-  tflite::FakeQuantParams op_params;
-  op_params.num_bits = num_bits;
-  op_params.minmax.min = rmin;
-  op_params.minmax.max = rmax;
-
-  FakeQuant(op_params, DimsToShape(input_dims), input_data,
-            DimsToShape(output_dims), output_data);
-}
-
 template <typename SrcT, typename DstT>
 inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data,
                  const RuntimeShape& output_shape, DstT* output_data) {
@@ -3456,23 +2979,6 @@ inline void Gather(const tflite::GatherParams& op_params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy Dims<4> version.
-// When moving legacy ops to legacy_reference_ops, replace content with looser
-// implementation.
-template <typename T>
-inline void Gather(const T* input_data, const Dims<4>& input_dims,
-                   int input_rank, const int32* coords_data,
-                   const Dims<4>& coords_dims, T* output_data,
-                   const Dims<4>& output_dims) {
-  tflite::GatherParams op_params;
-  op_params.input_rank = input_rank;
-
-  Gather(op_params, DimsToShape(input_dims), input_data,
-         DimsToShape(coords_dims), coords_data, DimsToShape(output_dims),
-         output_data);
-}
-
 template <typename T>
 inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
                            const RuntimeShape& unextended_input_shape,
@@ -3802,58 +3308,6 @@ inline void StridedSlice(const tflite::StridedSliceParams& op_params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline uint32 LegacyReverseBits32(uint32 n) {
-  n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1);
-  n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2);
-  n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4);
-  return (((n & 0xFF) << 24) | ((n & 0xFF00) << 8) | ((n & 0xFF0000) >> 8) |
-          ((n & 0xFF000000) >> 24));
-}
-
-inline void StridedSliceReverseIndices(tflite::StridedSliceParams* p) {
-  TFLITE_CHECK_EQ(p->start_indices_count, p->stop_indices_count);
-  TFLITE_CHECK_EQ(p->stop_indices_count, p->strides_count);
-
-  std::reverse(p->start_indices, p->start_indices + p->start_indices_count);
-  std::reverse(p->stop_indices, p->stop_indices + p->stop_indices_count);
-  std::reverse(p->strides, p->strides + p->strides_count);
-
-  p->begin_mask = LegacyReverseBits32(static_cast<uint32>(p->begin_mask)) >>
-                  (32 - p->start_indices_count);
-  p->ellipsis_mask =
-      LegacyReverseBits32(static_cast<uint32>(p->ellipsis_mask)) >>
-      (32 - p->start_indices_count);
-  p->end_mask = LegacyReverseBits32(static_cast<uint32>(p->end_mask)) >>
-                (32 - p->start_indices_count);
-  p->new_axis_mask =
-      LegacyReverseBits32(static_cast<uint32>(p->new_axis_mask)) >>
-      (32 - p->start_indices_count);
-  p->shrink_axis_mask =
-      LegacyReverseBits32(static_cast<uint32>(p->shrink_axis_mask)) >>
-      (32 - p->start_indices_count);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <typename T>
-inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
-                         int begin_mask, int end_mask, int shrink_axis_mask,
-                         const std::vector<int>& start_indices,
-                         const std::vector<int>& stop_indices,
-                         const std::vector<int>& strides, T* output_data,
-                         const Dims<4>& output_dims) {
-  TFLITE_DCHECK_EQ(start_indices.size(), 4);
-  auto op_params = strided_slice::BuildStridedSliceParams(
-      begin_mask, end_mask, shrink_axis_mask, start_indices, stop_indices,
-      strides);
-  StridedSliceReverseIndices(&op_params);
-
-  StridedSlice(op_params, DimsToShape(input_dims), input_data,
-               DimsToShape(output_dims), output_data);
-}
-
 template <typename T>
 inline void Slice(const tflite::SliceParams& op_params,
                   const RuntimeShape& input_shape, const T* input_data,
@@ -4119,22 +3573,6 @@ inline void Mean(const tflite::MeanParams& op_params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy Dims<4>.
-template <typename T>
-inline void Mean(const T* input_data, const Dims<4>& input_dims,
-                 const std::vector<int>& reduction_indices, T* output_data,
-                 const Dims<4>& output_dims) {
-  tflite::MeanParams op_params;
-  op_params.axis_count = reduction_indices.size();
-  for (int i = 0; i < op_params.axis_count; ++i) {
-    op_params.axis[i] = reduction_indices[op_params.axis_count - 1 - i];
-  }
-
-  Mean(op_params, DimsToShape(input_dims), input_data, DimsToShape(output_dims),
-       output_data);
-}
-
 // Computes the mean of elements across dimensions given in axis.
 // It does so in two stages, first calculates the sum of elements along the axis
 // then divides it by the number of element in axis for quantized values.
@@ -4392,20 +3830,6 @@ void Transpose(const TransposeParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <typename T>
-void Transpose(const T* input, const Dims<4>& input_dims, T* output,
-               const Dims<4>& output_dims, const int* permuted_axes) {
-  TransposeParams params;
-  params.perm_count = 4;
-  for (int i = 0; i < 4; ++i) {
-    params.perm[i] = 3 - permuted_axes[3 - i];
-  }
-  Transpose(params, DimsToShape(input_dims), input, DimsToShape(output_dims),
-            output);
-}
-
 inline void TransposeConv(
     const ConvParams& params, const RuntimeShape& input_shape,
     const float* input_data, const RuntimeShape& filter_shape,
@@ -4479,27 +3903,6 @@ inline void TransposeConv(
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
-                          const float* filter_data, const Dims<4>& filter_dims,
-                          int stride_width, int stride_height, int pad_width,
-                          int pad_height, float* output_data,
-                          const Dims<4>& output_dims, float* im2col_data,
-                          const Dims<4>& im2col_dims) {
-  tflite::ConvParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride_width;
-  op_params.stride_height = stride_height;
-
-  TransposeConv(op_params, DimsToShape(input_dims), input_data,
-                DimsToShape(filter_dims), filter_data, DimsToShape(output_dims),
-                output_data, DimsToShape(im2col_dims), im2col_data);
-}
-
 template <typename T>
 inline bool EqualFn(T lhs, T rhs) {
   return lhs == rhs;
@@ -4553,19 +3956,6 @@ inline void Comparison(const ComparisonParams& op_params,
                            input2_data, output_shape, output_data);
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <typename T, ComparisonFn<T> F>
-inline void Comparison(const T* input1_data, const Dims<4>& input1_dims,
-                       const T* input2_data, const Dims<4>& input2_dims,
-                       bool* output_data, const Dims<4>& output_dims) {
-  ComparisonParams op_params;
-  // No parameters needed.
-  ComparisonImpl<T, F>(op_params, DimsToShape(input1_dims), input1_data,
-                       DimsToShape(input2_dims), input2_data,
-                       DimsToShape(output_dims), output_data);
-}
-
 template <typename T, ComparisonFn<int32> F>
 inline void ComparisonWithScaling(
     const ComparisonParams& op_params, const RuntimeShape& input1_shape,
@@ -4596,32 +3986,6 @@ inline void ComparisonWithScaling(
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <typename T, ComparisonFn<int32> F>
-inline void Comparison(int left_shift, const T* input1_data,
-                       const Dims<4>& input1_dims, int32 input1_offset,
-                       int32 input1_multiplier, int input1_shift,
-                       const T* input2_data, const Dims<4>& input2_dims,
-                       int32 input2_offset, int32 input2_multiplier,
-                       int input2_shift, bool* output_data,
-                       const Dims<4>& output_dims) {
-  tflite::ComparisonParams op_params;
-  op_params.left_shift = left_shift;
-  op_params.input1_offset = input1_offset;
-  op_params.input1_multiplier = input1_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.input1_shift = kReverseShift * input1_shift;
-  op_params.input2_offset = input2_offset;
-  op_params.input2_multiplier = input2_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.input2_shift = kReverseShift * input2_shift;
-
-  ComparisonWithScaling<T, F>(op_params, DimsToShape(input1_dims), input1_data,
-                              DimsToShape(input2_dims), input2_data,
-                              DimsToShape(output_dims), output_data);
-}
-
 template <typename T, ComparisonFn<T> F>
 inline void BroadcastComparison4DSlowImpl(
     const ComparisonParams& op_params,
@@ -4665,22 +4029,6 @@ inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
                                           output_shape, output_data);
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <typename T, ComparisonFn<T> F>
-inline void BroadcastComparison(const T* input1_data,
-                                const Dims<4>& input1_dims,
-                                const T* input2_data,
-                                const Dims<4>& input2_dims, bool* output_data,
-                                const Dims<4>& output_dims) {
-  ComparisonParams op_params;
-  // No parameters needed.
-  BroadcastComparison4DSlowImpl<T, F>(op_params, DimsToShape(input1_dims),
-                                      input1_data, DimsToShape(input2_dims),
-                                      input2_data, DimsToShape(output_dims),
-                                      output_data);
-}
-
 template <typename T, ComparisonFn<int32> F>
 inline void BroadcastComparison4DSlowWithScaling(
     const ComparisonParams& op_params,
@@ -4731,80 +4079,7 @@ inline void BroadcastComparison4DSlowWithScaling(
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <typename T, ComparisonFn<int32> F>
-inline void BroadcastComparison(int left_shift, const T* input1_data,
-                                const Dims<4>& input1_dims, int32 input1_offset,
-                                int32 input1_multiplier, int input1_shift,
-                                const T* input2_data,
-                                const Dims<4>& input2_dims, int32 input2_offset,
-                                int32 input2_multiplier, int input2_shift,
-                                bool* output_data, const Dims<4>& output_dims) {
-  ComparisonParams op_params;
-
-  op_params.left_shift = left_shift;
-  op_params.input1_offset = input1_offset;
-  op_params.input1_multiplier = input1_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.input1_shift = kReverseShift * input1_shift;
-  op_params.input2_offset = input2_offset;
-  op_params.input2_multiplier = input2_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.input2_shift = kReverseShift * input2_shift;
-
-  BroadcastComparison4DSlowWithScaling<T, F>(
-      op_params, DimsToShape(input1_dims), input1_data,
-      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
-      output_data);
-}
-
 #define TFLITE_COMPARISON_OP(name)                                             \
-  template <typename T>                                                        \
-  inline void name(const T* input1_data, const Dims<4>& input1_dims,           \
-                   const T* input2_data, const Dims<4>& input2_dims,           \
-                   bool* output_data, const Dims<4>& output_dims) {            \
-    gemmlowp::ScopedProfilingLabel label(#name);                               \
-    Comparison<T, name##Fn>(input1_data, input1_dims, input2_data,             \
-                            input2_dims, output_data, output_dims);            \
-  }                                                                            \
-  template <typename T>                                                        \
-  inline void name(                                                            \
-      int left_shift, const T* input1_data, const Dims<4>& input1_dims,        \
-      int32 input1_offset, int32 input1_multiplier, int input1_shift,          \
-      const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset,   \
-      int32 input2_multiplier, int input2_shift, bool* output_data,            \
-      const Dims<4>& output_dims) {                                            \
-    gemmlowp::ScopedProfilingLabel label(#name "/8bit");                       \
-    Comparison<T, name##Fn>(left_shift, input1_data, input1_dims,              \
-                            input1_offset, input1_multiplier, input1_shift,    \
-                            input2_data, input2_dims, input2_offset,           \
-                            input2_multiplier, input2_shift, output_data,      \
-                            output_dims);                                      \
-  }                                                                            \
-  template <typename T>                                                        \
-  inline void Broadcast##name(                                                 \
-      const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,  \
-      const Dims<4>& input2_dims, bool* output_data,                           \
-      const Dims<4>& output_dims) {                                            \
-    gemmlowp::ScopedProfilingLabel label("Broadcast" #name);                   \
-    BroadcastComparison<T, name##Fn>(input1_data, input1_dims, input2_data,    \
-                                     input2_dims, output_data, output_dims);   \
-  }                                                                            \
-  template <typename T>                                                        \
-  inline void Broadcast##name(                                                 \
-      int left_shift, const T* input1_data, const Dims<4>& input1_dims,        \
-      int32 input1_offset, int32 input1_multiplier, int input1_shift,          \
-      const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset,   \
-      int32 input2_multiplier, int input2_shift, bool* output_data,            \
-      const Dims<4>& output_dims) {                                            \
-    gemmlowp::ScopedProfilingLabel label("Broadcast" #name "/8bit");           \
-    BroadcastComparison<T, name##Fn>(left_shift, input1_data, input1_dims,     \
-                                     input1_offset, input1_multiplier,         \
-                                     input1_shift, input2_data, input2_dims,   \
-                                     input2_offset, input2_multiplier,         \
-                                     input2_shift, output_data, output_dims);  \
-  }                                                                            \
   inline void name(const ComparisonParams& op_params,                          \
                    const RuntimeShape& input1_shape, const float* input1_data, \
                    const RuntimeShape& input2_shape, const float* input2_data, \
@@ -4889,19 +4164,6 @@ void Select(const RuntimeShape& input_condition_shape,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <typename D, typename T>
-inline void Select(const D* input_condition_data,
-                   const Dims<4>& input_condition_dims, const T* input_x_data,
-                   const Dims<4>& input_x_dims, const T* input_y_data,
-                   const Dims<4>& input_y_dims, T* output_data,
-                   const Dims<4>& output_dims) {
-  Select(DimsToShape(input_condition_dims), input_condition_data,
-         DimsToShape(input_x_dims), input_x_data, DimsToShape(input_y_dims),
-         input_y_data, DimsToShape(output_dims), output_data);
-}
-
 template <typename D, typename T>
 void RankOneSelect(const RuntimeShape& input_condition_shape,
                    const D* input_condition_data,
@@ -4923,20 +4185,6 @@ void RankOneSelect(const RuntimeShape& input_condition_shape,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <typename D, typename T>
-inline void RankOneSelect(const D* input_condition_data,
-                          const Dims<4>& input_condition_dims,
-                          const T* input_x_data, const Dims<4>& input_x_dims,
-                          const T* input_y_data, const Dims<4>& input_y_dims,
-                          T* output_data, const Dims<4>& output_dims) {
-  RankOneSelect(DimsToShape(input_condition_dims), input_condition_data,
-                DimsToShape(input_x_dims), input_x_data,
-                DimsToShape(input_y_dims), input_y_data,
-                DimsToShape(output_dims), output_data);
-}
-
 // For easy implementation, the indices is always a vector of size-4 vectors.
 template <typename T, typename TI>
 inline void SparseToDense(const std::vector<std::vector<TI>>& indices,
@@ -4978,16 +4226,6 @@ inline void SparseToDense(const std::vector<std::vector<TI>>& indices,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-template <typename T, typename TI>
-inline void SparseToDense(const std::vector<std::vector<TI>>& indices,
-                          const T* values, T default_value, T* output_data,
-                          const Dims<4>& output_dims, bool value_is_scalar) {
-  SparseToDense(indices, values, default_value, value_is_scalar,
-                DimsToShape(output_dims), output_data);
-}
-
 template <typename T>
 inline void Pow(const RuntimeShape& input1_shape, const T* input1_data,
                 const RuntimeShape& input2_shape, const T* input2_data,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/softmax.h b/tensorflow/contrib/lite/kernels/internal/reference/softmax.h
index 006174e8db..7d44296134 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/softmax.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/softmax.h
@@ -57,16 +57,6 @@ inline void Softmax(const SoftmaxParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Softmax(const float* input_data, const RuntimeShape& input_shape,
-                    float beta, float* output_data,
-                    const RuntimeShape& output_shape) {
-  SoftmaxParams params;
-  params.beta = beta;
-  Softmax(params, input_shape, input_data, output_shape, output_data);
-}
-
 inline void Softmax(const SoftmaxParams& params,
                     const RuntimeShape& input_shape, const uint8* input_data,
                     const RuntimeShape& output_shape, uint8* output_data) {
@@ -151,19 +141,6 @@ inline void Softmax(const SoftmaxParams& params,
   }
 }
 
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy
-inline void Softmax(const uint8* input_data, const RuntimeShape& input_shape,
-                    int32 input_beta_multiplier, int32 input_beta_left_shift,
-                    int diff_min, uint8* output_data,
-                    const RuntimeShape& output_shape) {
-  SoftmaxParams params;
-  params.input_multiplier = input_beta_multiplier;
-  params.input_left_shift = input_beta_left_shift;
-  params.diff_min = diff_min;
-  Softmax(params, input_shape, input_data, output_shape, output_data);
-}
-
 // Performs softmax along the input of size (input_size * batch_size).
 inline void Softmax(const float* in, const int input_size, const int batch_size,
                     const float beta, float* out) {
-- 
GitLab


From 370d385c3029a7972ba201c8303942b30f09521c Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Thu, 27 Sep 2018 20:52:53 -0700
Subject: [PATCH 123/570] Creating a LinearModel that works with V2 feature
 columns.

In subsequent change I'll change canned estimators to support FeatureColumn V2
and use this LinearModel.

PiperOrigin-RevId: 214882241
---
 .../feature_column/feature_column_v2.py       |  574 ++---
 .../feature_column/feature_column_v2_test.py  | 2042 ++++-------------
 2 files changed, 597 insertions(+), 2019 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 538641c251..a8d5bfb437 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -136,14 +136,11 @@ import six
 
 
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine.base_layer import Layer
-from tensorflow.python.layers import base
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -153,7 +150,6 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
@@ -245,28 +241,19 @@ class StateManager(object):
     raise NotImplementedError('StateManager.get_resource')
 
 
-class _InputLayerStateManager(StateManager):
-  """Manages the state of InputLayer."""
+class _StateManagerImpl(StateManager):
+  """Manages the state of FeatureLayer and LinearModel."""
 
-  def __init__(self, layer, feature_columns, trainable):
-    """Creates an _InputLayerStateManager object.
+  def __init__(self, layer, trainable):
+    """Creates an _StateManagerImpl object.
 
     Args:
       layer: The input layer this state manager is associated with.
-      feature_columns: List of feature columns for the input layer
       trainable: Whether by default, variables created are trainable or not.
     """
     self._trainable = trainable
     self._layer = layer
-    self._cols_to_vars_map = {}
-    self._cols_to_names_map = {}
-    for column in sorted(feature_columns, key=lambda x: x.name):
-      self._cols_to_vars_map[column] = {}
-      base_name = column.name
-      if isinstance(column, SharedEmbeddingColumn):
-        base_name = column.shared_collection_name
-      with variable_scope.variable_scope(base_name) as vs:
-        self._cols_to_names_map[column] = _strip_leading_slashes(vs.name)
+    self._cols_to_vars_map = collections.defaultdict(lambda: {})
 
   def create_variable(self,
                       feature_column,
@@ -277,19 +264,19 @@ class _InputLayerStateManager(StateManager):
                       initializer=None):
     if name in self._cols_to_vars_map[feature_column]:
       raise ValueError('Variable already exists.')
-    with variable_scope.variable_scope(self._cols_to_names_map[feature_column]):
-      var = self._layer.add_variable(
-          name=name,
-          shape=shape,
-          dtype=dtype,
-          initializer=initializer,
-          trainable=self._trainable and trainable,
-          # TODO(rohanj): Get rid of this hack once we have a mechanism for
-          # specifying a default partitioner for an entire layer. In that case,
-          # the default getter for Layers should work.
-          getter=variable_scope.get_variable)
-      self._cols_to_vars_map[feature_column][name] = var
-      return var
+
+    var = self._layer.add_variable(
+        name=name,
+        shape=shape,
+        dtype=dtype,
+        initializer=initializer,
+        trainable=self._trainable and trainable,
+        # TODO(rohanj): Get rid of this hack once we have a mechanism for
+        # specifying a default partitioner for an entire layer. In that case,
+        # the default getter for Layers should work.
+        getter=variable_scope.get_variable)
+    self._cols_to_vars_map[feature_column][name] = var
+    return var
 
   def get_variable(self, feature_column, name):
     if name in self._cols_to_vars_map[feature_column]:
@@ -313,12 +300,15 @@ class FeatureLayer(Layer):
   keywords_embedded = embedding_column(
       categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
   columns = [price, keywords_embedded, ...]
-  features = tf.parse_example(..., features=make_parse_example_spec(columns))
   feature_layer = FeatureLayer(columns)
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
   dense_tensor = feature_layer(features)
   for units in [128, 64, 32]:
     dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu)
-  prediction = tf.layers.dense(dense_tensor, 1)."""
+  prediction = tf.layers.dense(dense_tensor, 1).
+  ```
+  """
 
   def __init__(self,
                feature_columns,
@@ -375,8 +365,7 @@ class FeatureLayer(Layer):
     super(FeatureLayer, self).__init__(name=name, trainable=trainable, **kwargs)
 
     self._feature_columns = _normalize_feature_columns(feature_columns)
-    self._state_manager = _InputLayerStateManager(self, self._feature_columns,
-                                                  self.trainable)
+    self._state_manager = _StateManagerImpl(self, self.trainable)
     self._shared_state_manager = shared_state_manager
     for column in sorted(self._feature_columns, key=lambda x: x.name):
       if not isinstance(column, DenseColumn):
@@ -395,7 +384,8 @@ class FeatureLayer(Layer):
         column.create_state(self._shared_state_manager)
       else:
         with variable_scope.variable_scope(None, default_name=self.name):
-          column.create_state(self._state_manager)
+          with variable_scope.variable_scope(None, default_name=column.name):
+            column.create_state(self._state_manager)
       super(FeatureLayer, self).build(None)
 
   def call(self, features, cols_to_output_tensors=None):
@@ -448,20 +438,18 @@ class FeatureLayer(Layer):
     return (input_shape[0], total_elements)
 
 
-def linear_model(features,
-                 feature_columns,
-                 units=1,
-                 sparse_combiner='sum',
-                 weight_collections=None,
-                 trainable=True,
-                 cols_to_vars=None):
-  """Returns a linear prediction `Tensor` based on given `feature_columns`.
+def _strip_leading_slashes(name):
+  return name.rsplit('/', 1)[-1]
+
+
+class LinearModel(Layer):
+  """Produces a linear prediction `Tensor` based on given `feature_columns`.
 
-  This function generates a weighted sum based on output dimension `units`.
+  This layer generates a weighted sum based on output dimension `units`.
   Weighted sum refers to logits in classification problems. It refers to the
   prediction itself for linear regression problems.
 
-  Note on supported columns: `linear_model` treats categorical columns as
+  Note on supported columns: `LinearModel` treats categorical columns as
   `indicator_column`s. To be specific, assume the input as `SparseTensor` looks
   like:
 
@@ -486,308 +474,189 @@ def linear_model(features,
   keywords = categorical_column_with_hash_bucket("keywords", 10K)
   keywords_price = crossed_column('keywords', price_buckets, ...)
   columns = [price_buckets, keywords, keywords_price ...]
+  linear_model = LinearModel(columns)
+
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  prediction = linear_model(features, columns)
+  prediction = linear_model(features)
   ```
-
-  Args:
-    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
-      keys. For example `numeric_column('price')` will look at 'price' key in
-      this dict. Values are `Tensor` or `SparseTensor` depending on
-      corresponding `_FeatureColumn`.
-    feature_columns: An iterable containing the FeatureColumns to use as inputs
-      to your model. All items should be instances of classes derived from
-      `_FeatureColumn`s.
-    units: An integer, dimensionality of the output space. Default value is 1.
-    sparse_combiner: A string specifying how to reduce if a categorical column
-      is multivalent. Except `numeric_column`, almost all columns passed to
-      `linear_model` are considered as categorical columns.  It combines each
-      categorical column independently. Currently "mean", "sqrtn" and "sum" are
-      supported, with "sum" the default for linear model. "sqrtn" often achieves
-      good accuracy, in particular with bag-of-words columns.
-        * "sum": do not normalize features in the column
-        * "mean": do l1 normalization on features in the column
-        * "sqrtn": do l2 normalization on features in the column
-      For example, for two features represented as the categorical columns:
-
-      ```python
-        # Feature 1
-
-        shape = [2, 2]
-        {
-            [0, 0]: "a"
-            [0, 1]: "b"
-            [1, 0]: "c"
-        }
-
-        # Feature 2
-
-        shape = [2, 3]
-        {
-            [0, 0]: "d"
-            [1, 0]: "e"
-            [1, 1]: "f"
-            [1, 2]: "g"
-        }
-      ```
-      with `sparse_combiner` as "mean", the linear model outputs conceptly are:
-      ```
-        y_0 = 1.0 / 2.0 * ( w_a + w_ b) + w_c + b_0
-        y_1 = w_d + 1.0 / 3.0 * ( w_e + w_ f + w_g) + b_1
-      ```
-      where `y_i` is the output, `b_i` is the bias, and `w_x` is the weight
-      assigned to the presence of `x` in the input features.
-    weight_collections: A list of collection names to which the Variable will be
-      added. Note that, variables will also be added to collections
-      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
-    trainable: If `True` also add the variable to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    cols_to_vars: If not `None`, must be a dictionary that will be filled with a
-      mapping from `_FeatureColumn` to associated list of `Variable`s.  For
-      example, after the call, we might have cols_to_vars = {
-        _NumericColumn(
-          key='numeric_feature1', shape=(1,):
-        [<tf.Variable 'linear_model/price2/weights:0' shape=(1, 1)>],
-        'bias': [<tf.Variable 'linear_model/bias_weights:0' shape=(1,)>],
-        _NumericColumn(
-          key='numeric_feature2', shape=(2,)):
-        [<tf.Variable 'linear_model/price1/weights:0' shape=(2, 1)>]}
-      If a column creates no variables, its value will be an empty list. Note
-      that cols_to_vars will also contain a string key 'bias' that maps to a
-      list of Variables.
-
-  Returns:
-    A `Tensor` which represents predictions/logits of a linear model. Its shape
-    is (batch_size, units) and its dtype is `float32`.
-
-  Raises:
-    ValueError: if an item in `feature_columns` is neither a `_DenseColumn`
-      nor `_CategoricalColumn`.
-  """
-  with variable_scope.variable_scope(None, 'linear_model') as vs:
-    model_name = _strip_leading_slashes(vs.name)
-  linear_model_layer = _LinearModel(
-      feature_columns=feature_columns,
-      units=units,
-      sparse_combiner=sparse_combiner,
-      weight_collections=weight_collections,
-      trainable=trainable,
-      name=model_name)
-  retval = linear_model_layer(features)  # pylint: disable=not-callable
-  if cols_to_vars is not None:
-    cols_to_vars.update(linear_model_layer.cols_to_vars())
-  return retval
-
-
-def _add_to_collections(var, weight_collections):
-  """Adds a var to the list of weight_collections provided.
-
-  Handles the case for partitioned and non-partitioned variables.
-
-  Args:
-    var: A variable or Partitioned Variable.
-    weight_collections: List of collections to add variable to.
-  """
-  for weight_collection in weight_collections:
-    # The layer self.add_variable call already adds it to GLOBAL_VARIABLES.
-    if weight_collection == ops.GraphKeys.GLOBAL_VARIABLES:
-      continue
-    # TODO(rohanj): Explore adding a _get_variable_list method on `Variable`
-    # so that we don't have to do this check.
-    if isinstance(var, variables.PartitionedVariable):
-      for constituent_var in list(var):
-        ops.add_to_collection(weight_collection, constituent_var)
-    else:
-      ops.add_to_collection(weight_collection, var)
-
-
-class _FCLinearWrapper(base.Layer):
-  """Wraps a _FeatureColumn in a layer for use in a linear model.
-
-  See `linear_model` above.
   """
 
   def __init__(self,
-               feature_column,
+               feature_columns,
                units=1,
                sparse_combiner='sum',
-               weight_collections=None,
                trainable=True,
                name=None,
+               shared_state_manager=None,
                **kwargs):
-    super(_FCLinearWrapper, self).__init__(
-        trainable=trainable, name=name, **kwargs)
-    self._feature_column = feature_column
-    self._units = units
-    self._sparse_combiner = sparse_combiner
-    self._weight_collections = weight_collections
+    """Constructs a LinearModel.
 
-  def build(self, _):
-    if isinstance(self._feature_column, fc_old._CategoricalColumn):  # pylint: disable=protected-access
-      weight = self.add_variable(
-          name='weights',
-          shape=(self._feature_column._num_buckets, self._units),  # pylint: disable=protected-access
-          initializer=init_ops.zeros_initializer(),
-          trainable=self.trainable)
-    else:
-      num_elements = self._feature_column._variable_shape.num_elements()  # pylint: disable=protected-access
-      weight = self.add_variable(
-          name='weights',
-          shape=[num_elements, self._units],
-          initializer=init_ops.zeros_initializer(),
-          trainable=self.trainable)
-    _add_to_collections(weight, self._weight_collections)
-    self._weight_var = weight
-    self.built = True
-
-  def call(self, builder):
-    weighted_sum = fc_old._create_weighted_sum(  # pylint: disable=protected-access
-        column=self._feature_column,
-        builder=builder,
-        units=self._units,
-        sparse_combiner=self._sparse_combiner,
-        weight_collections=self._weight_collections,
-        trainable=self.trainable,
-        weight_var=self._weight_var)
-    return weighted_sum
+    Args:
+      feature_columns: An iterable containing the FeatureColumns to use as
+        inputs to your model. All items should be instances of classes derived
+        from `_FeatureColumn`s.
+      units: An integer, dimensionality of the output space. Default value is 1.
+      sparse_combiner: A string specifying how to reduce if a categorical column
+        is multivalent. Except `numeric_column`, almost all columns passed to
+        `linear_model` are considered as categorical columns.  It combines each
+        categorical column independently. Currently "mean", "sqrtn" and "sum"
+        are supported, with "sum" the default for linear model. "sqrtn" often
+        achieves good accuracy, in particular with bag-of-words columns.
+          * "sum": do not normalize features in the column
+          * "mean": do l1 normalization on features in the column
+          * "sqrtn": do l2 normalization on features in the column
+        For example, for two features represented as the categorical columns:
+
+          ```python
+          # Feature 1
+
+          shape = [2, 2]
+          {
+              [0, 0]: "a"
+              [0, 1]: "b"
+              [1, 0]: "c"
+          }
+
+          # Feature 2
+
+          shape = [2, 3]
+          {
+              [0, 0]: "d"
+              [1, 0]: "e"
+              [1, 1]: "f"
+              [1, 2]: "g"
+          }
+          ```
+
+        with `sparse_combiner` as "mean", the linear model outputs conceptly are
+        ```
+        y_0 = 1.0 / 2.0 * ( w_a + w_ b) + w_c + b_0
+        y_1 = w_d + 1.0 / 3.0 * ( w_e + w_ f + w_g) + b_1
+        ```
+        where `y_i` is the output, `b_i` is the bias, and `w_x` is the weight
+        assigned to the presence of `x` in the input features.
+      trainable: If `True` also add the variable to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: Name to give to the Linear Model. All variables and ops created will
+        be scoped by this name.
+      shared_state_manager: SharedEmbeddingStateManager that manages the state
+        of SharedEmbeddingColumns. For more info, look at `FeatureLayer`.
+      **kwargs: Keyword arguments to construct a layer.
 
+    Raises:
+      ValueError: if an item in `feature_columns` is neither a `DenseColumn`
+        nor `CategoricalColumn`.
+    """
+    super(LinearModel, self).__init__(name=name, trainable=trainable, **kwargs)
 
-class _BiasLayer(base.Layer):
-  """A layer for the bias term.
-  """
+    self._feature_columns = _normalize_feature_columns(feature_columns)
+    self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
+    for column in self._feature_columns:
+      if not isinstance(column, (DenseColumn, CategoricalColumn)):
+        raise ValueError(
+            'Items of feature_columns must be either a '
+            'DenseColumn or CategoricalColumn. Given: {}'.format(column))
 
-  def __init__(self,
-               units=1,
-               trainable=True,
-               weight_collections=None,
-               name=None,
-               **kwargs):
-    super(_BiasLayer, self).__init__(trainable=trainable, name=name, **kwargs)
     self._units = units
-    self._weight_collections = weight_collections
-
-  def build(self, _):
-    self._bias_variable = self.add_variable(
-        'bias_weights',
-        shape=[self._units],
-        initializer=init_ops.zeros_initializer(),
-        trainable=self.trainable)
-    _add_to_collections(self._bias_variable, self._weight_collections)
-    self.built = True
-
-  def call(self, _):
-    return self._bias_variable
+    self._sparse_combiner = sparse_combiner
 
+    self._state_manager = _StateManagerImpl(self, self.trainable)
+    self._shared_state_manager = shared_state_manager
+    self._bias_variable = None
 
-def _get_expanded_variable_list(var_list):
-  returned_list = []
-  for variable in var_list:
-    if (isinstance(variable, variables.Variable) or
-        resource_variable_ops.is_resource_variable(variable)):
-      returned_list.append(variable)  # Single variable case.
-    else:  # Must be a PartitionedVariable, so convert into a list.
-      returned_list.extend(list(variable))
-  return returned_list
+  def build(self, _):
+    # Create state for shared embedding columns.
+    for column in self._feature_columns:
+      if isinstance(column, SharedEmbeddingColumn):
+        column.create_state(self._shared_state_manager)
 
+    # We need variable scopes for now because we want the variable partitioning
+    # information to percolate down. We also use _pure_variable_scope's here
+    # since we want to open up a name_scope in the `call` method while creating
+    # the ops.
+    with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
+      for column in self._feature_columns:
+        with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
+          # Create the state for each feature column
+          if not isinstance(column, SharedEmbeddingColumn):
+            column.create_state(self._state_manager)
+
+          # Create a weight variable for each column.
+          if isinstance(column, CategoricalColumn):
+            first_dim = column.num_buckets
+          else:
+            first_dim = column.variable_shape.num_elements()
+          self._state_manager.create_variable(
+              column,
+              name='weights',
+              dtype=dtypes.float32,
+              shape=(first_dim, self._units),
+              initializer=init_ops.zeros_initializer(),
+              trainable=self.trainable)
+
+      # Create a bias variable.
+      self._bias_variable = self.add_variable(
+          name='bias_weights',
+          dtype=dtypes.float32,
+          shape=[self._units],
+          initializer=init_ops.zeros_initializer(),
+          trainable=self.trainable,
+          # TODO(rohanj): Get rid of this hack once we have a mechanism for
+          # specifying a default partitioner for an entire layer. In that case,
+          # the default getter for Layers should work.
+          getter=variable_scope.get_variable)
 
-def _strip_leading_slashes(name):
-  return name.rsplit('/', 1)[-1]
+    super(LinearModel, self).build(None)
 
+  def call(self, features):
+    """Returns a `Tensor` the represents the predictions of a linear model.
 
-class _LinearModel(training.Model):
-  """Creates a linear model using feature columns.
+    Args:
+      features: A mapping from key to tensors. `_FeatureColumn`s look up via
+        these keys. For example `numeric_column('price')` will look at 'price'
+        key in this dict. Values are `Tensor` or `SparseTensor` depending on
+        corresponding `_FeatureColumn`.
 
-  See `linear_model` for details.
-  """
+    Returns:
+      A `Tensor` which represents predictions/logits of a linear model. Its
+      shape is (batch_size, units) and its dtype is `float32`.
 
-  def __init__(self,
-               feature_columns,
-               units=1,
-               sparse_combiner='sum',
-               weight_collections=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(_LinearModel, self).__init__(name=name, **kwargs)
-    self._feature_columns = fc_old._normalize_feature_columns(  # pylint: disable=protected-access
-        feature_columns)
-    self._weight_collections = list(weight_collections or [])
-    if ops.GraphKeys.GLOBAL_VARIABLES not in self._weight_collections:
-      self._weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
-    if ops.GraphKeys.MODEL_VARIABLES not in self._weight_collections:
-      self._weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
-
-    column_layers = {}
-    for column in sorted(self._feature_columns, key=lambda x: x.name):
-      with variable_scope.variable_scope(
-          None, default_name=column._var_scope_name) as vs:  # pylint: disable=protected-access
-        # Having the fully expressed variable scope name ends up doubly
-        # expressing the outer scope (scope with which this method was called)
-        # in the name of the variable that would get created.
-        column_name = _strip_leading_slashes(vs.name)
-      column_layer = _FCLinearWrapper(column, units, sparse_combiner,
-                                      self._weight_collections, trainable,
-                                      column_name, **kwargs)
-      column_layers[column_name] = column_layer
-    self._column_layers = self._add_layers(column_layers)
-    self._bias_layer = _BiasLayer(
-        units=units,
-        trainable=trainable,
-        weight_collections=self._weight_collections,
-        name='bias_layer',
-        **kwargs)
-    self._cols_to_vars = {}
-
-  def cols_to_vars(self):
-    """Returns a dict mapping _FeatureColumns to variables.
-
-    See `linear_model` for more information.
-    This is not populated till `call` is called i.e. layer is built.
+    Raises:
+      ValueError: If features are not a dictionary.
     """
-    return self._cols_to_vars
-
-  def call(self, features):
-    with variable_scope.variable_scope(self.name):
-      for column in self._feature_columns:
-        if not isinstance(
-            column,
-            (
-                fc_old._DenseColumn,  # pylint: disable=protected-access
-                fc_old._CategoricalColumn)):  # pylint: disable=protected-access
-          raise ValueError(
-              'Items of feature_columns must be either a '
-              '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
-      weighted_sums = []
-      ordered_columns = []
-      builder = fc_old._LazyBuilder(features)  # pylint: disable=protected-access
-      for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
-        column = layer._feature_column  # pylint: disable=protected-access
-        ordered_columns.append(column)
-        weighted_sum = layer(builder)
+    if not isinstance(features, dict):
+      raise ValueError('We expected a dictionary here. Instead we got: ',
+                       features)
+    transformation_cache = FeatureTransformationCache(features)
+    weighted_sums = []
+    for column in self._feature_columns:
+      with ops.name_scope(column.name):
+        # All the weights used in the linear model are owned by the state
+        # manager associated with this Linear Model.
+        weight_var = self._state_manager.get_variable(column, 'weights')
+
+        # The embedding weights for the SharedEmbeddingColumn are owned by
+        # the shared_state_manager and so we need to pass that in while
+        # creating the weighted sum. For all other columns, the state is owned
+        # by the Linear Model's state manager.
+        if isinstance(column, SharedEmbeddingColumn):
+          state_manager = self._shared_state_manager
+        else:
+          state_manager = self._state_manager
+        weighted_sum = _create_weighted_sum(
+            column=column,
+            transformation_cache=transformation_cache,
+            state_manager=state_manager,
+            sparse_combiner=self._sparse_combiner,
+            weight_var=weight_var)
         weighted_sums.append(weighted_sum)
-        self._cols_to_vars[column] = ops.get_collection(
-            ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name)
-
-      _verify_static_batch_size_equality(weighted_sums, ordered_columns)
-      predictions_no_bias = math_ops.add_n(
-          weighted_sums, name='weighted_sum_no_bias')
-      predictions = nn_ops.bias_add(
-          predictions_no_bias,
-          self._bias_layer(  # pylint: disable=not-callable
-              builder,
-              scope=variable_scope.get_variable_scope()),  # pylint: disable=not-callable
-          name='weighted_sum')
-      bias = self._bias_layer.variables[0]
-      self._cols_to_vars['bias'] = _get_expanded_variable_list([bias])
-    return predictions
 
-  def _add_layers(self, layers):
-    # "Magic" required for keras.Model classes to track all the variables in
-    # a list of layers.Layer objects.
-    # TODO(ashankar): Figure out API so user code doesn't have to do this.
-    for name, layer in layers.items():
-      setattr(self, 'layer-%s' % name, layer)
-    return layers
+    _verify_static_batch_size_equality(weighted_sums, self._feature_columns)
+    predictions_no_bias = math_ops.add_n(
+        weighted_sums, name='weighted_sum_no_bias')
+    predictions = nn_ops.bias_add(
+        predictions_no_bias, self._bias_variable, name='weighted_sum')
+    return predictions
 
 
 def _transform_features(features, feature_columns, state_manager):
@@ -2053,58 +1922,32 @@ def is_feature_column_v2(feature_columns):
   return True
 
 
-def _create_weighted_sum(column,
-                         transformation_cache,
-                         state_manager,
-                         units,
-                         sparse_combiner,
-                         weight_collections,
-                         trainable,
-                         weight_var=None):
+def _create_weighted_sum(column, transformation_cache, state_manager,
+                         sparse_combiner, weight_var):
   """Creates a weighted sum for a dense/categorical column for linear_model."""
   if isinstance(column, CategoricalColumn):
     return _create_categorical_column_weighted_sum(
         column=column,
         transformation_cache=transformation_cache,
         state_manager=state_manager,
-        units=units,
         sparse_combiner=sparse_combiner,
-        weight_collections=weight_collections,
-        trainable=trainable,
         weight_var=weight_var)
   else:
     return _create_dense_column_weighted_sum(
         column=column,
         transformation_cache=transformation_cache,
         state_manager=state_manager,
-        units=units,
-        weight_collections=weight_collections,
-        trainable=trainable,
         weight_var=weight_var)
 
 
-def _create_dense_column_weighted_sum(column,
-                                      transformation_cache,
-                                      state_manager,
-                                      units,
-                                      weight_collections,
-                                      trainable,
-                                      weight_var=None):
+def _create_dense_column_weighted_sum(column, transformation_cache,
+                                      state_manager, weight_var):
   """Create a weighted sum of a dense column for linear_model."""
   tensor = column.get_dense_tensor(transformation_cache, state_manager)
   num_elements = column.variable_shape.num_elements()
   batch_size = array_ops.shape(tensor)[0]
   tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
-  if weight_var is not None:
-    weight = weight_var
-  else:
-    weight = variable_scope.get_variable(
-        name='weights',
-        shape=[num_elements, units],
-        initializer=init_ops.zeros_initializer(),
-        trainable=trainable,
-        collections=weight_collections)
-  return math_ops.matmul(tensor, weight, name='weighted_sum')
+  return math_ops.matmul(tensor, weight_var, name='weighted_sum')
 
 
 class CategoricalColumn(FeatureColumn):
@@ -2145,14 +1988,8 @@ class CategoricalColumn(FeatureColumn):
     pass
 
 
-def _create_categorical_column_weighted_sum(column,
-                                            transformation_cache,
-                                            state_manager,
-                                            units,
-                                            sparse_combiner,
-                                            weight_collections,
-                                            trainable,
-                                            weight_var=None):
+def _create_categorical_column_weighted_sum(
+    column, transformation_cache, state_manager, sparse_combiner, weight_var):
   # pylint: disable=g-doc-return-or-yield,g-doc-args
   """Create a weighted sum of a categorical column for linear_model.
 
@@ -2191,17 +2028,8 @@ def _create_categorical_column_weighted_sum(column,
     weight_tensor = sparse_ops.sparse_reshape(
         weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
 
-  if weight_var is not None:
-    weight = weight_var
-  else:
-    weight = variable_scope.get_variable(
-        name='weights',
-        shape=(column.num_buckets, units),
-        initializer=init_ops.zeros_initializer(),
-        trainable=trainable,
-        collections=weight_collections)
   return _safe_embedding_lookup_sparse(
-      weight,
+      weight_var,
       id_tensor,
       sparse_weights=weight_tensor,
       combiner=sparse_combiner,
@@ -2836,6 +2664,10 @@ class SharedEmbeddingColumn(
 
   def create_state(self, state_manager):
     """Creates the shared embedding lookup variable."""
+    if not isinstance(state_manager, SharedEmbeddingStateManager):
+      raise ValueError('Expected state_manager to be of type '
+                       'SharedEmbeddingStateManager. Obtained type: {}'.format(
+                           type(state_manager)))
     embedding_shape = (self.categorical_column.num_buckets, self.dimension)
     state_manager.create_variable(
         name=self.shared_collection_name,
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index 2970431167..a13a5010e1 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -31,9 +31,7 @@ from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
-from tensorflow.python.feature_column.feature_column_v2 import _LinearModel
 from tensorflow.python.feature_column.feature_column_v2 import _transform_features
 from tensorflow.python.feature_column.feature_column_v2 import FeatureColumn
 from tensorflow.python.feature_column.feature_column_v2 import FeatureLayer
@@ -48,7 +46,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
@@ -360,26 +357,12 @@ class NumericColumnTest(test.TestCase):
     self.assertEqual(a.default_value, ((3., 2.),))
 
   def test_linear_model(self):
-    price = fc_old.numeric_column('price')
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      predictions = fc.linear_model(features, [price])
-      bias = get_linear_model_bias()
-      price_var = get_linear_model_column_var(price)
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
-        sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
-
-  def test_keras_linear_model(self):
-    price = fc_old.numeric_column('price')
+    price = fc.numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      predictions = get_keras_linear_model_predictions(features, [price])
-      bias = get_linear_model_bias()
-      price_var = get_linear_model_column_var(price)
+      model = fc.LinearModel([price])
+      predictions = model(features)
+      price_var, bias = model.variables
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
         self.assertAllClose([[0.]], price_var.eval())
@@ -564,13 +547,13 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_linear_model_one_input_value(self):
     """Tests linear_model() for input with shape=[1]."""
-    price = fc_old.numeric_column('price', shape=[1])
-    bucketized_price = fc_old.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1.], [1.], [5.], [6.]]}
-      predictions = fc.linear_model(features, [bucketized_price])
-      bias = get_linear_model_bias()
-      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      model = fc.LinearModel([bucketized_price])
+      predictions = model(features)
+      bucketized_price_var, bias = model.variables
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
         # One weight variable per bucket, all initialized to zero.
@@ -589,13 +572,13 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_linear_model_two_input_values(self):
     """Tests linear_model() for input with shape=[2]."""
-    price = fc_old.numeric_column('price', shape=[2])
-    bucketized_price = fc_old.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1., 1.], [5., 6.]]}
-      predictions = fc.linear_model(features, [bucketized_price])
-      bias = get_linear_model_bias()
-      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      model = fc.LinearModel([bucketized_price])
+      predictions = model(features)
+      bucketized_price_var, bias = model.variables
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
         # One weight per bucket per input column, all initialized to zero.
@@ -616,62 +599,6 @@ class BucketizedColumnTest(test.TestCase):
         sess.run(bias.assign([1.]))
         self.assertAllClose([[81.], [141.]], predictions.eval())
 
-  def test_keras_linear_model_one_input_value(self):
-    """Tests _LinearModel for input with shape=[1]."""
-    price = fc_old.numeric_column('price', shape=[1])
-    bucketized_price = fc_old.bucketized_column(price, boundaries=[0, 2, 4, 6])
-    with ops.Graph().as_default():
-      features = {'price': [[-1.], [1.], [5.], [6.]]}
-      predictions = get_keras_linear_model_predictions(features,
-                                                       [bucketized_price])
-      bias = get_linear_model_bias()
-      bucketized_price_var = get_linear_model_column_var(bucketized_price)
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        # One weight variable per bucket, all initialized to zero.
-        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
-                            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
-        sess.run(
-            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
-        # price -1. is in the 0th bucket, whose weight is 10.
-        # price 1. is in the 1st bucket, whose weight is 20.
-        # price 5. is in the 3rd bucket, whose weight is 40.
-        # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
-        sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
-
-  def test_keras_linear_model_two_input_values(self):
-    """Tests _LinearModel for input with shape=[2]."""
-    price = fc_old.numeric_column('price', shape=[2])
-    bucketized_price = fc_old.bucketized_column(price, boundaries=[0, 2, 4, 6])
-    with ops.Graph().as_default():
-      features = {'price': [[-1., 1.], [5., 6.]]}
-      predictions = get_keras_linear_model_predictions(features,
-                                                       [bucketized_price])
-      bias = get_linear_model_bias()
-      bucketized_price_var = get_linear_model_column_var(bucketized_price)
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        # One weight per bucket per input column, all initialized to zero.
-        self.assertAllClose(
-            [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
-        sess.run(
-            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
-                                         [60.], [70.], [80.], [90.], [100.]]))
-        # 1st example:
-        #   price -1. is in the 0th bucket, whose weight is 10.
-        #   price 1. is in the 6th bucket, whose weight is 70.
-        # 2nd example:
-        #   price 5. is in the 3rd bucket, whose weight is 40.
-        #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
-        sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
-
 
 class HashedCategoricalColumnTest(test.TestCase):
 
@@ -852,39 +779,18 @@ class HashedCategoricalColumnTest(test.TestCase):
         transformation_cache.get(hashed_sparse, None), id_weight_pair.id_tensor)
 
   def test_linear_model(self):
-    wire_column = fc_old.categorical_column_with_hash_bucket('wire', 4)
-    self.assertEqual(4, wire_column._num_buckets)
-    with ops.Graph().as_default():
-      predictions = fc.linear_model({
-          wire_column.name: sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=('marlo', 'skywalker', 'omar'),
-              dense_shape=(2, 2))
-      }, (wire_column,))
-      bias = get_linear_model_bias()
-      wire_var = get_linear_model_column_var(wire_column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 3: wire_var[3] = 4
-        # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
-
-  def test_keras_linear_model(self):
-    wire_column = fc_old.categorical_column_with_hash_bucket('wire', 4)
-    self.assertEqual(4, wire_column._num_buckets)
+    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    self.assertEqual(4, wire_column.num_buckets)
     with ops.Graph().as_default():
-      predictions = get_keras_linear_model_predictions({
+      model = fc.LinearModel((wire_column,))
+      predictions = model({
           wire_column.name:
               sparse_tensor.SparseTensorValue(
                   indices=((0, 0), (1, 0), (1, 1)),
                   values=('marlo', 'skywalker', 'omar'),
                   dense_shape=(2, 2))
-      }, (wire_column,))
-      bias = get_linear_model_bias()
-      wire_var = get_linear_model_column_var(wire_column)
+      })
+      wire_var, bias = model.variables
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
@@ -1103,93 +1009,12 @@ class CrossedColumnTest(test.TestCase):
 
     Uses data from test_get_sparse_tesnsors_simple.
     """
-    a = fc_old.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc_old.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc_old.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
-    with ops.Graph().as_default():
-      predictions = fc.linear_model({
-          'a': constant_op.constant(((-1., .5), (.5, 1.))),
-          'c': sparse_tensor.SparseTensor(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=['cA', 'cB', 'cC'],
-              dense_shape=(2, 2)),
-      }, (crossed,))
-      bias = get_linear_model_bias()
-      crossed_var = get_linear_model_column_var(crossed)
-      with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(
-            ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
-        # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
-        sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
-
-  def test_linear_model_with_weights(self):
-
-    class _TestColumnWithWeights(fc_old._CategoricalColumn):
-      """Produces sparse IDs and sparse weights."""
-
-      @property
-      def name(self):
-        return 'test_column'
-
-      @property
-      def _parse_example_spec(self):
-        return {
-            self.name: parsing_ops.VarLenFeature(dtypes.int32),
-            '{}_weights'.format(self.name): parsing_ops.VarLenFeature(
-                dtypes.float32),
-            }
-
-      @property
-      def _num_buckets(self):
-        return 5
-
-      def _transform_feature(self, inputs):
-        return (inputs.get(self.name),
-                inputs.get('{}_weights'.format(self.name)))
-
-      def _get_sparse_tensors(self, inputs, weight_collections=None,
-                              trainable=None):
-        """Populates both id_tensor and weight_tensor."""
-        ids_and_weights = inputs.get(self)
-        return fc_old._CategoricalColumn.IdWeightPair(
-            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
-
-    t = _TestColumnWithWeights()
-    crossed = fc_old.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
-    with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
-          ValueError,
-          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
-        fc.linear_model({
-            t.name: sparse_tensor.SparseTensor(
-                indices=((0, 0), (1, 0), (1, 1)),
-                values=[0, 1, 2],
-                dense_shape=(2, 2)),
-            '{}_weights'.format(t.name): sparse_tensor.SparseTensor(
-                indices=((0, 0), (1, 0), (1, 1)),
-                values=[1., 10., 2.],
-                dense_shape=(2, 2)),
-            'c': sparse_tensor.SparseTensor(
-                indices=((0, 0), (1, 0), (1, 1)),
-                values=['cA', 'cB', 'cC'],
-                dense_shape=(2, 2)),
-        }, (crossed,))
-
-  def test_keras_linear_model(self):
-    """Tests _LinearModel.
-
-    Uses data from test_get_sparse_tesnsors_simple.
-    """
-    a = fc_old.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc_old.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc_old.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
-      predictions = get_keras_linear_model_predictions({
+      model = fc.LinearModel((crossed,))
+      predictions = model({
           'a':
               constant_op.constant(((-1., .5), (.5, 1.))),
           'c':
@@ -1197,847 +1022,126 @@ class CrossedColumnTest(test.TestCase):
                   indices=((0, 0), (1, 0), (1, 1)),
                   values=['cA', 'cB', 'cC'],
                   dense_shape=(2, 2)),
-      }, (crossed,))
-      bias = get_linear_model_bias()
-      crossed_var = get_linear_model_column_var(crossed)
+      })
+      crossed_var, bias = model.variables
       with _initialized_session() as sess:
         self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
-                            crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
-        # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
-        sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
-
-  def test_keras_linear_model_with_weights(self):
-
-    class _TestColumnWithWeights(fc_old._CategoricalColumn):
-      """Produces sparse IDs and sparse weights."""
-
-      @property
-      def name(self):
-        return 'test_column'
-
-      @property
-      def _parse_example_spec(self):
-        return {
-            self.name:
-                parsing_ops.VarLenFeature(dtypes.int32),
-            '{}_weights'.format(self.name):
-                parsing_ops.VarLenFeature(dtypes.float32),
-        }
-
-      @property
-      def _num_buckets(self):
-        return 5
-
-      def _transform_feature(self, inputs):
-        return (inputs.get(self.name),
-                inputs.get('{}_weights'.format(self.name)))
-
-      def _get_sparse_tensors(self,
-                              inputs,
-                              weight_collections=None,
-                              trainable=None):
-        """Populates both id_tensor and weight_tensor."""
-        ids_and_weights = inputs.get(self)
-        return fc_old._CategoricalColumn.IdWeightPair(
-            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
-
-    t = _TestColumnWithWeights()
-    crossed = fc_old.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
-    with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
-          ValueError,
-          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
-        get_keras_linear_model_predictions({
-            t.name:
-                sparse_tensor.SparseTensor(
-                    indices=((0, 0), (1, 0), (1, 1)),
-                    values=[0, 1, 2],
-                    dense_shape=(2, 2)),
-            '{}_weights'.format(t.name):
-                sparse_tensor.SparseTensor(
-                    indices=((0, 0), (1, 0), (1, 1)),
-                    values=[1., 10., 2.],
-                    dense_shape=(2, 2)),
-            'c':
-                sparse_tensor.SparseTensor(
-                    indices=((0, 0), (1, 0), (1, 1)),
-                    values=['cA', 'cB', 'cC'],
-                    dense_shape=(2, 2)),
-        }, (crossed,))
-
-
-def get_linear_model_bias(name='linear_model'):
-  with variable_scope.variable_scope(name, reuse=True):
-    return variable_scope.get_variable('bias_weights')
-
-
-def get_linear_model_column_var(column, name='linear_model'):
-  return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
-                            name + '/' + column.name)[0]
-
-
-def get_keras_linear_model_predictions(features,
-                                       feature_columns,
-                                       units=1,
-                                       sparse_combiner='sum',
-                                       weight_collections=None,
-                                       trainable=True,
-                                       cols_to_vars=None):
-  keras_linear_model = _LinearModel(
-      feature_columns,
-      units,
-      sparse_combiner,
-      weight_collections,
-      trainable,
-      name='linear_model')
-  retval = keras_linear_model(features)  # pylint: disable=not-callable
-  if cols_to_vars is not None:
-    cols_to_vars.update(keras_linear_model.cols_to_vars())
-  return retval
-
-
-class LinearModelTest(test.TestCase):
-
-  def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'feature_columns must not be empty'):
-      fc.linear_model(features={}, feature_columns=[])
-
-  def test_should_be_feature_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
-      fc.linear_model(features={'a': [[0]]}, feature_columns='NotSupported')
-
-  def test_should_be_dense_or_categorical_column(self):
-
-    class NotSupportedColumn(fc_old._FeatureColumn):
-
-      @property
-      def name(self):
-        return 'NotSupportedColumn'
-
-      def _transform_feature(self, cache):
-        pass
-
-      @property
-      def _parse_example_spec(self):
-        pass
-
-    with self.assertRaisesRegexp(
-        ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
-      fc.linear_model(
-          features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
-
-  def test_does_not_support_dict_columns(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'Expected feature_columns to be iterable, found dict.'):
-      fc.linear_model(
-          features={'a': [[0]]},
-          feature_columns={'a': fc_old.numeric_column('a')})
-
-  def test_raises_if_duplicate_name(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'Duplicate feature column name found for columns'):
-      fc.linear_model(
-          features={'a': [[0]]},
-          feature_columns=[
-              fc_old.numeric_column('a'),
-              fc_old.numeric_column('a')
-          ])
-
-  def test_dense_bias(self):
-    price = fc_old.numeric_column('price')
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      predictions = fc.linear_model(features, [price])
-      bias = get_linear_model_bias()
-      price_var = get_linear_model_column_var(price)
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        sess.run(price_var.assign([[10.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
-
-  def test_sparse_bias(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor}
-      predictions = fc.linear_model(features, [wire_cast])
-      bias = get_linear_model_bias()
-      wire_cast_var = get_linear_model_column_var(wire_cast)
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
-
-  def test_dense_and_sparse_bias(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
-    price = fc_old.numeric_column('price')
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
-      predictions = fc.linear_model(features, [wire_cast, price])
-      bias = get_linear_model_bias()
-      wire_cast_var = get_linear_model_column_var(wire_cast)
-      price_var = get_linear_model_column_var(price)
-      with _initialized_session() as sess:
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        sess.run(bias.assign([5.]))
-        sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
-
-  def test_dense_and_sparse_column(self):
-    """When the column is both dense and sparse, uses sparse tensors."""
-
-    class _DenseAndSparseColumn(fc_old._DenseColumn, fc_old._CategoricalColumn):
-
-      @property
-      def name(self):
-        return 'dense_and_sparse_column'
-
-      @property
-      def _parse_example_spec(self):
-        return {self.name: parsing_ops.VarLenFeature(self.dtype)}
-
-      def _transform_feature(self, inputs):
-        return inputs.get(self.name)
-
-      @property
-      def _variable_shape(self):
-        raise ValueError('Should not use this method.')
-
-      def _get_dense_tensor(self, inputs, weight_collections=None,
-                            trainable=None):
-        raise ValueError('Should not use this method.')
-
-      @property
-      def _num_buckets(self):
-        return 4
-
-      def _get_sparse_tensors(self, inputs, weight_collections=None,
-                              trainable=None):
-        sp_tensor = sparse_tensor.SparseTensor(
-            indices=[[0, 0], [1, 0], [1, 1]],
-            values=[2, 0, 3],
-            dense_shape=[2, 2])
-        return fc_old._CategoricalColumn.IdWeightPair(sp_tensor, None)
-
-    dense_and_sparse_column = _DenseAndSparseColumn()
-    with ops.Graph().as_default():
-      sp_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {dense_and_sparse_column.name: sp_tensor}
-      predictions = fc.linear_model(features, [dense_and_sparse_column])
-      bias = get_linear_model_bias()
-      dense_and_sparse_column_var = get_linear_model_column_var(
-          dense_and_sparse_column)
-      with _initialized_session() as sess:
-        sess.run(dense_and_sparse_column_var.assign(
-            [[10.], [100.], [1000.], [10000.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
-
-  def test_dense_multi_output(self):
-    price = fc_old.numeric_column('price')
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      predictions = fc.linear_model(features, [price], units=3)
-      bias = get_linear_model_bias()
-      price_var = get_linear_model_column_var(price)
-      with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
-        sess.run(price_var.assign([[10., 100., 1000.]]))
-        sess.run(bias.assign([5., 6., 7.]))
-        self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
-
-  def test_sparse_multi_output(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor}
-      predictions = fc.linear_model(features, [wire_cast], units=3)
-      bias = get_linear_model_bias()
-      wire_cast_var = get_linear_model_column_var(wire_cast)
-      with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
-        sess.run(
-            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
-                1000., 1100., 1200.
-            ], [10000., 11000., 12000.]]))
-        sess.run(bias.assign([5., 6., 7.]))
-        self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
-
-  def test_dense_multi_dimension(self):
-    price = fc_old.numeric_column('price', shape=2)
-    with ops.Graph().as_default():
-      features = {'price': [[1., 2.], [5., 6.]]}
-      predictions = fc.linear_model(features, [price])
-      price_var = get_linear_model_column_var(price)
-      with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
-
-  def test_sparse_multi_rank(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = array_ops.sparse_placeholder(dtypes.string)
-      wire_value = sparse_tensor.SparseTensorValue(
-          values=['omar', 'stringer', 'marlo', 'omar'],  # hashed = [2, 0, 3, 2]
-          indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]],
-          dense_shape=[2, 2, 2])
-      features = {'wire_cast': wire_tensor}
-      predictions = fc.linear_model(features, [wire_cast])
-      wire_cast_var = get_linear_model_column_var(wire_cast)
-      with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
-        self.assertAllClose(
-            np.zeros((2, 1)),
-            predictions.eval(feed_dict={wire_tensor: wire_value}))
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         self.assertAllClose(
-            [[1010.], [11000.]],
-            predictions.eval(feed_dict={wire_tensor: wire_value}))
-
-  def test_sparse_combiner(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor}
-      predictions = fc.linear_model(
-          features, [wire_cast], sparse_combiner='mean')
-      bias = get_linear_model_bias()
-      wire_cast_var = get_linear_model_column_var(wire_cast)
-      with _initialized_session() as sess:
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
-
-  def test_sparse_combiner_with_negative_weights(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
-    wire_cast_weights = fc_old.weighted_categorical_column(wire_cast, 'weights')
-
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {
-          'wire_cast': wire_tensor,
-          'weights': constant_op.constant([[1., 1., -1.0]])
-      }
-      predictions = fc.linear_model(
-          features, [wire_cast_weights], sparse_combiner='sum')
-      bias = get_linear_model_bias()
-      wire_cast_var = get_linear_model_column_var(wire_cast)
-      with _initialized_session() as sess:
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
-
-  def test_dense_multi_dimension_multi_output(self):
-    price = fc_old.numeric_column('price', shape=2)
-    with ops.Graph().as_default():
-      features = {'price': [[1., 2.], [5., 6.]]}
-      predictions = fc.linear_model(features, [price], units=3)
-      bias = get_linear_model_bias()
-      price_var = get_linear_model_column_var(price)
-      with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
-        sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
-        sess.run(bias.assign([2., 3., 4.]))
-        self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
-
-  def test_raises_if_shape_mismatch(self):
-    price = fc_old.numeric_column('price', shape=2)
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      with self.assertRaisesRegexp(
-          Exception,
-          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-        fc.linear_model(features, [price])
-
-  def test_dense_reshaping(self):
-    price = fc_old.numeric_column('price', shape=[1, 2])
-    with ops.Graph().as_default():
-      features = {'price': [[[1., 2.]], [[5., 6.]]]}
-      predictions = fc.linear_model(features, [price])
-      bias = get_linear_model_bias()
-      price_var = get_linear_model_column_var(price)
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
-        sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
-
-  def test_dense_multi_column(self):
-    price1 = fc_old.numeric_column('price1', shape=2)
-    price2 = fc_old.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [5., 6.]],
-          'price2': [[3.], [4.]]
-      }
-      predictions = fc.linear_model(features, [price1, price2])
-      bias = get_linear_model_bias()
-      price1_var = get_linear_model_column_var(price1)
-      price2_var = get_linear_model_column_var(price2)
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
-        sess.run(price1_var.assign([[10.], [100.]]))
-        sess.run(price2_var.assign([[1000.]]))
-        sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
-
-  def test_fills_cols_to_vars(self):
-    price1 = fc_old.numeric_column('price1', shape=2)
-    price2 = fc_old.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      cols_to_vars = {}
-      fc.linear_model(features, [price1, price2], cols_to_vars=cols_to_vars)
-      bias = get_linear_model_bias()
-      price1_var = get_linear_model_column_var(price1)
-      price2_var = get_linear_model_column_var(price2)
-      self.assertAllEqual(cols_to_vars['bias'], [bias])
-      self.assertAllEqual(cols_to_vars[price1], [price1_var])
-      self.assertAllEqual(cols_to_vars[price2], [price2_var])
-
-  def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc_old.numeric_column('price1', shape=2)
-    price2 = fc_old.numeric_column('price2', shape=3)
-    with ops.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [6., 7.]],
-          'price2': [[3., 4., 5.], [8., 9., 10.]]
-      }
-      cols_to_vars = {}
-      with variable_scope.variable_scope(
-          'linear',
-          partitioner=partitioned_variables.fixed_size_partitioner(2, axis=0)):
-        fc.linear_model(features, [price1, price2], cols_to_vars=cols_to_vars)
-      with _initialized_session():
-        self.assertEqual([0.], cols_to_vars['bias'][0].eval())
-        # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
-        self.assertAllEqual([[0.]], cols_to_vars[price1][0].eval())
-        self.assertAllEqual([[0.]], cols_to_vars[price1][1].eval())
-        # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and
-        # a [1, 1] Variable.
-        self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0].eval())
-        self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
-
-  def test_dense_collection(self):
-    price = fc_old.numeric_column('price')
-    with ops.Graph().as_default() as g:
-      features = {'price': [[1.], [5.]]}
-      fc.linear_model(features, [price], weight_collections=['my-vars'])
-      my_vars = g.get_collection('my-vars')
-      bias = get_linear_model_bias()
-      price_var = get_linear_model_column_var(price)
-      self.assertIn(bias, my_vars)
-      self.assertIn(price_var, my_vars)
-
-  def test_sparse_collection(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default() as g:
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-      features = {'wire_cast': wire_tensor}
-      fc.linear_model(
-          features, [wire_cast], weight_collections=['my-vars'])
-      my_vars = g.get_collection('my-vars')
-      bias = get_linear_model_bias()
-      wire_cast_var = get_linear_model_column_var(wire_cast)
-      self.assertIn(bias, my_vars)
-      self.assertIn(wire_cast_var, my_vars)
-
-  def test_dense_trainable_default(self):
-    price = fc_old.numeric_column('price')
-    with ops.Graph().as_default() as g:
-      features = {'price': [[1.], [5.]]}
-      fc.linear_model(features, [price])
-      bias = get_linear_model_bias()
-      price_var = get_linear_model_column_var(price)
-      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertIn(bias, trainable_vars)
-      self.assertIn(price_var, trainable_vars)
-
-  def test_sparse_trainable_default(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default() as g:
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-      features = {'wire_cast': wire_tensor}
-      fc.linear_model(features, [wire_cast])
-      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      bias = get_linear_model_bias()
-      wire_cast_var = get_linear_model_column_var(wire_cast)
-      self.assertIn(bias, trainable_vars)
-      self.assertIn(wire_cast_var, trainable_vars)
-
-  def test_dense_trainable_false(self):
-    price = fc_old.numeric_column('price')
-    with ops.Graph().as_default() as g:
-      features = {'price': [[1.], [5.]]}
-      fc.linear_model(features, [price], trainable=False)
-      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertEqual([], trainable_vars)
-
-  def test_sparse_trainable_false(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default() as g:
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-      features = {'wire_cast': wire_tensor}
-      fc.linear_model(features, [wire_cast], trainable=False)
-      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertEqual([], trainable_vars)
-
-  def test_column_order(self):
-    price_a = fc_old.numeric_column('price_a')
-    price_b = fc_old.numeric_column('price_b')
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default() as g:
-      features = {
-          'price_a': [[1.]],
-          'price_b': [[3.]],
-          'wire_cast':
-              sparse_tensor.SparseTensor(
-                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-      }
-      fc.linear_model(
-          features, [price_a, wire_cast, price_b],
-          weight_collections=['my-vars'])
-      my_vars = g.get_collection('my-vars')
-      self.assertIn('price_a', my_vars[0].name)
-      self.assertIn('price_b', my_vars[1].name)
-      self.assertIn('wire_cast', my_vars[2].name)
-
-    with ops.Graph().as_default() as g:
-      features = {
-          'price_a': [[1.]],
-          'price_b': [[3.]],
-          'wire_cast':
-              sparse_tensor.SparseTensor(
-                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-      }
-      fc.linear_model(
-          features, [wire_cast, price_b, price_a],
-          weight_collections=['my-vars'])
-      my_vars = g.get_collection('my-vars')
-      self.assertIn('price_a', my_vars[0].name)
-      self.assertIn('price_b', my_vars[1].name)
-      self.assertIn('wire_cast', my_vars[2].name)
-
-  def test_static_batch_size_mismatch(self):
-    price1 = fc_old.numeric_column('price1')
-    price2 = fc_old.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': [[1.], [5.], [7.]],  # batchsize = 3
-          'price2': [[3.], [4.]]  # batchsize = 2
-      }
-    with self.assertRaisesRegexp(
-        ValueError,
-        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-      fc.linear_model(features, [price1, price2])
-
-  def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc_old.numeric_column('price1')
-    price2 = fc_old.numeric_column('price2')
-    price3 = fc_old.numeric_column('price3')
-    with ops.Graph().as_default():
-      features = {
-          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
-          'price2': [[3.], [4.]],  # batchsize = 2
-          'price3': [[3.], [4.], [5.]]  # batchsize = 3
-      }
-      with self.assertRaisesRegexp(
-          ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        fc.linear_model(features, [price1, price2, price3])
-
-  def test_runtime_batch_size_mismatch(self):
-    price1 = fc_old.numeric_column('price1')
-    price2 = fc_old.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
-          'price2': [[3.], [4.]]  # batchsize = 2
-      }
-      predictions = fc.linear_model(features, [price1, price2])
-      with _initialized_session() as sess:
-        with self.assertRaisesRegexp(errors.OpError,
-                                     'must have the same size and shape'):
-          sess.run(
-              predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
-
-  def test_runtime_batch_size_matches(self):
-    price1 = fc_old.numeric_column('price1')
-    price2 = fc_old.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
-          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
-      }
-      predictions = fc.linear_model(features, [price1, price2])
-      with _initialized_session() as sess:
-        sess.run(
-            predictions,
-            feed_dict={
-                features['price1']: [[1.], [5.]],
-                features['price2']: [[1.], [5.]],
-            })
-
-  def test_with_numpy_input_fn(self):
-    price = fc_old.numeric_column('price')
-    price_buckets = fc_old.bucketized_column(
-        price, boundaries=[
-            0.,
-            10.,
-            100.,
-        ])
-    body_style = fc_old.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'price': np.array([-1., 2., 13., 104.]),
-            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
-        },
-        batch_size=2,
-        shuffle=False)
-    features = input_fn()
-    net = fc.linear_model(features, [price_buckets, body_style])
-    # self.assertEqual(1 + 3 + 5, net.shape[1])
-    with _initialized_session() as sess:
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
-
-      bias = get_linear_model_bias()
-      price_buckets_var = get_linear_model_column_var(price_buckets)
-      body_style_var = get_linear_model_column_var(body_style)
-
-      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
-      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
-      sess.run(bias.assign([5.]))
-
-      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def test_with_1d_sparse_tensor(self):
-    price = fc_old.numeric_column('price')
-    price_buckets = fc_old.bucketized_column(
-        price, boundaries=[
-            0.,
-            10.,
-            100.,
-        ])
-    body_style = fc_old.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-
-    # Provides 1-dim tensor and dense tensor.
-    features = {
-        'price': constant_op.constant([-1., 12.,]),
-        'body-style': sparse_tensor.SparseTensor(
-            indices=((0,), (1,)),
-            values=('sedan', 'hardtop'),
-            dense_shape=(2,)),
-    }
-    self.assertEqual(1, features['price'].shape.ndims)
-    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
-
-    net = fc.linear_model(features, [price_buckets, body_style])
-    with _initialized_session() as sess:
-      bias = get_linear_model_bias()
-      price_buckets_var = get_linear_model_column_var(price_buckets)
-      body_style_var = get_linear_model_column_var(body_style)
-
-      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
-      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
-      sess.run(bias.assign([5.]))
-
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
-
-  def test_with_1d_unknown_shape_sparse_tensor(self):
-    price = fc_old.numeric_column('price')
-    price_buckets = fc_old.bucketized_column(
-        price, boundaries=[
-            0.,
-            10.,
-            100.,
-        ])
-    body_style = fc_old.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    country = fc_old.categorical_column_with_vocabulary_list(
-        'country', vocabulary_list=['US', 'JP', 'CA'])
+            ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
+        # Expected ids after cross = (1, 0, 1, 3, 4, 2)
+        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        sess.run(bias.assign((.1,)))
+        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
 
-    # Provides 1-dim tensor and dense tensor.
-    features = {
-        'price': array_ops.placeholder(dtypes.float32),
-        'body-style': array_ops.sparse_placeholder(dtypes.string),
-        'country': array_ops.placeholder(dtypes.string),
-    }
-    self.assertIsNone(features['price'].shape.ndims)
-    self.assertIsNone(features['body-style'].get_shape().ndims)
+  def test_linear_model_with_weights(self):
 
-    price_data = np.array([-1., 12.])
-    body_style_data = sparse_tensor.SparseTensorValue(
-        indices=((0,), (1,)),
-        values=('sedan', 'hardtop'),
-        dense_shape=(2,))
-    country_data = np.array(['US', 'CA'])
+    class _TestColumnWithWeights(fc.CategoricalColumn):
+      """Produces sparse IDs and sparse weights."""
 
-    net = fc.linear_model(features, [price_buckets, body_style, country])
-    bias = get_linear_model_bias()
-    price_buckets_var = get_linear_model_column_var(price_buckets)
-    body_style_var = get_linear_model_column_var(body_style)
-    with _initialized_session() as sess:
-      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
-      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
-      sess.run(bias.assign([5.]))
+      @property
+      def name(self):
+        return 'test_column'
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
-                          sess.run(
-                              net,
-                              feed_dict={
-                                  features['price']: price_data,
-                                  features['body-style']: body_style_data,
-                                  features['country']: country_data
-                              }))
+      @property
+      def parse_example_spec(self):
+        return {
+            self.name: parsing_ops.VarLenFeature(dtypes.int32),
+            '{}_weights'.format(self.name): parsing_ops.VarLenFeature(
+                dtypes.float32),
+            }
 
-  def test_with_rank_0_feature(self):
-    price = fc_old.numeric_column('price')
-    features = {
-        'price': constant_op.constant(0),
-    }
-    self.assertEqual(0, features['price'].shape.ndims)
+      @property
+      def num_buckets(self):
+        return 5
 
-    # Static rank 0 should fail
-    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
-      fc.linear_model(features, [price])
+      def transform_feature(self, transformation_cache, state_manager):
+        return (transformation_cache.get(self.name, state_manager),
+                transformation_cache.get('{}_weights'.format(self.name),
+                                         state_manager))
 
-    # Dynamic rank 0 should fail
-    features = {
-        'price': array_ops.placeholder(dtypes.float32),
-    }
-    net = fc.linear_model(features, [price])
-    self.assertEqual(1, net.shape[1])
-    with _initialized_session() as sess:
-      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
-        sess.run(net, feed_dict={features['price']: np.array(1)})
+      def get_sparse_tensors(self, transformation_cache, state_manager):
+        """Populates both id_tensor and weight_tensor."""
+        ids_and_weights = transformation_cache.get(self, state_manager)
+        return fc.CategoricalColumn.IdWeightPair(
+            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
 
-  def test_multiple_linear_models(self):
-    price = fc_old.numeric_column('price')
+    t = _TestColumnWithWeights()
+    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
-      features1 = {'price': [[1.], [5.]]}
-      features2 = {'price': [[2.], [10.]]}
-      predictions1 = fc.linear_model(features1, [price])
-      predictions2 = fc.linear_model(features2, [price])
-      bias1 = get_linear_model_bias(name='linear_model')
-      bias2 = get_linear_model_bias(name='linear_model_1')
-      price_var1 = get_linear_model_column_var(price, name='linear_model')
-      price_var2 = get_linear_model_column_var(price, name='linear_model_1')
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], bias1.eval())
-        sess.run(price_var1.assign([[10.]]))
-        sess.run(bias1.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions1.eval())
-        self.assertAllClose([0.], bias2.eval())
-        sess.run(price_var2.assign([[10.]]))
-        sess.run(bias2.assign([5.]))
-        self.assertAllClose([[25.], [105.]], predictions2.eval())
+      with self.assertRaisesRegexp(
+          ValueError,
+          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
+        model = fc.LinearModel((crossed,))
+        model({
+            t.name:
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=[0, 1, 2],
+                    dense_shape=(2, 2)),
+            '{}_weights'.format(t.name):
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=[1., 10., 2.],
+                    dense_shape=(2, 2)),
+            'c':
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=['cA', 'cB', 'cC'],
+                    dense_shape=(2, 2)),
+        })
 
 
-class _LinearModelTest(test.TestCase):
+class LinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
     with self.assertRaisesRegexp(ValueError,
                                  'feature_columns must not be empty'):
-      get_keras_linear_model_predictions(features={}, feature_columns=[])
+      fc.LinearModel(feature_columns=[])
 
   def test_should_be_feature_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
-      get_keras_linear_model_predictions(
-          features={'a': [[0]]}, feature_columns='NotSupported')
+    with self.assertRaisesRegexp(ValueError, 'must be a FeatureColumn'):
+      fc.LinearModel(feature_columns='NotSupported')
 
   def test_should_be_dense_or_categorical_column(self):
 
-    class NotSupportedColumn(fc_old._FeatureColumn):
+    class NotSupportedColumn(fc.FeatureColumn):
 
       @property
       def name(self):
         return 'NotSupportedColumn'
 
-      def _transform_feature(self, cache):
+      def transform_feature(self, transformation_cache, state_manager):
         pass
 
       @property
-      def _parse_example_spec(self):
+      def parse_example_spec(self):
         pass
 
     with self.assertRaisesRegexp(
-        ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
-      get_keras_linear_model_predictions(
-          features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
+        ValueError, 'must be either a DenseColumn or CategoricalColumn'):
+      fc.LinearModel(feature_columns=[NotSupportedColumn()])
 
   def test_does_not_support_dict_columns(self):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
-      fc.linear_model(
-          features={'a': [[0]]},
-          feature_columns={'a': fc_old.numeric_column('a')})
+      fc.LinearModel(feature_columns={'a': fc.numeric_column('a')})
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
-      get_keras_linear_model_predictions(
-          features={'a': [[0]]},
-          feature_columns=[
-              fc_old.numeric_column('a'),
-              fc_old.numeric_column('a')
-          ])
+      fc.LinearModel(
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
 
   def test_dense_bias(self):
-    price = fc_old.numeric_column('price')
+    price = fc.numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      predictions = get_keras_linear_model_predictions(features, [price])
-      bias = get_linear_model_bias()
-      price_var = get_linear_model_column_var(price)
+      model = fc.LinearModel([price])
+      predictions = model(features)
+      price_var, bias = model.variables
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
         sess.run(price_var.assign([[10.]]))
@@ -2045,16 +1149,16 @@ class _LinearModelTest(test.TestCase):
         self.assertAllClose([[15.], [55.]], predictions.eval())
 
   def test_sparse_bias(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {'wire_cast': wire_tensor}
-      predictions = get_keras_linear_model_predictions(features, [wire_cast])
-      bias = get_linear_model_bias()
-      wire_cast_var = get_linear_model_column_var(wire_cast)
+      model = fc.LinearModel([wire_cast])
+      predictions = model(features)
+      wire_cast_var, bias = model.variables
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
         self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
@@ -2063,19 +1167,17 @@ class _LinearModelTest(test.TestCase):
         self.assertAllClose([[1005.], [10015.]], predictions.eval())
 
   def test_dense_and_sparse_bias(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
-    price = fc_old.numeric_column('price')
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc.numeric_column('price')
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
-      predictions = get_keras_linear_model_predictions(features,
-                                                       [wire_cast, price])
-      bias = get_linear_model_bias()
-      wire_cast_var = get_linear_model_column_var(wire_cast)
-      price_var = get_linear_model_column_var(price)
+      model = fc.LinearModel([wire_cast, price])
+      predictions = model(features)
+      price_var, wire_cast_var, bias = model.variables
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
@@ -2085,42 +1187,36 @@ class _LinearModelTest(test.TestCase):
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
 
-    class _DenseAndSparseColumn(fc_old._DenseColumn, fc_old._CategoricalColumn):
+    class _DenseAndSparseColumn(fc.DenseColumn, fc.CategoricalColumn):
 
       @property
       def name(self):
         return 'dense_and_sparse_column'
 
       @property
-      def _parse_example_spec(self):
+      def parse_example_spec(self):
         return {self.name: parsing_ops.VarLenFeature(self.dtype)}
 
-      def _transform_feature(self, inputs):
-        return inputs.get(self.name)
+      def transform_feature(self, transformation_cache, state_manager):
+        return transformation_cache.get(self.name, state_manager)
 
       @property
-      def _variable_shape(self):
+      def variable_shape(self):
         raise ValueError('Should not use this method.')
 
-      def _get_dense_tensor(self,
-                            inputs,
-                            weight_collections=None,
-                            trainable=None):
+      def get_dense_tensor(self, transformation_cache, state_manager):
         raise ValueError('Should not use this method.')
 
       @property
-      def _num_buckets(self):
+      def num_buckets(self):
         return 4
 
-      def _get_sparse_tensors(self,
-                              inputs,
-                              weight_collections=None,
-                              trainable=None):
+      def get_sparse_tensors(self, transformation_cache, state_manager):
         sp_tensor = sparse_tensor.SparseTensor(
             indices=[[0, 0], [1, 0], [1, 1]],
             values=[2, 0, 3],
             dense_shape=[2, 2])
-        return fc_old._CategoricalColumn.IdWeightPair(sp_tensor, None)
+        return fc.CategoricalColumn.IdWeightPair(sp_tensor, None)
 
     dense_and_sparse_column = _DenseAndSparseColumn()
     with ops.Graph().as_default():
@@ -2129,26 +1225,22 @@ class _LinearModelTest(test.TestCase):
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {dense_and_sparse_column.name: sp_tensor}
-      predictions = get_keras_linear_model_predictions(
-          features, [dense_and_sparse_column])
-      bias = get_linear_model_bias()
-      dense_and_sparse_column_var = get_linear_model_column_var(
-          dense_and_sparse_column)
+      model = fc.LinearModel([dense_and_sparse_column])
+      predictions = model(features)
+      dense_and_sparse_column_var, bias = model.variables
       with _initialized_session() as sess:
-        sess.run(
-            dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
-                                                [10000.]]))
+        sess.run(dense_and_sparse_column_var.assign(
+            [[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         self.assertAllClose([[1005.], [10015.]], predictions.eval())
 
   def test_dense_multi_output(self):
-    price = fc_old.numeric_column('price')
+    price = fc.numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      predictions = get_keras_linear_model_predictions(
-          features, [price], units=3)
-      bias = get_linear_model_bias()
-      price_var = get_linear_model_column_var(price)
+      model = fc.LinearModel([price], units=3)
+      predictions = model(features)
+      price_var, bias = model.variables
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((3,)), bias.eval())
         self.assertAllClose(np.zeros((1, 3)), price_var.eval())
@@ -2158,41 +1250,41 @@ class _LinearModelTest(test.TestCase):
                             predictions.eval())
 
   def test_sparse_multi_output(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {'wire_cast': wire_tensor}
-      predictions = get_keras_linear_model_predictions(
-          features, [wire_cast], units=3)
-      bias = get_linear_model_bias()
-      wire_cast_var = get_linear_model_column_var(wire_cast)
+      model = fc.LinearModel([wire_cast], units=3)
+      predictions = model(features)
+      wire_cast_var, bias = model.variables
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((3,)), bias.eval())
         self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
         sess.run(
-            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
-                                  [1000., 1100.,
-                                   1200.], [10000., 11000., 12000.]]))
+            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
+                1000., 1100., 1200.
+            ], [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
                             predictions.eval())
 
   def test_dense_multi_dimension(self):
-    price = fc_old.numeric_column('price', shape=2)
+    price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
-      predictions = get_keras_linear_model_predictions(features, [price])
-      price_var = get_linear_model_column_var(price)
+      model = fc.LinearModel([price])
+      predictions = model(features)
+      price_var, _ = model.variables
       with _initialized_session() as sess:
         self.assertAllClose([[0.], [0.]], price_var.eval())
         sess.run(price_var.assign([[10.], [100.]]))
         self.assertAllClose([[210.], [650.]], predictions.eval())
 
   def test_sparse_multi_rank(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = array_ops.sparse_placeholder(dtypes.string)
       wire_value = sparse_tensor.SparseTensorValue(
@@ -2200,8 +1292,9 @@ class _LinearModelTest(test.TestCase):
           indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]],
           dense_shape=[2, 2, 2])
       features = {'wire_cast': wire_tensor}
-      predictions = get_keras_linear_model_predictions(features, [wire_cast])
-      wire_cast_var = get_linear_model_column_var(wire_cast)
+      model = fc.LinearModel([wire_cast])
+      predictions = model(features)
+      wire_cast_var, _ = model.variables
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
         self.assertAllClose(
@@ -2213,30 +1306,49 @@ class _LinearModelTest(test.TestCase):
             predictions.eval(feed_dict={wire_tensor: wire_value}))
 
   def test_sparse_combiner(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {'wire_cast': wire_tensor}
-      predictions = get_keras_linear_model_predictions(
-          features, [wire_cast], sparse_combiner='mean')
-      bias = get_linear_model_bias()
-      wire_cast_var = get_linear_model_column_var(wire_cast)
+      model = fc.LinearModel([wire_cast], sparse_combiner='mean')
+      predictions = model(features)
+      wire_cast_var, bias = model.variables
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         self.assertAllClose([[1005.], [5010.]], predictions.eval())
 
+  def test_sparse_combiner_with_negative_weights(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast_weights = fc.weighted_categorical_column(wire_cast, 'weights')
+
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {
+          'wire_cast': wire_tensor,
+          'weights': constant_op.constant([[1., 1., -1.0]])
+      }
+      model = fc.LinearModel([wire_cast_weights], sparse_combiner='sum')
+      predictions = model(features)
+      wire_cast_var, bias = model.variables
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+
   def test_dense_multi_dimension_multi_output(self):
-    price = fc_old.numeric_column('price', shape=2)
+    price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
-      predictions = get_keras_linear_model_predictions(
-          features, [price], units=3)
-      bias = get_linear_model_bias()
-      price_var = get_linear_model_column_var(price)
+      model = fc.LinearModel([price], units=3)
+      predictions = model(features)
+      price_var, bias = model.variables
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((3,)), bias.eval())
         self.assertAllClose(np.zeros((2, 3)), price_var.eval())
@@ -2246,21 +1358,22 @@ class _LinearModelTest(test.TestCase):
                             predictions.eval())
 
   def test_raises_if_shape_mismatch(self):
-    price = fc_old.numeric_column('price', shape=2)
+    price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       with self.assertRaisesRegexp(
           Exception,
           r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-        get_keras_linear_model_predictions(features, [price])
+        model = fc.LinearModel([price])
+        model(features)
 
   def test_dense_reshaping(self):
-    price = fc_old.numeric_column('price', shape=[1, 2])
+    price = fc.numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
-      predictions = get_keras_linear_model_predictions(features, [price])
-      bias = get_linear_model_bias()
-      price_var = get_linear_model_column_var(price)
+      model = fc.LinearModel([price])
+      predictions = model(features)
+      price_var, bias = model.variables
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
         self.assertAllClose([[0.], [0.]], price_var.eval())
@@ -2269,15 +1382,16 @@ class _LinearModelTest(test.TestCase):
         self.assertAllClose([[210.], [650.]], predictions.eval())
 
   def test_dense_multi_column(self):
-    price1 = fc_old.numeric_column('price1', shape=2)
-    price2 = fc_old.numeric_column('price2')
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
-      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      predictions = get_keras_linear_model_predictions(features,
-                                                       [price1, price2])
-      bias = get_linear_model_bias()
-      price1_var = get_linear_model_column_var(price1)
-      price2_var = get_linear_model_column_var(price2)
+      features = {
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3.], [4.]]
+      }
+      model = fc.LinearModel([price1, price2])
+      predictions = model(features)
+      price1_var, price2_var, bias = model.variables
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
         self.assertAllClose([[0.], [0.]], price1_var.eval())
@@ -2288,118 +1402,55 @@ class _LinearModelTest(test.TestCase):
         sess.run(bias.assign([7.]))
         self.assertAllClose([[3217.], [4657.]], predictions.eval())
 
-  def test_fills_cols_to_vars(self):
-    price1 = fc_old.numeric_column('price1', shape=2)
-    price2 = fc_old.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      cols_to_vars = {}
-      get_keras_linear_model_predictions(
-          features, [price1, price2], cols_to_vars=cols_to_vars)
-      bias = get_linear_model_bias()
-      price1_var = get_linear_model_column_var(price1)
-      price2_var = get_linear_model_column_var(price2)
-      self.assertAllEqual(cols_to_vars['bias'], [bias])
-      self.assertAllEqual(cols_to_vars[price1], [price1_var])
-      self.assertAllEqual(cols_to_vars[price2], [price2_var])
-
-  def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc_old.numeric_column('price1', shape=2)
-    price2 = fc_old.numeric_column('price2', shape=3)
-    with ops.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [6., 7.]],
-          'price2': [[3., 4., 5.], [8., 9., 10.]]
-      }
-      cols_to_vars = {}
-      with variable_scope.variable_scope(
-          'linear',
-          partitioner=partitioned_variables.fixed_size_partitioner(2, axis=0)):
-        get_keras_linear_model_predictions(
-            features, [price1, price2], cols_to_vars=cols_to_vars)
-      with _initialized_session():
-        self.assertEqual([0.], cols_to_vars['bias'][0].eval())
-        # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
-        self.assertAllEqual([[0.]], cols_to_vars[price1][0].eval())
-        self.assertAllEqual([[0.]], cols_to_vars[price1][1].eval())
-        # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and
-        # a [1, 1] Variable.
-        self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0].eval())
-        self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
-
-  def test_dense_collection(self):
-    price = fc_old.numeric_column('price')
-    with ops.Graph().as_default() as g:
-      features = {'price': [[1.], [5.]]}
-      get_keras_linear_model_predictions(
-          features, [price], weight_collections=['my-vars'])
-      my_vars = g.get_collection('my-vars')
-      bias = get_linear_model_bias()
-      price_var = get_linear_model_column_var(price)
-      self.assertIn(bias, my_vars)
-      self.assertIn(price_var, my_vars)
-
-  def test_sparse_collection(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default() as g:
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-      features = {'wire_cast': wire_tensor}
-      get_keras_linear_model_predictions(
-          features, [wire_cast], weight_collections=['my-vars'])
-      my_vars = g.get_collection('my-vars')
-      bias = get_linear_model_bias()
-      wire_cast_var = get_linear_model_column_var(wire_cast)
-      self.assertIn(bias, my_vars)
-      self.assertIn(wire_cast_var, my_vars)
-
   def test_dense_trainable_default(self):
-    price = fc_old.numeric_column('price')
+    price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
-      get_keras_linear_model_predictions(features, [price])
-      bias = get_linear_model_bias()
-      price_var = get_linear_model_column_var(price)
+      model = fc.LinearModel([price])
+      model(features)
+      price_var, bias = model.variables
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       self.assertIn(bias, trainable_vars)
       self.assertIn(price_var, trainable_vars)
 
   def test_sparse_trainable_default(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
       features = {'wire_cast': wire_tensor}
-      get_keras_linear_model_predictions(features, [wire_cast])
+      model = fc.LinearModel([wire_cast])
+      model(features)
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      bias = get_linear_model_bias()
-      wire_cast_var = get_linear_model_column_var(wire_cast)
+      wire_cast_var, bias = model.variables
       self.assertIn(bias, trainable_vars)
       self.assertIn(wire_cast_var, trainable_vars)
 
   def test_dense_trainable_false(self):
-    price = fc_old.numeric_column('price')
+    price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
-      get_keras_linear_model_predictions(features, [price], trainable=False)
+      model = fc.LinearModel([price], trainable=False)
+      model(features)
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       self.assertEqual([], trainable_vars)
 
   def test_sparse_trainable_false(self):
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
       features = {'wire_cast': wire_tensor}
-      get_keras_linear_model_predictions(features, [wire_cast], trainable=False)
+      model = fc.LinearModel([wire_cast], trainable=False)
+      model(features)
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       self.assertEqual([], trainable_vars)
 
   def test_column_order(self):
-    price_a = fc_old.numeric_column('price_a')
-    price_b = fc_old.numeric_column('price_b')
-    wire_cast = fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default() as g:
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
       features = {
           'price_a': [[1.]],
           'price_b': [[3.]],
@@ -2407,15 +1458,15 @@ class _LinearModelTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
       }
-      get_keras_linear_model_predictions(
-          features, [price_a, wire_cast, price_b],
-          weight_collections=['my-vars'])
-      my_vars = g.get_collection('my-vars')
+      model = fc.LinearModel([price_a, wire_cast, price_b])
+      model(features)
+
+      my_vars = model.variables
       self.assertIn('price_a', my_vars[0].name)
       self.assertIn('price_b', my_vars[1].name)
       self.assertIn('wire_cast', my_vars[2].name)
 
-    with ops.Graph().as_default() as g:
+    with ops.Graph().as_default():
       features = {
           'price_a': [[1.]],
           'price_b': [[3.]],
@@ -2423,17 +1474,45 @@ class _LinearModelTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
       }
-      get_keras_linear_model_predictions(
-          features, [wire_cast, price_b, price_a],
-          weight_collections=['my-vars'])
-      my_vars = g.get_collection('my-vars')
+      model = fc.LinearModel([wire_cast, price_b, price_a])
+      model(features)
+
+      my_vars = model.variables
       self.assertIn('price_a', my_vars[0].name)
       self.assertIn('price_b', my_vars[1].name)
       self.assertIn('wire_cast', my_vars[2].name)
 
+  def test_variable_names(self):
+    price1 = fc.numeric_column('price1')
+    dense_feature = fc.numeric_column('dense_feature')
+    dense_feature_bucketized = fc.bucketized_column(
+        dense_feature, boundaries=[0.])
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)
+    all_cols = [price1, dense_feature_bucketized, some_embedding_column]
+
+    with ops.Graph().as_default():
+      model = fc.LinearModel(all_cols)
+      features = {
+          'price1': [[3.], [4.]],
+          'dense_feature': [[-1.], [4.]],
+          'sparse_feature': [['a'], ['x']],
+      }
+      model(features)
+      variable_names = [var.name for var in model.variables]
+      self.assertItemsEqual([
+          'linear_model/dense_feature_bucketized/weights:0',
+          'linear_model/price1/weights:0',
+          'linear_model/sparse_feature_embedding/embedding_weights:0',
+          'linear_model/sparse_feature_embedding/weights:0',
+          'linear_model/bias_weights:0',
+      ], variable_names)
+
   def test_static_batch_size_mismatch(self):
-    price1 = fc_old.numeric_column('price1')
-    price2 = fc_old.numeric_column('price2')
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
@@ -2442,12 +1521,13 @@ class _LinearModelTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-      get_keras_linear_model_predictions(features, [price1, price2])
+      model = fc.LinearModel([price1, price2])
+      model(features)
 
   def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc_old.numeric_column('price1')
-    price2 = fc_old.numeric_column('price2')
-    price3 = fc_old.numeric_column('price3')
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2457,18 +1537,19 @@ class _LinearModelTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        get_keras_linear_model_predictions(features, [price1, price2, price3])
+        model = fc.LinearModel([price1, price2, price3])
+        model(features)
 
   def test_runtime_batch_size_mismatch(self):
-    price1 = fc_old.numeric_column('price1')
-    price2 = fc_old.numeric_column('price2')
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-      predictions = get_keras_linear_model_predictions(features,
-                                                       [price1, price2])
+      model = fc.LinearModel([price1, price2])
+      predictions = model(features)
       with _initialized_session() as sess:
         with self.assertRaisesRegexp(errors.OpError,
                                      'must have the same size and shape'):
@@ -2476,15 +1557,15 @@ class _LinearModelTest(test.TestCase):
               predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
-    price1 = fc_old.numeric_column('price1')
-    price2 = fc_old.numeric_column('price2')
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
           'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
       }
-      predictions = get_keras_linear_model_predictions(features,
-                                                       [price1, price2])
+      model = fc.LinearModel([price1, price2])
+      predictions = model(features)
       with _initialized_session() as sess:
         sess.run(
             predictions,
@@ -2494,14 +1575,14 @@ class _LinearModelTest(test.TestCase):
             })
 
   def test_with_numpy_input_fn(self):
-    price = fc_old.numeric_column('price')
-    price_buckets = fc_old.bucketized_column(
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(
         price, boundaries=[
             0.,
             10.,
             100.,
         ])
-    body_style = fc_old.categorical_column_with_vocabulary_list(
+    body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
 
     input_fn = numpy_io.numpy_input_fn(
@@ -2512,16 +1593,14 @@ class _LinearModelTest(test.TestCase):
         batch_size=2,
         shuffle=False)
     features = input_fn()
-    net = get_keras_linear_model_predictions(features,
-                                             [price_buckets, body_style])
+    model = fc.LinearModel([price_buckets, body_style])
+    net = model(features)
     # self.assertEqual(1 + 3 + 5, net.shape[1])
     with _initialized_session() as sess:
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
 
-      bias = get_linear_model_bias()
-      price_buckets_var = get_linear_model_column_var(price_buckets)
-      body_style_var = get_linear_model_column_var(body_style)
+      body_style_var, price_buckets_var, bias = model.variables
 
       sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
@@ -2533,38 +1612,31 @@ class _LinearModelTest(test.TestCase):
       coord.join(threads)
 
   def test_with_1d_sparse_tensor(self):
-    price = fc_old.numeric_column('price')
-    price_buckets = fc_old.bucketized_column(
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(
         price, boundaries=[
             0.,
             10.,
             100.,
         ])
-    body_style = fc_old.categorical_column_with_vocabulary_list(
+    body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
 
     # Provides 1-dim tensor and dense tensor.
     features = {
-        'price':
-            constant_op.constant([
-                -1.,
-                12.,
-            ]),
-        'body-style':
-            sparse_tensor.SparseTensor(
-                indices=((0,), (1,)),
-                values=('sedan', 'hardtop'),
-                dense_shape=(2,)),
+        'price': constant_op.constant([-1., 12.,]),
+        'body-style': sparse_tensor.SparseTensor(
+            indices=((0,), (1,)),
+            values=('sedan', 'hardtop'),
+            dense_shape=(2,)),
     }
     self.assertEqual(1, features['price'].shape.ndims)
     self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
 
-    net = get_keras_linear_model_predictions(features,
-                                             [price_buckets, body_style])
+    model = fc.LinearModel([price_buckets, body_style])
+    net = model(features)
     with _initialized_session() as sess:
-      bias = get_linear_model_bias()
-      price_buckets_var = get_linear_model_column_var(price_buckets)
-      body_style_var = get_linear_model_column_var(body_style)
+      body_style_var, price_buckets_var, bias = model.variables
 
       sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
@@ -2573,16 +1645,16 @@ class _LinearModelTest(test.TestCase):
       self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
 
   def test_with_1d_unknown_shape_sparse_tensor(self):
-    price = fc_old.numeric_column('price')
-    price_buckets = fc_old.bucketized_column(
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(
         price, boundaries=[
             0.,
             10.,
             100.,
         ])
-    body_style = fc_old.categorical_column_with_vocabulary_list(
+    body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    country = fc_old.categorical_column_with_vocabulary_list(
+    country = fc.categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -2596,14 +1668,14 @@ class _LinearModelTest(test.TestCase):
 
     price_data = np.array([-1., 12.])
     body_style_data = sparse_tensor.SparseTensorValue(
-        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
+        indices=((0,), (1,)),
+        values=('sedan', 'hardtop'),
+        dense_shape=(2,))
     country_data = np.array(['US', 'CA'])
 
-    net = get_keras_linear_model_predictions(
-        features, [price_buckets, body_style, country])
-    bias = get_linear_model_bias()
-    price_buckets_var = get_linear_model_column_var(price_buckets)
-    body_style_var = get_linear_model_column_var(body_style)
+    model = fc.LinearModel([price_buckets, body_style, country])
+    net = model(features)
+    body_style_var, _, price_buckets_var, bias = model.variables
     with _initialized_session() as sess:
       sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
@@ -2619,7 +1691,7 @@ class _LinearModelTest(test.TestCase):
                               }))
 
   def test_with_rank_0_feature(self):
-    price = fc_old.numeric_column('price')
+    price = fc.numeric_column('price')
     features = {
         'price': constant_op.constant(0),
     }
@@ -2627,18 +1699,41 @@ class _LinearModelTest(test.TestCase):
 
     # Static rank 0 should fail
     with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
-      get_keras_linear_model_predictions(features, [price])
+      model = fc.LinearModel([price])
+      model(features)
 
     # Dynamic rank 0 should fail
     features = {
         'price': array_ops.placeholder(dtypes.float32),
     }
-    net = get_keras_linear_model_predictions(features, [price])
+    model = fc.LinearModel([price])
+    net = model(features)
     self.assertEqual(1, net.shape[1])
     with _initialized_session() as sess:
       with self.assertRaisesOpError('Feature .* cannot have rank 0'):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
+  def test_multiple_linear_models(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features1 = {'price': [[1.], [5.]]}
+      features2 = {'price': [[2.], [10.]]}
+      model1 = fc.LinearModel([price])
+      model2 = fc.LinearModel([price])
+      predictions1 = model1(features1)
+      predictions2 = model2(features2)
+      price_var1, bias1 = model1.variables
+      price_var2, bias2 = model2.variables
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias1.eval())
+        sess.run(price_var1.assign([[10.]]))
+        sess.run(bias1.assign([5.]))
+        self.assertAllClose([[15.], [55.]], predictions1.eval())
+        self.assertAllClose([0.], bias2.eval())
+        sess.run(price_var2.assign([[10.]]))
+        sess.run(bias2.assign([5.]))
+        self.assertAllClose([[25.], [105.]], predictions2.eval())
+
 
 class FeatureLayerTest(test.TestCase):
 
@@ -3739,47 +2834,22 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_linear_model(self):
-    wire_column = fc_old.categorical_column_with_vocabulary_file(
-        key='wire',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size,
-        num_oov_buckets=1)
-    self.assertEqual(4, wire_column._num_buckets)
-    with ops.Graph().as_default():
-      predictions = fc.linear_model({
-          wire_column.name: sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=('marlo', 'skywalker', 'omar'),
-              dense_shape=(2, 2))
-      }, (wire_column,))
-      bias = get_linear_model_bias()
-      wire_var = get_linear_model_column_var(wire_column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 2: wire_var[2] = 3
-        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
-
-  def test_keras_linear_model(self):
-    wire_column = fc_old.categorical_column_with_vocabulary_file(
+    wire_column = fc.categorical_column_with_vocabulary_file(
         key='wire',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
         num_oov_buckets=1)
-    self.assertEqual(4, wire_column._num_buckets)
+    self.assertEqual(4, wire_column.num_buckets)
     with ops.Graph().as_default():
-      predictions = get_keras_linear_model_predictions({
+      model = fc.LinearModel((wire_column,))
+      predictions = model({
           wire_column.name:
               sparse_tensor.SparseTensorValue(
                   indices=((0, 0), (1, 0), (1, 1)),
                   values=('marlo', 'skywalker', 'omar'),
                   dense_shape=(2, 2))
-      }, (wire_column,))
-      bias = get_linear_model_bias()
-      wire_var = get_linear_model_column_var(wire_column)
+      })
+      wire_var, bias = model.variables
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
@@ -4131,54 +3201,30 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
     with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 60, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
-
-  def test_linear_model(self):
-    wire_column = fc_old.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'),
-        num_oov_buckets=1)
-    self.assertEqual(4, wire_column._num_buckets)
-    with ops.Graph().as_default():
-      predictions = fc.linear_model({
-          wire_column.name: sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=('marlo', 'skywalker', 'omar'),
-              dense_shape=(2, 2))
-      }, (wire_column,))
-      bias = get_linear_model_bias()
-      wire_var = get_linear_model_column_var(wire_column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 2: wire_var[2] = 3
-        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 60, 0, 4), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
 
-  def test_keras_linear_model(self):
-    wire_column = fc_old.categorical_column_with_vocabulary_list(
+  def test_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         num_oov_buckets=1)
-    self.assertEqual(4, wire_column._num_buckets)
+    self.assertEqual(4, wire_column.num_buckets)
     with ops.Graph().as_default():
-      predictions = get_keras_linear_model_predictions({
+      model = fc.LinearModel((wire_column,))
+      predictions = model({
           wire_column.name:
               sparse_tensor.SparseTensorValue(
                   indices=((0, 0), (1, 0), (1, 1)),
                   values=('marlo', 'skywalker', 'omar'),
                   dense_shape=(2, 2))
-      }, (wire_column,))
-      bias = get_linear_model_bias()
-      wire_var = get_linear_model_column_var(wire_column)
+      })
+      wire_var, bias = model.variables
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
@@ -4398,39 +3444,18 @@ class IdentityCategoricalColumnTest(test.TestCase):
           }))
 
   def test_linear_model(self):
-    column = fc_old.categorical_column_with_identity(key='aaa', num_buckets=3)
-    self.assertEqual(3, column.num_buckets)
-    with ops.Graph().as_default():
-      predictions = fc.linear_model({
-          column.name: sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=(0, 2, 1),
-              dense_shape=(2, 2))
-      }, (column,))
-      bias = get_linear_model_bias()
-      weight_var = get_linear_model_column_var(column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] = 1
-        # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
-
-  def test_keras_linear_model(self):
-    column = fc_old.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column.num_buckets)
     with ops.Graph().as_default():
-      predictions = get_keras_linear_model_predictions({
+      model = fc.LinearModel((column,))
+      predictions = model({
           column.name:
               sparse_tensor.SparseTensorValue(
                   indices=((0, 0), (1, 0), (1, 1)),
                   values=(0, 2, 1),
                   dense_shape=(2, 2))
-      }, (column,))
-      bias = get_linear_model_bias()
-      weight_var = get_linear_model_column_var(column)
+      })
+      weight_var, bias = model.variables
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
         self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
@@ -4656,27 +3681,8 @@ class IndicatorColumnTest(test.TestCase):
       self.assertAllEqual([[0., 1., 1.]], indicator_tensor.eval())
 
   def test_linear_model(self):
-    animal = fc_old.indicator_column(
-        fc_old.categorical_column_with_identity('animal', num_buckets=4))
-    with ops.Graph().as_default():
-      features = {
-          'animal':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-
-      predictions = fc.linear_model(features, [animal])
-      weight_var = get_linear_model_column_var(animal)
-      with _initialized_session():
-        # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
-        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
-
-  def test_keras_linear_model(self):
-    animal = fc_old.indicator_column(
-        fc_old.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -4684,8 +3690,9 @@ class IndicatorColumnTest(test.TestCase):
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
 
-      predictions = get_keras_linear_model_predictions(features, [animal])
-      weight_var = get_linear_model_column_var(animal)
+      model = fc.LinearModel([animal])
+      predictions = model(features)
+      weight_var, _ = model.variables
       with _initialized_session():
         # All should be zero-initialized.
         self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
@@ -5137,17 +4144,16 @@ class EmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column = fc_old.categorical_column_with_identity(
+    categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old.embedding_column(
+    embedding_column = fc.embedding_column(
         categorical_column,
         dimension=embedding_dimension,
         initializer=_initializer)
 
     with ops.Graph().as_default():
-      predictions = fc.linear_model({
-          categorical_column.name: sparse_input
-      }, (embedding_column,))
+      model = fc.LinearModel((embedding_column,))
+      predictions = model({categorical_column.name: sparse_input})
       expected_var_names = (
           'linear_model/bias_weights:0',
           'linear_model/aaa_embedding/weights:0',
@@ -5189,82 +4195,6 @@ class EmbeddingColumnTest(test.TestCase):
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
         self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
 
-  def test_keras_linear_model(self):
-    # Inputs.
-    batch_size = 4
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(batch_size, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_shape = (vocabulary_size, embedding_dimension)
-    zeros_embedding_values = np.zeros(embedding_shape)
-
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual(embedding_shape, shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return zeros_embedding_values
-
-    # Build columns.
-    categorical_column = fc_old.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    with ops.Graph().as_default():
-      predictions = get_keras_linear_model_predictions({
-          categorical_column.name: sparse_input
-      }, (embedding_column,))
-      expected_var_names = (
-          'linear_model/bias_weights:0',
-          'linear_model/aaa_embedding/weights:0',
-          'linear_model/aaa_embedding/embedding_weights:0',
-      )
-      self.assertItemsEqual(
-          expected_var_names,
-          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-      trainable_vars = {
-          v.name: v
-          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
-      bias = trainable_vars['linear_model/bias_weights:0']
-      embedding_weights = trainable_vars[
-          'linear_model/aaa_embedding/embedding_weights:0']
-      linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
-      with _initialized_session():
-        # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
-
-        # Predictions with all non-zero weights.
-        embedding_weights.assign((
-            (1., 2.),  # id 0
-            (3., 5.),  # id 1
-            (7., 11.)  # id 2
-        )).eval()
-        linear_weights.assign(((4.,), (6.,))).eval()
-        # example 0, ids [2], embedding[0] = [7, 11]
-        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # example 2, ids [], embedding[2] = [0, 0]
-        # example 3, ids [1], embedding[3] = [3, 5]
-        # sum(embeddings * linear_weights)
-        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
-
   def test_feature_layer(self):
     # Inputs.
     vocabulary_size = 3
@@ -5765,27 +4695,31 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_old.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
 
     with ops.Graph().as_default():
-      predictions = fc.linear_model({
+      model = fc.LinearModel(
+          (embedding_column_a, embedding_column_b),
+          shared_state_manager=fc.SharedEmbeddingStateManager())
+      predictions = model({
           categorical_column_a.name: input_a,
-          categorical_column_b.name: input_b,
-      }, (embedding_column_a, embedding_column_b))
+          categorical_column_b.name: input_b
+      })
+
       # Linear weights do not follow the column name. But this is a rare use
       # case, and fixing it would add too much complexity to the code.
       expected_var_names = (
           'linear_model/bias_weights:0',
-          'linear_model/aaa_bbb_shared_embedding/weights:0',
-          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
-          'linear_model/aaa_bbb_shared_embedding_1/weights:0',
+          'linear_model/aaa_shared_embedding/weights:0',
+          'shared_embedding_state_manager/aaa_bbb_shared_embedding:0',
+          'linear_model/bbb_shared_embedding/weights:0',
       )
       self.assertItemsEqual(
           expected_var_names,
@@ -5797,102 +4731,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
       self.assertItemsEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
-          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
-      linear_weights_a = trainable_vars[
-          'linear_model/aaa_bbb_shared_embedding/weights:0']
-      linear_weights_b = trainable_vars[
-          'linear_model/aaa_bbb_shared_embedding_1/weights:0']
-      with _initialized_session():
-        # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
-
-        # Predictions with all non-zero weights.
-        embedding_weights.assign((
-            (1., 2.),  # id 0
-            (3., 5.),  # id 1
-            (7., 11.)  # id 2
-        )).eval()
-        linear_weights_a.assign(((4.,), (6.,))).eval()
-        # example 0, ids [2], embedding[0] = [7, 11]
-        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # sum(embeddings * linear_weights)
-        # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
-        linear_weights_b.assign(((3.,), (5.,))).eval()
-        # example 0, ids [0], embedding[0] = [1, 2]
-        # example 1, ids [], embedding[1] = 0, 0]
-        # sum(embeddings * linear_weights)
-        # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
-
-  def test_keras_linear_model(self):
-    # Inputs.
-    batch_size = 2
-    vocabulary_size = 3
-    # -1 values are ignored.
-    input_a = np.array([
-        [2, -1, -1],  # example 0, ids [2]
-        [0, 1, -1]
-    ])  # example 1, ids [0, 1]
-    input_b = np.array([
-        [0, -1, -1],  # example 0, ids [0]
-        [-1, -1, -1]
-    ])  # example 1, ids []
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_shape = (vocabulary_size, embedding_dimension)
-    zeros_embedding_values = np.zeros(embedding_shape)
-
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual(embedding_shape, shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return zeros_embedding_values
-
-    # Build columns.
-    categorical_column_a = fc_old.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc_old.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_old.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    with ops.Graph().as_default():
-      predictions = get_keras_linear_model_predictions({
-          categorical_column_a.name: input_a,
-          categorical_column_b.name: input_b,
-      }, (embedding_column_a, embedding_column_b))
-      # Linear weights do not follow the column name. But this is a rare use
-      # case, and fixing it would add too much complexity to the code.
-      expected_var_names = (
-          'linear_model/bias_weights:0',
-          'linear_model/aaa_bbb_shared_embedding/weights:0',
-          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
-          'linear_model/aaa_bbb_shared_embedding_1/weights:0',
-      )
-      self.assertItemsEqual(
-          expected_var_names,
-          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-      trainable_vars = {
-          v.name: v
-          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
-      bias = trainable_vars['linear_model/bias_weights:0']
-      embedding_weights = trainable_vars[
-          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
+          'shared_embedding_state_manager/aaa_bbb_shared_embedding:0']
       linear_weights_a = trainable_vars[
-          'linear_model/aaa_bbb_shared_embedding/weights:0']
+          'linear_model/aaa_shared_embedding/weights:0']
       linear_weights_b = trainable_vars[
-          'linear_model/aaa_bbb_shared_embedding_1/weights:0']
+          'linear_model/bbb_shared_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
         self.assertAllClose(np.zeros((1,)), bias.eval())
@@ -6291,13 +5134,14 @@ class WeightedCategoricalColumnTest(test.TestCase):
               dense_shape=(2, 2)),
           weight_tensor.eval())
 
-  def test_keras_linear_model(self):
-    column = fc_old.weighted_categorical_column(
-        categorical_column=fc_old.categorical_column_with_identity(
+  def test_linear_model(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
-      predictions = get_keras_linear_model_predictions({
+      model = fc.LinearModel((column,))
+      predictions = model({
           'ids':
               sparse_tensor.SparseTensorValue(
                   indices=((0, 0), (1, 0), (1, 1)),
@@ -6308,9 +5152,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
                   indices=((0, 0), (1, 0), (1, 1)),
                   values=(.5, 1., .1),
                   dense_shape=(2, 2))
-      }, (column,))
-      bias = get_linear_model_bias()
-      weight_var = get_linear_model_column_var(column)
+      })
+      weight_var, bias = model.variables
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
         self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
@@ -6321,15 +5164,16 @@ class WeightedCategoricalColumnTest(test.TestCase):
         # = 3*1 + 2*.1 = 3+.2 = 3.2
         self.assertAllClose(((.5,), (3.2,)), predictions.eval())
 
-  def test_keras_linear_model_mismatched_shape(self):
-    column = fc_old.weighted_categorical_column(
-        categorical_column=fc_old.categorical_column_with_identity(
+  def test_linear_model_mismatched_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError,
-                                   r'Dimensions.*are not compatible'):
-        get_keras_linear_model_predictions({
+      with self.assertRaisesRegexp(
+          ValueError, r'Dimensions.*are not compatible'):
+        model = fc.LinearModel((column,))
+        model({
             'ids':
                 sparse_tensor.SparseTensorValue(
                     indices=((0, 0), (1, 0), (1, 1)),
@@ -6340,122 +5184,23 @@ class WeightedCategoricalColumnTest(test.TestCase):
                     indices=((0, 0), (0, 1), (1, 0), (1, 1)),
                     values=(.5, 11., 1., .1),
                     dense_shape=(2, 2))
-        }, (column,))
-
-  def test_keras_linear_model_mismatched_dense_values(self):
-    column = fc_old.weighted_categorical_column(
-        categorical_column=fc_old.categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    with ops.Graph().as_default():
-      predictions = get_keras_linear_model_predictions(
-          {
-              'ids':
-                  sparse_tensor.SparseTensorValue(
-                      indices=((0, 0), (1, 0), (1, 1)),
-                      values=(0, 2, 1),
-                      dense_shape=(2, 2)),
-              'values': ((.5,), (1.,))
-          }, (column,),
-          sparse_combiner='mean')
-      # Disabling the constant folding optimizer here since it changes the
-      # error message differently on CPU and GPU.
-      config = config_pb2.ConfigProto()
-      config.graph_options.rewrite_options.constant_folding = (
-          rewriter_config_pb2.RewriterConfig.OFF)
-      with _initialized_session(config):
-        with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+        })
 
-  def test_keras_linear_model_mismatched_dense_shape(self):
-    column = fc_old.weighted_categorical_column(
-        categorical_column=fc_old.categorical_column_with_identity(
+  def test_linear_model_mismatched_dense_values(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
-      predictions = get_keras_linear_model_predictions({
+      model = fc.LinearModel((column,), sparse_combiner='mean')
+      predictions = model({
           'ids':
               sparse_tensor.SparseTensorValue(
                   indices=((0, 0), (1, 0), (1, 1)),
                   values=(0, 2, 1),
                   dense_shape=(2, 2)),
-          'values': ((.5,), (1.,), (.1,))
-      }, (column,))
-      bias = get_linear_model_bias()
-      weight_var = get_linear_model_column_var(column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-        # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
-
-  def test_linear_model(self):
-    column = fc_old.weighted_categorical_column(
-        categorical_column=fc_old.categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    with ops.Graph().as_default():
-      predictions = fc.linear_model({
-          'ids': sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=(0, 2, 1),
-              dense_shape=(2, 2)),
-          'values': sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=(.5, 1., .1),
-              dense_shape=(2, 2))
-      }, (column,))
-      bias = get_linear_model_bias()
-      weight_var = get_linear_model_column_var(column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-        # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
-
-  def test_linear_model_mismatched_shape(self):
-    column = fc_old.weighted_categorical_column(
-        categorical_column=fc_old.categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
-          ValueError, r'Dimensions.*are not compatible'):
-        fc.linear_model({
-            'ids': sparse_tensor.SparseTensorValue(
-                indices=((0, 0), (1, 0), (1, 1)),
-                values=(0, 2, 1),
-                dense_shape=(2, 2)),
-            'values': sparse_tensor.SparseTensorValue(
-                indices=((0, 0), (0, 1), (1, 0), (1, 1)),
-                values=(.5, 11., 1., .1),
-                dense_shape=(2, 2))
-        }, (column,))
-
-  def test_linear_model_mismatched_dense_values(self):
-    column = fc_old.weighted_categorical_column(
-        categorical_column=fc_old.categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    with ops.Graph().as_default():
-      predictions = fc.linear_model(
-          {
-              'ids':
-                  sparse_tensor.SparseTensorValue(
-                      indices=((0, 0), (1, 0), (1, 1)),
-                      values=(0, 2, 1),
-                      dense_shape=(2, 2)),
-              'values': ((.5,), (1.,))
-          }, (column,),
-          sparse_combiner='mean')
+          'values': ((.5,), (1.,))
+      })
       # Disabling the constant folding optimizer here since it changes the
       # error message differently on CPU and GPU.
       config = config_pb2.ConfigProto()
@@ -6466,20 +5211,21 @@ class WeightedCategoricalColumnTest(test.TestCase):
           predictions.eval()
 
   def test_linear_model_mismatched_dense_shape(self):
-    column = fc_old.weighted_categorical_column(
-        categorical_column=fc_old.categorical_column_with_identity(
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
-      predictions = fc.linear_model({
-          'ids': sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=(0, 2, 1),
-              dense_shape=(2, 2)),
+      model = fc.LinearModel((column,))
+      predictions = model({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
           'values': ((.5,), (1.,), (.1,))
-      }, (column,))
-      bias = get_linear_model_bias()
-      weight_var = get_linear_model_column_var(column)
+      })
+      weight_var, bias = model.variables
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
         self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-- 
GitLab


From 97cba0b88cb3ce6a3f3cc66a8c4fd414bd3ac1a8 Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Thu, 27 Sep 2018 20:59:37 -0700
Subject: [PATCH 124/570] Allowing source_device to be set to /cpu:0 for multi
 device iterator in distribution strategies. That is always the appropriate
 option.

In the existing code, we would set it to a partially specified "worker" name that was ambiguous and end up on the GPU.

PiperOrigin-RevId: 214882658
---
 tensorflow/contrib/distribute/python/mirrored_strategy.py | 3 +--
 tensorflow/contrib/distribute/python/values.py            | 5 +----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 945f450387..504f45a695 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -482,8 +482,7 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
       return values.PerDeviceDataset(
           self._call_dataset_fn(dataset_fn),
           self._devices,
-          self._prefetch_on_device,
-          source_device=device_util.resolve("/device:CPU:0"))
+          self._prefetch_on_device)
 
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   def _run_steps_on_dataset(self, fn, iterator, iterations,
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index a0cd029f51..cce41e7717 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -708,10 +708,8 @@ class PerDeviceDataset(object):
       dataset,
       devices,
       prefetch_on_device=None,
-      source_device="/cpu:0",
   ):
     self._devices = devices
-    self._source_device = source_device if source_device is not None else "/cpu:0"
 
     # Default to using prefetching in graph mode, unless specified.
     # TODO(rohanj): Enable prefetching in eager mode.
@@ -750,7 +748,7 @@ class PerDeviceDataset(object):
                        "Please use `make_one_shot_iterator` instead.")
     if self._prefetch_on_device:
       dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator(
-          self._dataset, self._devices, source_device=self._source_device)
+          self._dataset, self._devices)
     else:
       dataset_iterator = self._dataset.make_initializable_iterator()
     return PerDeviceDataIterator(
@@ -838,7 +836,6 @@ class MultiWorkerDataset(object):
         self._datasets[worker] = PerDeviceDataset(
             worker_input,
             worker_devices,
-            source_device=worker,
             prefetch_on_device=prefetch_on_device)
 
   def make_one_shot_iterator(self):
-- 
GitLab


From 7fd14feb9cbc690b362633639b27393576472c79 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 27 Sep 2018 21:11:42 -0700
Subject: [PATCH 125/570] Kernel signature reworking, remove Dims from tensor
 functions.

PiperOrigin-RevId: 214883775
---
 .../contrib/lite/kernels/internal/tensor.h    |  4 ---
 .../lite/kernels/internal/tensor_ctypes.h     | 29 ---------------
 .../lite/kernels/internal/tensor_test.cc      | 36 ++++++++++---------
 3 files changed, 20 insertions(+), 49 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/contrib/lite/kernels/internal/tensor.h
index 765c3a03ef..689cea03e7 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor.h
@@ -37,10 +37,6 @@ inline const std::complex<float>* GetTensorData(const TfLiteTensor* tensor) {
              : nullptr;
 }
 
-inline Dims<4> GetTensorDims(std::vector<int32_t> data) {
-  return GetTensorDims(data.data(), data.size());
-}
-
 inline RuntimeShape GetTensorShape(std::vector<int32_t> data) {
   return RuntimeShape(data.size(), data.data());
 }
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h b/tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h
index 5e688ce452..9f5b33d217 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h
@@ -86,35 +86,6 @@ inline const bool* GetTensorData(const TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.b : nullptr;
 }
 
-// TODO(ahentz): the implementations in kernels/internal/ take a Dims<4> object
-// even if the original tensors were not 4D. We should consider rewriting them
-// to take a more generic 'shape' object.
-inline Dims<4> GetTensorDims(const int data[], const int size) {
-  Dims<4> d;
-  for (int i = 0; i < 4; ++i) {
-    int src = size - i - 1;
-    if (src >= 0) {
-      d.sizes[i] = data[src];
-    } else {
-      d.sizes[i] = 1;
-    }
-  }
-  d.strides[0] = 1;
-  for (int i = 1; i < 4; i++) {
-    d.strides[i] = d.strides[i - 1] * d.sizes[i - 1];
-  }
-  return d;
-}
-
-inline Dims<4> GetTensorDims(const TfLiteTensor* tensor) {
-  if (tensor == nullptr) {
-    return Dims<4>();
-  }
-
-  auto* dims = tensor->dims;
-  return GetTensorDims(dims->data, dims->size);
-}
-
 inline RuntimeShape GetTensorShape(const TfLiteTensor* tensor) {
   if (tensor == nullptr) {
     return RuntimeShape();
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_test.cc b/tensorflow/contrib/lite/kernels/internal/tensor_test.cc
index bf2068d320..2ed73ba82d 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_test.cc
@@ -21,28 +21,32 @@ namespace {
 
 using ::testing::ElementsAre;
 
-TEST(TensorTest, GetTensorDims4D) {
-  Dims<4> d = GetTensorDims({2, 3, 4, 5});
-  EXPECT_THAT(d.sizes, ElementsAre(5, 4, 3, 2));
-  EXPECT_THAT(d.strides, ElementsAre(1, 5, 20, 60));
+TEST(TensorTest, GetTensorShape4D) {
+  RuntimeShape d = GetTensorShape({2, 3, 4, 5});
+  EXPECT_THAT(
+      std::vector<int32>(d.DimsData(), d.DimsData() + d.DimensionsCount()),
+      ElementsAre(2, 3, 4, 5));
 }
 
-TEST(TensorTest, GetTensorDims3D) {
-  Dims<4> d = GetTensorDims({3, 4, 5});
-  EXPECT_THAT(d.sizes, ElementsAre(5, 4, 3, 1));
-  EXPECT_THAT(d.strides, ElementsAre(1, 5, 20, 60));
+TEST(TensorTest, GetTensorShape3D) {
+  RuntimeShape d = GetTensorShape({3, 4, 5});
+  EXPECT_THAT(
+      std::vector<int32>(d.DimsData(), d.DimsData() + d.DimensionsCount()),
+      ElementsAre(3, 4, 5));
 }
 
-TEST(TensorTest, GetTensorDims2D) {
-  Dims<4> d = GetTensorDims({4, 5});
-  EXPECT_THAT(d.sizes, ElementsAre(5, 4, 1, 1));
-  EXPECT_THAT(d.strides, ElementsAre(1, 5, 20, 20));
+TEST(TensorTest, GetTensorShape2D) {
+  RuntimeShape d = GetTensorShape({4, 5});
+  EXPECT_THAT(
+      std::vector<int32>(d.DimsData(), d.DimsData() + d.DimensionsCount()),
+      ElementsAre(4, 5));
 }
 
-TEST(TensorTest, GetTensorDims1D) {
-  Dims<4> d = GetTensorDims({5});
-  EXPECT_THAT(d.sizes, ElementsAre(5, 1, 1, 1));
-  EXPECT_THAT(d.strides, ElementsAre(1, 5, 5, 5));
+TEST(TensorTest, GetTensorShape1D) {
+  RuntimeShape d = GetTensorShape({5});
+  EXPECT_THAT(
+      std::vector<int32>(d.DimsData(), d.DimsData() + d.DimensionsCount()),
+      ElementsAre(5));
 }
 
 }  // namespace
-- 
GitLab


From efe17306442aa91192df953ae537d3f9b824dae6 Mon Sep 17 00:00:00 2001
From: IMBurbank <bassmanburbank@gmail.com>
Date: Thu, 27 Sep 2018 22:21:47 -0600
Subject: [PATCH 126/570] Updated python3 tf_inspect.getargspec calls to use
 getfullargspec and repackage the return values into the getargspec struct.

---
 .../python/losses/python/tuple_losses_impl.py |  2 +-
 .../labeled_tensor/python/ops/_typecheck.py   |  2 +-
 .../layers/python/layers/rev_block_lib.py     |  3 +-
 .../python/learn/estimators/estimator.py      |  4 +-
 .../learn/python/learn/estimators/head.py     |  2 +-
 .../learn/python/learn/experiment_test.py     |  2 +-
 .../learn/python/learn/export_strategy.py     |  2 +-
 .../contrib/learn/python/learn/metric_spec.py |  2 +-
 .../contrib/learn/python/learn/monitors.py    |  2 +-
 .../contrib/tpu/python/tpu/tpu_function.py    |  2 +-
 tensorflow/python/framework/errors_impl.py    |  2 +-
 tensorflow/python/framework/function.py       |  6 +-
 tensorflow/python/keras/backend_test.py       |  2 +-
 tensorflow/python/keras/testing_utils.py      |  2 +-
 .../kernel_tests/variable_scope_test.py       |  4 +-
 tensorflow/python/ops/variable_scope.py       |  4 +-
 tensorflow/python/util/tf_contextlib_test.py  |  2 +-
 tensorflow/python/util/tf_inspect.py          | 89 ++++++++++++-------
 .../api/lib/python_object_to_proto_visitor.py |  4 +-
 19 files changed, 79 insertions(+), 59 deletions(-)

diff --git a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
index 00a83e5e55..221c70c38b 100644
--- a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
@@ -101,7 +101,7 @@ def _args_to_gan_model(loss_fn):
   """
   # Match arguments in `loss_fn` to elements of `namedtuple`.
   # TODO(joelshor): Properly handle `varargs` and `keywords`.
-  argspec = tf_inspect.getfullargspec(loss_fn)
+  argspec = tf_inspect.getargspec(loss_fn)
   defaults = argspec.defaults or []
 
   required_args = set(argspec.args[:-len(defaults)])
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py b/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
index 0e23039847..80fa17ec1f 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
@@ -230,7 +230,7 @@ def accepts(*types):
 
   def check_accepts(f):
     """Check the types."""
-    spec = tf_inspect.getfullargspec(f)
+    spec = tf_inspect.getargspec(f)
 
     num_function_arguments = len(spec.args)
     if len(types) != num_function_arguments:
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index 55979cc391..06da32072f 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -576,8 +576,7 @@ def _recomputing_grad_fn(compute_fn,
 
 def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
   """See recompute_grad."""
-  has_is_recompute_kwarg = (
-      "is_recomputing" in tf_inspect.getfullargspec(fn).args)
+  has_is_recompute_kwarg = "is_recomputing" in tf_inspect.getargspec(fn).args
   for arg in args:
     if not isinstance(arg, framework_ops.Tensor):
       raise ValueError("All inputs to function must be Tensors")
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index b88923bca2..c1de42782e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -199,11 +199,11 @@ def _model_fn_args(fn):
   if hasattr(fn, 'func') and hasattr(fn, 'keywords') and hasattr(fn, 'args'):
     # Handle functools.partial and similar objects.
     return tuple([
-        arg for arg in tf_inspect.getfullargspec(fn.func).args[len(fn.args):]
+        arg for arg in tf_inspect.getargspec(fn.func).args[len(fn.args):]
         if arg not in set(fn.keywords.keys())
     ])
   # Handle function.
-  return tuple(tf_inspect.getfullargspec(fn).args)
+  return tuple(tf_inspect.getargspec(fn).args)
 
 
 def _get_replica_device_setter(config):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 63dd08316b..c6f79e00d5 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -1861,7 +1861,7 @@ def _get_arguments(func):
   _, func = tf_decorator.unwrap(func)
   if hasattr(func, "__code__"):
     # Regular function.
-    return tf_inspect.getfullargspec(func)
+    return tf_inspect.getargspec(func)
   elif hasattr(func, "func"):
     # Partial function.
     return _get_arguments(func.func)
diff --git a/tensorflow/contrib/learn/python/learn/experiment_test.py b/tensorflow/contrib/learn/python/learn/experiment_test.py
index 6926696fb6..fb16c94c29 100644
--- a/tensorflow/contrib/learn/python/learn/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/experiment_test.py
@@ -126,7 +126,7 @@ class TestBaseEstimator(object):
 
 def _check_method_supports_args(method, kwargs):
   """Checks that the given method supports the given args."""
-  supported_args = tuple(tf_inspect.getfullargspec(method).args)
+  supported_args = tuple(tf_inspect.getargspec(method).args)
   for kwarg in kwargs:
     if kwarg not in supported_args:
       raise ValueError(
diff --git a/tensorflow/contrib/learn/python/learn/export_strategy.py b/tensorflow/contrib/learn/python/learn/export_strategy.py
index 0d6e0cdc18..075cab536e 100644
--- a/tensorflow/contrib/learn/python/learn/export_strategy.py
+++ b/tensorflow/contrib/learn/python/learn/export_strategy.py
@@ -96,7 +96,7 @@ class ExportStrategy(
     """
     # don't break existing export_fns that don't accept checkpoint_path and
     # eval_result
-    export_fn_args = tf_inspect.getfullargspec(self.export_fn).args
+    export_fn_args = tf_inspect.getargspec(self.export_fn).args
     kwargs = {}
     if 'checkpoint_path' in export_fn_args:
       kwargs['checkpoint_path'] = checkpoint_path
diff --git a/tensorflow/contrib/learn/python/learn/metric_spec.py b/tensorflow/contrib/learn/python/learn/metric_spec.py
index 604d6d46b4..97220365d5 100644
--- a/tensorflow/contrib/learn/python/learn/metric_spec.py
+++ b/tensorflow/contrib/learn/python/learn/metric_spec.py
@@ -51,7 +51,7 @@ def _args(fn):
     return tuple(
         [arg for arg in _args(fn.func) if arg not in set(fn.keywords.keys())])
   # Handle function.
-  return tuple(tf_inspect.getfullargspec(fn).args)
+  return tuple(tf_inspect.getargspec(fn).args)
 
 
 _CANONICAL_LABELS_ARG = 'labels'
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index 5f61e0264f..3d691d4340 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -1303,7 +1303,7 @@ class RunHookAdapterForMonitors(session_run_hook.SessionRunHook):
   def end(self, session):
     self._last_step = None
     for m in self._monitors:
-      if "session" in tf_inspect.getfullargspec(m.end).args:
+      if "session" in tf_inspect.getargspec(m.end).args:
         m.end(session=session)
       else:
         m.end()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_function.py b/tensorflow/contrib/tpu/python/tpu/tpu_function.py
index 9c4bd1c4d1..0c7a38dbbb 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_function.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_function.py
@@ -80,7 +80,7 @@ def check_function_argument_count(func, input_arity, infeed_queue):
   number_of_arguments_needed = input_arity
   if infeed_queue is not None:
     number_of_arguments_needed += infeed_queue.number_of_tuple_elements
-  arg_spec = tf_inspect.getfullargspec(func)
+  arg_spec = tf_inspect.getargspec(func)
   number_of_args = len(arg_spec.args)
   if arg_spec.defaults is None:
     number_of_defaults = 0
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index c373e75a74..5af71f2cfb 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -55,7 +55,7 @@ class OpError(Exception):
 
   def __reduce__(self):
     # Allow the subclasses to accept less arguments in their __init__.
-    init_argspec = tf_inspect.getfullargspec(self.__class__.__init__)
+    init_argspec = tf_inspect.getargspec(self.__class__.__init__)
     args = tuple(getattr(self, arg) for arg in init_argspec.args[1:])
     return self.__class__, args
 
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 3db6f683c9..225208944e 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -132,9 +132,9 @@ class Defun(object):
       raise ValueError("func %s must be callable" % func)
 
     # Func should not use kwargs and defaults.
-    argspec = tf_inspect.getfullargspec(func)
-    if argspec.varkw or argspec.defaults:
-      raise ValueError("Functions with argument defaults or varkw "
+    argspec = tf_inspect.getargspec(func)
+    if argspec.keywords or argspec.defaults:
+      raise ValueError("Functions with argument defaults or keywords "
                        "arguments are not supported.")
 
     # Computes how many arguments 'func' has.
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 31191d0d35..ab71589940 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -452,7 +452,7 @@ class BackendLinearAlgebraTest(test.TestCase):
         compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
                                          keras_kwargs={'axis': -1},
                                          np_kwargs={'axis': -1})
-        if 'keepdims' in tf_inspect.getfullargspec(keras_op).args:
+        if 'keepdims' in tf_inspect.getargspec(keras_op).args:
           compare_single_input_op_to_numpy(keras_op, np_op,
                                            input_shape=(4, 7, 5),
                                            keras_kwargs={'axis': 1,
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 1afaba5653..501b50ba5f 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -102,7 +102,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   layer.set_weights(weights)
 
   # test and instantiation from weights
-  if 'weights' in tf_inspect.getfullargspec(layer_cls.__init__):
+  if 'weights' in tf_inspect.getargspec(layer_cls.__init__):
     kwargs['weights'] = weights
     layer = layer_cls(**kwargs)
 
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 1d0b72b17a..401e1ae102 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -998,8 +998,8 @@ class VariableScopeTest(test.TestCase):
 
   def testSignatureGetVarVsGetLocalVar(self):
     """get_{local,}variable() must take the same list of args."""
-    arg_names = tf_inspect.getfullargspec(variable_scope.get_variable)[0]
-    local_arg_names = tf_inspect.getfullargspec(
+    arg_names = tf_inspect.getargspec(variable_scope.get_variable)[0]
+    local_arg_names = tf_inspect.getargspec(
         variable_scope.get_local_variable)[0]
     self.assertEqual(arg_names, local_arg_names)
 
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 3cc1eb916d..a43676cd70 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -892,14 +892,14 @@ class _VariableStore(object):
         if shape and shape.is_fully_defined():
           init_val = lambda: initializer(  # pylint: disable=g-long-lambda
               shape.as_list(), dtype=dtype, partition_info=partition_info)
-        elif not tf_inspect.getfullargspec(initializer).args:
+        elif not tf_inspect.getargspec(initializer).args:
           init_val = initializer
         else:
           raise ValueError("You can only pass an initializer function that "
                            "expects no arguments to its callable when the "
                            "shape is not fully defined. The given initializer "
                            "function expects the following args %s" %
-                           tf_inspect.getfullargspec(initializer).args)
+                           tf_inspect.getargspec(initializer).args)
         variable_dtype = dtype.base_dtype
 
     # Create the variable.
diff --git a/tensorflow/python/util/tf_contextlib_test.py b/tensorflow/python/util/tf_contextlib_test.py
index 1e921b5ea3..4a5bf388a6 100644
--- a/tensorflow/python/util/tf_contextlib_test.py
+++ b/tensorflow/python/util/tf_contextlib_test.py
@@ -83,7 +83,7 @@ class TfContextlibTest(test.TestCase):
     self.assertFalse(isinstance(target, tf_decorator.TFDecorator))
 
   def testGetArgSpecReturnsWrappedArgSpec(self):
-    argspec = tf_inspect.getfullargspec(test_params_and_defaults)
+    argspec = tf_inspect.getargspec(test_params_and_defaults)
     self.assertEqual(['a', 'b', 'c', 'd'], argspec.args)
     self.assertEqual((2, True, 'hello'), argspec.defaults)
 
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 234850ac3f..3cd6c515b9 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -36,6 +36,53 @@ else:
       'annotations'
   ])
 
+if hasattr(_inspect, 'getfullargspec'):
+  _getfullargspec = _inspect.getfullargspec  # pylint: disable=invalid-name
+  
+  def _getargspec(target):
+    """A python3 version of getargspec.
+
+    Calls `getfullargspec` and assigns args, varargs, 
+    varkw, and defaults to a python 2/3 compatible `ArgSpec`.
+
+    The parameter name 'varkw' is changed to 'keywords' to fit the 
+    `ArgSpec` struct.
+
+    Args:
+      target: the target object to inspect.
+    Returns:
+      An ArgSpec with args, varargs, keywords, and defaults parameters
+      from FullArgSpec.
+    """
+    fullargspecs = getfullargspec(target)
+    argspecs = ArgSpec(
+        args=fullargspecs.args,
+        varargs=fullargspecs.varargs,
+        keywords=fullargspecs.varkw,
+        defaults=fullargspecs.defaults)
+    return argspecs
+else:
+  _getargspec = _inspect.getargspec
+
+  def _getfullargspec(target):
+    """A python2 version of getfullargspec.
+
+    Args:
+      target: the target object to inspect.
+    Returns:
+      A FullArgSpec with empty kwonlyargs, kwonlydefaults and annotations.
+    """
+    argspecs = getargspec(target)
+    fullargspecs = FullArgSpec(
+        args=argspecs.args,
+        varargs=argspecs.varargs,
+        varkw=argspecs.keywords,
+        defaults=argspecs.defaults,
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
+    return fullargspecs
+
 
 def currentframe():
   """TFDecorator-aware replacement for inspect.currentframe."""
@@ -45,10 +92,8 @@ def currentframe():
 def getargspec(obj):
   """TFDecorator-aware replacement for `inspect.getargspec`.
 
-  This should not be called from other modules. It is deprecated in python3.
-
-  Use `getfullargspec`. It is a TFDecorator-aware replacement for 
-  `inspect.getfullargspec` compatible with both python2 and python3.
+  Note: `getfullargspec` is recommended as the python 2/3 compatible 
+  replacement for this function.
 
   Args:
     obj: A function, partial function, or callable object, possibly
@@ -56,8 +101,8 @@ def getargspec(obj):
 
   Returns:
     The `ArgSpec` that describes the signature of the outermost decorator that
-    changes the callable's signature. If the callable is not decorated,
-    `inspect.getargspec()` will be called directly on the object.
+    changes the callable's signature, or the `ArgSpec` that describes 
+    the object if not decorated.
 
   Raises:
     ValueError: When callable's signature can not be expressed with
@@ -77,24 +122,24 @@ def getargspec(obj):
 
   try:
     # Python3 will handle most callables here (not partial).
-    return _inspect.getargspec(target)
+    return _getargspec(target)
   except TypeError:
     pass
 
   if isinstance(target, type):
     try:
-      return _inspect.getargspec(target.__init__)
+      return _getargspec(target.__init__)
     except TypeError:
       pass
 
     try:
-      return _inspect.getargspec(target.__new__)
+      return _getargspec(target.__new__)
     except TypeError:
       pass
 
   # The `type(target)` ensures that if a class is received we don't return
   # the signature of it's __call__ method.
-  return _inspect.getargspec(type(target).__call__)
+  return _getargspec(type(target).__call__)
 
 
 def _get_argspec_for_partial(obj):
@@ -177,30 +222,6 @@ def _get_argspec_for_partial(obj):
   return ArgSpec(args, varargs, keywords, tuple(all_defaults[first_default:]))
 
 
-if hasattr(_inspect, 'getfullargspec'):
-  _getfullargspec = _inspect.getfullargspec
-else:
-
-  def _getfullargspec(target):
-    """A python2 version of getfullargspec.
-
-    Args:
-      target: the target object to inspect.
-    Returns:
-      A FullArgSpec with empty kwonlyargs, kwonlydefaults and annotations.
-    """
-    argspecs = getargspec(target)
-    fullargspecs = FullArgSpec(
-        args=argspecs.args,
-        varargs=argspecs.varargs,
-        varkw=argspecs.keywords,
-        defaults=argspecs.defaults,
-        kwonlyargs=[],
-        kwonlydefaults=None,
-        annotations={})
-    return fullargspecs
-
-
 def getfullargspec(obj):
   """TFDecorator-aware replacement for `inspect.getfullargspec`.
 
diff --git a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
index a8e69fda4f..3a48cf683c 100644
--- a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
+++ b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
@@ -47,9 +47,9 @@ def _SanitizedArgSpec(obj):
     string, a string representation of the argspec.
   """
   output_string = ''
-  unsanitized_arg_spec = tf_inspect.getfullargspec(obj)
+  unsanitized_arg_spec = tf_inspect.getargspec(obj)
 
-  for clean_attr in ('args', 'varargs', 'varkw'):
+  for clean_attr in ('args', 'varargs', 'keywords'):
     output_string += '%s=%s, ' % (clean_attr,
                                   getattr(unsanitized_arg_spec, clean_attr))
 
-- 
GitLab


From f4eccdda0ca2b06328363191975fa8364ba14728 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Thu, 27 Sep 2018 21:54:33 -0700
Subject: [PATCH 127/570] Run buildifier on workspace.bzl.

PiperOrigin-RevId: 214886657
---
 tensorflow/workspace.bzl | 380 +++++++++++++++++++--------------------
 1 file changed, 190 insertions(+), 190 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 6966783efd..70bade060e 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -57,39 +57,39 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     # Point //external/local_config_arm_compiler to //external/arm_compiler
     arm_compiler_configure(
         name = "local_config_arm_compiler",
-        remote_config_repo = "../arm_compiler",
         build_file = clean_dep("//third_party/toolchains/cpus/arm:BUILD"),
+        remote_config_repo = "../arm_compiler",
     )
 
     mkl_repository(
         name = "mkl_linux",
+        build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
+        sha256 = "e2233534a9d15c387e22260997af4312a39e9f86f791768409be273b5453c4e6",
+        strip_prefix = "mklml_lnx_2019.0.20180710",
         urls = [
             "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_lnx_2019.0.20180710.tgz",
             "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_lnx_2019.0.20180710.tgz",
         ],
-        sha256 = "e2233534a9d15c387e22260997af4312a39e9f86f791768409be273b5453c4e6",
-        strip_prefix = "mklml_lnx_2019.0.20180710",
-        build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
     )
     mkl_repository(
         name = "mkl_windows",
+        build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
+        sha256 = "3fdcff17b018a0082491adf3ba143358265336a801646e46e0191ec8d58d24a2",
+        strip_prefix = "mklml_win_2019.0.20180710",
         urls = [
             "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_win_2019.0.20180710.zip",
             "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_win_2019.0.20180710.zip",
         ],
-        sha256 = "3fdcff17b018a0082491adf3ba143358265336a801646e46e0191ec8d58d24a2",
-        strip_prefix = "mklml_win_2019.0.20180710",
-        build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
     )
     mkl_repository(
         name = "mkl_darwin",
+        build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
+        sha256 = "411a30014a938eb83fb9f37b3dbe8e371b106fc1dd621fc23123cadc72737ce6",
+        strip_prefix = "mklml_mac_2019.0.20180710",
         urls = [
             "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_mac_2019.0.20180710.tgz",
             "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_mac_2019.0.20180710.tgz",
         ],
-        sha256 = "411a30014a938eb83fb9f37b3dbe8e371b106fc1dd621fc23123cadc72737ce6",
-        strip_prefix = "mklml_mac_2019.0.20180710",
-        build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
     )
 
     if path_prefix:
@@ -98,39 +98,40 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "mkl_dnn",
+        build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
+        sha256 = "363cc9239eacf8e7917753c6d8c94f767e4cd049160d0654a61ef32d5e1b3049",
+        strip_prefix = "mkl-dnn-4e333787e0d66a1dca1218e99a891d493dbc8ef1",
         urls = [
             "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/4e333787e0d66a1dca1218e99a891d493dbc8ef1.tar.gz",
             "https://github.com/intel/mkl-dnn/archive/4e333787e0d66a1dca1218e99a891d493dbc8ef1.tar.gz",
         ],
-        sha256 = "363cc9239eacf8e7917753c6d8c94f767e4cd049160d0654a61ef32d5e1b3049",
-        strip_prefix = "mkl-dnn-4e333787e0d66a1dca1218e99a891d493dbc8ef1",
-        build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
     )
 
     tf_http_archive(
         name = "com_google_absl",
+        build_file = clean_dep("//third_party:com_google_absl.BUILD"),
+        sha256 = "278a1af58b633be886fe81bf7061dca6b5fea99566850d1319fffdaa1a061792",
+        strip_prefix = "abseil-cpp-e291c279e458761e77a69b09b129d3d1e81f1e80",
         urls = [
             "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/e291c279e458761e77a69b09b129d3d1e81f1e80.tar.gz",
             "https://github.com/abseil/abseil-cpp/archive/e291c279e458761e77a69b09b129d3d1e81f1e80.tar.gz",
         ],
-        sha256 = "278a1af58b633be886fe81bf7061dca6b5fea99566850d1319fffdaa1a061792",
-        strip_prefix = "abseil-cpp-e291c279e458761e77a69b09b129d3d1e81f1e80",
-        build_file = clean_dep("//third_party:com_google_absl.BUILD"),
     )
 
     tf_http_archive(
         name = "eigen_archive",
+        build_file = clean_dep("//third_party:eigen.BUILD"),
+        sha256 = "d956415d784fa4e42b6a2a45c32556d6aec9d0a3d8ef48baee2522ab762556a9",
+        strip_prefix = "eigen-eigen-fd6845384b86",
         urls = [
             "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/fd6845384b86.tar.gz",
             "https://bitbucket.org/eigen/eigen/get/fd6845384b86.tar.gz",
         ],
-        sha256 = "d956415d784fa4e42b6a2a45c32556d6aec9d0a3d8ef48baee2522ab762556a9",
-        strip_prefix = "eigen-eigen-fd6845384b86",
-        build_file = clean_dep("//third_party:eigen.BUILD"),
     )
 
     tf_http_archive(
         name = "arm_compiler",
+        build_file = clean_dep("//:arm_compiler.BUILD"),
         sha256 = "970285762565c7890c6c087d262b0a18286e7d0384f13a37786d8521773bc969",
         strip_prefix = "tools-0e906ebc527eab1cdbf7adabff5b474da9562e9f/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf",
         urls = [
@@ -139,216 +140,211 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
             # remove the whitelist entry in third_party/repo.bzl.
             # "https://github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
         ],
-        build_file = clean_dep("//:arm_compiler.BUILD"),
     )
 
     tf_http_archive(
         name = "libxsmm_archive",
+        build_file = clean_dep("//third_party:libxsmm.BUILD"),
+        sha256 = "cd8532021352b4a0290d209f7f9bfd7c2411e08286a893af3577a43457287bfa",
+        strip_prefix = "libxsmm-1.9",
         urls = [
             "https://mirror.bazel.build/github.com/hfp/libxsmm/archive/1.9.tar.gz",
             "https://github.com/hfp/libxsmm/archive/1.9.tar.gz",
         ],
-        sha256 = "cd8532021352b4a0290d209f7f9bfd7c2411e08286a893af3577a43457287bfa",
-        strip_prefix = "libxsmm-1.9",
-        build_file = clean_dep("//third_party:libxsmm.BUILD"),
     )
 
     tf_http_archive(
         name = "ortools_archive",
+        build_file = clean_dep("//third_party:ortools.BUILD"),
+        sha256 = "d025a95f78b5fc5eaa4da5f395f23d11c23cf7dbd5069f1f627f002de87b86b9",
+        strip_prefix = "or-tools-6.7.2/src",
         urls = [
             "https://mirror.bazel.build/github.com/google/or-tools/archive/v6.7.2.tar.gz",
             "https://github.com/google/or-tools/archive/v6.7.2.tar.gz",
         ],
-        sha256 = "d025a95f78b5fc5eaa4da5f395f23d11c23cf7dbd5069f1f627f002de87b86b9",
-        strip_prefix = "or-tools-6.7.2/src",
-        build_file = clean_dep("//third_party:ortools.BUILD"),
     )
 
     tf_http_archive(
         name = "com_googlesource_code_re2",
+        sha256 = "803c7811146edeef8f91064de37c6f19136ff01a2a8cdb3230e940b2fd9f07fe",
+        strip_prefix = "re2-2018-07-01",
+        system_build_file = clean_dep("//third_party/systemlibs:re2.BUILD"),
         urls = [
             "https://mirror.bazel.build/github.com/google/re2/archive/2018-07-01.tar.gz",
             "https://github.com/google/re2/archive/2018-07-01.tar.gz",
         ],
-        sha256 = "803c7811146edeef8f91064de37c6f19136ff01a2a8cdb3230e940b2fd9f07fe",
-        strip_prefix = "re2-2018-07-01",
-        system_build_file = clean_dep("//third_party/systemlibs:re2.BUILD"),
     )
 
     tf_http_archive(
         name = "com_github_googlecloudplatform_google_cloud_cpp",
-        urls = [
-            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/14760a86c4ffab9943b476305c4fe927ad95db1c.tar.gz",
-            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/14760a86c4ffab9943b476305c4fe927ad95db1c.tar.gz",
-        ],
         sha256 = "fdd3b3aecce60987e5525e55bf3a21d68a8695320bd5b980775af6507eec3944",
         strip_prefix = "google-cloud-cpp-14760a86c4ffab9943b476305c4fe927ad95db1c",
         system_build_file = clean_dep("//third_party/systemlibs:google_cloud_cpp.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
+        urls = [
+            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/14760a86c4ffab9943b476305c4fe927ad95db1c.tar.gz",
+            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/14760a86c4ffab9943b476305c4fe927ad95db1c.tar.gz",
+        ],
     )
 
     tf_http_archive(
         name = "com_github_googleapis_googleapis",
+        build_file = clean_dep("//third_party:googleapis.BUILD"),
+        sha256 = "824870d87a176f26bcef663e92051f532fac756d1a06b404055dc078425f4378",
+        strip_prefix = "googleapis-f81082ea1e2f85c43649bee26e0d9871d4b41cdb",
+        system_build_file = clean_dep("//third_party/systemlibs:googleapis.BUILD"),
         urls = [
             "https://mirror.bazel.build/github.com/googleapis/googleapis/archive/f81082ea1e2f85c43649bee26e0d9871d4b41cdb.zip",
             "https://github.com/googleapis/googleapis/archive/f81082ea1e2f85c43649bee26e0d9871d4b41cdb.zip",
         ],
-        sha256 = "824870d87a176f26bcef663e92051f532fac756d1a06b404055dc078425f4378",
-        strip_prefix = "googleapis-f81082ea1e2f85c43649bee26e0d9871d4b41cdb",
-        build_file = clean_dep("//third_party:googleapis.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:googleapis.BUILD"),
     )
 
     tf_http_archive(
         name = "gemmlowp",
+        sha256 = "b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658",
+        strip_prefix = "gemmlowp-38ebac7b059e84692f53e5938f97a9943c120d98",
         urls = [
             "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
             "https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
         ],
-        sha256 = "b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658",
-        strip_prefix = "gemmlowp-38ebac7b059e84692f53e5938f97a9943c120d98",
     )
 
     tf_http_archive(
         name = "farmhash_archive",
+        build_file = clean_dep("//third_party:farmhash.BUILD"),
+        sha256 = "6560547c63e4af82b0f202cb710ceabb3f21347a4b996db565a411da5b17aba0",
+        strip_prefix = "farmhash-816a4ae622e964763ca0862d9dbd19324a1eaf45",
         urls = [
             "https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz",
             "https://github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz",
         ],
-        sha256 = "6560547c63e4af82b0f202cb710ceabb3f21347a4b996db565a411da5b17aba0",
-        strip_prefix = "farmhash-816a4ae622e964763ca0862d9dbd19324a1eaf45",
-        build_file = clean_dep("//third_party:farmhash.BUILD"),
     )
 
     tf_http_archive(
         name = "highwayhash",
+        build_file = clean_dep("//third_party:highwayhash.BUILD"),
+        sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37",
+        strip_prefix = "highwayhash-fd3d9af80465e4383162e4a7c5e2f406e82dd968",
         urls = [
             "http://mirror.bazel.build/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
             "https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
         ],
-        sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37",
-        strip_prefix = "highwayhash-fd3d9af80465e4383162e4a7c5e2f406e82dd968",
-        build_file = clean_dep("//third_party:highwayhash.BUILD"),
     )
 
     tf_http_archive(
         name = "nasm",
+        build_file = clean_dep("//third_party:nasm.BUILD"),
+        sha256 = "63ec86477ad3f0f6292325fd89e1d93aea2e2fd490070863f17d48f7cd387011",
+        strip_prefix = "nasm-2.13.03",
+        system_build_file = clean_dep("//third_party/systemlibs:nasm.BUILD"),
         urls = [
             "https://mirror.bazel.build/www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2",
             "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.13.03.tar.bz2/sha512/d7a6b4cee8dfd603d8d4c976e5287b5cc542fa0b466ff989b743276a6e28114e64289bf02a7819eca63142a5278aa6eed57773007e5f589e15768e6456a8919d/nasm-2.13.03.tar.bz2",
             "http://www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2",
         ],
-        sha256 = "63ec86477ad3f0f6292325fd89e1d93aea2e2fd490070863f17d48f7cd387011",
-        strip_prefix = "nasm-2.13.03",
-        build_file = clean_dep("//third_party:nasm.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:nasm.BUILD"),
     )
 
     tf_http_archive(
         name = "jpeg",
+        build_file = clean_dep("//third_party/jpeg:jpeg.BUILD"),
+        sha256 = "f892fff427ab3adffc289363eac26d197ce3ccacefe5f5822377348a8166069b",
+        strip_prefix = "libjpeg-turbo-2.0.0",
+        system_build_file = clean_dep("//third_party/systemlibs:jpeg.BUILD"),
         urls = [
             "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz",
             "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz",
         ],
-        sha256 = "f892fff427ab3adffc289363eac26d197ce3ccacefe5f5822377348a8166069b",
-        strip_prefix = "libjpeg-turbo-2.0.0",
-        build_file = clean_dep("//third_party/jpeg:jpeg.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:jpeg.BUILD"),
     )
 
     tf_http_archive(
         name = "png_archive",
+        build_file = clean_dep("//third_party:png.BUILD"),
+        patch_file = clean_dep("//third_party:png_fix_rpi.patch"),
+        sha256 = "e45ce5f68b1d80e2cb9a2b601605b374bdf51e1798ef1c2c2bd62131dfcf9eef",
+        strip_prefix = "libpng-1.6.34",
+        system_build_file = clean_dep("//third_party/systemlibs:png.BUILD"),
         urls = [
             "https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.6.34.tar.gz",
             "https://github.com/glennrp/libpng/archive/v1.6.34.tar.gz",
         ],
-        sha256 = "e45ce5f68b1d80e2cb9a2b601605b374bdf51e1798ef1c2c2bd62131dfcf9eef",
-        strip_prefix = "libpng-1.6.34",
-        build_file = clean_dep("//third_party:png.BUILD"),
-        patch_file = clean_dep("//third_party:png_fix_rpi.patch"),
-        system_build_file = clean_dep("//third_party/systemlibs:png.BUILD"),
     )
 
     tf_http_archive(
         name = "org_sqlite",
+        build_file = clean_dep("//third_party:sqlite.BUILD"),
+        sha256 = "ad68c1216c3a474cf360c7581a4001e952515b3649342100f2d7ca7c8e313da6",
+        strip_prefix = "sqlite-amalgamation-3240000",
+        system_build_file = clean_dep("//third_party/systemlibs:sqlite.BUILD"),
         urls = [
             "https://mirror.bazel.build/www.sqlite.org/2018/sqlite-amalgamation-3240000.zip",
             "https://www.sqlite.org/2018/sqlite-amalgamation-3240000.zip",
         ],
-        sha256 = "ad68c1216c3a474cf360c7581a4001e952515b3649342100f2d7ca7c8e313da6",
-        strip_prefix = "sqlite-amalgamation-3240000",
-        build_file = clean_dep("//third_party:sqlite.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:sqlite.BUILD"),
     )
 
     tf_http_archive(
         name = "gif_archive",
+        build_file = clean_dep("//third_party:gif.BUILD"),
+        sha256 = "34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1",
+        strip_prefix = "giflib-5.1.4",
+        system_build_file = clean_dep("//third_party/systemlibs:gif.BUILD"),
         urls = [
             "https://mirror.bazel.build/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
             "http://pilotfiber.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
         ],
-        sha256 = "34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1",
-        strip_prefix = "giflib-5.1.4",
-        build_file = clean_dep("//third_party:gif.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:gif.BUILD"),
     )
 
     tf_http_archive(
         name = "six_archive",
+        build_file = clean_dep("//third_party:six.BUILD"),
+        sha256 = "105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
+        strip_prefix = "six-1.10.0",
+        system_build_file = clean_dep("//third_party/systemlibs:six.BUILD"),
         urls = [
             "https://mirror.bazel.build/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
             "https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
         ],
-        sha256 = "105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
-        strip_prefix = "six-1.10.0",
-        build_file = clean_dep("//third_party:six.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:six.BUILD"),
     )
 
     tf_http_archive(
         name = "astor_archive",
+        build_file = clean_dep("//third_party:astor.BUILD"),
+        sha256 = "ff6d2e2962d834acb125cc4dcc80c54a8c17c253f4cc9d9c43b5102a560bb75d",
+        strip_prefix = "astor-0.6.2",
+        system_build_file = clean_dep("//third_party/systemlibs:astor.BUILD"),
         urls = [
             "https://mirror.bazel.build/pypi.python.org/packages/d8/be/c4276b3199ec3feee2a88bc64810fbea8f26d961e0a4cd9c68387a9f35de/astor-0.6.2.tar.gz",
             "https://pypi.python.org/packages/d8/be/c4276b3199ec3feee2a88bc64810fbea8f26d961e0a4cd9c68387a9f35de/astor-0.6.2.tar.gz",
         ],
-        sha256 = "ff6d2e2962d834acb125cc4dcc80c54a8c17c253f4cc9d9c43b5102a560bb75d",
-        strip_prefix = "astor-0.6.2",
-        build_file = clean_dep("//third_party:astor.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:astor.BUILD"),
     )
 
     tf_http_archive(
         name = "gast_archive",
+        build_file = clean_dep("//third_party:gast.BUILD"),
+        sha256 = "7068908321ecd2774f145193c4b34a11305bd104b4551b09273dfd1d6a374930",
+        strip_prefix = "gast-0.2.0",
+        system_build_file = clean_dep("//third_party/systemlibs:gast.BUILD"),
         urls = [
             "https://mirror.bazel.build/pypi.python.org/packages/5c/78/ff794fcae2ce8aa6323e789d1f8b3b7765f601e7702726f430e814822b96/gast-0.2.0.tar.gz",
             "https://pypi.python.org/packages/5c/78/ff794fcae2ce8aa6323e789d1f8b3b7765f601e7702726f430e814822b96/gast-0.2.0.tar.gz",
         ],
-        sha256 = "7068908321ecd2774f145193c4b34a11305bd104b4551b09273dfd1d6a374930",
-        strip_prefix = "gast-0.2.0",
-        build_file = clean_dep("//third_party:gast.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:gast.BUILD"),
     )
 
     tf_http_archive(
         name = "termcolor_archive",
+        build_file = clean_dep("//third_party:termcolor.BUILD"),
+        sha256 = "1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b",
+        strip_prefix = "termcolor-1.1.0",
+        system_build_file = clean_dep("//third_party/systemlibs:termcolor.BUILD"),
         urls = [
             "https://mirror.bazel.build/pypi.python.org/packages/8a/48/a76be51647d0eb9f10e2a4511bf3ffb8cc1e6b14e9e4fab46173aa79f981/termcolor-1.1.0.tar.gz",
             "https://pypi.python.org/packages/8a/48/a76be51647d0eb9f10e2a4511bf3ffb8cc1e6b14e9e4fab46173aa79f981/termcolor-1.1.0.tar.gz",
         ],
-        sha256 = "1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b",
-        strip_prefix = "termcolor-1.1.0",
-        build_file = clean_dep("//third_party:termcolor.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:termcolor.BUILD"),
     )
 
     tf_http_archive(
         name = "absl_py",
-        urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz",
-            "https://github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz",
-        ],
         sha256 = "95160f778a62c7a60ddeadc7bf2d83f85a23a27359814aca12cf949e896fa82c",
         strip_prefix = "abseil-py-pypi-v0.2.2",
         system_build_file = clean_dep("//third_party/systemlibs:absl_py.BUILD"),
@@ -356,17 +352,21 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
             "//third_party/systemlibs:absl_py.absl.flags.BUILD": "absl/flags/BUILD",
             "//third_party/systemlibs:absl_py.absl.testing.BUILD": "absl/testing/BUILD",
         },
+        urls = [
+            "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz",
+            "https://github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz",
+        ],
     )
 
     tf_http_archive(
         name = "org_python_pypi_backports_weakref",
+        build_file = clean_dep("//third_party:backports_weakref.BUILD"),
+        sha256 = "8813bf712a66b3d8b85dc289e1104ed220f1878cf981e2fe756dfaabe9a82892",
+        strip_prefix = "backports.weakref-1.0rc1/src",
         urls = [
             "https://mirror.bazel.build/pypi.python.org/packages/bc/cc/3cdb0a02e7e96f6c70bd971bc8a90b8463fda83e264fa9c5c1c98ceabd81/backports.weakref-1.0rc1.tar.gz",
             "https://pypi.python.org/packages/bc/cc/3cdb0a02e7e96f6c70bd971bc8a90b8463fda83e264fa9c5c1c98ceabd81/backports.weakref-1.0rc1.tar.gz",
         ],
-        sha256 = "8813bf712a66b3d8b85dc289e1104ed220f1878cf981e2fe756dfaabe9a82892",
-        strip_prefix = "backports.weakref-1.0rc1/src",
-        build_file = clean_dep("//third_party:backports_weakref.BUILD"),
     )
 
     filegroup_external(
@@ -389,9 +389,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "protobuf_archive",
-        urls = PROTOBUF_URLS,
         sha256 = PROTOBUF_SHA256,
         strip_prefix = PROTOBUF_STRIP_PREFIX,
+        urls = PROTOBUF_URLS,
     )
 
     # We need to import the protobuf library under the names com_google_protobuf
@@ -399,222 +399,222 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     # Unfortunately there is no way to alias http_archives at the moment.
     tf_http_archive(
         name = "com_google_protobuf",
-        urls = PROTOBUF_URLS,
         sha256 = PROTOBUF_SHA256,
         strip_prefix = PROTOBUF_STRIP_PREFIX,
+        urls = PROTOBUF_URLS,
     )
 
     tf_http_archive(
         name = "com_google_protobuf_cc",
-        urls = PROTOBUF_URLS,
         sha256 = PROTOBUF_SHA256,
         strip_prefix = PROTOBUF_STRIP_PREFIX,
+        urls = PROTOBUF_URLS,
     )
 
     tf_http_archive(
         name = "nsync",
+        sha256 = "692f9b30e219f71a6371b98edd39cef3cbda35ac3abc4cd99ce19db430a5591a",
+        strip_prefix = "nsync-1.20.1",
+        system_build_file = clean_dep("//third_party/systemlibs:nsync.BUILD"),
         urls = [
             "https://mirror.bazel.build/github.com/google/nsync/archive/1.20.1.tar.gz",
             "https://github.com/google/nsync/archive/1.20.1.tar.gz",
         ],
-        sha256 = "692f9b30e219f71a6371b98edd39cef3cbda35ac3abc4cd99ce19db430a5591a",
-        strip_prefix = "nsync-1.20.1",
-        system_build_file = clean_dep("//third_party/systemlibs:nsync.BUILD"),
     )
 
     tf_http_archive(
         name = "com_google_googletest",
+        sha256 = "353ab86e35cea1cd386115279cf4b16695bbf21b897bfbf2721cf4cb5f64ade8",
+        strip_prefix = "googletest-997d343dd680e541ef96ce71ee54a91daf2577a0",
         urls = [
             "https://mirror.bazel.build/github.com/google/googletest/archive/997d343dd680e541ef96ce71ee54a91daf2577a0.zip",
             "https://github.com/google/googletest/archive/997d343dd680e541ef96ce71ee54a91daf2577a0.zip",
         ],
-        sha256 = "353ab86e35cea1cd386115279cf4b16695bbf21b897bfbf2721cf4cb5f64ade8",
-        strip_prefix = "googletest-997d343dd680e541ef96ce71ee54a91daf2577a0",
     )
 
     tf_http_archive(
         name = "com_github_gflags_gflags",
+        sha256 = "ae27cdbcd6a2f935baa78e4f21f675649271634c092b1be01469440495609d0e",
+        strip_prefix = "gflags-2.2.1",
         urls = [
             "https://mirror.bazel.build/github.com/gflags/gflags/archive/v2.2.1.tar.gz",
             "https://github.com/gflags/gflags/archive/v2.2.1.tar.gz",
         ],
-        sha256 = "ae27cdbcd6a2f935baa78e4f21f675649271634c092b1be01469440495609d0e",
-        strip_prefix = "gflags-2.2.1",
     )
 
     tf_http_archive(
         name = "pcre",
+        build_file = clean_dep("//third_party:pcre.BUILD"),
         sha256 = "69acbc2fbdefb955d42a4c606dfde800c2885711d2979e356c0636efde9ec3b5",
+        strip_prefix = "pcre-8.42",
+        system_build_file = clean_dep("//third_party/systemlibs:pcre.BUILD"),
         urls = [
             "https://mirror.bazel.build/ftp.exim.org/pub/pcre/pcre-8.42.tar.gz",
             "http://ftp.exim.org/pub/pcre/pcre-8.42.tar.gz",
         ],
-        strip_prefix = "pcre-8.42",
-        build_file = clean_dep("//third_party:pcre.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:pcre.BUILD"),
     )
 
     tf_http_archive(
         name = "swig",
+        build_file = clean_dep("//third_party:swig.BUILD"),
         sha256 = "58a475dbbd4a4d7075e5fe86d4e54c9edde39847cdb96a3053d87cb64a23a453",
+        strip_prefix = "swig-3.0.8",
+        system_build_file = clean_dep("//third_party/systemlibs:swig.BUILD"),
         urls = [
             "https://mirror.bazel.build/ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
             "http://ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
             "http://pilotfiber.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
         ],
-        strip_prefix = "swig-3.0.8",
-        build_file = clean_dep("//third_party:swig.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:swig.BUILD"),
     )
 
     tf_http_archive(
         name = "curl",
+        build_file = clean_dep("//third_party:curl.BUILD"),
         sha256 = "e9c37986337743f37fd14fe8737f246e97aec94b39d1b71e8a5973f72a9fc4f5",
+        strip_prefix = "curl-7.60.0",
+        system_build_file = clean_dep("//third_party/systemlibs:curl.BUILD"),
         urls = [
             "https://mirror.bazel.build/curl.haxx.se/download/curl-7.60.0.tar.gz",
             "https://curl.haxx.se/download/curl-7.60.0.tar.gz",
         ],
-        strip_prefix = "curl-7.60.0",
-        build_file = clean_dep("//third_party:curl.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:curl.BUILD"),
     )
 
     tf_http_archive(
         name = "grpc",
+        sha256 = "50db9cf2221354485eb7c3bd55a4c27190caef7048a2a1a15fbe60a498f98b44",
+        strip_prefix = "grpc-1.13.0",
+        system_build_file = clean_dep("//third_party/systemlibs:grpc.BUILD"),
         urls = [
             "https://mirror.bazel.build/github.com/grpc/grpc/archive/v1.13.0.tar.gz",
             "https://github.com/grpc/grpc/archive/v1.13.0.tar.gz",
         ],
-        sha256 = "50db9cf2221354485eb7c3bd55a4c27190caef7048a2a1a15fbe60a498f98b44",
-        strip_prefix = "grpc-1.13.0",
-        system_build_file = clean_dep("//third_party/systemlibs:grpc.BUILD"),
     )
 
     tf_http_archive(
         name = "linenoise",
+        build_file = clean_dep("//third_party:linenoise.BUILD"),
         sha256 = "7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7",
+        strip_prefix = "linenoise-c894b9e59f02203dbe4e2be657572cf88c4230c3",
         urls = [
             "https://mirror.bazel.build/github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
             "https://github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
         ],
-        strip_prefix = "linenoise-c894b9e59f02203dbe4e2be657572cf88c4230c3",
-        build_file = clean_dep("//third_party:linenoise.BUILD"),
     )
 
     # TODO(phawkins): currently, this rule uses an unofficial LLVM mirror.
     # Switch to an official source of snapshots if/when possible.
     tf_http_archive(
         name = "llvm",
+        build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
+        sha256 = "a4f8bfe7e3e69069934a87e612a1d4d3b8b6af13e0f1213a42a6046e1bcd50d8",
+        strip_prefix = "llvm-d3429e96fe1e45b1dc0106463832523f37faf271",
         urls = [
             "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/d3429e96fe1e45b1dc0106463832523f37faf271.tar.gz",
             "https://github.com/llvm-mirror/llvm/archive/d3429e96fe1e45b1dc0106463832523f37faf271.tar.gz",
         ],
-        sha256 = "a4f8bfe7e3e69069934a87e612a1d4d3b8b6af13e0f1213a42a6046e1bcd50d8",
-        strip_prefix = "llvm-d3429e96fe1e45b1dc0106463832523f37faf271",
-        build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
     )
 
     tf_http_archive(
         name = "lmdb",
+        build_file = clean_dep("//third_party:lmdb.BUILD"),
+        sha256 = "f3927859882eb608868c8c31586bb7eb84562a40a6bf5cc3e13b6b564641ea28",
+        strip_prefix = "lmdb-LMDB_0.9.22/libraries/liblmdb",
+        system_build_file = clean_dep("//third_party/systemlibs:lmdb.BUILD"),
         urls = [
             "https://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.22.tar.gz",
             "https://github.com/LMDB/lmdb/archive/LMDB_0.9.22.tar.gz",
         ],
-        sha256 = "f3927859882eb608868c8c31586bb7eb84562a40a6bf5cc3e13b6b564641ea28",
-        strip_prefix = "lmdb-LMDB_0.9.22/libraries/liblmdb",
-        build_file = clean_dep("//third_party:lmdb.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:lmdb.BUILD"),
     )
 
     tf_http_archive(
         name = "jsoncpp_git",
+        build_file = clean_dep("//third_party:jsoncpp.BUILD"),
+        sha256 = "c49deac9e0933bcb7044f08516861a2d560988540b23de2ac1ad443b219afdb6",
+        strip_prefix = "jsoncpp-1.8.4",
+        system_build_file = clean_dep("//third_party/systemlibs:jsoncpp.BUILD"),
         urls = [
             "https://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz",
             "https://github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz",
         ],
-        sha256 = "c49deac9e0933bcb7044f08516861a2d560988540b23de2ac1ad443b219afdb6",
-        strip_prefix = "jsoncpp-1.8.4",
-        build_file = clean_dep("//third_party:jsoncpp.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:jsoncpp.BUILD"),
     )
 
     tf_http_archive(
         name = "boringssl",
+        sha256 = "1188e29000013ed6517168600fc35a010d58c5d321846d6a6dfee74e4c788b45",
+        strip_prefix = "boringssl-7f634429a04abc48e2eb041c81c5235816c96514",
+        system_build_file = clean_dep("//third_party/systemlibs:boringssl.BUILD"),
         urls = [
             "https://mirror.bazel.build/github.com/google/boringssl/archive/7f634429a04abc48e2eb041c81c5235816c96514.tar.gz",
             "https://github.com/google/boringssl/archive/7f634429a04abc48e2eb041c81c5235816c96514.tar.gz",
         ],
-        sha256 = "1188e29000013ed6517168600fc35a010d58c5d321846d6a6dfee74e4c788b45",
-        strip_prefix = "boringssl-7f634429a04abc48e2eb041c81c5235816c96514",
-        system_build_file = clean_dep("//third_party/systemlibs:boringssl.BUILD"),
     )
 
     tf_http_archive(
         name = "zlib_archive",
+        build_file = clean_dep("//third_party:zlib.BUILD"),
+        sha256 = "c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1",
+        strip_prefix = "zlib-1.2.11",
+        system_build_file = clean_dep("//third_party/systemlibs:zlib.BUILD"),
         urls = [
             "https://mirror.bazel.build/zlib.net/zlib-1.2.11.tar.gz",
             "https://zlib.net/zlib-1.2.11.tar.gz",
         ],
-        sha256 = "c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1",
-        strip_prefix = "zlib-1.2.11",
-        build_file = clean_dep("//third_party:zlib.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:zlib.BUILD"),
     )
 
     tf_http_archive(
         name = "fft2d",
+        build_file = clean_dep("//third_party/fft2d:fft2d.BUILD"),
+        sha256 = "52bb637c70b971958ec79c9c8752b1df5ff0218a4db4510e60826e0cb79b5296",
         urls = [
             "https://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
             "http://www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
         ],
-        sha256 = "52bb637c70b971958ec79c9c8752b1df5ff0218a4db4510e60826e0cb79b5296",
-        build_file = clean_dep("//third_party/fft2d:fft2d.BUILD"),
     )
 
     tf_http_archive(
         name = "snappy",
+        build_file = clean_dep("//third_party:snappy.BUILD"),
+        sha256 = "3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4",
+        strip_prefix = "snappy-1.1.7",
+        system_build_file = clean_dep("//third_party/systemlibs:snappy.BUILD"),
         urls = [
             "https://mirror.bazel.build/github.com/google/snappy/archive/1.1.7.tar.gz",
             "https://github.com/google/snappy/archive/1.1.7.tar.gz",
         ],
-        sha256 = "3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4",
-        strip_prefix = "snappy-1.1.7",
-        build_file = clean_dep("//third_party:snappy.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:snappy.BUILD"),
     )
 
     tf_http_archive(
         name = "nccl_archive",
+        build_file = clean_dep("//third_party:nccl/nccl_archive.BUILD"),
+        sha256 = "2ca86fb6179ecbff789cc67c836139c1bbc0324ed8c04643405a30bf26325176",
+        strip_prefix = "nccl-03d856977ecbaac87e598c0c4bafca96761b9ac7",
         urls = [
             "https://mirror.bazel.build/github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
             "https://github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
         ],
-        sha256 = "2ca86fb6179ecbff789cc67c836139c1bbc0324ed8c04643405a30bf26325176",
-        strip_prefix = "nccl-03d856977ecbaac87e598c0c4bafca96761b9ac7",
-        build_file = clean_dep("//third_party:nccl/nccl_archive.BUILD"),
     )
 
     tf_http_archive(
         name = "kafka",
+        build_file = clean_dep("//third_party:kafka/BUILD"),
+        patch_file = clean_dep("//third_party/kafka:config.patch"),
+        sha256 = "cc6ebbcd0a826eec1b8ce1f625ffe71b53ef3290f8192b6cae38412a958f4fd3",
+        strip_prefix = "librdkafka-0.11.5",
         urls = [
             "https://mirror.bazel.build/github.com/edenhill/librdkafka/archive/v0.11.5.tar.gz",
             "https://github.com/edenhill/librdkafka/archive/v0.11.5.tar.gz",
         ],
-        sha256 = "cc6ebbcd0a826eec1b8ce1f625ffe71b53ef3290f8192b6cae38412a958f4fd3",
-        strip_prefix = "librdkafka-0.11.5",
-        build_file = clean_dep("//third_party:kafka/BUILD"),
-        patch_file = clean_dep("//third_party/kafka:config.patch"),
     )
 
     tf_http_archive(
         name = "aws",
+        build_file = clean_dep("//third_party:aws.BUILD"),
+        sha256 = "b888d8ce5fc10254c3dd6c9020c7764dd53cf39cf011249d0b4deda895de1b7c",
+        strip_prefix = "aws-sdk-cpp-1.3.15",
         urls = [
             "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
             "https://github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
         ],
-        sha256 = "b888d8ce5fc10254c3dd6c9020c7764dd53cf39cf011249d0b4deda895de1b7c",
-        strip_prefix = "aws-sdk-cpp-1.3.15",
-        build_file = clean_dep("//third_party:aws.BUILD"),
     )
 
     java_import_external(
@@ -644,14 +644,14 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "jemalloc",
+        build_file = clean_dep("//third_party:jemalloc.BUILD"),
+        sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
+        strip_prefix = "jemalloc-4.4.0",
+        system_build_file = clean_dep("//third_party/systemlibs:jemalloc.BUILD"),
         urls = [
             "https://mirror.bazel.build/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
             "https://github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
         ],
-        sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
-        strip_prefix = "jemalloc-4.4.0",
-        build_file = clean_dep("//third_party:jemalloc.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:jemalloc.BUILD"),
     )
 
     java_import_external(
@@ -700,196 +700,196 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "com_google_pprof",
+        build_file = clean_dep("//third_party:pprof.BUILD"),
+        sha256 = "e0928ca4aa10ea1e0551e2d7ce4d1d7ea2d84b2abbdef082b0da84268791d0c4",
+        strip_prefix = "pprof-c0fb62ec88c411cc91194465e54db2632845b650",
         urls = [
             "https://mirror.bazel.build/github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
             "https://github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
         ],
-        sha256 = "e0928ca4aa10ea1e0551e2d7ce4d1d7ea2d84b2abbdef082b0da84268791d0c4",
-        strip_prefix = "pprof-c0fb62ec88c411cc91194465e54db2632845b650",
-        build_file = clean_dep("//third_party:pprof.BUILD"),
     )
 
     tf_http_archive(
         name = "cub_archive",
+        build_file = clean_dep("//third_party:cub.BUILD"),
+        sha256 = "6bfa06ab52a650ae7ee6963143a0bbc667d6504822cbd9670369b598f18c58c3",
+        strip_prefix = "cub-1.8.0",
         urls = [
             "https://mirror.bazel.build/github.com/NVlabs/cub/archive/1.8.0.zip",
             "https://github.com/NVlabs/cub/archive/1.8.0.zip",
         ],
-        sha256 = "6bfa06ab52a650ae7ee6963143a0bbc667d6504822cbd9670369b598f18c58c3",
-        strip_prefix = "cub-1.8.0",
-        build_file = clean_dep("//third_party:cub.BUILD"),
     )
 
     tf_http_archive(
         name = "cython",
+        build_file = clean_dep("//third_party:cython.BUILD"),
+        delete = ["BUILD.bazel"],
         sha256 = "bccc9aa050ea02595b2440188813b936eaf345e85fb9692790cecfe095cf91aa",
+        strip_prefix = "cython-0.28.4",
+        system_build_file = clean_dep("//third_party/systemlibs:cython.BUILD"),
         urls = [
             "https://mirror.bazel.build/github.com/cython/cython/archive/0.28.4.tar.gz",
             "https://github.com/cython/cython/archive/0.28.4.tar.gz",
         ],
-        strip_prefix = "cython-0.28.4",
-        build_file = clean_dep("//third_party:cython.BUILD"),
-        delete = ["BUILD.bazel"],
-        system_build_file = clean_dep("//third_party/systemlibs:cython.BUILD"),
     )
 
     tf_http_archive(
         name = "bazel_toolchains",
+        sha256 = "3b604699685c5c65dd3f6f17425570a4b2f00ddba2f750db15acc72e55bb098b",
+        strip_prefix = "bazel-toolchains-37acf1841ab1475c98a152cb9e446460c8ae29e1",
         urls = [
             "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/37acf1841ab1475c98a152cb9e446460c8ae29e1.tar.gz",
             "https://github.com/bazelbuild/bazel-toolchains/archive/37acf1841ab1475c98a152cb9e446460c8ae29e1.tar.gz",
         ],
-        strip_prefix = "bazel-toolchains-37acf1841ab1475c98a152cb9e446460c8ae29e1",
-        sha256 = "3b604699685c5c65dd3f6f17425570a4b2f00ddba2f750db15acc72e55bb098b",
     )
 
     tf_http_archive(
         name = "arm_neon_2_x86_sse",
+        build_file = clean_dep("//third_party:arm_neon_2_x86_sse.BUILD"),
         sha256 = "c8d90aa4357f8079d427e87a6f4c493da1fa4140aee926c05902d7ec1533d9a5",
         strip_prefix = "ARM_NEON_2_x86_SSE-0f77d9d182265259b135dad949230ecbf1a2633d",
         urls = [
             "https://mirror.bazel.build/github.com/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz",
             "https://github.com/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz",
         ],
-        build_file = clean_dep("//third_party:arm_neon_2_x86_sse.BUILD"),
     )
 
     tf_http_archive(
         name = "double_conversion",
+        build_file = clean_dep("//third_party:double_conversion.BUILD"),
+        sha256 = "2f7fbffac0d98d201ad0586f686034371a6d152ca67508ab611adc2386ad30de",
+        strip_prefix = "double-conversion-3992066a95b823efc8ccc1baf82a1cfc73f6e9b8",
+        system_build_file = clean_dep("//third_party/systemlibs:double_conversion.BUILD"),
         urls = [
             "https://mirror.bazel.build/github.com/google/double-conversion/archive/3992066a95b823efc8ccc1baf82a1cfc73f6e9b8.zip",
             "https://github.com/google/double-conversion/archive/3992066a95b823efc8ccc1baf82a1cfc73f6e9b8.zip",
         ],
-        sha256 = "2f7fbffac0d98d201ad0586f686034371a6d152ca67508ab611adc2386ad30de",
-        strip_prefix = "double-conversion-3992066a95b823efc8ccc1baf82a1cfc73f6e9b8",
-        build_file = clean_dep("//third_party:double_conversion.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:double_conversion.BUILD"),
     )
 
     tf_http_archive(
         name = "tflite_mobilenet",
+        build_file = clean_dep("//third_party:tflite_mobilenet.BUILD"),
         sha256 = "23f814d1c076bdf03715dfb6cab3713aa4fbdf040fd5448c43196bd2e97a4c1b",
         urls = [
             "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
             "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
         ],
-        build_file = clean_dep("//third_party:tflite_mobilenet.BUILD"),
     )
 
     tf_http_archive(
         name = "tflite_mobilenet_ssd",
+        build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
         sha256 = "767057f2837a46d97882734b03428e8dd640b93236052b312b2f0e45613c1cf0",
         urls = [
             "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip",
             "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip",
         ],
-        build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
     )
 
     tf_http_archive(
         name = "tflite_mobilenet_ssd_quant",
+        build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
         sha256 = "a809cd290b4d6a2e8a9d5dad076e0bd695b8091974e0eed1052b480b2f21b6dc",
         urls = [
             "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_0.75_quant_2018_06_29.zip",
             "https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_0.75_quant_2018_06_29.zip",
         ],
-        build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
     )
 
     tf_http_archive(
         name = "tflite_mobilenet_ssd_quant_protobuf",
+        build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
         sha256 = "09280972c5777f1aa775ef67cb4ac5d5ed21970acd8535aeca62450ef14f0d79",
+        strip_prefix = "ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18",
         urls = [
             "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18.tar.gz",
             "http://storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18.tar.gz",
         ],
-        strip_prefix = "ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18",
-        build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
     )
 
     tf_http_archive(
         name = "tflite_conv_actions_frozen",
+        build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
         sha256 = "d947b38cba389b5e2d0bfc3ea6cc49c784e187b41a071387b3742d1acac7691e",
         urls = [
             "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip",
             "https://storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip",
         ],
-        build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
     )
 
     tf_http_archive(
         name = "tflite_smartreply",
+        build_file = clean_dep("//third_party:tflite_smartreply.BUILD"),
         sha256 = "8980151b85a87a9c1a3bb1ed4748119e4a85abd3cb5744d83da4d4bd0fbeef7c",
         urls = [
             "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip",
             "https://storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip",
         ],
-        build_file = clean_dep("//third_party:tflite_smartreply.BUILD"),
     )
 
     tf_http_archive(
         name = "tflite_ovic_testdata",
+        build_file = clean_dep("//third_party:tflite_ovic_testdata.BUILD"),
         sha256 = "a9a705d8d519220178e2e65d383fdb21da37fdb31d1e909b0a1acdac46479e9c",
+        strip_prefix = "ovic",
         urls = [
             "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/data/ovic.zip",
             "https://storage.googleapis.com/download.tensorflow.org/data/ovic.zip",
         ],
-        build_file = clean_dep("//third_party:tflite_ovic_testdata.BUILD"),
-        strip_prefix = "ovic",
     )
 
     tf_http_archive(
         name = "build_bazel_rules_android",
         sha256 = "cd06d15dd8bb59926e4d65f9003bfc20f9da4b2519985c27e190cddc8b7a7806",
+        strip_prefix = "rules_android-0.1.1",
         urls = [
             "https://mirror.bazel.build/github.com/bazelbuild/rules_android/archive/v0.1.1.zip",
             "https://github.com/bazelbuild/rules_android/archive/v0.1.1.zip",
         ],
-        strip_prefix = "rules_android-0.1.1",
     )
 
     tf_http_archive(
         name = "tbb",
+        build_file = clean_dep("//third_party/ngraph:tbb.BUILD"),
+        sha256 = "724686f90bcda78f13b76f297d964008737ccd6399328143c1c0093e73ae6a13",
+        strip_prefix = "tbb-tbb_2018",
         urls = [
             "https://mirror.bazel.build/github.com/01org/tbb/archive/tbb_2018.zip",
             "https://github.com/01org/tbb/archive/tbb_2018.zip",
         ],
-        sha256 = "724686f90bcda78f13b76f297d964008737ccd6399328143c1c0093e73ae6a13",
-        strip_prefix = "tbb-tbb_2018",
-        build_file = clean_dep("//third_party/ngraph:tbb.BUILD"),
     )
 
     tf_http_archive(
         name = "ngraph",
+        build_file = clean_dep("//third_party/ngraph:ngraph.BUILD"),
+        sha256 = "bf9dcc88e5c66021e3aac80491a231711211540d613bf9b6bd28db3f5bb86b62",
+        strip_prefix = "ngraph-0.8.1",
         urls = [
             "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.8.1.tar.gz",
             "https://github.com/NervanaSystems/ngraph/archive/v0.8.1.tar.gz",
         ],
-        sha256 = "bf9dcc88e5c66021e3aac80491a231711211540d613bf9b6bd28db3f5bb86b62",
-        strip_prefix = "ngraph-0.8.1",
-        build_file = clean_dep("//third_party/ngraph:ngraph.BUILD"),
     )
 
     tf_http_archive(
         name = "nlohmann_json_lib",
+        build_file = clean_dep("//third_party/ngraph:nlohmann_json.BUILD"),
+        sha256 = "9f3549824af3ca7e9707a2503959886362801fb4926b869789d6929098a79e47",
+        strip_prefix = "json-3.1.1",
         urls = [
             "https://mirror.bazel.build/github.com/nlohmann/json/archive/v3.1.1.tar.gz",
             "https://github.com/nlohmann/json/archive/v3.1.1.tar.gz",
         ],
-        sha256 = "9f3549824af3ca7e9707a2503959886362801fb4926b869789d6929098a79e47",
-        strip_prefix = "json-3.1.1",
-        build_file = clean_dep("//third_party/ngraph:nlohmann_json.BUILD"),
     )
 
     tf_http_archive(
         name = "ngraph_tf",
+        build_file = clean_dep("//third_party/ngraph:ngraph_tf.BUILD"),
+        sha256 = "402f84c748c113780a60f35f39aab118435285543aee4900d712b76fbf8a21ee",
+        strip_prefix = "ngraph-tf-0.6.1",
         urls = [
             "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.6.1.tar.gz",
             "https://github.com/NervanaSystems/ngraph-tf/archive/v0.6.1.tar.gz",
         ],
-        sha256 = "402f84c748c113780a60f35f39aab118435285543aee4900d712b76fbf8a21ee",
-        strip_prefix = "ngraph-tf-0.6.1",
-        build_file = clean_dep("//third_party/ngraph:ngraph_tf.BUILD"),
     )
 
     ##############################################################################
-- 
GitLab


From d56c298f1ef14b5a738e1e0b7bbc66fcd736be3e Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Thu, 27 Sep 2018 21:57:48 -0700
Subject: [PATCH 128/570] Remove AWS, GCP, Kafka and HDFS options from
 configure. Make them default on for supported operating systems.

PiperOrigin-RevId: 214886845
---
 configure.py                                  | 12 ---
 tensorflow/BUILD                              | 96 -------------------
 tensorflow/contrib/BUILD                      | 50 +++-------
 .../core/platform/default/build_config.bzl    | 45 ++++-----
 tensorflow/tools/lib_package/BUILD            | 38 ++++----
 tensorflow/tools/pip_package/BUILD            | 27 ++----
 6 files changed, 55 insertions(+), 213 deletions(-)

diff --git a/configure.py b/configure.py
index f71caa1994..55fce8b93b 100644
--- a/configure.py
+++ b/configure.py
@@ -1488,11 +1488,7 @@ def main():
   setup_python(environ_cp)
 
   if is_windows():
-    environ_cp['TF_NEED_AWS'] = '0'
-    environ_cp['TF_NEED_GCP'] = '0'
-    environ_cp['TF_NEED_HDFS'] = '0'
     environ_cp['TF_NEED_JEMALLOC'] = '0'
-    environ_cp['TF_NEED_KAFKA'] = '0'
     environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
     environ_cp['TF_NEED_COMPUTECPP'] = '0'
     environ_cp['TF_NEED_OPENCL'] = '0'
@@ -1518,14 +1514,6 @@ def main():
 
   set_build_var(environ_cp, 'TF_NEED_JEMALLOC', 'jemalloc as malloc',
                 'with_jemalloc', True)
-  set_build_var(environ_cp, 'TF_NEED_GCP', 'Google Cloud Platform',
-                'with_gcp_support', True, 'gcp')
-  set_build_var(environ_cp, 'TF_NEED_HDFS', 'Hadoop File System',
-                'with_hdfs_support', True, 'hdfs')
-  set_build_var(environ_cp, 'TF_NEED_AWS', 'Amazon AWS Platform',
-                'with_aws_support', True, 'aws')
-  set_build_var(environ_cp, 'TF_NEED_KAFKA', 'Apache Kafka Platform',
-                'with_kafka_support', True, 'kafka')
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
                 False, 'xla')
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 3610eea42a..5f73da68a2 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -224,60 +224,6 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-config_setting(
-    name = "with_gcp_support",
-    define_values = {"with_gcp_support": "true"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_hdfs_support",
-    define_values = {"with_hdfs_support": "true"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_aws_support",
-    define_values = {"with_aws_support": "true"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_kafka_support",
-    define_values = {"with_kafka_support": "true"},
-    visibility = ["//visibility:public"],
-)
-
-# Crosses between platforms and file system libraries not supported on those
-# platforms due to limitations in nested select() statements.
-config_setting(
-    name = "with_gcp_support_windows_override",
-    define_values = {"with_gcp_support": "true"},
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_hdfs_support_windows_override",
-    define_values = {"with_hdfs_support": "true"},
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_aws_support_windows_override",
-    define_values = {"with_aws_support": "true"},
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_kafka_support_windows_override",
-    define_values = {"with_kafka_support": "true"},
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
 config_setting(
     name = "with_cuda_support_windows_override",
     define_values = {"using_cuda_nvcc": "true"},
@@ -285,48 +231,6 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-config_setting(
-    name = "with_gcp_support_android_override",
-    define_values = {"with_gcp_support": "true"},
-    values = {"crosstool_top": "//external:android/crosstool"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_hdfs_support_android_override",
-    define_values = {"with_hdfs_support": "true"},
-    values = {"crosstool_top": "//external:android/crosstool"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_aws_support_android_override",
-    define_values = {"with_aws_support": "true"},
-    values = {"crosstool_top": "//external:android/crosstool"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_gcp_support_ios_override",
-    define_values = {"with_gcp_support": "true"},
-    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_hdfs_support_ios_override",
-    define_values = {"with_hdfs_support": "true"},
-    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_aws_support_ios_override",
-    define_values = {"with_aws_support": "true"},
-    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
-    visibility = ["//visibility:public"],
-)
-
 config_setting(
     name = "with_xla_support",
     define_values = {"with_xla_support": "true"},
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index ae5ca32bcf..1a9ae8ac3a 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -112,26 +112,14 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/estimator:estimator_py",
     ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + select({
-        "//tensorflow:with_kafka_support_windows_override": [],
-        "//tensorflow:with_kafka_support": [
-            "//tensorflow/contrib/kafka",
-        ],
-        "//conditions:default": [],
-    }) + select({
-        "//tensorflow:with_aws_support_windows_override": [],
-        "//tensorflow:with_aws_support": [
-            "//tensorflow/contrib/kinesis",
-        ],
-        "//conditions:default": [],
-    }) + if_not_windows_cuda([
-        "//tensorflow/contrib/fused_conv:fused_conv_py",  # unresolved symbols, need to export more symbols
-    ]) + if_not_windows([
-    ]) + select({
         "//tensorflow:linux_s390x": [],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "//tensorflow/contrib/bigtable",
             "//tensorflow/contrib/cloud:cloud_py",
+            "//tensorflow/contrib/fused_conv:fused_conv_py",  # unresolved symbols, need to export more symbols
+            "//tensorflow/contrib/kafka",
+            "//tensorflow/contrib/kinesis",
             "//tensorflow/contrib/tensorrt:init_py",
             "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
         ],
@@ -159,20 +147,14 @@ cc_library(
     ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_cuda([
         "//tensorflow/contrib/nccl:nccl_kernels",
     ]) + select({
-        "//tensorflow:with_kafka_support_windows_override": [],
-        "//tensorflow:with_kafka_support": [
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
             "//tensorflow/contrib/kafka:dataset_kernels",
-        ],
-        "//conditions:default": [],
-    }) + select({
-        "//tensorflow:with_aws_support_windows_override": [],
-        "//tensorflow:with_aws_support": [
             "//tensorflow/contrib/kinesis:dataset_kernels",
+            "//tensorflow/contrib/tensorrt:trt_engine_op_kernel",
         ],
-        "//conditions:default": [],
-    }) + if_not_windows([
-        "//tensorflow/contrib/tensorrt:trt_engine_op_kernel",
-    ]),
+    }),
 )
 
 cc_library(
@@ -198,18 +180,12 @@ cc_library(
         "//tensorflow/contrib/text:all_ops",
         "//tensorflow/contrib/tpu:all_ops",
     ] + select({
-        "//tensorflow:with_kafka_support_windows_override": [],
-        "//tensorflow:with_kafka_support": [
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
             "//tensorflow/contrib/kafka:dataset_ops_op_lib",
-        ],
-        "//conditions:default": [],
-    }) + select({
-        "//tensorflow:with_aws_support_windows_override": [],
-        "//tensorflow:with_aws_support": [
             "//tensorflow/contrib/kinesis:dataset_ops_op_lib",
+            "//tensorflow/contrib/tensorrt:trt_engine_op_op_lib",
         ],
-        "//conditions:default": [],
-    }) + if_not_windows([
-        "//tensorflow/contrib/tensorrt:trt_engine_op_op_lib",
-    ]),
+    }),
 )
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index bb841aeab7..3b14757945 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -641,54 +641,41 @@ def tf_additional_lib_deps():
 
 def tf_additional_core_deps():
     return select({
-        "//tensorflow:with_gcp_support_android_override": [],
-        "//tensorflow:with_gcp_support_ios_override": [],
-        "//tensorflow:with_gcp_support": [
+        "//tensorflow:android": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//conditions:default": [
             "//tensorflow/core/platform/cloud:gcs_file_system",
-        ],
-        "//conditions:default": [],
-    }) + select({
-        "//tensorflow:with_hdfs_support_windows_override": [],
-        "//tensorflow:with_hdfs_support_android_override": [],
-        "//tensorflow:with_hdfs_support_ios_override": [],
-        "//tensorflow:with_hdfs_support": [
-            "//tensorflow/core/platform/hadoop:hadoop_file_system",
-        ],
-        "//conditions:default": [],
-    }) + select({
-        "//tensorflow:with_aws_support_windows_override": [],
-        "//tensorflow:with_aws_support_android_override": [],
-        "//tensorflow:with_aws_support_ios_override": [],
-        "//tensorflow:with_aws_support": [
             "//tensorflow/core/platform/s3:s3_file_system",
+            "//tensorflow/core/platform/hadoop:hadoop_file_system",
         ],
-        "//conditions:default": [],
     })
 
 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_op_deps():
     return select({
-        "//tensorflow:with_gcp_support_windows_override": [],
-        "//tensorflow:with_gcp_support_android_override": [],
-        "//tensorflow:with_gcp_support_ios_override": [],
-        "//tensorflow:with_gcp_support": [
+        "//tensorflow:android": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//conditions:default": [
             "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
             "//tensorflow/contrib/cloud:gcs_config_ops_op_lib",
         ],
-        "//conditions:default": [],
     })
 
 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_kernel_deps():
     return select({
-        "//tensorflow:with_gcp_support_windows_override": [],
-        "//tensorflow:with_gcp_support_android_override": [],
-        "//tensorflow:with_gcp_support_ios_override": [],
-        "//tensorflow:with_gcp_support": [
+        "//tensorflow:android": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//conditions:default": [
             "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
             "//tensorflow/contrib/cloud/kernels:gcs_config_ops",
         ],
-        "//conditions:default": [],
     })
 
 def tf_lib_proto_parsing_deps():
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 095ac1f4cc..b9f4902639 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -137,16 +137,6 @@ genrule(
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
     ] + select({
-        "//tensorflow:with_aws_support": [
-            "@aws//:LICENSE",
-        ],
-        "//conditions:default": [],
-    }) + select({
-        "//tensorflow:with_gcp_support": [
-            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
-        ],
-        "//conditions:default": [],
-    }) + select({
         "//tensorflow:with_jemalloc_linux_x86_64": [
             "@jemalloc//:COPYING",
         ],
@@ -171,7 +161,14 @@ genrule(
             "@grpc//third_party/nanopb:LICENSE.txt",
             "@grpc//third_party/address_sorting:LICENSE",
         ],
-    ),
+    ) + select({
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "@aws//:LICENSE",
+            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
+        ],
+    }),
     outs = ["include/tensorflow/c/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
     tools = [":concat_licenses.sh"],
@@ -205,16 +202,6 @@ genrule(
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
     ] + select({
-        "//tensorflow:with_aws_support": [
-            "@aws//:LICENSE",
-        ],
-        "//conditions:default": [],
-    }) + select({
-        "//tensorflow:with_gcp_support": [
-            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
-        ],
-        "//conditions:default": [],
-    }) + select({
         "//tensorflow:with_jemalloc_linux_x86_64": [
             "@jemalloc//:COPYING",
         ],
@@ -232,7 +219,14 @@ genrule(
     ]) + if_mkl([
         "//third_party/mkl:LICENSE",
         "//third_party/mkl_dnn:LICENSE",
-    ]),
+    ]) + select({
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "@aws//:LICENSE",
+            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
+        ],
+    }),
     outs = ["include/tensorflow/jni/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
     tools = [":concat_licenses.sh"],
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index cce60ccea0..f1de22300b 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -169,17 +169,6 @@ filegroup(
         "@zlib_archive//:zlib.h",
         "@org_python_pypi_backports_weakref//:LICENSE",
     ] + select({
-        "//tensorflow:with_aws_support": [
-            "@aws//:LICENSE",
-        ],
-        "//conditions:default": [],
-    }) + select({
-        "//tensorflow:with_gcp_support": [
-            "@com_github_googleapis_googleapis//:LICENSE",
-            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
-        ],
-        "//conditions:default": [],
-    }) + select({
         "//tensorflow:with_jemalloc_linux_x86_64": [
             "@jemalloc//:COPYING",
         ],
@@ -187,11 +176,6 @@ filegroup(
             "@jemalloc//:COPYING",
         ],
         "//conditions:default": [],
-    }) + select({
-        "//tensorflow:with_kafka_support": [
-            "@kafka//:LICENSE",
-        ],
-        "//conditions:default": [],
     }) + select({
         "//tensorflow/core/kernels:xsmm": [
             "@libxsmm_archive//:LICENSE.md",
@@ -215,7 +199,16 @@ filegroup(
         "@ngraph_tf//:LICENSE",
         "@nlohmann_json_lib//:LICENSE.MIT",
         "@tbb//:LICENSE",
-    ]) + tf_additional_license_deps(),
+    ]) + tf_additional_license_deps() + select({
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "@aws//:LICENSE",
+            "@com_github_googleapis_googleapis//:LICENSE",
+            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
+            "@kafka//:LICENSE",
+        ],
+    }),
 )
 
 sh_binary(
-- 
GitLab


From 6ebe9baae06c06d0a70a424a55c78f5af07b49f7 Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Thu, 27 Sep 2018 22:57:39 -0700
Subject: [PATCH 129/570] Fix error that occurs when attempting to use
 TensorFlow optimizers with Keras and DistributionStrategy

PiperOrigin-RevId: 214890580
---
 .../contrib/distribute/python/combinations.py |   3 +
 .../contrib/distribute/python/keras_test.py   | 121 ++++---
 tensorflow/python/keras/engine/training.py    |   3 +-
 .../keras/engine/training_distributed.py      | 341 +++++++++---------
 4 files changed, 240 insertions(+), 228 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 244d1fcec8..82ca041cc2 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -59,6 +59,7 @@ from tensorflow.python.training import adagrad
 from tensorflow.python.training import adam
 from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import rmsprop
 from tensorflow.python.util import tf_inspect
 
 
@@ -354,6 +355,8 @@ gradient_descent_optimizer_v1_fn = NamedObject(
     "GradientDescentV1", lambda: gradient_descent.GradientDescentOptimizer(0.2))
 adagrad_optimizer_v1_fn = NamedObject(
     "AdagradV1", lambda: adagrad.AdagradOptimizer(0.001))
+rmsprop_optimizer_v1_fn = NamedObject(
+    "RmsPropV1", lambda: rmsprop.RMSPropOptimizer(0.001))
 optimizers_v1 = [adam_optimizer_v1_fn, gradient_descent_optimizer_v1_fn,
                  adagrad_optimizer_v1_fn]
 
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index a0b8bde132..3aab2c521f 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -173,13 +173,42 @@ def batch_wrapper(dataset, batch_size, distribution):
     return dataset.batch(batch_size)
 
 
-def all_combinations():
+def get_model():
+  x = keras.layers.Input(shape=(3,), name='input')
+  y = keras.layers.Dense(4, name='dense')(x)
+  model = keras.Model(x, y)
+  return model
+
+
+def get_dataset(distribution):
+  inputs = np.zeros((10, 3), dtype=np.float32)
+  targets = np.zeros((10, 4), dtype=np.float32)
+  dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+  dataset = dataset.repeat(100)
+  dataset = batch_wrapper(dataset, 10, distribution)
+  return dataset
+
+
+strategies = [combinations.default_strategy,
+              combinations.one_device_strategy,
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.mirrored_strategy_with_two_gpus,
+              combinations.tpu_strategy_one_step]
+
+
+def strategy_combinations():
   return combinations.combine(
-      distribution=[combinations.default_strategy,
-                    combinations.one_device_strategy,
-                    combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus,
-                    combinations.tpu_strategy_one_step],
+      distribution=strategies,
+      mode=['graph'])
+
+
+def strategy_and_optimizer_combinations():
+  return combinations.combine(
+      distribution=strategies,
+      optimizer=[combinations.adagrad_optimizer_v1_fn,
+                 combinations.adam_optimizer_v1_fn,
+                 combinations.gradient_descent_optimizer_v1_fn,
+                 combinations.rmsprop_optimizer_v1_fn],
       mode=['graph'])
 
 
@@ -360,9 +389,7 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
 
   def test_calling_model_with_numpy_arrays(self):
     with self.cached_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
+      model = get_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
@@ -392,23 +419,17 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
       # with batch_size
       model.predict(inputs, batch_size=8)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(strategy_combinations())
   def test_calling_model_on_same_dataset(self, distribution):
     with self.cached_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
+      model = get_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae', keras.metrics.CategoricalAccuracy()]
       model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = batch_wrapper(dataset, 10, distribution)
+      dataset = get_dataset(distribution)
 
       # Call fit with validation data
       model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
@@ -461,23 +482,17 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
 
       model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(strategy_combinations())
   def test_fit_eval_and_predict_methods_on_dataset(self, distribution):
     with self.cached_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
+      model = get_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae', keras.metrics.CategoricalAccuracy()]
       model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = batch_wrapper(dataset, 10, distribution)
+      dataset = get_dataset(distribution)
 
       model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
       model.evaluate(dataset, steps=2, verbose=1)
@@ -486,11 +501,23 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
       model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
                 validation_data=dataset, validation_steps=2)
 
+  @combinations.generate(strategy_and_optimizer_combinations())
+  def test_fit_eval_and_predict_with_optimizer(self, distribution, optimizer):
+    with self.cached_session():
+      model = get_model()
+
+      loss = 'mse'
+      model.compile(optimizer(), loss, distribute=distribution)
+
+      dataset = get_dataset(distribution)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+      model.evaluate(dataset, steps=2, verbose=1)
+      model.predict(dataset, steps=2)
+
   def test_unsupported_features(self):
     with self.cached_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
+      model = get_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
@@ -500,11 +527,7 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
 
       model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
 
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
+      dataset = get_dataset(strategy)
 
       # Test with validation split
       with self.assertRaisesRegexp(
@@ -541,9 +564,7 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
 
   def test_calling_with_unsupported_predefined_callbacks(self):
     with self.cached_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
+      model = get_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
@@ -552,11 +573,7 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
                                                      '/device:GPU:0'])
       model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
 
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
+      dataset = get_dataset(strategy)
 
       def schedule(_):
         return 0.001
@@ -580,9 +597,7 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
 
   def test_dataset_input_shape_validation(self):
     with self.cached_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
+      model = get_model()
 
       optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
@@ -616,17 +631,13 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
       mode=['graph']))
   def test_dataset_input_shape_fully_defined(self, distribution):
     with self.cached_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
+      model = get_model()
 
       optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
       model.compile(optimizer, loss, distribute=distribution)
 
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = get_dataset(distribution)
       # Input shapes are not fully known. Batch dimension is unknown as we are
       # not using the drop_remainder argument.
       dataset = dataset.repeat(100).batch(10)
@@ -698,7 +709,7 @@ class LossMaskingWithDistributionStrategyTest(test.TestCase):
 class NormalizationLayerWithDistributionStrategyTest(
     test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(strategy_combinations())
   def test_batchnorm_correctness(self, distribution):
     with self.cached_session():
       model = keras.models.Sequential()
@@ -726,7 +737,7 @@ class NormalizationLayerWithDistributionStrategyTest(
 class CorrectnessWithDistributionStrategyTest(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(strategy_combinations())
   def test_metric_correctness(self, distribution):
     with self.cached_session():
       keras.backend.set_image_data_format('channels_last')
@@ -756,7 +767,7 @@ class CorrectnessWithDistributionStrategyTest(test.TestCase,
       history = model.fit(x=train_dataset, epochs=1, steps_per_epoch=10)
       self.assertEqual(history.history['binary_accuracy'], [1.0])
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(strategy_combinations())
   def test_correctness(self, distribution):
     with self.cached_session():
       keras.backend.set_image_data_format('channels_last')
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 46bffd7068..5091cac836 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -851,7 +851,8 @@ class Model(Network):
     # able to clone a Dataset on multiple workers we can remove this lambda.
     result = self._distribution_strategy.distribute_dataset(lambda: x)
     iterator = result.make_initializable_iterator()
-    K.get_session().run(iterator.initializer)
+    with self._distribution_strategy.scope():
+      K.get_session().run(iterator.initializer)
 
     training_utils.validate_iterator_input(x, y, sample_weight,
                                            validation_split)
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index 1b64f904d5..a6470458d2 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -112,100 +112,99 @@ def fit_loop(
     dataset_targets = distributed_training_utils.flatten_perdevice_values(
         current_strategy, targets)
 
-  # Create a train function that is composed of all the parameters above.
-  distributed_train_function = K.Function(
-      all_inputs, all_outputs,
-      updates=all_updates,
-      name='distributed_train_function',
-      **all_session_args)
-
-  # We need to set sample_weights to None since there are sample weight
-  # placeholders that are created with default values.
-  sample_weights = [None for _ in range(len(model.outputs) *
-                                        current_strategy.num_towers)]
-  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-    ins = dataset_inputs + dataset_targets + sample_weights + [1]
-  else:
-    ins = dataset_inputs + dataset_targets
+    # Create a train function that is composed of all the parameters above.
+    distributed_train_function = K.Function(
+        all_inputs, all_outputs,
+        updates=all_updates,
+        name='distributed_train_function',
+        **all_session_args)
+
+    # We need to set sample_weights to None since there are sample weight
+    # placeholders that are created with default values.
+    sample_weights = [None for _ in range(len(model.outputs) *
+                                          current_strategy.num_towers)]
+    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = dataset_inputs + dataset_targets + sample_weights + [1]
+    else:
+      ins = dataset_inputs + dataset_targets
 
-  do_validation = False
-  if validation_steps:
-    do_validation = True
+    do_validation = False
+    if validation_steps:
+      do_validation = True
 
-  # Copy the weights from the original model to each of the replicated models.
-  orig_model_weights = model.get_weights()
-  with current_strategy.scope():
+    # Copy the weights from the original model to each of the replicated models.
+    orig_model_weights = model.get_weights()
     distributed_model = current_strategy.unwrap(model._grouped_model)[0]
     distributed_training_utils.set_weights(
         current_strategy, distributed_model, orig_model_weights)
 
-  callbacks = cbks.configure_callbacks(
-      callbacks,
-      model,
-      do_validation=do_validation,
-      val_inputs=None,
-      val_targets=None,
-      epochs=epochs,
-      steps_per_epoch=steps_per_epoch,
-      verbose=verbose)
-  out_labels = model.metrics_names or []
-  callbacks.on_train_begin()
-
-  assert steps_per_epoch is not None
-
-  for epoch in range(initial_epoch, epochs):
-    # Reset stateful metrics
-    for m in model.stateful_metric_functions:
-      m.reset_states()
-    callbacks.on_epoch_begin(epoch)
-    epoch_logs = {}
-    for step_index in range(steps_per_epoch):
-      batch_logs = {'batch': step_index, 'size': 1}
-      callbacks.on_batch_begin(step_index, batch_logs)
-      try:
-        outs = distributed_train_function(ins)
-      except errors.OutOfRangeError:
-        logging.warning('Your dataset iterator ran out of data; '
-                        'interrupting training. Make sure that your dataset '
-                        'can generate at least `steps_per_epoch * epochs` '
-                        'batches (in this case, %d batches).' %
-                        steps_per_epoch * epochs)
-        break
-
-      if not isinstance(outs, list):
-        outs = [outs]
-
-      outs = _aggregate_metrics_across_towers(current_strategy.num_towers,
-                                              out_labels,
-                                              model.stateful_metric_names, outs)
-      for l, o in zip(out_labels, outs):
-        batch_logs[l] = o
-      callbacks.on_batch_end(step_index, batch_logs)
+    callbacks = cbks.configure_callbacks(
+        callbacks,
+        model,
+        do_validation=do_validation,
+        val_inputs=None,
+        val_targets=None,
+        epochs=epochs,
+        steps_per_epoch=steps_per_epoch,
+        verbose=verbose)
+    out_labels = model.metrics_names or []
+    callbacks.on_train_begin()
+
+    assert steps_per_epoch is not None
+
+    for epoch in range(initial_epoch, epochs):
+      # Reset stateful metrics
+      for m in model.stateful_metric_functions:
+        m.reset_states()
+      callbacks.on_epoch_begin(epoch)
+      epoch_logs = {}
+      for step_index in range(steps_per_epoch):
+        batch_logs = {'batch': step_index, 'size': 1}
+        callbacks.on_batch_begin(step_index, batch_logs)
+        try:
+          outs = distributed_train_function(ins)
+        except errors.OutOfRangeError:
+          logging.warning('Your dataset iterator ran out of data; '
+                          'interrupting training. Make sure that your dataset '
+                          'can generate at least `steps_per_epoch * epochs` '
+                          'batches (in this case, %d batches).' %
+                          steps_per_epoch * epochs)
+          break
+
+        if not isinstance(outs, list):
+          outs = [outs]
+
+        outs = _aggregate_metrics_across_towers(current_strategy.num_towers,
+                                                out_labels,
+                                                model.stateful_metric_names,
+                                                outs)
+        for l, o in zip(out_labels, outs):
+          batch_logs[l] = o
+        callbacks.on_batch_end(step_index, batch_logs)
+        if callbacks.model.stop_training:
+          break
+      if do_validation:
+        val_outs = test_loop(
+            model,
+            val_iterator,
+            steps=validation_steps,
+            verbose=0)
+        if not isinstance(val_outs, list):
+          val_outs = [val_outs]
+        # Same labels assumed.
+        for l, o in zip(out_labels, val_outs):
+          epoch_logs['val_' + l] = o
+
+      callbacks.on_epoch_end(epoch, epoch_logs)
       if callbacks.model.stop_training:
         break
-    if do_validation:
-      val_outs = test_loop(
-          model,
-          val_iterator,
-          steps=validation_steps,
-          verbose=0)
-      if not isinstance(val_outs, list):
-        val_outs = [val_outs]
-      # Same labels assumed.
-      for l, o in zip(out_labels, val_outs):
-        epoch_logs['val_' + l] = o
+    callbacks.on_train_end()
 
-    callbacks.on_epoch_end(epoch, epoch_logs)
-    if callbacks.model.stop_training:
-      break
-  callbacks.on_train_end()
-
-  # Copy the weights back from the replicated model to the original model.
-  with current_strategy.scope():
+    # Copy the weights back from the replicated model to the original model.
     updated_weights = current_strategy.unwrap(
         model._grouped_model)[0].get_weights()
     model.set_weights(updated_weights)
-  return model.history
+    return model.history
 
 
 def _experimental_fit_loop(
@@ -427,66 +426,65 @@ def test_loop(model, iterator, verbose=0, steps=None):
     dataset_targets = distributed_training_utils.flatten_perdevice_values(
         current_strategy, targets)
 
-  distributed_test_function = K.Function(
-      all_inputs, all_outputs,
-      updates=all_updates,
-      name='distributed_test_function',
-      **all_session_args)
-
-  # We need to set sample_weights to None since there are sample weight
-  # placeholders that are created with default values.
-  sample_weights = [None for _ in range(len(model.outputs) *
-                                        current_strategy.num_towers)]
-  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-    ins = dataset_inputs + dataset_targets + sample_weights + [0]
-  else:
-    ins = dataset_inputs + dataset_targets
+    distributed_test_function = K.Function(
+        all_inputs, all_outputs,
+        updates=all_updates,
+        name='distributed_test_function',
+        **all_session_args)
 
-  for m in model.stateful_metric_functions:
-    m.reset_states()
-  stateful_metric_indices = [
-      i for i, name in enumerate(model.metrics_names)
-      if str(name) in model.stateful_metric_names
-  ]
+    # We need to set sample_weights to None since there are sample weight
+    # placeholders that are created with default values.
+    sample_weights = [None for _ in range(len(model.outputs) *
+                                          current_strategy.num_towers)]
+    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = dataset_inputs + dataset_targets + sample_weights + [0]
+    else:
+      ins = dataset_inputs + dataset_targets
 
-  outs = []
-  if verbose == 1:
-    progbar = Progbar(target=steps)
+    for m in model.stateful_metric_functions:
+      m.reset_states()
+    stateful_metric_indices = [
+        i for i, name in enumerate(model.metrics_names)
+        if str(name) in model.stateful_metric_names
+    ]
 
-  # Copy the weights from the original model to each of the replicated models.
-  orig_model_weights = model.get_weights()
-  with current_strategy.scope():
+    outs = []
+    if verbose == 1:
+      progbar = Progbar(target=steps)
+
+    # Copy the weights from the original model to each of the replicated models.
+    orig_model_weights = model.get_weights()
     distributed_model = current_strategy.unwrap(model._grouped_model)[0]
     distributed_training_utils.set_weights(
         current_strategy, distributed_model, orig_model_weights)
 
-  assert steps is not None
-  for step in range(steps):
-    batch_outs = distributed_test_function(ins)
-    batch_outs = _aggregate_metrics_across_towers(
-        current_strategy.num_towers, model.metrics_names,
-        model.stateful_metric_names, batch_outs)
-    if isinstance(batch_outs, list):
-      if step == 0:
-        outs = [0.] * len(batch_outs)
-      for i, batch_out in enumerate(batch_outs):
-        if i in stateful_metric_indices:
-          outs[i] = batch_out
-        else:
-          outs[i] += batch_out
-    else:
-      if step == 0:
-        outs.append(0.)
-      outs[0] += batch_outs
-    if verbose >= 1:
-      progbar.update(step + 1)
-  for i in range(len(outs)):
-    if i not in stateful_metric_indices:
-      outs[i] /= steps
+    assert steps is not None
+    for step in range(steps):
+      batch_outs = distributed_test_function(ins)
+      batch_outs = _aggregate_metrics_across_towers(
+          current_strategy.num_towers, model.metrics_names,
+          model.stateful_metric_names, batch_outs)
+      if isinstance(batch_outs, list):
+        if step == 0:
+          outs = [0.] * len(batch_outs)
+        for i, batch_out in enumerate(batch_outs):
+          if i in stateful_metric_indices:
+            outs[i] = batch_out
+          else:
+            outs[i] += batch_out
+      else:
+        if step == 0:
+          outs.append(0.)
+        outs[0] += batch_outs
+      if verbose >= 1:
+        progbar.update(step + 1)
+    for i in range(len(outs)):
+      if i not in stateful_metric_indices:
+        outs[i] /= steps
 
-  if len(outs) == 1:
-    return outs[0]
-  return outs
+    if len(outs) == 1:
+      return outs[0]
+    return outs
 
 
 def _experimental_test_loop(model, iterator, verbose=0, steps=None):
@@ -647,51 +645,50 @@ def predict_loop(model, iterator, verbose=0, steps=None):
     dataset_inputs = distributed_training_utils.flatten_perdevice_values(
         current_strategy, inputs)
 
-  distributed_predict_function = K.Function(
-      all_inputs, all_outputs,
-      updates=all_updates,
-      name='distributed_predict_function',
-      **all_session_args)
+    distributed_predict_function = K.Function(
+        all_inputs, all_outputs,
+        updates=all_updates,
+        name='distributed_predict_function',
+        **all_session_args)
 
-  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-    ins = dataset_inputs + [0]
-  else:
-    ins = dataset_inputs
+    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = dataset_inputs + [0]
+    else:
+      ins = dataset_inputs
 
-  if verbose == 1:
-    progbar = Progbar(target=steps)
+    if verbose == 1:
+      progbar = Progbar(target=steps)
 
-  # Copy the weights from the original model to each of the replicated models.
-  orig_model_weights = model.get_weights()
-  with current_strategy.scope():
+    # Copy the weights from the original model to each of the replicated models.
+    orig_model_weights = model.get_weights()
     distributed_model = current_strategy.unwrap(model._grouped_model)[0]
     distributed_training_utils.set_weights(
         current_strategy, distributed_model, orig_model_weights)
 
-  if steps is not None:
-    # Since we do not know how many samples we will see, we cannot pre-allocate
-    # the returned Numpy arrays. Instead, we store one array per batch seen
-    # and concatenate them upon returning.
-    unconcatenated_outs = []
-    for step in range(steps):
-      batch_outs = distributed_predict_function(ins)
-      if not isinstance(batch_outs, list):
-        batch_outs = [batch_outs]
-      if step == 0:
-        for _ in batch_outs:
-          unconcatenated_outs.append([])
-      # TODO(anjalisridhar): Should combine the outputs from multiple towers
-      # correctly here.
-      for i, batch_out in enumerate(batch_outs):
-        unconcatenated_outs[i].append(batch_out)
-      if verbose >= 1:
-        progbar.update(step + 1)
-    if len(unconcatenated_outs) == 1:
-      return np.concatenate(unconcatenated_outs[0], axis=0)
-    return [
-        np.concatenate(unconcatenated_outs[i], axis=0)
-        for i in range(len(unconcatenated_outs))
-    ]
+    if steps is not None:
+      # Since we do not know how many samples we will see, we cannot
+      # pre-allocate the returned Numpy arrays. Instead, we store one array per
+      # batch seen and concatenate them upon returning.
+      unconcatenated_outs = []
+      for step in range(steps):
+        batch_outs = distributed_predict_function(ins)
+        if not isinstance(batch_outs, list):
+          batch_outs = [batch_outs]
+        if step == 0:
+          for _ in batch_outs:
+            unconcatenated_outs.append([])
+        # TODO(anjalisridhar): Should combine the outputs from multiple towers
+        # correctly here.
+        for i, batch_out in enumerate(batch_outs):
+          unconcatenated_outs[i].append(batch_out)
+        if verbose >= 1:
+          progbar.update(step + 1)
+      if len(unconcatenated_outs) == 1:
+        return np.concatenate(unconcatenated_outs[0], axis=0)
+      return [
+          np.concatenate(unconcatenated_outs[i], axis=0)
+          for i in range(len(unconcatenated_outs))
+      ]
 
 
 def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
-- 
GitLab


From fa8c1eabd06f3043be820bf476e8413818853f17 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 00:04:20 -0700
Subject: [PATCH 130/570] Internal

PiperOrigin-RevId: 214895147
---
 tensorflow/examples/android/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index f327b645f5..f5f0d7c3c8 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -68,6 +68,7 @@ android_binary(
     srcs = glob([
         "src/**/*.java",
     ]),
+    aapt_version = "aapt",
     # Package assets from assets dir as well as all model targets. Remove undesired models
     # (and corresponding Activities in source) to reduce APK size.
     assets = [
-- 
GitLab


From d0690d46466bf0393ad65544d1e8c55e948df133 Mon Sep 17 00:00:00 2001
From: EFanZh <efanzh@gmail.com>
Date: Fri, 28 Sep 2018 15:20:26 +0800
Subject: [PATCH 131/570] Fix some documentation errors

---
 tensorflow/contrib/distribute/python/mirrored_strategy.py | 5 +++--
 tensorflow/python/keras/engine/training.py                | 2 +-
 tensorflow/python/training/distribute.py                  | 6 +++---
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 504f45a695..c0861da567 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -318,12 +318,13 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
   [TensorFlow's documentation](https://www.tensorflow.org/deploy/distributed).
   The distribution strategy inherits these concepts as well and in addition to
   that we also clarify several more concepts:
-    * **In-graph replication**: the `client` creates a single `tf.Graph` that
+
+  * **In-graph replication**: the `client` creates a single `tf.Graph` that
     specifies tasks for devices on all workers. The `client` then creates a
     client session which will talk to the `master` service of a `worker`. Then
     the `master` will partition the graph and distribute the work to all
     participating workers.
-    * **Worker**: A `worker` is a TensorFlow `task` that usually maps to one
+  * **Worker**: A `worker` is a TensorFlow `task` that usually maps to one
     physical machine. We will have multiple `worker`s with different `task`
     index. They all do similar things except for one worker checkpointing model
     variables, writing summaries, etc. in addition to its ordinary work.
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 5091cac836..1bd8422658 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -2356,6 +2356,6 @@ class DistributedCallbackModel(Model):
     # Whitelisted atttributes of the model that can be accessed by the user
     # during a callback.
     if item not in ['_setattr_tracking']:
-      logging.warning('You are accessing attribute ' + item + 'of the '
+      logging.warning('You are accessing attribute ' + item + ' of the '
                       'DistributedCallbackModel that may not have been set '
                       'correctly.')
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 419a9ec12b..fd4704285c 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -631,7 +631,7 @@ class DistributionStrategy(object):
 
     Args:
       fn: function to run using this distribution strategy. The function must
-        have the following signature: def fn(context, *inputs).
+        have the following signature: `def fn(context, *inputs)`.
         `context` is an instance of `MultiStepContext` that will be passed when
         `fn` is run. `context` can be used to specify the outputs to be returned
         from `fn` by calling `context.set_last_step_output`. It can also be used
@@ -797,9 +797,9 @@ class DistributionStrategy(object):
     return merged(results)
     ```
 
-    Otherwise this returns `fn(var, *args, **kwargs)` colocated with `var`.'
+    Otherwise this returns `fn(var, *args, **kwargs)` colocated with `var`.
 
-    Neither *args nor **kwargs may contain per-device values.
+    Neither `*args` nor `**kwargs` may contain per-device values.
     If they contain mirrored values, they will be unwrapped before
     calling `fn`.
 
-- 
GitLab


From 19b2383cc0e221262be0780180558cf5bbb3e37e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 02:01:03 -0700
Subject: [PATCH 132/570] compat: Update forward compatibility horizon to
 2018-09-28

PiperOrigin-RevId: 214904795
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 88cad5d6d9..b74fce3a4c 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -26,7 +26,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 9, 27)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 9, 28)
 
 
 @tf_export("compat.forward_compatible")
-- 
GitLab


From 8eb27871583d9fc61e046493acaa0df2839bc1c7 Mon Sep 17 00:00:00 2001
From: wangsiyu <siyu.wsy@gmail.com>
Date: Fri, 28 Sep 2018 18:51:34 +0800
Subject: [PATCH 133/570] remove slash

---
 tensorflow/python/ops/variables.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 69f63bc8e6..262cd61e5a 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -2401,7 +2401,8 @@ class PartitionedVariable(object):
     partition_axes = self._partition_axes()
     if len(partition_axes) > 1:
       raise NotImplementedError(
-          "Multi-axis partition assign_fn is not supported "
+          "Cannot do assign action along more than one dimension: %s.  "
+          "Multi-axis partition assign action is not supported "
           % str(partition_axes))
     partition_ix = partition_axes[0]
     size_splits_list = [
@@ -2409,7 +2410,7 @@ class PartitionedVariable(object):
     value_list = array_ops.split(
         value, size_splits_list, axis=partition_ix)
     op_list = [
-        assign_fn(var, value_list[idx], idx) \
+        assign_fn(var, value_list[idx], idx)
         for idx, var in enumerate(self._variable_list)]
     return op_list
 
-- 
GitLab


From 32627bfba19606d3c3a34f5d02ae9428675bbc42 Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Fri, 28 Sep 2018 07:28:19 -0700
Subject: [PATCH 134/570] Allow testManyCPUs to encounter non-CPU devices.

PiperOrigin-RevId: 214932861
---
 tensorflow/python/client/session_test.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 5c0c405306..347833ce8f 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -120,11 +120,17 @@ class SessionTest(test_util.TensorFlowTestCase):
       inp = constant_op.constant(10.0, name='W1')
       self.assertAllEqual(inp.eval(), 10.0)
 
-      devices = sess.list_devices()
-      self.assertEqual(2, len(devices))
-      for device in devices:
-        self.assertEqual('CPU', framework_device_lib.DeviceSpec.from_string(
-            device.name).device_type)
+      num_cpu_devices = 0
+      num_gpu_devices = 0
+      for device in sess.list_devices():
+        device_type = framework_device_lib.DeviceSpec.from_string(
+            device.name).device_type
+        if device_type == 'CPU':
+          num_cpu_devices += 1
+        elif device_type == 'GPU':
+          num_gpu_devices += 1
+      self.assertEqual(2, num_cpu_devices)
+      self.assertEqual(0, num_gpu_devices)
 
   def testPerSessionThreads(self):
     with session.Session(
-- 
GitLab


From 4e955be2ae1c920623778c15357129fea9a3bdab Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Fri, 28 Sep 2018 08:26:55 -0700
Subject: [PATCH 135/570] Fixing a couple of small bugs with the multi device
 iterator having to deal with the case when the background thread terminated
 because the iterator finished and yet some other requests were coming in.

1. The GetNextFromShard would see an empty buffer and return cancelled instead
of OutOfRange errors
2. On shutdown, we weren't calling all the pending callbacks.

Tested with runs_per_test=5000

PiperOrigin-RevId: 214939274
---
 .../kernels/data/multi_device_iterator_ops.cc | 34 +++++++++++--------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index 5f143967d9..d909b9e9d3 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -134,19 +134,17 @@ class MultiDeviceIterator : public ResourceBase {
     void Reset() LOCKS_EXCLUDED(mu_) {
       {
         mutex_lock l(mu_);
-        if (background_thread_finished_) {
-          return;
-        }
-
-        cancelled_ = true;
-        // Wake up the background thread.
-        for (int i = 0; i < size_; ++i) {
-          buffer_[i].cond_var.notify_all();
-        }
+        if (!background_thread_finished_) {
+          cancelled_ = true;
+          // Wake up the background thread.
+          for (int i = 0; i < size_; ++i) {
+            buffer_[i].cond_var.notify_all();
+          }
 
-        // Make sure background thread has finished first.
-        while (!background_thread_finished_) {
-          shutdown_cond_var_.wait(l);
+          // Make sure background thread has finished first.
+          while (!background_thread_finished_) {
+            shutdown_cond_var_.wait(l);
+          }
         }
       }
       RunPendingCallbacks();
@@ -182,7 +180,7 @@ class MultiDeviceIterator : public ResourceBase {
             buffer_[shard_num].cond_var.notify_all();
           }
         } else {
-          if (background_thread_finished_) {
+          if (end_of_iterator_) {
             produced_output = true;
             elem.end_of_sequence = true;
           } else {
@@ -219,8 +217,12 @@ class MultiDeviceIterator : public ResourceBase {
           while (!buffer_[i].callbacks.empty()) {
             if (buffer_[i].data.empty()) {
               HostBufferElement elem;
-              elem.status =
-                  errors::Cancelled("Cancelled and buffer not filled.");
+              if (end_of_iterator_) {
+                elem.end_of_sequence = true;
+              } else {
+                elem.status =
+                    errors::Cancelled("Cancelled and buffer not filled.");
+              }
               cancellation_elements.push_back(std::move(elem));
             } else {
               cancellation_elements.push_back(
@@ -293,6 +295,7 @@ class MultiDeviceIterator : public ResourceBase {
           {
             mutex_lock l(mu_);
             background_thread_finished_ = true;
+            end_of_iterator_ = true;
             shutdown_cond_var_.notify_all();
           }
           RunPendingCallbacks();
@@ -312,6 +315,7 @@ class MultiDeviceIterator : public ResourceBase {
     std::unique_ptr<Thread> background_thread_ GUARDED_BY(mu_);
     bool background_thread_finished_ GUARDED_BY(mu_) = false;
     bool background_thread_started_ GUARDED_BY(mu_) = false;
+    bool end_of_iterator_ GUARDED_BY(mu_) = false;
     bool cancelled_ GUARDED_BY(mu_) = false;
     condition_variable shutdown_cond_var_ GUARDED_BY(mu_);
 
-- 
GitLab


From a74a3217f7ff2dbee2fb618aa658cf666861545c Mon Sep 17 00:00:00 2001
From: Jason Zaman <jason@perfinion.com>
Date: Sat, 4 Aug 2018 14:13:00 +0800
Subject: [PATCH 136/570] Move bazel.rc to workspace root to support
 bazel-0.18.0

Bazel 0.18.0 will contain a change for which rc files it accepts.
https://github.com/bazelbuild/bazel/commit/ec83598cb6ee4136166bb562a24dc5dfa58921db
https://github.com/bazelbuild/bazel/issues/4502

Old bazel used to read %workspace%/tools/bazel.rc. New bazel will not
read that and instead will only read %workspace%/.bazelrc.

Signed-off-by: Jason Zaman <jason@perfinion.com>
---
 tools/bazel.rc => .bazelrc | 4 +++-
 .gitignore                 | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)
 rename tools/bazel.rc => .bazelrc (98%)

diff --git a/tools/bazel.rc b/.bazelrc
similarity index 98%
rename from tools/bazel.rc
rename to .bazelrc
index 3734fab715..9f09fdff97 100644
--- a/tools/bazel.rc
+++ b/.bazelrc
@@ -29,7 +29,7 @@ build:mkl -c opt
 
 # This config option is used to enable MKL-DNN open source library only,
 # without depending on MKL binary version.
-build:mkl_open_source_only --define=build_with_mkl_dnn_only=true 
+build:mkl_open_source_only --define=build_with_mkl_dnn_only=true
 build:mkl_open_source_only --define=build_with_mkl=true --define=enable_mkl=true
 
 build:download_clang --crosstool_top=@local_config_download_clang//:toolchain
@@ -84,3 +84,5 @@ build:dynamic_kernels --define=dynamic_loaded_kernels=true
 build --define=PREFIX=/usr
 build --define=LIBDIR=$(PREFIX)/lib
 build --define=INCLUDEDIR=$(PREFIX)/include
+
+# Do not commit the tf_configure.bazelrc line
diff --git a/.gitignore b/.gitignore
index 1ef4c297ee..cb65f447d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,6 @@
 .DS_Store
 .ipynb_checkpoints
 node_modules
-/.bazelrc
 /.tf_configure.bazelrc
 /bazel-*
 /bazel_pip
-- 
GitLab


From d3f6b72bc7356d5c94289e32426dc482b8ededf0 Mon Sep 17 00:00:00 2001
From: Jason Zaman <jason@perfinion.com>
Date: Sat, 4 Aug 2018 14:28:02 +0800
Subject: [PATCH 137/570] configure: use workspace-relative path to
 tf_configure_bazelrc

/.bazelrc is not gitignored anymore so this should help in case the
import line is accidentally committed. Bazel 0.18.0 will support a new
'try-import' statement that should be used once 0.18.0 has been out long
enough.

Signed-off-by: Jason Zaman <jason@perfinion.com>
---
 configure.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/configure.py b/configure.py
index 55fce8b93b..129d9c5fe7 100644
--- a/configure.py
+++ b/configure.py
@@ -257,11 +257,7 @@ def reset_tf_configure_bazelrc(workspace_path):
       if _TF_BAZELRC_FILENAME in l:
         continue
       f.write('%s\n' % l)
-    if is_windows():
-      tf_bazelrc_path = _TF_BAZELRC.replace('\\', '/')
-    else:
-      tf_bazelrc_path = _TF_BAZELRC
-    f.write('import %s\n' % tf_bazelrc_path)
+    f.write('import %%workspace%%/%s\n' % _TF_BAZELRC_FILENAME)
 
 
 def cleanup_makefile():
-- 
GitLab


From e06783e7bb80f664c7ec9be90680ac6ddcbd598f Mon Sep 17 00:00:00 2001
From: Brian Patton <bjp@google.com>
Date: Fri, 28 Sep 2018 08:38:25 -0700
Subject: [PATCH 138/570] Fix a latex render nit

PiperOrigin-RevId: 214940748
---
 tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt b/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt
index 40d7d371ca..7142a0e3f2 100644
--- a/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt
@@ -9,7 +9,7 @@ The lower regularized incomplete Gamma function is defined as:
 
 where
 
-\\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
+\\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
 
 is the lower incomplete Gamma function.
 
-- 
GitLab


From c7bb3c3d65e4e064d53630d4b524522eed6f3f44 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 28 Sep 2018 08:38:53 -0700
Subject: [PATCH 139/570] [tf.data] Move `tf.contrib.data` C++ code to a core
 "experimental" directory.

NOTE: All ops and kernels previously previously defined in
tensorflow/contrib/data have had their name prefixed with
"Experimental" to indicate that they are not (yet) stable, and thus
not subject to backwards or forwards compatibility guarantees.
PiperOrigin-RevId: 214940819
---
 tensorflow/contrib/BUILD                      |   3 -
 tensorflow/contrib/cmake/python_modules.txt   |   1 -
 tensorflow/contrib/data/BUILD                 |  38 -----
 .../contrib/data/ops/indexed_dataset_ops.cc   |  80 ---------
 .../contrib/data/python/kernel_tests/BUILD    |   3 +-
 .../kernel_tests/indexed_dataset_ops_test.py  |  12 +-
 tensorflow/contrib/data/python/ops/BUILD      |  57 +------
 .../data/python/ops/contrib_op_loader.py      |  24 ---
 .../contrib/data/python/ops/error_ops.py      |   5 +-
 .../data/python/ops/indexed_dataset_ops.py    |  25 +--
 .../contrib/data/python/ops/interleave_ops.py |  13 +-
 .../contrib/data/python/ops/optimization.py   |   5 +-
 .../data/python/ops/prefetching_ops.py        |  37 ++--
 tensorflow/contrib/data/python/ops/readers.py |   6 +-
 .../contrib/data/python/ops/threadpool.py     |   9 +-
 tensorflow/contrib/data/python/ops/unique.py  |   5 +-
 tensorflow/core/BUILD                         |   2 +
 ...pi_def_ExperimentalAssertNextDataset.pbtxt |   4 +
 .../api_def_ExperimentalCSVDataset.pbtxt      |   4 +
 ...xperimentalDirectedInterleaveDataset.pbtxt |  21 +++
 ...xperimentalFunctionBufferingResource.pbtxt |  58 +++++++
 ...ntalFunctionBufferingResourceGetNext.pbtxt |  25 +++
 ...mentalFunctionBufferingResourceReset.pbtxt |  13 ++
 ...f_ExperimentalIdentityIndexedDataset.pbtxt |   4 +
 ..._def_ExperimentalIgnoreErrorsDataset.pbtxt |   8 +
 ...pi_def_ExperimentalIndexedDatasetGet.pbtxt |   4 +
 ...xperimentalIndexedDatasetMaterialize.pbtxt |   4 +
 ...pi_def_ExperimentalIteratorGetDevice.pbtxt |   8 +
 .../api_def_ExperimentalLMDBDataset.pbtxt     |   4 +
 ...mentalMaterializedIndexDatasetHandle.pbtxt |   4 +
 ...pi_def_ExperimentalThreadPoolDataset.pbtxt |  13 ++
 ...api_def_ExperimentalThreadPoolHandle.pbtxt |  35 ++++
 .../api_def_ExperimentalUniqueDataset.pbtxt   |   8 +
 tensorflow/core/kernels/data/BUILD            |   1 +
 .../kernels/data/experimental}/BUILD          |  90 +++++-----
 .../experimental}/assert_next_dataset_op.cc   |   5 +-
 .../data/experimental}/csv_dataset_op.cc      |   3 +-
 .../directed_interleave_dataset_op.cc         |   5 +-
 .../experimental}/identity_indexed_dataset.cc |   7 +-
 .../experimental}/ignore_errors_dataset_op.cc |   6 +-
 .../data/experimental}/indexed_dataset.cc     |  14 +-
 .../data/experimental}/indexed_dataset.h      |   6 +-
 .../data/experimental}/lmdb_dataset_op.cc     |   3 +-
 .../data/experimental}/prefetching_kernels.cc |  23 +--
 .../experimental}/threadpool_dataset_op.cc    |   7 +-
 .../data/experimental}/unique_dataset_op.cc   |   7 +-
 .../ops/experimental_dataset_ops.cc}          | 161 +++++++++---------
 tensorflow/python/BUILD                       |   9 +
 tensorflow/tools/pip_package/BUILD            |   1 -
 49 files changed, 469 insertions(+), 421 deletions(-)
 delete mode 100644 tensorflow/contrib/data/ops/indexed_dataset_ops.cc
 delete mode 100644 tensorflow/contrib/data/python/ops/contrib_op_loader.py
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalAssertNextDataset.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalCSVDataset.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalDirectedInterleaveDataset.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResource.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalIdentityIndexedDataset.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalIgnoreErrorsDataset.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalIndexedDatasetGet.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalIndexedDatasetMaterialize.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalIteratorGetDevice.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalLMDBDataset.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalMaterializedIndexDatasetHandle.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalThreadPoolDataset.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalThreadPoolHandle.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExperimentalUniqueDataset.pbtxt
 rename tensorflow/{contrib/data/kernels => core/kernels/data/experimental}/BUILD (52%)
 rename tensorflow/{contrib/data/kernels => core/kernels/data/experimental}/assert_next_dataset_op.cc (97%)
 rename tensorflow/{contrib/data/kernels => core/kernels/data/experimental}/csv_dataset_op.cc (99%)
 rename tensorflow/{contrib/data/kernels => core/kernels/data/experimental}/directed_interleave_dataset_op.cc (98%)
 rename tensorflow/{contrib/data/kernels => core/kernels/data/experimental}/identity_indexed_dataset.cc (96%)
 rename tensorflow/{contrib/data/kernels => core/kernels/data/experimental}/ignore_errors_dataset_op.cc (96%)
 rename tensorflow/{contrib/data/kernels => core/kernels/data/experimental}/indexed_dataset.cc (97%)
 rename tensorflow/{contrib/data/kernels => core/kernels/data/experimental}/indexed_dataset.h (95%)
 rename tensorflow/{contrib/data/kernels => core/kernels/data/experimental}/lmdb_dataset_op.cc (98%)
 rename tensorflow/{contrib/data/kernels => core/kernels/data/experimental}/prefetching_kernels.cc (95%)
 rename tensorflow/{contrib/data/kernels => core/kernels/data/experimental}/threadpool_dataset_op.cc (97%)
 rename tensorflow/{contrib/data/kernels => core/kernels/data/experimental}/unique_dataset_op.cc (97%)
 rename tensorflow/{contrib/data/ops/dataset_ops.cc => core/ops/experimental_dataset_ops.cc} (62%)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 1a9ae8ac3a..98dff965a9 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -132,7 +132,6 @@ cc_library(
     deps = [
         "//tensorflow/contrib/boosted_trees:boosted_trees_kernels",
         "//tensorflow/contrib/coder:all_kernels",
-        "//tensorflow/contrib/data/kernels:dataset_kernels",
         "//tensorflow/contrib/factorization/kernels:all_kernels",
         "//tensorflow/contrib/hadoop:dataset_kernels",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_kernels",
@@ -163,8 +162,6 @@ cc_library(
     deps = [
         "//tensorflow/contrib/boosted_trees:boosted_trees_ops_op_lib",
         "//tensorflow/contrib/coder:all_ops",
-        "//tensorflow/contrib/data:dataset_ops_op_lib",
-        "//tensorflow/contrib/data:indexed_dataset_ops_op_lib",
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/hadoop:dataset_ops_op_lib",
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index c0763f4c0e..2975b167ec 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -132,7 +132,6 @@ tensorflow/contrib/cudnn_rnn/python
 tensorflow/contrib/cudnn_rnn/python/layers
 tensorflow/contrib/cudnn_rnn/python/ops
 tensorflow/contrib/data
-tensorflow/contrib/data/kernels
 tensorflow/contrib/data/python
 tensorflow/contrib/data/python/kernel_tests
 tensorflow/contrib/data/python/kernel_tests/serialization
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index 9f710613dd..38f1c65a4d 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -4,17 +4,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_custom_op_library",
-    "tf_gen_op_libs",
-    "if_not_windows",
-)
-load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
-    "if_static",
-)
-
 py_library(
     name = "data",
     srcs = ["__init__.py"],
@@ -25,30 +14,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-cc_library(
-    name = "lib_proto_parsing_for_dataset_ops",
-    deps = if_not_windows(["//tensorflow/core:lib_proto_parsing"]),
-)
-
-tf_custom_op_library(
-    name = "_dataset_ops.so",
-    srcs = [
-        "ops/dataset_ops.cc",
-        "ops/indexed_dataset_ops.cc",
-    ],
-    deps = [
-        "//tensorflow/contrib/data/kernels:dataset_kernels",
-        "//tensorflow/contrib/data/kernels:indexed_dataset",
-    ] + if_static(
-        extra_deps = [":lib_proto_parsing_for_dataset_ops"],
-        otherwise = [],
-    ),
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "dataset_ops",
-        "indexed_dataset_ops",
-    ],
-)
diff --git a/tensorflow/contrib/data/ops/indexed_dataset_ops.cc b/tensorflow/contrib/data/ops/indexed_dataset_ops.cc
deleted file mode 100644
index cd9b7c68a0..0000000000
--- a/tensorflow/contrib/data/ops/indexed_dataset_ops.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-
-namespace tensorflow {
-
-REGISTER_OP("IdentityIndexedDataset")
-    .Input("size: uint64")
-    .Output("handle: variant")
-    .SetIsStateful()
-    .SetShapeFn(
-        shape_inference::ScalarShape);  // TODO(saeta): check input shapes.
-
-///////////////////////////////////////////////////////////////////////////////
-//     IndexedDataset Internals
-///////////////////////////////////////////////////////////////////////////////
-
-// Creates the handle.
-REGISTER_OP("MaterializedIndexDatasetHandle")
-    .Output("handle: resource")
-    .Attr("container: string")
-    .Attr("shared_name: string")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
-// Actually materialize the materialize handle.
-REGISTER_OP("IndexedDatasetMaterialize")
-    .Input("dataset: variant")
-    .Input("materialized: resource")
-    .SetShapeFn(shape_inference::NoOutputs);
-
-namespace {
-
-Status GetShapeFn(shape_inference::InferenceContext* c) {
-  shape_inference::ShapeHandle unused;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-  std::vector<PartialTensorShape> output_shapes;
-  TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
-  if (output_shapes.size() != c->num_outputs()) {
-    return errors::InvalidArgument(
-        "`output_shapes` must be the same length as `output_types` (",
-        output_shapes.size(), " vs. ", c->num_outputs());
-  }
-  for (size_t i = 0; i < output_shapes.size(); ++i) {
-    shape_inference::ShapeHandle output_shape_handle;
-    TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
-        output_shapes[i], &output_shape_handle));
-    c->set_output(static_cast<int>(i), output_shape_handle);
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
-REGISTER_OP("IndexedDatasetGet")
-    .Input("materialized: resource")
-    .Input("index: uint64")
-    .Output("components: output_types")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(GetShapeFn)
-    .Doc(R"doc(
-Gets the element at `index` from `materialized` IndexedDataset.
-)doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index ce52c990ce..21ac40eb21 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -139,12 +139,11 @@ py_test(
     name = "indexed_dataset_ops_test",
     srcs = ["indexed_dataset_ops_test.py"],
     deps = [
-        "//tensorflow/contrib/data/python/ops:contrib_op_loader",
-        "//tensorflow/contrib/data/python/ops:gen_dataset_ops",
         "//tensorflow/contrib/data/python/ops:indexed_dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/contrib/data/python/kernel_tests/indexed_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/indexed_dataset_ops_test.py
index 9c508d686d..46a7127b52 100644
--- a/tensorflow/contrib/data/python/kernel_tests/indexed_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/indexed_dataset_ops_test.py
@@ -19,29 +19,29 @@ from __future__ import print_function
 
 import unittest
 
-from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
-from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.contrib.data.python.ops import indexed_dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.platform import test
 
 
 class IndexedDatasetOpsTest(test.TestCase):
 
   def testLowLevelIndexedDatasetOps(self):
-    identity = gen_dataset_ops.identity_indexed_dataset(
+    identity = ged_ops.experimental_identity_indexed_dataset(
         ops.convert_to_tensor(16, dtype=dtypes.uint64))
-    handle = gen_dataset_ops.materialized_index_dataset_handle(
+    handle = ged_ops.experimental_materialized_index_dataset_handle(
         container="",
         shared_name="",
         output_types=[dtypes.uint64],
         output_shapes=[[]])
-    materialize = gen_dataset_ops.indexed_dataset_materialize(identity, handle)
+    materialize = ged_ops.experimental_indexed_dataset_materialize(
+        identity, handle)
     index = array_ops.placeholder(dtypes.uint64)
-    get_op = gen_dataset_ops.indexed_dataset_get(
+    get_op = ged_ops.experimental_indexed_dataset_get(
         handle, index, output_types=[dtypes.uint64], output_shapes=[[]])
 
     with self.cached_session() as sess:
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index a14781cd93..5cd1ed542b 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -78,7 +78,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":batching",
-        ":gen_dataset_ops",
         ":interleave_ops",
         ":optimization",
         ":parsing_ops",
@@ -86,6 +85,7 @@ py_library(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
@@ -148,8 +148,7 @@ py_library(
     srcs = ["error_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":contrib_op_loader",
-        ":gen_dataset_ops",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
@@ -179,12 +178,11 @@ py_library(
     srcs = ["interleave_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":contrib_op_loader",
-        ":gen_dataset_ops",
         ":random_ops",
         "//tensorflow/contrib/stateless",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:util",
@@ -199,9 +197,8 @@ py_library(
     srcs = ["optimization.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":contrib_op_loader",
-        ":gen_dataset_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
@@ -304,8 +301,7 @@ py_library(
     srcs = ["threadpool.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":contrib_op_loader",
-        ":gen_dataset_ops",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
@@ -321,9 +317,8 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":contrib_op_loader",
-        ":gen_dataset_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
@@ -342,47 +337,11 @@ py_library(
     ],
 )
 
-tf_gen_op_wrapper_py(
-    name = "gen_dataset_ops",
-    out = "gen_dataset_ops.py",
-    deps = [
-        "//tensorflow/contrib/data:dataset_ops_op_lib",
-        "//tensorflow/contrib/data:indexed_dataset_ops_op_lib",
-    ],
-)
-
-tf_kernel_library(
-    name = "dataset_ops_kernels",
-    deps = [
-        "//tensorflow/contrib/data/kernels:dataset_kernels",
-        "//tensorflow/core:framework",
-    ],
-    alwayslink = 1,
-)
-
-tf_custom_op_py_library(
-    name = "contrib_op_loader",
-    srcs = ["contrib_op_loader.py"],
-    dso = ["//tensorflow/contrib/data:_dataset_ops.so"],
-    kernels = [
-        ":dataset_ops_kernels",
-        "//tensorflow/contrib/data:indexed_dataset_ops_op_lib",
-        "//tensorflow/contrib/data:dataset_ops_op_lib",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":gen_dataset_ops",
-        "//tensorflow/contrib/util:util_py",
-        "//tensorflow/python:platform",
-    ],
-)
-
 py_library(
     name = "indexed_dataset_ops",
     srcs = ["indexed_dataset_ops.py"],
     deps = [
-        ":contrib_op_loader",
-        ":gen_dataset_ops",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
@@ -394,7 +353,7 @@ py_library(
     name = "prefetching_ops",
     srcs = ["prefetching_ops.py"],
     deps = [
-        ":contrib_op_loader",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
diff --git a/tensorflow/contrib/data/python/ops/contrib_op_loader.py b/tensorflow/contrib/data/python/ops/contrib_op_loader.py
deleted file mode 100644
index 8f495a9dc9..0000000000
--- a/tensorflow/contrib/data/python/ops/contrib_op_loader.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Python helper for loading contrib ops and kernels."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.util import loader
-from tensorflow.python.platform import resource_loader
-
-_dataset_ops = loader.load_op_library(
-    resource_loader.get_path_to_datafile("../../_dataset_ops.so"))
diff --git a/tensorflow/contrib/data/python/ops/error_ops.py b/tensorflow/contrib/data/python/ops/error_ops.py
index 615dbcabd4..f962e623ee 100644
--- a/tensorflow/contrib/data/python/ops/error_ops.py
+++ b/tensorflow/contrib/data/python/ops/error_ops.py
@@ -17,9 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
-from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 
 
 def ignore_errors():
@@ -60,7 +59,7 @@ class _IgnoreErrorsDataset(dataset_ops.UnaryDataset):
     self._input_dataset = input_dataset
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.ignore_errors_dataset(
+    return gen_experimental_dataset_ops.experimental_ignore_errors_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
 
diff --git a/tensorflow/contrib/data/python/ops/indexed_dataset_ops.py b/tensorflow/contrib/data/python/ops/indexed_dataset_ops.py
index cc76ab0850..9c06474a2f 100644
--- a/tensorflow/contrib/data/python/ops/indexed_dataset_ops.py
+++ b/tensorflow/contrib/data/python/ops/indexed_dataset_ops.py
@@ -19,14 +19,13 @@ from __future__ import print_function
 
 import abc
 
-from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
-from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 
 
 class MaterializedIndexedDataset(object):
@@ -57,7 +56,7 @@ class MaterializedIndexedDataset(object):
       A tensor containing the values corresponding to `index`.
     """
     # TODO(saeta): nest.pack_sequence_as(...)
-    return gen_dataset_ops.indexed_dataset_get(
+    return ged_ops.experimental_indexed_dataset_get(
         self._materialized_resource,
         index,
         output_types=nest.flatten(
@@ -90,16 +89,18 @@ class IndexedDataset(dataset_ops.Dataset):
       container = ""
     if shared_name is None:
       shared_name = ""
-    materialized_resource = gen_dataset_ops.materialized_index_dataset_handle(
-        container=container,
-        shared_name=shared_name,
-        output_types=nest.flatten(
-            sparse.as_dense_types(self.output_types, self.output_classes)),
-        output_shapes=nest.flatten(
-            sparse.as_dense_types(self.output_shapes, self.output_classes)))
+    materialized_resource = (
+        ged_ops.experimental_materialized_index_dataset_handle(
+            container=container,
+            shared_name=shared_name,
+            output_types=nest.flatten(
+                sparse.as_dense_types(self.output_types, self.output_classes)),
+            output_shapes=nest.flatten(
+                sparse.as_dense_types(self.output_shapes,
+                                      self.output_classes))))
 
     with ops.colocate_with(materialized_resource):
-      materializer = gen_dataset_ops.indexed_dataset_materialize(
+      materializer = ged_ops.experimental_indexed_dataset_materialize(
           self._as_variant_tensor(), materialized_resource)
     return MaterializedIndexedDataset(materialized_resource, materializer,
                                       self.output_classes, self.output_types,
@@ -170,7 +171,7 @@ class IdentityIndexedDataset(IndexedDataset):
     return tensor_shape.scalar()
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.identity_indexed_dataset(self._size)
+    return ged_ops.experimental_identity_indexed_dataset(self._size)
 
   def _inputs(self):
     return []
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index bfa3fdf543..1ee9db1aa8 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -18,8 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import stateless
-from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
-from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.contrib.data.python.ops import random_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
@@ -28,6 +26,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
 
@@ -167,10 +166,12 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset):
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
-    return gen_dataset_ops.directed_interleave_dataset(
-        self._selector_input._as_variant_tensor(),
-        [data_input._as_variant_tensor() for data_input in self._data_inputs],
-        **dataset_ops.flat_structure(self))
+    return (
+        gen_experimental_dataset_ops.experimental_directed_interleave_dataset(
+            self._selector_input._as_variant_tensor(), [
+                data_input._as_variant_tensor()
+                for data_input in self._data_inputs
+            ], **dataset_ops.flat_structure(self)))
     # pylint: enable=protected-access
 
   def _inputs(self):
diff --git a/tensorflow/contrib/data/python/ops/optimization.py b/tensorflow/contrib/data/python/ops/optimization.py
index 3eb172acd5..7f5ce97228 100644
--- a/tensorflow/contrib/data/python/ops/optimization.py
+++ b/tensorflow/contrib/data/python/ops/optimization.py
@@ -17,12 +17,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
-from tensorflow.contrib.data.python.ops import gen_dataset_ops as contrib_gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 
 # A constant that can be used to enable auto-tuning.
 AUTOTUNE = -1
@@ -97,7 +96,7 @@ class _AssertNextDataset(dataset_ops.UnaryDataset):
         transformations, dtype=dtypes.string, name="transformations")
 
   def _as_variant_tensor(self):
-    return contrib_gen_dataset_ops.assert_next_dataset(
+    return gen_experimental_dataset_ops.experimental_assert_next_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._transformations,
         **dataset_ops.flat_structure(self))
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 58395879e6..46f82e453a 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -19,8 +19,6 @@ from __future__ import print_function
 
 import warnings
 
-from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
-from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
@@ -32,7 +30,8 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gen_dataset_ops as core_gen_dataset_ops
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import resource_variable_ops
 
 
@@ -64,7 +63,7 @@ def function_buffering_resource(string_arg,
   """
   if shared_name is None:
     shared_name = ""
-  return gen_dataset_ops.function_buffering_resource(
+  return ged_ops.experimental_function_buffering_resource(
       string_arg=string_arg,
       target_device=target_device,
       shared_name=shared_name,
@@ -78,14 +77,14 @@ def function_buffering_resource(string_arg,
 def function_buffering_resource_get_next(function_buffer_resource,
                                          output_types,
                                          name=None):
-  return gen_dataset_ops.function_buffering_resource_get_next(
+  return ged_ops.experimental_function_buffering_resource_get_next(
       function_buffer_resource=function_buffer_resource,
       output_types=output_types,
       name=name)
 
 
 def function_buffering_resource_reset(function_buffer_resource, name=None):
-  return gen_dataset_ops.function_buffering_resource_reset(
+  return ged_ops.experimental_function_buffering_resource_reset(
       function_buffer_resource=function_buffer_resource, name=name)
 
 
@@ -136,7 +135,7 @@ class _PrefetchToDeviceIterator(object):
       ret = remote_iterator.get_next()
       return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
-    iterator_device = gen_dataset_ops.iterator_get_device(
+    iterator_device = ged_ops.experimental_iterator_get_device(
         self._input_iterator._iterator_resource)
 
     with ops.device(device):
@@ -162,10 +161,11 @@ class _PrefetchToDeviceIterator(object):
     if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
       warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
 
-    flat_ret = gen_dataset_ops.function_buffering_resource_get_next(
+    flat_ret = ged_ops.experimental_function_buffering_resource_get_next(
         self._buffering_resource,
-        output_types=nest.flatten(sparse.as_dense_types(
-            self.output_types, self.output_classes)), name=name)
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        name=name)
 
     ret = sparse.deserialize_sparse_tensors(
         nest.pack_sequence_as(self.output_types, flat_ret),
@@ -219,7 +219,7 @@ class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
                buffer_size):
     with ops.device("/device:CPU:0"):
       super(_PrefetchToDeviceEagerIterator, self).__init__(input_dataset)
-      input_iterator_handle = core_gen_dataset_ops.iterator_to_string_handle(
+      input_iterator_handle = gen_dataset_ops.iterator_to_string_handle(
           self._resource)
 
     self._device = device
@@ -238,7 +238,8 @@ class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
       self._buffering_resource = function_buffering_resource(
           f=_prefetch_fn,
           output_types=self._flat_output_types,
-          target_device=gen_dataset_ops.iterator_get_device(self._resource),
+          target_device=ged_ops.experimental_iterator_get_device(
+              self._resource),
           string_arg=input_iterator_handle,
           buffer_size=buffer_size,
           shared_name=iterator_ops._generate_shared_name(
@@ -252,7 +253,7 @@ class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
     # TODO(b/77291417): Fix
     with context.execution_mode(context.SYNC):
       with ops.device(self._device):
-        ret = gen_dataset_ops.function_buffering_resource_get_next(
+        ret = ged_ops.experimental_function_buffering_resource_get_next(
             function_buffer_resource=self._buffering_resource,
             output_types=self._flat_output_types)
       return sparse.deserialize_sparse_tensors(
@@ -409,12 +410,12 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
       """
       # pylint: disable=protected-access
       ds_variant = self._input_dataset._as_variant_tensor()
-      resource = core_gen_dataset_ops.anonymous_iterator(
+      resource = gen_dataset_ops.anonymous_iterator(
           output_types=self._flat_output_types,
           output_shapes=self._flat_output_shapes)
       with ops.control_dependencies(
-          [core_gen_dataset_ops.make_iterator(ds_variant, resource)]):
-        return core_gen_dataset_ops.iterator_to_string_handle(resource)
+          [gen_dataset_ops.make_iterator(ds_variant, resource)]):
+        return gen_dataset_ops.iterator_to_string_handle(resource)
 
     @function.Defun()
     def _remote_init_func():
@@ -463,7 +464,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
       Returns:
         Tensor constant 0
       """
-      iterator_resource = core_gen_dataset_ops.iterator_from_string_handle_v2(
+      iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
           string_handle,
           output_types=self._flat_output_types,
           output_shapes=self._flat_output_shapes)
@@ -504,7 +505,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
 
   def _as_variant_tensor(self):
     with ops.device(self._target_device):
-      return core_gen_dataset_ops.generator_dataset(
+      return gen_dataset_ops.generator_dataset(
           self._init_captured_args,
           self._next_captured_args,
           self._finalize_captured_args,
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index d9d06e2703..360971e200 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -23,7 +23,6 @@ import csv
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import batching
-from tensorflow.contrib.data.python.ops import gen_dataset_ops as contrib_gen_dataset_ops
 from tensorflow.contrib.data.python.ops import interleave_ops
 from tensorflow.contrib.data.python.ops import optimization
 from tensorflow.contrib.data.python.ops import parsing_ops
@@ -38,6 +37,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.util import deprecation
 
@@ -629,7 +629,7 @@ class CsvDataset(dataset_ops.DatasetSource):
 
   def _as_variant_tensor(self):
     # Constructs graph node for the dataset op.
-    return contrib_gen_dataset_ops.csv_dataset(
+    return gen_experimental_dataset_ops.experimental_csv_dataset(
         filenames=self._filenames,
         record_defaults=self._record_defaults,
         buffer_size=self._buffer_size,
@@ -1013,7 +1013,7 @@ class LMDBDataset(dataset_ops.DatasetSource):
         filenames, dtype=dtypes.string, name="filenames")
 
   def _as_variant_tensor(self):
-    return contrib_gen_dataset_ops.lmdb_dataset(
+    return gen_experimental_dataset_ops.experimental_lmdb_dataset(
         self._filenames,
         output_types=nest.flatten(self.output_types),
         output_shapes=nest.flatten(self.output_shapes))
diff --git a/tensorflow/contrib/data/python/ops/threadpool.py b/tensorflow/contrib/data/python/ops/threadpool.py
index 9d165ad52a..f73c3fd9cb 100644
--- a/tensorflow/contrib/data/python/ops/threadpool.py
+++ b/tensorflow/contrib/data/python/ops/threadpool.py
@@ -19,10 +19,9 @@ from __future__ import print_function
 
 import threading
 
-from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
-from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import resource_variable_ops
 
 _uid_counter = 0
@@ -47,7 +46,7 @@ class PrivateThreadPool(object):
     """Creates a `PrivateThreadPool` with the given number of threads."""
     if context.executing_eagerly():
       shared_name = _generate_shared_name("privatethreadpool")
-      self._resource = gen_dataset_ops.thread_pool_handle(
+      self._resource = ged_ops.experimental_thread_pool_handle(
           num_threads=num_threads,
           max_intra_op_parallelism=max_intra_op_parallelism,
           display_name=display_name,
@@ -55,7 +54,7 @@ class PrivateThreadPool(object):
       self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
           handle=self._resource, handle_device=context.context().device_name)
     else:
-      self._resource = gen_dataset_ops.thread_pool_handle(
+      self._resource = ged_ops.experimental_thread_pool_handle(
           num_threads=num_threads,
           max_intra_op_parallelism=max_intra_op_parallelism,
           display_name=display_name)
@@ -70,7 +69,7 @@ class _ThreadPoolDataset(dataset_ops.UnaryDataset):
     self._thread_pool = thread_pool
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.thread_pool_dataset(
+    return ged_ops.experimental_thread_pool_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._thread_pool._resource,  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
diff --git a/tensorflow/contrib/data/python/ops/unique.py b/tensorflow/contrib/data/python/ops/unique.py
index bad67a580d..ed363a7090 100644
--- a/tensorflow/contrib/data/python/ops/unique.py
+++ b/tensorflow/contrib/data/python/ops/unique.py
@@ -17,10 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
-from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import gen_experimental_dataset_ops
 
 
 def unique():
@@ -61,7 +60,7 @@ class _UniqueDataset(dataset_ops.UnaryDataset):
           "`tf.int32`, `tf.int64`, or `tf.string` component.")
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.unique_dataset(
+    return gen_experimental_dataset_ops.experimental_unique_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index ca247dc56b..50fe308b73 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1039,6 +1039,7 @@ tf_gen_op_libs(
         "dataset_ops",
         "decode_proto_ops",
         "encode_proto_ops",
+        "experimental_dataset_ops",
         "function_ops",
         "functional_ops",
         "image_ops",
@@ -1169,6 +1170,7 @@ cc_library(
         ":dataset_ops_op_lib",
         ":decode_proto_ops_op_lib",
         ":encode_proto_ops_op_lib",
+        ":experimental_dataset_ops_op_lib",
         ":function_ops_op_lib",
         ":functional_ops_op_lib",
         ":image_ops_op_lib",
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalAssertNextDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalAssertNextDataset.pbtxt
new file mode 100644
index 0000000000..fa8fc96bb2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalAssertNextDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalAssertNextDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalCSVDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalCSVDataset.pbtxt
new file mode 100644
index 0000000000..5fd88e7a0c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalCSVDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalCSVDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalDirectedInterleaveDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalDirectedInterleaveDataset.pbtxt
new file mode 100644
index 0000000000..ac1f9719fe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalDirectedInterleaveDataset.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "ExperimentalDirectedInterleaveDataset"
+  in_arg {
+    name: "selector_input_dataset"
+    description: <<END
+A dataset of scalar `DT_INT64` elements that determines which of the
+`N` data inputs should produce the next output element.
+END
+  }
+  in_arg {
+    name: "data_input_datasets"
+    description: <<END
+`N` datasets with the same type that will be interleaved according to
+the values of `selector_input_dataset`.
+END
+  }
+  summary: <<END
+A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResource.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResource.pbtxt
new file mode 100644
index 0000000000..66511eff60
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResource.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "ExperimentalFunctionBufferingResource"
+  in_arg {
+    name: "string_arg"
+    description: <<END
+String argument to the function call.
+END
+  }
+  in_arg {
+    name: "target_device"
+    description: <<END
+Target device to execute the function on.
+END
+  }
+  out_arg {
+    name: "resource"
+    description: <<END
+Handle to the resource created.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this resource will be shared under the given name across
+multiple sessions.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this resource is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "f"
+    description: <<END
+Function to be executed.
+END
+  }
+  attr {
+    name: "buffer_size"
+    description: <<END
+Size of the buffer.
+END
+  }
+  attr {
+    name: "output_types"
+    description: <<END
+The type list for the return values.
+END
+  }
+  summary: <<END
+Creates a resource that fills up a buffer by making function calls.
+END
+  visibility: HIDDEN
+}
+
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt
new file mode 100644
index 0000000000..bf4b66b22b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "ExperimentalFunctionBufferingResourceGetNext"
+  in_arg {
+    name: "function_buffer_resource"
+    description: <<END
+The FunctionBufferingResource handle.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A list of return values.
+END
+  }
+  attr {
+    name: "output_types"
+    description: <<END
+The type list for the return values.
+END
+  }
+  summary: <<END
+Gets the next element from a FunctionBufferingResource.
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt
new file mode 100644
index 0000000000..729718ddb3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "ExperimentalFunctionBufferingResourceReset"
+  in_arg {
+    name: "function_buffer_resource"
+    description: <<END
+The FunctionBufferingResource handle.
+END
+  }
+  summary: <<END
+Resets the FunctionBufferingResource.
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalIdentityIndexedDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalIdentityIndexedDataset.pbtxt
new file mode 100644
index 0000000000..fe266c111f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalIdentityIndexedDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalIdentityIndexedDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalIgnoreErrorsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalIgnoreErrorsDataset.pbtxt
new file mode 100644
index 0000000000..d42546516d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalIgnoreErrorsDataset.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "ExperimentalIgnoreErrorsDataset"
+  summary: <<END
+Creates a dataset that contains the elements of `input_dataset` ignoring errors.
+END
+  visibility: HIDDEN
+}
+
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalIndexedDatasetGet.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalIndexedDatasetGet.pbtxt
new file mode 100644
index 0000000000..e285f87e10
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalIndexedDatasetGet.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalIndexedDatasetGet"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalIndexedDatasetMaterialize.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalIndexedDatasetMaterialize.pbtxt
new file mode 100644
index 0000000000..60c32473b5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalIndexedDatasetMaterialize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalIndexedDatasetMaterialize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalIteratorGetDevice.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalIteratorGetDevice.pbtxt
new file mode 100644
index 0000000000..b72b229e9a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalIteratorGetDevice.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "ExperimentalIteratorGetDevice"
+  summary: <<END
+Returns the name of the device on which `resource` has been placed.
+END
+  visibility: HIDDEN
+}
+
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalLMDBDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalLMDBDataset.pbtxt
new file mode 100644
index 0000000000..b38b23a51d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalLMDBDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalLMDBDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalMaterializedIndexDatasetHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMaterializedIndexDatasetHandle.pbtxt
new file mode 100644
index 0000000000..9676b9d284
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMaterializedIndexDatasetHandle.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalMaterializedIndexDatasetHandle"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalThreadPoolDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalThreadPoolDataset.pbtxt
new file mode 100644
index 0000000000..d73b5bfda3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalThreadPoolDataset.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "ExperimentalThreadPoolDataset"
+  in_arg {
+    name: "thread_pool"
+    description: <<END
+A resource produced by the ThreadPoolHandle op.
+END
+  }
+  summary: <<END
+Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalThreadPoolHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalThreadPoolHandle.pbtxt
new file mode 100644
index 0000000000..48bf93406c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalThreadPoolHandle.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "ExperimentalThreadPoolHandle"
+  out_arg {
+    name: "handle"
+    description: <<END
+A resource that can be consumed by one or more ExperimentalThreadPoolDataset
+ops.
+END
+  }
+  attr {
+    name: "num_threads"
+    description: <<END
+The number of threads in the thread pool.
+END
+  }
+  attr {
+    name: "max_intra_op_parallelism"
+    description: <<END
+The maximum degree of parallelism to use within operations that execute on this
+threadpool.
+END
+  }
+  attr {
+    name: "display_name"
+    description: <<END
+A human-readable name for the threads that may be visible in some
+visualizations.
+threadpool.
+END
+  }
+  summary: <<END
+Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalUniqueDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalUniqueDataset.pbtxt
new file mode 100644
index 0000000000..68ed797a0c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalUniqueDataset.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "ExperimentalUniqueDataset"
+  summary: <<END
+Creates a dataset that contains the unique elements of `input_dataset`.
+END
+  visibility: HIDDEN
+}
+
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 87efdff789..6333853cdf 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -765,6 +765,7 @@ tf_kernel_library(
         ":window_dataset_op",
         ":writer_ops",
         ":zip_dataset_op",
+        "//tensorflow/core/kernels/data/experimental:dataset_kernels",
     ],
 )
 
diff --git a/tensorflow/contrib/data/kernels/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
similarity index 52%
rename from tensorflow/contrib/data/kernels/BUILD
rename to tensorflow/core/kernels/data/experimental/BUILD
index ec6cb37193..43406db3ed 100644
--- a/tensorflow/contrib/data/kernels/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -1,22 +1,26 @@
 # Description:
-#   Contains kernels for datasets and iterators.
+#   Contains experimental kernels for datasets and iterators.
 package(default_visibility = ["//tensorflow:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_kernel_library",
+)
+
 cc_library(
     name = "indexed_dataset_headers",
     hdrs = ["indexed_dataset.h"],
     deps = [
-        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:framework",
         "//third_party/eigen3",
-        "@protobuf_archive//:protobuf_headers",
     ],
 )
 
-cc_library(
+tf_kernel_library(
     name = "indexed_dataset",
     srcs = [
         "identity_indexed_dataset.cc",
@@ -24,103 +28,102 @@ cc_library(
     ],
     deps = [
         ":indexed_dataset_headers",
-        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//third_party/eigen3",
-        "@protobuf_archive//:protobuf_headers",
     ],
-    alwayslink = 1,
 )
 
-cc_library(
+tf_kernel_library(
     name = "prefetching_kernels",
     srcs = ["prefetching_kernels.cc"],
     deps = [
-        "//tensorflow/core:core_cpu_headers_lib",
-        "//tensorflow/core:framework_headers_lib",
-        "//third_party/eigen3",
-        "@protobuf_archive//:protobuf_headers",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ],
-    alwayslink = 1,
 )
 
-cc_library(
+tf_kernel_library(
     name = "directed_interleave_dataset_op",
     srcs = ["directed_interleave_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//third_party/eigen3",
-        "@protobuf_archive//:protobuf_headers",
     ],
-    alwayslink = 1,
 )
 
-cc_library(
+tf_kernel_library(
     name = "csv_dataset_op",
     srcs = ["csv_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//third_party/eigen3",
-        "@protobuf_archive//:protobuf_headers",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ],
-    alwayslink = 1,
 )
 
-cc_library(
+tf_kernel_library(
     name = "ignore_errors_dataset_op",
     srcs = ["ignore_errors_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
         "//third_party/eigen3",
-        "@protobuf_archive//:protobuf_headers",
     ],
-    alwayslink = 1,
 )
 
-cc_library(
+tf_kernel_library(
     name = "lmdb_dataset_op",
     srcs = ["lmdb_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//third_party/eigen3",
         "@lmdb",
-        "@protobuf_archive//:protobuf_headers",
     ],
 )
 
-cc_library(
+tf_kernel_library(
     name = "threadpool_dataset_op",
     srcs = ["threadpool_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//third_party/eigen3",
-        "@protobuf_archive//:protobuf_headers",
     ],
-    alwayslink = 1,
 )
 
-cc_library(
+tf_kernel_library(
     name = "unique_dataset_op",
     srcs = ["unique_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//third_party/eigen3",
-        "@protobuf_archive//:protobuf_headers",
     ],
-    alwayslink = 1,
 )
 
-cc_library(
+tf_kernel_library(
     name = "assert_next_dataset_op",
     srcs = ["assert_next_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
         "//third_party/eigen3",
-        "@protobuf_archive//:protobuf_headers",
     ],
-    alwayslink = 1,
 )
 
-cc_library(
+tf_kernel_library(
     name = "dataset_kernels",
     deps = [
         ":assert_next_dataset_op",
@@ -132,8 +135,5 @@ cc_library(
         ":prefetching_kernels",
         ":threadpool_dataset_op",
         ":unique_dataset_op",
-        "//tensorflow/core:framework_headers_lib",
-        "//third_party/eigen3",
-        "@protobuf_archive//:protobuf_headers",
     ],
 )
diff --git a/tensorflow/contrib/data/kernels/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
similarity index 97%
rename from tensorflow/contrib/data/kernels/assert_next_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
index c19a609780..3511cca0f5 100644
--- a/tensorflow/contrib/data/kernels/assert_next_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
@@ -147,8 +147,9 @@ class AssertNextDatasetOp : public UnaryDatasetOpKernel {
   std::vector<PartialTensorShape> output_shapes_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("AssertNextDataset").Device(DEVICE_CPU),
-                        AssertNextDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalAssertNextDataset").Device(DEVICE_CPU),
+    AssertNextDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/contrib/data/kernels/csv_dataset_op.cc b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
similarity index 99%
rename from tensorflow/contrib/data/kernels/csv_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
index 21ec50fb6b..7451ca4cb1 100644
--- a/tensorflow/contrib/data/kernels/csv_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
@@ -852,7 +852,8 @@ class CSVDatasetOp : public DatasetOpKernel {
 };  // class CSVDatasetOp
 
 // Register the kernel implementation for CSVDataset.
-REGISTER_KERNEL_BUILDER(Name("CSVDataset").Device(DEVICE_CPU), CSVDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("ExperimentalCSVDataset").Device(DEVICE_CPU),
+                        CSVDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
similarity index 98%
rename from tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
index a5321620bf..c47a9099c4 100644
--- a/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
@@ -272,8 +272,9 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("DirectedInterleaveDataset").Device(DEVICE_CPU),
-                        DirectedInterleaveDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalDirectedInterleaveDataset").Device(DEVICE_CPU),
+    DirectedInterleaveDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/contrib/data/kernels/identity_indexed_dataset.cc b/tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc
similarity index 96%
rename from tensorflow/contrib/data/kernels/identity_indexed_dataset.cc
rename to tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc
index c3cb45dbf7..2141f118ca 100644
--- a/tensorflow/contrib/data/kernels/identity_indexed_dataset.cc
+++ b/tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/data/kernels/indexed_dataset.h"
+#include "tensorflow/core/kernels/data/experimental/indexed_dataset.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -147,8 +147,9 @@ class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("IdentityIndexedDataset").Device(DEVICE_CPU),
-                        IdentityIndexedDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalIdentityIndexedDataset").Device(DEVICE_CPU),
+    IdentityIndexedDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
similarity index 96%
rename from tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
index beec344534..b34377c642 100644
--- a/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
@@ -15,7 +15,6 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
 namespace data {
@@ -133,8 +132,9 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("IgnoreErrorsDataset").Device(DEVICE_CPU),
-                        IgnoreErrorsDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalIgnoreErrorsDataset").Device(DEVICE_CPU),
+    IgnoreErrorsDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/contrib/data/kernels/indexed_dataset.cc b/tensorflow/core/kernels/data/experimental/indexed_dataset.cc
similarity index 97%
rename from tensorflow/contrib/data/kernels/indexed_dataset.cc
rename to tensorflow/core/kernels/data/experimental/indexed_dataset.cc
index ced8ab0d60..75ea462f40 100644
--- a/tensorflow/contrib/data/kernels/indexed_dataset.cc
+++ b/tensorflow/core/kernels/data/experimental/indexed_dataset.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/data/kernels/indexed_dataset.h"
+#include "tensorflow/core/kernels/data/experimental/indexed_dataset.h"
 
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -361,12 +361,14 @@ class IndexedDatasetGet : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(
-    Name("MaterializedIndexDatasetHandle").Device(DEVICE_CPU),
+    Name("ExperimentalMaterializedIndexDatasetHandle").Device(DEVICE_CPU),
     MaterializedHandleOp);
-REGISTER_KERNEL_BUILDER(Name("IndexedDatasetMaterialize").Device(DEVICE_CPU),
-                        MaterializeDatasetOp);
-REGISTER_KERNEL_BUILDER(Name("IndexedDatasetGet").Device(DEVICE_CPU),
-                        IndexedDatasetGet);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalIndexedDatasetMaterialize").Device(DEVICE_CPU),
+    MaterializeDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalIndexedDatasetGet").Device(DEVICE_CPU),
+    IndexedDatasetGet);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/contrib/data/kernels/indexed_dataset.h b/tensorflow/core/kernels/data/experimental/indexed_dataset.h
similarity index 95%
rename from tensorflow/contrib/data/kernels/indexed_dataset.h
rename to tensorflow/core/kernels/data/experimental/indexed_dataset.h
index 7aa2d3fdbc..27a8360cbc 100644
--- a/tensorflow/contrib/data/kernels/indexed_dataset.h
+++ b/tensorflow/core/kernels/data/experimental/indexed_dataset.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_DATA_KERNELS_INDEXED_DATASET_H_
-#define TENSORFLOW_CONTRIB_DATA_KERNELS_INDEXED_DATASET_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
 
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -116,4 +116,4 @@ Status StoreIndexedDatasetInVariantTensor(IndexedDataset* dataset,
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_DATA_KERNELS_INDEXED_DATASET_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
diff --git a/tensorflow/contrib/data/kernels/lmdb_dataset_op.cc b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
similarity index 98%
rename from tensorflow/contrib/data/kernels/lmdb_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
index d233c1f8ec..8a88d32f0c 100644
--- a/tensorflow/contrib/data/kernels/lmdb_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
@@ -210,7 +210,8 @@ class LMDBDatasetOp : public DatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("LMDBDataset").Device(DEVICE_CPU), LMDBDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("ExperimentalLMDBDataset").Device(DEVICE_CPU),
+                        LMDBDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/contrib/data/kernels/prefetching_kernels.cc b/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
similarity index 95%
rename from tensorflow/contrib/data/kernels/prefetching_kernels.cc
rename to tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
index 96f1dd0059..2c6179d9f5 100644
--- a/tensorflow/contrib/data/kernels/prefetching_kernels.cc
+++ b/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
@@ -338,20 +338,20 @@ class FunctionBufferResourceHandleOp : public OpKernel {
   DataTypeVector output_types_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResource")
+REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResource")
                             .Device(DEVICE_CPU)
                             .HostMemory("resource")
                             .HostMemory("string_arg")
                             .HostMemory("target_device"),
                         FunctionBufferResourceHandleOp);
-REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResource")
+REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResource")
                             .Device(DEVICE_GPU)
                             .HostMemory("resource")
                             .HostMemory("string_arg")
                             .HostMemory("target_device"),
                         FunctionBufferResourceHandleOp);
 #if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResource")
+REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResource")
                             .Device(DEVICE_SYCL)
                             .HostMemory("resource")
                             .HostMemory("string_arg")
@@ -403,16 +403,16 @@ class FunctionBufferingResourceGetNextOp : public AsyncOpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResourceGetNext")
+REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceGetNext")
                             .Device(DEVICE_CPU)
                             .HostMemory("function_buffer_resource"),
                         FunctionBufferingResourceGetNextOp);
-REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResourceGetNext")
+REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceGetNext")
                             .Device(DEVICE_GPU)
                             .HostMemory("function_buffer_resource"),
                         FunctionBufferingResourceGetNextOp);
 #if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResourceGetNext")
+REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceGetNext")
                             .Device(DEVICE_SYCL)
                             .HostMemory("function_buffer_resource"),
                         FunctionBufferingResourceGetNextOp);
@@ -440,16 +440,16 @@ class FunctionBufferingResourceResetOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResourceReset")
+REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceReset")
                             .Device(DEVICE_CPU)
                             .HostMemory("function_buffer_resource"),
                         FunctionBufferingResourceResetOp);
-REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResourceReset")
+REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceReset")
                             .Device(DEVICE_GPU)
                             .HostMemory("function_buffer_resource"),
                         FunctionBufferingResourceResetOp);
 #if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResourceReset")
+REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceReset")
                             .Device(DEVICE_SYCL)
                             .HostMemory("function_buffer_resource"),
                         FunctionBufferingResourceResetOp);
@@ -473,8 +473,9 @@ class IteratorGetDeviceOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("IteratorGetDevice").Device(DEVICE_CPU),
-                        IteratorGetDeviceOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalIteratorGetDevice").Device(DEVICE_CPU),
+    IteratorGetDeviceOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
similarity index 97%
rename from tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 30fa97a636..c80493d3a1 100644
--- a/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -209,10 +209,11 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("ThreadPoolHandle").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("ExperimentalThreadPoolHandle").Device(DEVICE_CPU),
                         ThreadPoolHandleOp);
-REGISTER_KERNEL_BUILDER(Name("ThreadPoolDataset").Device(DEVICE_CPU),
-                        ThreadPoolDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalThreadPoolDataset").Device(DEVICE_CPU),
+    ThreadPoolDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/contrib/data/kernels/unique_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
similarity index 97%
rename from tensorflow/contrib/data/kernels/unique_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
index 57fc5697a4..cd612e0eb2 100644
--- a/tensorflow/contrib/data/kernels/unique_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
@@ -199,8 +199,9 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel {
             HANDLE_TYPE(DT_INT64);
             HANDLE_TYPE(DT_STRING);
             default:
-              LOG(FATAL) << "UniqueDataset unhandled data type: "
-                         << DataTypeString(lhs.dtype());
+              DCHECK(false) << "UniqueDataset unhandled data type: "
+                            << DataTypeString(lhs.dtype());
+              return false;
           }
         }
       };
@@ -215,7 +216,7 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("UniqueDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("ExperimentalUniqueDataset").Device(DEVICE_CPU),
                         UniqueDatasetOp);
 
 }  // namespace
diff --git a/tensorflow/contrib/data/ops/dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
similarity index 62%
rename from tensorflow/contrib/data/ops/dataset_ops.cc
rename to tensorflow/core/ops/experimental_dataset_ops.cc
index d1a771f005..f6bd5dce26 100644
--- a/tensorflow/contrib/data/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -17,24 +17,16 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("DirectedInterleaveDataset")
+REGISTER_OP("ExperimentalDirectedInterleaveDataset")
     .Input("selector_input_dataset: variant")
     .Input("data_input_datasets: N * variant")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("N: int >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
-
-selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines
-  which of the `N` data inputs should produce the next output element.
-data_input_datasets: `N` datasets with the same type that will be interleaved
-  according to the values of `selector_input_dataset`.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("CSVDataset")
+REGISTER_OP("ExperimentalCSVDataset")
     .Input("filenames: string")
     .Input("compression_type: string")
     .Input("buffer_size: int64")
@@ -76,35 +68,26 @@ REGISTER_OP("CSVDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("IgnoreErrorsDataset")
+REGISTER_OP("ExperimentalIgnoreErrorsDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that contains the elements of `input_dataset` ignoring errors.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("UniqueDataset")
+REGISTER_OP("ExperimentalUniqueDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that contains the unique elements of `input_dataset`.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("IteratorGetDevice")
+REGISTER_OP("ExperimentalIteratorGetDevice")
     .Input("resource: resource")
     .Output("device: string")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Returns the name of the device on which `resource` has been placed.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("FunctionBufferingResource")
+REGISTER_OP("ExperimentalFunctionBufferingResource")
     .Input("string_arg: string")
     .Input("target_device: string")
     .Output("resource: resource")
@@ -113,77 +96,36 @@ REGISTER_OP("FunctionBufferingResource")
     .Attr("f: func")
     .Attr("buffer_size: int")
     .Attr("output_types: list(type)")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Creates a resource that fills up a buffer by making function calls.
-
-string_arg: String argument to the function call.
-target_device: Target device to execute the function on.
-resource: Handle to the resource created.
-f: Function to be executed.
-buffer_size: Size of the buffer.
-container: If non-empty, this resource is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this resource will be shared under the given name
-  across multiple sessions.
-output_types: The type list for the return values.
-)doc");
-
-REGISTER_OP("FunctionBufferingResourceGetNext")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("ExperimentalFunctionBufferingResourceGetNext")
     .Input("function_buffer_resource: resource")
     .Attr("output_types: list(type)")
     .Output("output: output_types")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Gets the next element from a FunctionBufferingResource.
+    .SetShapeFn(shape_inference::UnknownShape);
 
-function_buffer_resource: The FunctionBufferingResource handle.
-output: A list of return values.
-output_types: The type list for the return values.
-)doc");
-
-REGISTER_OP("FunctionBufferingResourceReset")
+REGISTER_OP("ExperimentalFunctionBufferingResourceReset")
     .Input("function_buffer_resource: resource")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Resets the FunctionBufferingResource.
-
-function_buffer_resource: The FunctionBufferingResource handle.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
-REGISTER_OP("ThreadPoolDataset")
+REGISTER_OP("ExperimentalThreadPoolDataset")
     .Input("input_dataset: variant")
     .Input("thread_pool: resource")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that uses a custom thread pool to compute `input_dataset`.
-
-handle: A resource produced by the ThreadPoolHandle op.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("ThreadPoolHandle")
+REGISTER_OP("ExperimentalThreadPoolHandle")
     .Output("handle: resource")
     .SetShapeFn(shape_inference::ScalarShape)
     .Attr("num_threads: int")
     .Attr("max_intra_op_parallelism: int = 1")
     .Attr("display_name: string")
     .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Doc(R"doc(
-Creates a custom thread pool with the given number of threads.
-
-handle: A resource that can be consumed by one or more ThreadPoolDataset ops.
-num_threads: The number of threads in the thread pool.
-max_intra_op_parallelism: The maximum degree of parallelism to use within
-  operations that execute on this threadpool.
-display_name: A human-readable name for the threads that may be visible in
-  some visualizations.
-)doc");
-
-REGISTER_OP("AssertNextDataset")
+    .Attr("shared_name: string = ''");
+
+REGISTER_OP("ExperimentalAssertNextDataset")
     .Input("input_dataset: variant")
     .Input("transformations: string")
     .Output("handle: variant")
@@ -196,7 +138,7 @@ REGISTER_OP("AssertNextDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("LMDBDataset")
+REGISTER_OP("ExperimentalLMDBDataset")
     .Input("filenames: string")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
@@ -205,4 +147,61 @@ REGISTER_OP("LMDBDataset")
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalIdentityIndexedDataset")
+    .Input("size: uint64")
+    .Output("handle: variant")
+    .SetIsStateful()
+    .SetShapeFn(
+        shape_inference::ScalarShape);  // TODO(saeta): check input shapes.
+
+///////////////////////////////////////////////////////////////////////////////
+//     IndexedDataset Internals
+///////////////////////////////////////////////////////////////////////////////
+
+// Creates the handle.
+REGISTER_OP("ExperimentalMaterializedIndexDatasetHandle")
+    .Output("handle: resource")
+    .Attr("container: string")
+    .Attr("shared_name: string")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+// Actually materialize the materialize handle.
+REGISTER_OP("ExperimentalIndexedDatasetMaterialize")
+    .Input("dataset: variant")
+    .Input("materialized: resource")
+    .SetShapeFn(shape_inference::NoOutputs);
+
+namespace {
+
+Status GetShapeFn(shape_inference::InferenceContext* c) {
+  shape_inference::ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+  std::vector<PartialTensorShape> output_shapes;
+  TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+  if (output_shapes.size() != c->num_outputs()) {
+    return errors::InvalidArgument(
+        "`output_shapes` must be the same length as `output_types` (",
+        output_shapes.size(), " vs. ", c->num_outputs());
+  }
+  for (size_t i = 0; i < output_shapes.size(); ++i) {
+    shape_inference::ShapeHandle output_shape_handle;
+    TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+        output_shapes[i], &output_shape_handle));
+    c->set_output(static_cast<int>(i), output_shape_handle);
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+REGISTER_OP("ExperimentalIndexedDatasetGet")
+    .Input("materialized: resource")
+    .Input("index: uint64")
+    .Output("components: output_types")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(GetShapeFn);
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 410b3a553a..91cafea042 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1638,6 +1638,15 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "experimental_dataset_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow:__subpackages__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "image_ops_gen",
     visibility = ["//learning/brain/python/ops:__pkg__"],
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index f1de22300b..7d925a8fef 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -67,7 +67,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/data/python/kernel_tests/serialization:dataset_serialization_test_base",
     "//tensorflow/contrib/data/python/kernel_tests:stats_dataset_test_base",
     "//tensorflow/contrib/data/python/kernel_tests:test_utils",
-    "//tensorflow/contrib/data/python/ops:contrib_op_loader",
     "//tensorflow/contrib/eager/python/examples:examples_pip",
     "//tensorflow/contrib/eager/python:evaluator",
     "//tensorflow/contrib/gan:gan",
-- 
GitLab


From 9ef0ec921cc6de670fd2fdba1be49e0eca2a1043 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 08:47:34 -0700
Subject: [PATCH 140/570] internal change only

PiperOrigin-RevId: 214941829
---
 tensorflow/contrib/tpu/profiler/op_profile.proto | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tpu/profiler/op_profile.proto b/tensorflow/contrib/tpu/profiler/op_profile.proto
index b25d06dda8..292108f949 100644
--- a/tensorflow/contrib/tpu/profiler/op_profile.proto
+++ b/tensorflow/contrib/tpu/profiler/op_profile.proto
@@ -66,8 +66,8 @@ message Metrics {
   //  - it does not reveal the peak core FLOPS of the hardware
   double flops = 2;
 
-  // The VMEM bandwidth used to load operands from HBM, as a fraction of
-  // thereotical VMEM bandwidth on the specific hardware.
+  // The memory bandwidth used to load operands, as a fraction of
+  // thereotical memory bandwidth on the specific hardware.
   double memory_bandwidth = 3;
 
   double raw_time = 11;   // Elapsed core-time in picoseconds.
-- 
GitLab


From 35459cbaa0f654393b242c5357f6939b05267ab8 Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Fri, 28 Sep 2018 08:56:06 -0700
Subject: [PATCH 141/570] Build TF with XLA support by default.

Building binaries with XLA support does not enable it by default, it
simply makes it accessible via default binary builds.

PiperOrigin-RevId: 214942824
---
 configure.py                                      | 4 ++--
 tensorflow/tools/ci_build/builds/run_pip_tests.sh | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/configure.py b/configure.py
index 55fce8b93b..9899ae10e8 100644
--- a/configure.py
+++ b/configure.py
@@ -1504,6 +1504,7 @@ def main():
   if is_macos():
     environ_cp['TF_NEED_JEMALLOC'] = '0'
     environ_cp['TF_NEED_TENSORRT'] = '0'
+    environ_cp['TF_ENABLE_XLA'] = '0'
 
   # The numpy package on ppc64le uses OpenBLAS which has multi-threading
   # issues that lead to incorrect answers.  Set OMP_NUM_THREADS=1 at
@@ -1515,7 +1516,7 @@ def main():
   set_build_var(environ_cp, 'TF_NEED_JEMALLOC', 'jemalloc as malloc',
                 'with_jemalloc', True)
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
-                False, 'xla')
+                True, 'xla')
 
 
   set_action_env_var(environ_cp, 'TF_NEED_OPENCL_SYCL', 'OpenCL SYCL', False)
@@ -1624,4 +1625,3 @@ def main():
 
 if __name__ == '__main__':
   main()
-
diff --git a/tensorflow/tools/ci_build/builds/run_pip_tests.sh b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
index 17198a6560..7d5cf3f843 100755
--- a/tensorflow/tools/ci_build/builds/run_pip_tests.sh
+++ b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
@@ -111,7 +111,6 @@ bazel clean
 # virtualenv.
 export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
-export TF_ENABLE_XLA=0
 
 # Obtain the path to Python binary
 if [[ ${IS_VIRTUALENV} == "1" ]]; then
-- 
GitLab


From 97498f64ef097096b756c6b262f3ae38965e8685 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 28 Sep 2018 09:17:51 -0700
Subject: [PATCH 142/570] [TF:XLA] Add comment explaining why there is no
 PrimitiveTypeToDataType function.

PiperOrigin-RevId: 214945748
---
 tensorflow/compiler/tf2xla/type_util.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/compiler/tf2xla/type_util.h b/tensorflow/compiler/tf2xla/type_util.h
index bda667eb1f..6354216eee 100644
--- a/tensorflow/compiler/tf2xla/type_util.h
+++ b/tensorflow/compiler/tf2xla/type_util.h
@@ -25,6 +25,14 @@ namespace tensorflow {
 // Converts a Tensorflow DataType to an XLA PrimitiveType.
 Status DataTypeToPrimitiveType(DataType data_type, xla::PrimitiveType* type);
 
+// N.B.: there is intentionally no function to convert an XLA PrimitiveType to
+// a TensorFlow DataType. The mapping from TF types to XLA types is not
+// one-to-one: for example, both DT_INT8 and DT_QINT8 map to xla::S8. So the
+// inverse would not be a well-defined function. If you find that you want the
+// inverse mapping, then most likely you should be preserving the original
+// TensorFlow type, rather than trying to convert an XLA type into a TensorFlow
+// type.
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_TYPE_UTIL_H_
-- 
GitLab


From f4014108a310928cd897085a8bc7d757c641a1c3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 09:21:32 -0700
Subject: [PATCH 143/570] Update ops-related pbtxt files.

PiperOrigin-RevId: 214946257
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 415 ++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 415 ++++++++++++++++++
 2 files changed, 830 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 32ce31cf23..43c14d83b5 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -21531,6 +21531,421 @@ op {
     }
   }
 }
+op {
+  name: "ExperimentalAssertNextDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "transformations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalCSVDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "header"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "field_delim"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "use_quote_delim"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "na_value"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "select_cols"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "output_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalDirectedInterleaveDataset"
+  input_arg {
+    name: "selector_input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "data_input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalFunctionBufferingResource"
+  input_arg {
+    name: "string_arg"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "target_device"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "buffer_size"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalFunctionBufferingResourceGetNext"
+  input_arg {
+    name: "function_buffer_resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalFunctionBufferingResourceReset"
+  input_arg {
+    name: "function_buffer_resource"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIdentityIndexedDataset"
+  input_arg {
+    name: "size"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIgnoreErrorsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalIndexedDatasetGet"
+  input_arg {
+    name: "materialized"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "index"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIndexedDatasetMaterialize"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "materialized"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIteratorGetDevice"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "device"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalLMDBDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalMaterializedIndexDatasetHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "thread_pool"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalThreadPoolHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "num_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_intra_op_parallelism"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "display_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalUniqueDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "Expm1"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 02a7f8d717..abee803889 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -10038,6 +10038,421 @@ op {
     }
   }
 }
+op {
+  name: "ExperimentalAssertNextDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "transformations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalCSVDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "header"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "field_delim"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "use_quote_delim"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "na_value"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "select_cols"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "output_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalDirectedInterleaveDataset"
+  input_arg {
+    name: "selector_input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "data_input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalFunctionBufferingResource"
+  input_arg {
+    name: "string_arg"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "target_device"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "buffer_size"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalFunctionBufferingResourceGetNext"
+  input_arg {
+    name: "function_buffer_resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalFunctionBufferingResourceReset"
+  input_arg {
+    name: "function_buffer_resource"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIdentityIndexedDataset"
+  input_arg {
+    name: "size"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIgnoreErrorsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalIndexedDatasetGet"
+  input_arg {
+    name: "materialized"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "index"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIndexedDatasetMaterialize"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "materialized"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIteratorGetDevice"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "device"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalLMDBDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalMaterializedIndexDatasetHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "thread_pool"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalThreadPoolHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "num_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_intra_op_parallelism"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "display_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalUniqueDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "Expm1"
   input_arg {
-- 
GitLab


From 4eb53d3e5f7bec3c757a06d186ff31fe52083e6d Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Fri, 28 Sep 2018 09:27:29 -0700
Subject: [PATCH 144/570] Simplify eager/graph Layer.losses conditionals

Fixes an issue where losses created while executing eagerly were returned as unevaluated lambdas in a defun.

Lazily evaluates Layer losses by default when possible. Even when graph building this is generally a better thing to do (e.g. losses called in a while_loop).

Allows calls to Layer.add_loss when executing eagerly, but only for losses which are not conditional on inputs (no activity regularizers).

PiperOrigin-RevId: 214947108
---
 tensorflow/python/keras/engine/base_layer.py  | 157 ++++++++----------
 .../keras/engine/training_eager_test.py       |  14 ++
 .../python/keras/engine/training_test.py      |  12 ++
 tensorflow/python/layers/base.py              |  16 +-
 .../python/layers/convolutional_test.py       |  36 ++--
 tensorflow/python/layers/core_test.py         |   6 +-
 6 files changed, 140 insertions(+), 101 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index e98b131ae6..a75ce30d31 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import collections as collections_lib
 import enum  # pylint: disable=g-bad-import-order
+import functools
 import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
 
 import numpy as np
@@ -160,9 +161,13 @@ class Layer(checkpointable.CheckpointableBase):
     self._trainable_weights = []
     self._non_trainable_weights = []
     self._updates = []
-    # When executing eagerly, _losses is a list of zero-argument lambdas which
-    # return tensors. When using graph execution, _losses is a list of ops.
+    # A list of zero-argument lambdas which return Tensors, used for variable
+    # regularizers.
+    self._callable_losses = []
+    # A list of Tensors containing activity regularizers and losses manually
+    # added through `add_loss`. Empty when executing eagerly.
     self._losses = []
+    self._in_call = False  # Flag for error checking in add_loss
     self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
     self._call_fn_args = function_utils.fn_args(self.call)
     self._compute_previous_mask = ('mask' in self._call_fn_args or
@@ -359,20 +364,20 @@ class Layer(checkpointable.CheckpointableBase):
   def losses(self):
     """Losses which are associated with this `Layer`.
 
-    Note that when executing eagerly, getting this property evaluates
-    regularizers. When using graph execution, variable regularization ops have
-    already been created and are simply returned here.
+    Variable regularization tensors are created when this property is accessed,
+    so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+    propagate gradients back to the corresponding variables.
 
     Returns:
       A list of tensors.
     """
-    if context.executing_eagerly():
-      # _losses may only contain variable regularization losses when executing
-      # eagerly, and they have been saved as lambdas to be executed when
-      # requested.
-      return [regularizer() for regularizer in self._losses]
-    else:
-      return self._losses
+    collected_losses = []
+    collected_losses.extend(self._losses)
+    for regularizer in self._callable_losses:
+      loss_tensor = regularizer()
+      if loss_tensor is not None:
+        collected_losses.append(loss_tensor)
+    return collected_losses
 
   @doc_controls.for_subclass_implementers
   def add_loss(self, losses, inputs=None):
@@ -393,7 +398,9 @@ class Layer(checkpointable.CheckpointableBase):
     from `Layer.call()`).
 
     Arguments:
-      losses: Loss tensor, or list/tuple of tensors.
+      losses: Loss tensor, or list/tuple of tensors. Rather than tensors, losses
+        may also be zero-argument callables which create a loss tensor. Only
+        callable losses are supported when executing eagerly.
       inputs: If anything other than None is passed, it signals the losses
         are conditional on some of the layer's inputs,
         and thus they should only be run where these inputs are available.
@@ -403,29 +410,45 @@ class Layer(checkpointable.CheckpointableBase):
         (e.g. weight regularization losses).
 
     Raises:
-      RuntimeError: If called in Eager mode.
+      RuntimeError: If called in Eager mode with a `Tensor` rather than a
+        callable, or if `inputs` is not None.
     """
-    if context.executing_eagerly():
-      # TODO(fchollet): it should be possible (and highly desirable) to support
-      # `add_loss` in eager mode. This allows great convenience and flexibility
-      # in defining custom losses on the fly (e.g. in VAEs).
-      # Simply appending the loss value to `self._losses`
-      # is the correct behavior.
-      # The only caveat is that we need to force the user to only call
-      # `add_loss` from inside a model or Layer's `call` method
-      # (otherwise the loss computation cannot be backproped through).
-      raise RuntimeError('Layer.add_loss not supported in Eager mode.')
-
+    executing_eagerly = context.executing_eagerly()
+    if executing_eagerly:
+      if inputs is not None:
+        raise RuntimeError(
+            'Activity regularization (via the "inputs" argument to '
+            'Layer.add_loss) is not supported when executing eagerly. Consider '
+            'returning activity regularization losses from a Model\'s call() '
+            'method.')
+      if getattr(self, '_in_call', False):
+        # TODO(psv): Support activity regularization and a way to reset losses.
+        raise RuntimeError(
+            'Adding losses inside a Layer\'s call() method is not currently '
+            'supported when executing eagerly. Please file a feature request '
+            'if you need this limitation lifted.')
     losses = generic_utils.to_list(losses)
-    losses = [ops.convert_to_tensor(loss, dtype=backend.floatx())
-              if not tensor_util.is_tensor(loss) else loss for loss in losses]
-    self._losses += losses
-    if inputs is None:
-      for loss in losses:
-        loss._unconditional_loss = True  # pylint: disable=protected-access
-    else:
-      for loss in losses:
-        loss._unconditional_loss = False  # pylint: disable=protected-access
+
+    def _tag_unconditional(loss):
+      if callable(loss):
+        loss = loss()
+      if loss is None:
+        return None  # Will be filtered out when computing the .losses property
+      if not tensor_util.is_tensor(loss):
+        loss = ops.convert_to_tensor(loss, dtype=backend.floatx())
+      loss._unconditional_loss = (inputs is None)  # pylint: disable=protected-access
+      return loss
+
+    for loss in losses:
+      if callable(loss):
+        self._callable_losses.append(
+            functools.partial(_tag_unconditional, loss))
+      else:
+        if executing_eagerly:
+          raise RuntimeError(
+              'Layer.add_loss only supported for zero-argument lambdas when '
+              'executing eagerly.')
+        self._losses.append(_tag_unconditional(loss))
 
   def get_losses_for(self, inputs):
     """Retrieves losses relevant to a specific set of inputs.
@@ -599,56 +622,20 @@ class Layer(checkpointable.CheckpointableBase):
     return variable
 
   def _handle_weight_regularization(self, name, variable, regularizer):
-    # `init_graph` should point to the graph in which variable initialization
-    # will occur; it should be None if and only if initialization will take
-    # place in the eager context.
-    init_graph = None
-    if not context.executing_eagerly():
-      default_graph = ops.get_default_graph()
-      if default_graph.building_function:
-        with ops.init_scope():
-          # Retrieve the variables from the graph into which variables
-          # will be lifted; if initialization ops will be lifted into
-          # the eager context, then there is nothing to retrieve, since variable
-          # collections are not supported when eager execution is enabled.
-          if not context.executing_eagerly():
-            init_graph = ops.get_default_graph()
-      else:
-        # Initialization ops will not be lifted out of the default graph.
-        init_graph = default_graph
-
-    if init_graph is not None:  # pylint: disable=protected-access
-      # The variable was created and initialized in a graph.
-      if regularizer:
-        if isinstance(variable, tf_variables.PartitionedVariable):
-          for v in variable:
-            with ops.colocate_with(v.op):
-              with ops.name_scope(name + '/Regularizer'):
-                regularization = regularizer(v)
-            if regularization is not None:
-              self.add_loss(regularization)
-        else:
-          with ops.colocate_with(variable.op):
-            with ops.name_scope(name + '/Regularizer'):
-              regularization = regularizer(variable)
-          if regularization is not None:
-            self.add_loss(regularization)
-    elif regularizer:  # initialization took place in an eager context
-      if isinstance(variable, tf_variables.PartitionedVariable):
-        raise RuntimeError(
-            'Partitioned variable regularization is not yet '
-            'supported when executing eagerly. File a feature request'
-            'if this is important to you.')
-      # Save a zero-argument lambda which runs the regularizer on the
-      # variable, to be executed when `Layer.losses` is requested.
-      # This makes losses responsive to variable updates when executing
-      # eagerly.
-      #
-      # TODO(akshayka): Do the same for graphs as well, so that losses
-      # collected in a while_loop can be run outside its control flow
-      # context and so that losses won't be swallowed up by graph functions
-      # (i.e., `.losses()` should always create regularizers).
-      self._losses.append(lambda: regularizer(variable))
+    """Create lambdas which compute regularization losses."""
+
+    def _loss_for_variable(v):
+      """Creates a regularization loss `Tensor` for variable `v`."""
+      with ops.colocate_with(v):
+        with ops.name_scope(name + '/Regularizer'):
+          regularization = regularizer(v)
+      return regularization
+
+    if isinstance(variable, tf_variables.PartitionedVariable):
+      for v in variable:
+        self.add_loss(functools.partial(_loss_for_variable, v))
+    else:
+      self.add_loss(functools.partial(_loss_for_variable, variable))
 
   def _handle_activity_regularization(self, inputs, outputs):
     # Apply activity regularization.
@@ -766,7 +753,9 @@ class Layer(checkpointable.CheckpointableBase):
         self._assert_input_compatibility(inputs)
 
       if not in_deferred_mode:
+        self._in_call = True
         outputs = self.call(inputs, *args, **kwargs)
+        self._in_call = False
         if outputs is None:
           raise ValueError('A layer\'s `call` method should return a Tensor '
                            'or a list of Tensors, not None (layer: ' +
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index db7ccb181f..1f5176c4d7 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -192,6 +192,20 @@ class CorrectnessTest(test.TestCase):
     history = model.fit(iterator, epochs=1, steps_per_epoch=10)
     self.assertEqual(np.around(history.history['loss'][-1], decimals=4), 0.6173)
 
+  def test_no_loss_in_call(self):
+
+    class HasLoss(keras.layers.Layer):
+
+      def call(self, x):
+        self.add_loss(x)
+        return x
+
+    layer = HasLoss()
+    with self.assertRaises(RuntimeError):
+      layer(1.)
+
+    with ops.Graph().as_default():
+      layer(1.)
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 30be4131a4..54ad74c08b 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -27,6 +27,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
@@ -2427,6 +2428,17 @@ class TestTrainingWithMetrics(test.TestCase):
       scores = model.train_on_batch(x, y, sample_weight=w)
       self.assertArrayNear(scores, [0.2, 0.8, 0.8], 0.1)
 
+  def test_losses_in_defun(self):
+    with context.eager_mode():
+      layer = keras.layers.Dense(1, kernel_regularizer='l1')
+      layer(array_ops.ones([1, 10]))
+
+      @function.defun
+      def get_losses():
+        return layer.losses
+
+      self.assertAllEqual(self.evaluate(layer.losses),
+                          self.evaluate(get_losses()))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 3ba880d7a1..e399ece232 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -131,10 +131,20 @@ class Layer(base_layer.Layer):
 
   def add_loss(self, losses, inputs=None):
     previous_losses_length = len(self._losses)
+    previous_callable_losses_length = len(self._callable_losses)
     super(Layer, self).add_loss(losses, inputs=inputs)
-    # TODO(fchollet): deprecate collection below.
-    new_losses = self._losses[previous_losses_length:]
-    _add_elements_to_collection(new_losses, ops.GraphKeys.REGULARIZATION_LOSSES)
+    if not context.executing_eagerly():
+      # TODO(fchollet): deprecate collection below.
+      new_losses = self._losses[previous_losses_length:]
+      new_callable_losses = self._callable_losses[
+          previous_callable_losses_length:]
+      for regularizer in new_callable_losses:
+        loss_tensor = regularizer()
+        if loss_tensor is not None:
+          new_losses.append(loss_tensor)
+      _add_elements_to_collection(
+          new_losses,
+          ops.GraphKeys.REGULARIZATION_LOSSES)
 
   def _name_scope(self):
     """Determines op naming for the Layer."""
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index d61d3b6dba..257fa27156 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -207,7 +207,8 @@ class ConvTest(test.TestCase):
     layer.apply(images)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(layer.losses, loss_keys)
+    self.evaluate([v.initializer for v in layer.variables])
+    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
   def testConv2DBiasRegularizer(self):
     height, width = 7, 9
@@ -217,7 +218,8 @@ class ConvTest(test.TestCase):
     layer.apply(images)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(layer.losses, loss_keys)
+    self.evaluate([v.initializer for v in layer.variables])
+    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
   def testConv2DNoBias(self):
     height, width = 7, 9
@@ -445,7 +447,8 @@ class SeparableConv1DTest(test.TestCase):
     layer.apply(data)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertEqual(layer.losses, loss_keys)
+    self.evaluate([v.initializer for v in layer.variables])
+    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
   def testSeparableConv1DPointwiseRegularizer(self):
     length = 9
@@ -455,7 +458,8 @@ class SeparableConv1DTest(test.TestCase):
     layer.apply(data)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertEqual(layer.losses, loss_keys)
+    self.evaluate([v.initializer for v in layer.variables])
+    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
   def testSeparableConv1DBiasRegularizer(self):
     length = 9
@@ -465,7 +469,8 @@ class SeparableConv1DTest(test.TestCase):
     layer.apply(data)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertEqual(layer.losses, loss_keys)
+    self.evaluate([v.initializer for v in layer.variables])
+    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
   def testSeparableConv1DNoBias(self):
     length = 9
@@ -682,7 +687,8 @@ class SeparableConv2DTest(test.TestCase):
     layer.apply(images)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(layer.losses, loss_keys)
+    self.evaluate([v.initializer for v in layer.variables])
+    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
   def testSeparableConv2DPointwiseRegularizer(self):
     height, width = 7, 9
@@ -692,7 +698,8 @@ class SeparableConv2DTest(test.TestCase):
     layer.apply(images)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(layer.losses, loss_keys)
+    self.evaluate([v.initializer for v in layer.variables])
+    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
   def testSeparableConv2DBiasRegularizer(self):
     height, width = 7, 9
@@ -702,7 +709,8 @@ class SeparableConv2DTest(test.TestCase):
     layer.apply(images)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(layer.losses, loss_keys)
+    self.evaluate([v.initializer for v in layer.variables])
+    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
   def testSeparableConv2DNoBias(self):
     height, width = 7, 9
@@ -839,7 +847,8 @@ class Conv2DTransposeTest(test.TestCase):
     layer.apply(images)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(layer.losses, loss_keys)
+    self.evaluate([v.initializer for v in layer.variables])
+    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
   def testConv2DTransposeBiasRegularizer(self):
     height, width = 7, 9
@@ -849,7 +858,8 @@ class Conv2DTransposeTest(test.TestCase):
     layer.apply(images)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(layer.losses, loss_keys)
+    self.evaluate([v.initializer for v in layer.variables])
+    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
   def testConv2DTransposeNoBias(self):
     height, width = 7, 9
@@ -1017,7 +1027,8 @@ class Conv3DTransposeTest(test.TestCase):
     layer.apply(volumes)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(layer.losses, loss_keys)
+    self.evaluate([v.initializer for v in layer.variables])
+    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
   def testConv3DTransposeBiasRegularizer(self):
     depth, height, width = 5, 7, 9
@@ -1027,7 +1038,8 @@ class Conv3DTransposeTest(test.TestCase):
     layer.apply(volumes)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(layer.losses, loss_keys)
+    self.evaluate([v.initializer for v in layer.variables])
+    self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
   def testConv3DTransposeNoBias(self):
     depth, height, width = 5, 7, 9
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index 46009a30ac..d26f3f4789 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -197,7 +197,8 @@ class DenseTest(test.TestCase):
     _ = dense(inputs)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(dense.losses, loss_keys)
+    self.evaluate([v.initializer for v in dense.variables])
+    self.assertAllEqual(self.evaluate(dense.losses), self.evaluate(loss_keys))
 
   def testKernelRegularizerWithReuse(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
@@ -218,7 +219,8 @@ class DenseTest(test.TestCase):
     _ = dense(inputs)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(dense.losses, loss_keys)
+    self.evaluate([v.initializer for v in dense.variables])
+    self.assertAllEqual(self.evaluate(dense.losses), self.evaluate(loss_keys))
 
   def testFunctionalDense(self):
     with self.cached_session():
-- 
GitLab


From fe0140fcfc33f109191cf0ebe423aed28ec67bb6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 09:47:00 -0700
Subject: [PATCH 145/570] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 214949709

---
 tensorflow/go/op/wrappers.go | 3192 +++++++++++++++++-----------------
 1 file changed, 1596 insertions(+), 1596 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 2f297d5161..b4d4db3e4d 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -3742,27 +3742,6 @@ func BoostedTreesMakeStatsSummary(scope *Scope, node_ids tf.Output, gradients tf
 	return op.Output(0)
 }
 
-// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//
-// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
-// layer.
-func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesGetEnsembleStates",
-		Input: []tf.Input{
-			tree_ensemble_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
 // Creates a tree ensemble model and returns a handle to it.
 //
 // Arguments:
@@ -4059,168 +4038,291 @@ func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
-type ResourceStridedSliceAssignAttr func(optionalAttr)
+// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
+type LogUniformCandidateSamplerAttr func(optionalAttr)
 
-// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
+// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
+func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["begin_mask"] = value
+		m["seed"] = value
 	}
 }
 
-// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
 // If not specified, defaults to 0
-func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["end_mask"] = value
+		m["seed2"] = value
 	}
 }
 
-// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
+// Generates labels for candidate sampling with a log-uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LogUniformCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
+type UniformCandidateSamplerAttr func(optionalAttr)
+
+// UniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
+		m["seed"] = value
 	}
 }
 
-// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
 // If not specified, defaults to 0
-func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+		m["seed2"] = value
 	}
 }
 
-// Assign `value` to the sliced l-value reference of `ref`.
+// Generates labels for candidate sampling with a uniform distribution.
 //
-// The values of `value` are assigned to the positions in the variable
-// `ref` that are selected by the slice parameters. The slice parameters
-// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// NOTE this op currently does not support broadcasting and so `value`'s
-// shape must be exactly the shape produced by the slice of `ref`.
+// For each batch, this op picks a single set of sampled candidate labels.
 //
-// Returns the created operation.
-func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceStridedSliceAssign",
+		Type: "UniformCandidateSampler",
 		Input: []tf.Input{
-			ref, begin, end, strides, value,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ArgMaxAttr is an optional argument to ArgMax.
-type ArgMaxAttr func(optionalAttr)
+// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
+type GenerateVocabRemappingAttr func(optionalAttr)
 
-// ArgMaxOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
+// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
+//
+// value: Number of entries in the old vocab file to consider.  If -1,
+// use the entire old vocabulary.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["old_vocab_size"] = value
 	}
 }
 
-// Returns the index with the largest value across dimensions of a tensor.
+// Given a path to new and old vocabulary files, returns a remapping Tensor of
 //
-// Note that in case of ties the identity of the return value is not guaranteed.
+// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
+// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
+// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
+// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
+// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
+// default value of -1.
+//
+// `num_vocab_offset` enables
+// use in the partitioned variable case, and should generally be set through
+// examining partitioning info.  The format of the files should be a text file,
+// with each line containing a single entity within the vocabulary.
+//
+// For example, with `new_vocab_file` a text file containing each of the following
+// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
+// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
+// `[0, -1, 2]`.
+//
+// The op also returns a count of how many entries in the new vocabulary
+// were present in the old vocabulary, which is used to calculate the number of
+// values to initialize in a weight matrix remapping
+//
+// This functionality can be used to remap both row vocabularies (typically,
+// features) and column vocabularies (typically, classes) from TensorFlow
+// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
+// corresponding to div-partitioned variables.  Moreover, the underlying remapping
+// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
+// use the corresponding index_table_from_file() as the FeatureColumn framework
+// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
 //
 // Arguments:
+//	new_vocab_file: Path to the new vocab file.
+//	old_vocab_file: Path to the old vocab file.
+//	new_vocab_offset: How many entries into the new vocab file to start reading.
+//	num_new_vocab: Number of entries in the new vocab file to remap.
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
+// Returns A Tensor of length num_new_vocab where the element at index i
+// is equal to the old ID that maps to the new ID i.  This element is -1 for any
+// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
+func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMax",
+		Type: "GenerateVocabRemapping",
 		Input: []tf.Input{
-			input, dimension,
+			new_vocab_file, old_vocab_file,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns which elements of x are finite.
-//
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+// Broadcasts a tensor value to one or more other devices.
+func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "IsFinite",
+		Type: "CollectiveBcastSend",
 		Input: []tf.Input{
-			x,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
-
-// MatMulTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, "a" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
+// Mutually reduces multiple tensors of identical type and shape.
+func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64) (data tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MatMulTransposeB sets the optional transpose_b attribute to value.
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
+	opspec := tf.OpSpec{
+		Type: "CollectiveReduce",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AbortAttr is an optional argument to Abort.
+type AbortAttr func(optionalAttr)
+
+// AbortErrorMsg sets the optional error_msg attribute to value.
 //
-// value: If true, "b" is transposed before multiplication.
+// value: A string which is the message associated with the exception.
+// If not specified, defaults to ""
+func AbortErrorMsg(value string) AbortAttr {
+	return func(m optionalAttr) {
+		m["error_msg"] = value
+	}
+}
+
+// AbortExitWithoutError sets the optional exit_without_error attribute to value.
 // If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
+func AbortExitWithoutError(value bool) AbortAttr {
 	return func(m optionalAttr) {
-		m["transpose_b"] = value
+		m["exit_without_error"] = value
 	}
 }
 
-// Multiply the matrix "a" by the matrix "b".
+// Raise a exception to abort the process when called.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
+// If exit_without_error is true, the process will exit normally,
+// otherwise it will exit with a SIGABORT signal.
 //
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+// Returns nothing but an exception.
+//
+// Returns the created operation.
+func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -4229,258 +4331,163 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatMul",
-		Input: []tf.Input{
-			a, b,
-		},
+		Type: "Abort",
+
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Selects elements from `x` or `y`, depending on `condition`.
-//
-// The `x`, and `y` tensors must all have the same shape, and the
-// output will also have that shape.
-//
-// The `condition` tensor must be a scalar if `x` and `y` are scalars.
-// If `x` and `y` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `x`, or must have
-// the same shape as `x`.
-//
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `x` (if true) or `y` (if false).
-//
-// If `condition` is a vector and `x` and `y` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `x` and `y`.
-// If `condition` has the same shape as `x` and `y`, then it chooses which
-// element to copy from `x` and `y`.
-//
-// For example:
-//
-// ```python
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e)  # => [[1, 6], [7, 4]]
-//
-//
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
+// Forwards the input to the output.
 //
-// ```
+// This operator represents the loop termination condition used by the
+// "pivot" switches of a loop.
 //
 // Arguments:
+//	input: A boolean scalar, representing the branch predicate of the Switch op.
 //
-//	x: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `x` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	y: = A `Tensor` with the same type and shape as `x`.
-//
-// Returns = A `Tensor` with the same type and shape as `x` and `y`.
-func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
+// Returns The same tensor as `input`.
+func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Select",
+		Type: "LoopCond",
 		Input: []tf.Input{
-			condition, x, y,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of x OR y element-wise.
+// Returns a tensor of zeros with the same shape and type as x.
 //
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with zeros.
+func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalOr",
+		Type: "ZerosLike",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-//
-// The regularized incomplete beta integral is defined as:
-//
-//
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
-//
-// where
-//
-//
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
-//
-//
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+// Returns a copy of the input tensor.
+func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Betainc",
+		Type: "Snapshot",
 		Input: []tf.Input{
-			a, b, x,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
+
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
-		Input: []tf.Input{
-			input,
-		},
+}
+
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
-//
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Atan2",
-		Input: []tf.Input{
-			y, x,
-		},
+}
+
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that passes a sliding window over `input_dataset`.
-//
-// Arguments:
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Assign `value` to the sliced l-value reference of `ref`.
 //
-//	window_size: A scalar representing the number of elements in the
-// sliding window.
-//	window_shift: A scalar representing the steps moving the sliding window
-// forward in one iteration. It must be positive.
-//	window_stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
 //
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
 //
-func SlideDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SlideDataset",
+		Type: "ResourceStridedSliceAssign",
 		Input: []tf.Input{
-			input_dataset, window_size, window_shift, window_stride,
+			ref, begin, end, strides, value,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
 
-// EditDistanceNormalize sets the optional normalize attribute to value.
-//
-// value: boolean (if true, edit distances are normalized by length of truth).
-//
-// The output is:
-// If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
+// ArgMaxOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
 	return func(m optionalAttr) {
-		m["normalize"] = value
+		m["output_type"] = value
 	}
 }
 
-// Computes the (possibly normalized) Levenshtein Edit Distance.
-//
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
+// Returns the index with the largest value across dimensions of a tensor.
 //
-// The inputs are:
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
 // Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
-//
-// Returns A dense float tensor with rank R - 1.
-//
-// For the example input:
-//
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
 //
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
-//
-// The output will be:
-//
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -4489,9 +4496,9 @@ func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EditDistance",
+		Type: "ArgMax",
 		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+			input, dimension,
 		},
 		Attrs: attrs,
 	}
@@ -4499,69 +4506,69 @@ func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
-type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+// Returns which elements of x are finite.
+//
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsFinite",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
+
+// MatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+// MatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["transpose_b"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the input.
+// Multiply the matrix "a" by the matrix "b".
 //
-// Arguments:
-//	input_sizes: An integer vector representing the shape of `input`, based
-// on `data_format`.  For example, if `data_format` is 'NHWC' then
-//  `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
 //
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
+		Type: "MatMul",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			a, b,
 		},
 		Attrs: attrs,
 	}
@@ -4569,47 +4576,79 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 	return op.Output(0)
 }
 
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
-
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
-	return func(m optionalAttr) {
-		m["tolerance"] = value
-	}
-}
-
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+// Selects elements from `x` or `y`, depending on `condition`.
+//
+// The `x`, and `y` tensors must all have the same shape, and the
+// output will also have that shape.
+//
+// The `condition` tensor must be a scalar if `x` and `y` are scalars.
+// If `x` and `y` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `x`, or must have
+// the same shape as `x`.
+//
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `x` (if true) or `y` (if false).
+//
+// If `condition` is a vector and `x` and `y` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `x` and `y`.
+// If `condition` has the same shape as `x` and `y`, then it chooses which
+// element to copy from `x` and `y`.
+//
+// For example:
+//
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
+//
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
+// ```
+//
+// Arguments:
+//
+//	x: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `x` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	y: = A `Tensor` with the same type and shape as `x`.
+//
+// Returns = A `Tensor` with the same type and shape as `x` and `y`.
+func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
+		Type: "Select",
 		Input: []tf.Input{
-			x, y,
+			condition, x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x / y element-wise.
+// Returns the truth value of x OR y element-wise.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Div",
+		Type: "LogicalOr",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -4618,65 +4657,92 @@ func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Returns x * y element-wise.
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
-// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The regularized incomplete beta integral is defined as:
+//
+//
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Mul",
+		Type: "Betainc",
 		Input: []tf.Input{
-			x, y,
+			a, b, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
-
-// BiasAddDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Identity",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Adds `bias` to `value`.
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan2",
+		Input: []tf.Input{
+			y, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that passes a sliding window over `input_dataset`.
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+//	window_size: A scalar representing the number of elements in the
+// sliding window.
+//	window_shift: A scalar representing the steps moving the sliding window
+// forward in one iteration. It must be positive.
+//	window_stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
+//
+//
+func SlideDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
+		Type: "SlideDataset",
 		Input: []tf.Input{
-			value, bias,
+			input_dataset, window_size, window_shift, window_stride,
 		},
 		Attrs: attrs,
 	}
@@ -4684,41 +4750,74 @@ func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddA
 	return op.Output(0)
 }
 
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
 
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+// EditDistanceNormalize sets the optional normalize attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+// value: boolean (if true, edit distances are normalized by length of truth).
+//
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["normalize"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
+// Computes the (possibly normalized) Levenshtein Edit Distance.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// The inputs are:
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
+//
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -4727,146 +4826,183 @@ func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
+		Type: "EditDistance",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
-type AllCandidateSamplerAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
 
-// AllCandidateSamplerSeed sets the optional seed attribute to value.
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["data_format"] = value
 	}
 }
 
-// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["dilations"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// Computes the gradients of depthwise convolution with respect to the input.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
+		Type: "DepthwiseConv2dNativeBackpropInput",
 		Input: []tf.Input{
-			true_classes,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddV2",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns an element-wise indication of the sign of a number.
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
+
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+	return func(m optionalAttr) {
+		m["tolerance"] = value
+	}
+}
+
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ApproximateEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise.
 //
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Div",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x * y element-wise.
 //
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sign",
+		Type: "Mul",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ArgMinAttr is an optional argument to ArgMin.
-type ArgMinAttr func(optionalAttr)
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
 
-// ArgMinOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMinOutputType(value tf.DataType) ArgMinAttr {
+// BiasAddDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["data_format"] = value
 	}
 }
 
-// Returns the index with the smallest value across dimensions of a tensor.
+// Adds `bias` to `value`.
 //
-// Note that in case of ties the identity of the return value is not guaranteed.
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -4875,9 +5011,9 @@ func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMin",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			input, dimension,
+			value, bias,
 		},
 		Attrs: attrs,
 	}
@@ -4885,33 +5021,52 @@ func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
+
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// output range specified with 'requested_output_min' and 'requested_output_max'.
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
 //
-// Arguments:
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	requested_output_min: The float value that the minimum quantized output value represents.
-//	requested_output_max: The float value that the maximum quantized output value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
-// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
-func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Requantize",
+		Type: "SparseReduceSumSparse",
 		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
@@ -4919,95 +5074,277 @@ func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
 
-// PreventGradientMessage sets the optional message attribute to value.
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["message"] = value
+		m["seed"] = value
 	}
 }
 
-// An identity op that triggers an error if a gradient is requested.
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// When executed in a graph, this op outputs its input tensor as-is.
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	input: any tensor.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
 //
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PreventGradient",
+		Type: "AllCandidateSampler",
 		Input: []tf.Input{
-			input,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns x + y element-wise.
+//
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Asin",
+		Type: "AddV2",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// For example:
-//
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+// Returns an element-wise indication of the sign of a number.
 //
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
 //
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sign",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ArgMinAttr is an optional argument to ArgMin.
+type ArgMinAttr func(optionalAttr)
+
+// ArgMinOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMinOutputType(value tf.DataType) ArgMinAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
+	}
+}
+
+// Returns the index with the smallest value across dimensions of a tensor.
+//
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Arguments:
+//
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ArgMin",
+		Input: []tf.Input{
+			input, dimension,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+//
+// output range specified with 'requested_output_min' and 'requested_output_max'.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "Requantize",
+		Input: []tf.Input{
+			input, input_min, input_max, requested_output_min, requested_output_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
+
+// PreventGradientMessage sets the optional message attribute to value.
+//
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
+	}
+}
+
+// An identity op that triggers an error if a gradient is requested.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
+//
+// Arguments:
+//	input: any tensor.
+//
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PreventGradient",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asin",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor.
+//
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
+//
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
+//
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
 // #     [ 0  0  0  0]]
 // ```
 //
@@ -10182,23 +10519,6 @@ func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...Ass
 	return scope.AddOperation(opspec)
 }
 
-// Broadcasts a tensor value to one or more other devices.
-func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "CollectiveBcastSend",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Split a `SparseTensor` into `num_split` tensors along one dimension.
 //
 // If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
@@ -10776,23 +11096,6 @@ func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, update
 	return scope.AddOperation(opspec)
 }
 
-// Mutually reduces multiple tensors of identical type and shape.
-func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
-	opspec := tf.OpSpec{
-		Type: "CollectiveReduce",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Updates the tree ensemble by either adding a layer to the last tree being grown
 //
 // or by starting a new tree.
@@ -11671,6 +11974,49 @@ func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.
 	return scope.AddOperation(opspec)
 }
 
+// Exits the current frame to its parent frame.
+//
+// Exit makes its input `data` available to the parent frame.
+//
+// Arguments:
+//	data: The tensor to be made available to the parent frame.
+//
+// Returns The same tensor as `data`.
+func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Exit",
+		Input: []tf.Input{
+			data,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produce a string tensor that encodes the state of a Reader.
+//
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderSerializeStateV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
 type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
@@ -11804,68 +12150,6 @@ func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (o
 	return op.Output(0)
 }
 
-// StringSplitV2Attr is an optional argument to StringSplitV2.
-type StringSplitV2Attr func(optionalAttr)
-
-// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
-//
-// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
-// If not specified, defaults to -1
-func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
-	return func(m optionalAttr) {
-		m["maxsplit"] = value
-	}
-}
-
-// Split elements of `source` based on `sep` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `source` based on `sep` and return a `SparseTensor`
-// containing the split tokens. Empty tokens are ignored.
-//
-// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-// then the output will be
-// ```
-// st.indices = [0, 0;
-//               0, 1;
-//               1, 0;
-//               1, 1;
-//               1, 2]
-// st.shape = [2, 3]
-// st.values = ['hello', 'world', 'a', 'b', 'c']
-// ```
-//
-// If `sep` is given, consecutive delimiters are not grouped together and are
-// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-// string, consecutive whitespace are regarded as a single separator, and the
-// result will contain no empty strings at the startor end if the string has
-// leading or trailing whitespace.
-//
-// Note that the above mentioned behavior matches python's str.split.
-//
-// Arguments:
-//	input: `1-D` string `Tensor`, the strings to split.
-//	sep: `0-D` string `Tensor`, the delimiter character.
-func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringSplitV2",
-		Input: []tf.Input{
-			input, sep,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // MaxPoolAttr is an optional argument to MaxPool.
 type MaxPoolAttr func(optionalAttr)
 
@@ -12435,21 +12719,6 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 	return op.Output(0)
 }
 
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softsign",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a TensorList which, when stacked, has the value of `tensor`.
 //
 // Each tensor in the result list corresponds to one row of the input tensor.
@@ -12470,81 +12739,6 @@ func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Outpu
 	return op.Output(0)
 }
 
-// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
-type GenerateVocabRemappingAttr func(optionalAttr)
-
-// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
-//
-// value: Number of entries in the old vocab file to consider.  If -1,
-// use the entire old vocabulary.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
-	return func(m optionalAttr) {
-		m["old_vocab_size"] = value
-	}
-}
-
-// Given a path to new and old vocabulary files, returns a remapping Tensor of
-//
-// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
-// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
-// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
-// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
-// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
-// default value of -1.
-//
-// `num_vocab_offset` enables
-// use in the partitioned variable case, and should generally be set through
-// examining partitioning info.  The format of the files should be a text file,
-// with each line containing a single entity within the vocabulary.
-//
-// For example, with `new_vocab_file` a text file containing each of the following
-// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
-// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
-// `[0, -1, 2]`.
-//
-// The op also returns a count of how many entries in the new vocabulary
-// were present in the old vocabulary, which is used to calculate the number of
-// values to initialize in a weight matrix remapping
-//
-// This functionality can be used to remap both row vocabularies (typically,
-// features) and column vocabularies (typically, classes) from TensorFlow
-// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
-// corresponding to div-partitioned variables.  Moreover, the underlying remapping
-// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
-// use the corresponding index_table_from_file() as the FeatureColumn framework
-// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
-//
-// Arguments:
-//	new_vocab_file: Path to the new vocab file.
-//	old_vocab_file: Path to the old vocab file.
-//	new_vocab_offset: How many entries into the new vocab file to start reading.
-//	num_new_vocab: Number of entries in the new vocab file to remap.
-//
-// Returns A Tensor of length num_new_vocab where the element at index i
-// is equal to the old ID that maps to the new ID i.  This element is -1 for any
-// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
-func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "GenerateVocabRemapping",
-		Input: []tf.Input{
-			new_vocab_file, old_vocab_file,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Assigns sparse updates to the variable referenced by `resource`.
 //
 // This operation computes
@@ -13547,6 +13741,27 @@ func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAtt
 	return op.Output(0)
 }
 
+// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
+// layer.
+func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesGetEnsembleStates",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
 // ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
 type ResourceApplyPowerSignAttr func(optionalAttr)
 
@@ -16327,79 +16542,6 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 	return op.Output(0)
 }
 
-// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
-type LogUniformCandidateSamplerAttr func(optionalAttr)
-
-// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a log-uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LogUniformCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // Returns the max of x and y (i.e. x > y ? x : y) element-wise.
 //
 // *NOTE*: `Maximum` supports broadcasting. More about broadcasting
@@ -19444,31 +19586,6 @@ func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output
 	return op.Output(0)
 }
 
-// Read an element from the TensorArray into output `value`.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
-//
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
-		Input: []tf.Input{
-			handle, index, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // QuantizeV2Attr is an optional argument to QuantizeV2.
 type QuantizeV2Attr func(optionalAttr)
 
@@ -20866,6 +20983,201 @@ func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (ou
 	return op.Output(0)
 }
 
+// EnterAttr is an optional argument to Enter.
+type EnterAttr func(optionalAttr)
+
+// EnterIsConstant sets the optional is_constant attribute to value.
+//
+// value: If true, the output is constant within the child frame.
+// If not specified, defaults to false
+func EnterIsConstant(value bool) EnterAttr {
+	return func(m optionalAttr) {
+		m["is_constant"] = value
+	}
+}
+
+// EnterParallelIterations sets the optional parallel_iterations attribute to value.
+//
+// value: The number of iterations allowed to run in parallel.
+// If not specified, defaults to 10
+func EnterParallelIterations(value int64) EnterAttr {
+	return func(m optionalAttr) {
+		m["parallel_iterations"] = value
+	}
+}
+
+// Creates or finds a child frame, and makes `data` available to the child frame.
+//
+// This op is used together with `Exit` to create loops in the graph.
+// The unique `frame_name` is used by the `Executor` to identify frames. If
+// `is_constant` is true, `output` is a constant in the child frame; otherwise
+// it may be changed in the child frame. At most `parallel_iterations` iterations
+// are run in parallel in the child frame.
+//
+// Arguments:
+//	data: The tensor to be made available to the child frame.
+//	frame_name: The name of the child frame.
+//
+// Returns The same tensor as `data`.
+func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"frame_name": frame_name}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Enter",
+		Input: []tf.Input{
+			data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Add all input tensors element wise.
+//
+// Arguments:
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AddN",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TryRpcAttr is an optional argument to TryRpc.
+type TryRpcAttr func(optionalAttr)
+
+// TryRpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func TryRpcProtocol(value string) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// TryRpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func TryRpcFailFast(value bool) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func TryRpcTimeoutInMs(value int64) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// Unlike the standard `Rpc` op, if the connection fails or the remote worker
+// returns an error status, this op does **not** reraise the exception.
+// Instead, the `status_code` and `status_message` entry for the corresponding RPC
+// call is set with the error returned from the RPC call.  The `response` tensor
+// will contain valid response values for those minibatch entries whose RPCs did
+// not fail; the rest of the entries will have empty strings.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
+// returned from the RPC calls.
+func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TryRpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Delete the tensor specified by its handle in the session.
 //
 // Arguments:
@@ -21612,29 +21924,6 @@ func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Outp
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Forwards the input to the output.
-//
-// This operator represents the loop termination condition used by the
-// "pivot" switches of a loop.
-//
-// Arguments:
-//	input: A boolean scalar, representing the branch predicate of the Switch op.
-//
-// Returns The same tensor as `input`.
-func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LoopCond",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the sum along segments of a tensor.
 //
 // Read
@@ -24163,6 +24452,31 @@ func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr
 	return op.Output(0)
 }
 
+// Read an element from the TensorArray into output `value`.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayReadV3",
+		Input: []tf.Input{
+			handle, index, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the gradient for the tanh of `x` wrt its input.
 //
 // Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
@@ -27849,178 +28163,6 @@ func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Outpu
 	return op.Output(0)
 }
 
-// EncodeProtoAttr is an optional argument to EncodeProto.
-type EncodeProtoAttr func(optionalAttr)
-
-// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
-// If not specified, defaults to "local://"
-func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
-	return func(m optionalAttr) {
-		m["descriptor_source"] = value
-	}
-}
-
-// The op serializes protobuf messages provided in the input tensors.
-//
-// The types of the tensors in `values` must match the schema for the
-// fields specified in `field_names`. All the tensors in `values` must
-// have a common shape prefix, *batch_shape*.
-//
-// The `sizes` tensor specifies repeat counts for each field.  The repeat
-// count (last dimension) of a each tensor in `values` must be greater
-// than or equal to corresponding repeat count in `sizes`.
-//
-// A `message_type` name must be provided to give context for the field
-// names. The actual message descriptor can be looked up either in the
-// linked-in descriptor pool or a filename provided by the caller using
-// the `descriptor_source` attribute.
-//
-// The `descriptor_source` attribute selects a source of protocol
-// descriptors to consult when looking up `message_type`. This may be a
-// filename containing a serialized `FileDescriptorSet` message,
-// or the special value `local://`, in which case only descriptors linked
-// into the code will be searched; the filename can be on any filesystem
-// accessible to TensorFlow.
-//
-// You can build a `descriptor_source` file using the `--descriptor_set_out`
-// and `--include_imports` options to the protocol compiler `protoc`.
-//
-// The `local://` database only covers descriptors linked into the
-// code via C++ libraries, not Python imports. You can link in a proto descriptor
-// by creating a cc_library target with alwayslink=1.
-//
-// There are a few special cases in the value mapping:
-//
-// Submessage and group fields must be pre-serialized as TensorFlow strings.
-//
-// TensorFlow lacks support for unsigned int64s, so they must be
-// represented as `tf.int64` with the same twos-complement bit pattern
-// (the obvious way).
-//
-// Unsigned int32 values can be represented exactly with `tf.int64`, or
-// with sign wrapping if the input is of type `tf.int32`.
-//
-// Arguments:
-//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-//	values: List of tensors containing values for the corresponding field.
-//	field_names: List of strings containing proto field names.
-//	message_type: Name of the proto message type to decode.
-//
-// Returns Tensor of serialized protos with shape `batch_shape`.
-func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeProto",
-		Input: []tf.Input{
-			sizes, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a TensorArray for storing the gradients of values in the given handle.
-//
-// If the given TensorArray gradient already exists, returns a reference to it.
-//
-// Locks the size of the original TensorArray by disabling its dynamic size flag.
-//
-// **A note about the input flow_in:**
-//
-// The handle flow_in forces the execution of the gradient lookup to occur
-// only after certain other operations have occurred.  For example, when
-// the forward TensorArray is dynamically sized, writes to this TensorArray
-// may resize the object.  The gradient TensorArray is statically sized based
-// on the size of the forward TensorArray when this operation executes.
-// Furthermore, the size of the forward TensorArray is frozen by this call.
-// As a result, the flow is used to ensure that the call to generate the gradient
-// TensorArray only happens after all writes are executed.
-//
-// In the case of dynamically sized TensorArrays, gradient computation should
-// only be performed on read operations that have themselves been chained via
-// flow to occur only after all writes have executed. That way the final size
-// of the forward TensorArray is known when this operation is called.
-//
-// **A note about the source attribute:**
-//
-// TensorArray gradient calls use an accumulator TensorArray object.  If
-// multiple gradients are calculated and run in the same session, the multiple
-// gradient nodes may accidentally flow through the same accumulator TensorArray.
-// This double counts and generally breaks the TensorArray gradient flow.
-//
-// The solution is to identify which gradient call this particular
-// TensorArray gradient is being called in.  This is performed by identifying
-// a unique string (e.g. "gradients", "gradients_1", ...) from the input
-// gradient Tensor's name.  This string is used as a suffix when creating
-// the TensorArray gradient object here (the attribute `source`).
-//
-// The attribute `source` is added as a suffix to the forward TensorArray's
-// name when performing the creation / lookup, so that each separate gradient
-// calculation gets its own TensorArray accumulator.
-//
-// Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"source": source}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV3",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Creates a dataset that splits a SparseTensor into elements row-wise.
-func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorSliceDataset",
-		Input: []tf.Input{
-			indices, values, dense_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x / y element-wise for real types.
-//
-// If `x` and `y` are reals, this will return the floating-point division.
-//
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RealDiv",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 //     Adds v into specified rows of x.
 //
 //     Computes y = x; y[i, :] += v; return y.
@@ -28316,65 +28458,314 @@ func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...Sta
 	return op.Output(0)
 }
 
-// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
-func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ConcatenateDataset",
-		Input: []tf.Input{
-			input_dataset, another_dataset,
-		},
-		Attrs: attrs,
+// StringSplitV2Attr is an optional argument to StringSplitV2.
+type StringSplitV2Attr func(optionalAttr)
+
+// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
+//
+// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
+// If not specified, defaults to -1
+func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
+	return func(m optionalAttr) {
+		m["maxsplit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Adds a value to the current value of a variable.
+// Split elements of `source` based on `sep` into a `SparseTensor`.
 //
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the incremented value or a subsequent newer one.
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `source` based on `sep` and return a `SparseTensor`
+// containing the split tokens. Empty tokens are ignored.
 //
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+// then the output will be
+// ```
+// st.indices = [0, 0;
+//               0, 1;
+//               1, 0;
+//               1, 1;
+//               1, 2]
+// st.shape = [2, 3]
+// st.values = ['hello', 'world', 'a', 'b', 'c']
+// ```
 //
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+// If `sep` is given, consecutive delimiters are not grouped together and are
+// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+// string, consecutive whitespace are regarded as a single separator, and the
+// result will contain no empty strings at the startor end if the string has
+// leading or trailing whitespace.
+//
+// Note that the above mentioned behavior matches python's str.split.
+//
+// Arguments:
+//	input: `1-D` string `Tensor`, the strings to split.
+//	sep: `0-D` string `Tensor`, the delimiter character.
+func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
+		Type: "StringSplitV2",
 		Input: []tf.Input{
-			resource, value,
+			input, sep,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Records the latency of producing `input_dataset` elements in a StatsAggregator.
-func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "LatencyStatsDataset",
+		Type: "Softsign",
 		Input: []tf.Input{
-			input_dataset, tag,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MapSizeAttr is an optional argument to MapSize.
-type MapSizeAttr func(optionalAttr)
+// EncodeProtoAttr is an optional argument to EncodeProto.
+type EncodeProtoAttr func(optionalAttr)
+
+// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
+// If not specified, defaults to "local://"
+func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
+	return func(m optionalAttr) {
+		m["descriptor_source"] = value
+	}
+}
+
+// The op serializes protobuf messages provided in the input tensors.
+//
+// The types of the tensors in `values` must match the schema for the
+// fields specified in `field_names`. All the tensors in `values` must
+// have a common shape prefix, *batch_shape*.
+//
+// The `sizes` tensor specifies repeat counts for each field.  The repeat
+// count (last dimension) of a each tensor in `values` must be greater
+// than or equal to corresponding repeat count in `sizes`.
+//
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// There are a few special cases in the value mapping:
+//
+// Submessage and group fields must be pre-serialized as TensorFlow strings.
+//
+// TensorFlow lacks support for unsigned int64s, so they must be
+// represented as `tf.int64` with the same twos-complement bit pattern
+// (the obvious way).
+//
+// Unsigned int32 values can be represented exactly with `tf.int64`, or
+// with sign wrapping if the input is of type `tf.int32`.
+//
+// Arguments:
+//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+//	values: List of tensors containing values for the corresponding field.
+//	field_names: List of strings containing proto field names.
+//	message_type: Name of the proto message type to decode.
+//
+// Returns Tensor of serialized protos with shape `batch_shape`.
+func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeProto",
+		Input: []tf.Input{
+			sizes, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a TensorArray for storing the gradients of values in the given handle.
+//
+// If the given TensorArray gradient already exists, returns a reference to it.
+//
+// Locks the size of the original TensorArray by disabling its dynamic size flag.
+//
+// **A note about the input flow_in:**
+//
+// The handle flow_in forces the execution of the gradient lookup to occur
+// only after certain other operations have occurred.  For example, when
+// the forward TensorArray is dynamically sized, writes to this TensorArray
+// may resize the object.  The gradient TensorArray is statically sized based
+// on the size of the forward TensorArray when this operation executes.
+// Furthermore, the size of the forward TensorArray is frozen by this call.
+// As a result, the flow is used to ensure that the call to generate the gradient
+// TensorArray only happens after all writes are executed.
+//
+// In the case of dynamically sized TensorArrays, gradient computation should
+// only be performed on read operations that have themselves been chained via
+// flow to occur only after all writes have executed. That way the final size
+// of the forward TensorArray is known when this operation is called.
+//
+// **A note about the source attribute:**
+//
+// TensorArray gradient calls use an accumulator TensorArray object.  If
+// multiple gradients are calculated and run in the same session, the multiple
+// gradient nodes may accidentally flow through the same accumulator TensorArray.
+// This double counts and generally breaks the TensorArray gradient flow.
+//
+// The solution is to identify which gradient call this particular
+// TensorArray gradient is being called in.  This is performed by identifying
+// a unique string (e.g. "gradients", "gradients_1", ...) from the input
+// gradient Tensor's name.  This string is used as a suffix when creating
+// the TensorArray gradient object here (the attribute `source`).
+//
+// The attribute `source` is added as a suffix to the forward TensorArray's
+// name when performing the creation / lookup, so that each separate gradient
+// calculation gets its own TensorArray accumulator.
+//
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradV3",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Creates a dataset that splits a SparseTensor into elements row-wise.
+func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorSliceDataset",
+		Input: []tf.Input{
+			indices, values, dense_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise for real types.
+//
+// If `x` and `y` are reals, this will return the floating-point division.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RealDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
+func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ConcatenateDataset",
+		Input: []tf.Input{
+			input_dataset, another_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds a value to the current value of a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the incremented value or a subsequent newer one.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignAddVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "LatencyStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MapSizeAttr is an optional argument to MapSize.
+type MapSizeAttr func(optionalAttr)
 
 // MapSizeCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
@@ -32542,602 +32933,211 @@ func CudnnRNNParamsToCanonicalSeed2(value int64) CudnnRNNParamsToCanonicalAttr {
 // Retrieves a set of weights from the opaque params buffer that can be saved and
 // restored in a way compatible with future runs.
 //
-// Note that the params buffer may not be compatible across different GPUs. So any
-// save and restoration should be converted to and from the canonical weights and
-// biases.
-//
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// num_params: number of parameter sets for all layers.
-//     Each layer may contain multiple parameter sets, with each set consisting of
-//     a weight matrix and a bias vector.
-// weights: the canonical form of weights that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// biases: the canonical form of biases that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     The actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//     dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-func CudnnRNNParamsToCanonical(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, params tf.Output, num_params int64, optional ...CudnnRNNParamsToCanonicalAttr) (weights []tf.Output, biases []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_params": num_params}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNParamsToCanonical",
-		Input: []tf.Input{
-			num_layers, num_units, input_size, params,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if weights, idx, err = makeOutputList(op, idx, "weights"); err != nil {
-		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
-		return
-	}
-	if biases, idx, err = makeOutputList(op, idx, "biases"); err != nil {
-		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
-		return
-	}
-	return weights, biases
-}
-
-// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
-type UniformCandidateSamplerAttr func(optionalAttr)
-
-// UniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UniformCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// CTCLossAttr is an optional argument to CTCLoss.
-type CTCLossAttr func(optionalAttr)
-
-// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
-//
-// value: Scalar, if true then repeated labels are
-// collapsed prior to the CTC calculation.
-// If not specified, defaults to false
-func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["preprocess_collapse_repeated"] = value
-	}
-}
-
-// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
-//
-// value: Scalar.  If set to false, *during* CTC calculation
-// repeated non-blank labels will not be merged and are interpreted as
-// individual labels.  This is a simplified version of CTC.
-// If not specified, defaults to true
-func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["ctc_merge_repeated"] = value
-	}
-}
-
-// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
-//
-// value: Scalar. If set to true, during CTC
-// calculation, items that have longer output sequences than input sequences
-// are skipped: they don't contribute to the loss term and have zero-gradient.
-// If not specified, defaults to false
-func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["ignore_longer_outputs_than_inputs"] = value
-	}
-}
-
-// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
-//
-// the gradient.  This class performs the softmax operation for you, so inputs
-// should be e.g. linear projections of outputs by an LSTM.
-//
-// Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
-// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
-// `(batch b, time t)`.
-//	labels_values: The values (labels) associated with the given batch and time.
-//	sequence_length: A vector containing sequence lengths (batch).
-//
-// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
-// `(max_time x batch_size x num_classes)`.
-func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CTCLoss",
-		Input: []tf.Input{
-			inputs, labels_indices, labels_values, sequence_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
-type CTCGreedyDecoderAttr func(optionalAttr)
-
-// CTCGreedyDecoderMergeRepeated sets the optional merge_repeated attribute to value.
-//
-// value: If True, merge repeated classes in output.
-// If not specified, defaults to false
-func CTCGreedyDecoderMergeRepeated(value bool) CTCGreedyDecoderAttr {
-	return func(m optionalAttr) {
-		m["merge_repeated"] = value
-	}
-}
-
-// Performs greedy decoding on the logits given in inputs.
-//
-// A note about the attribute merge_repeated: if enabled, when
-// consecutive logits' maximum indices are the same, only the first of
-// these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
-// becomes "A B B" if merge_repeated = True and "A B B B B" if
-// merge_repeated = False.
-//
-// Regardless of the value of merge_repeated, if the maximum index of a given
-// time and batch corresponds to the blank, index `(num_classes - 1)`, no new
-// element is emitted.
-//
-// Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch_size)`.
-//
-// Returns Indices matrix, size `(total_decoded_outputs x 2)`,
-// of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].Values vector, size: `(total_decoded_outputs)`,
-// of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.Shape vector, size `(2)`, of the decoded SparseTensor.
-// Values are: `[batch_size, max_decoded_length]`.Matrix, size `(batch_size x 1)`, containing sequence
-// log-probabilities.
-func CTCGreedyDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, optional ...CTCGreedyDecoderAttr) (decoded_indices tf.Output, decoded_values tf.Output, decoded_shape tf.Output, log_probability tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CTCGreedyDecoder",
-		Input: []tf.Input{
-			inputs, sequence_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Forwards `data` to the output port determined by `pred`.
-//
-// If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
-// the data goes to `output_false`.
-//
-// See also `RefSwitch` and `Merge`.
-//
-// Arguments:
-//	data: The tensor to be forwarded to the appropriate output.
-//	pred: A scalar that specifies which output port will receive data.
-//
-// Returns If `pred` is false, data will be forwarded to this output.If `pred` is true, data will be forwarded to this output.
-func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Output, output_true tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Switch",
-		Input: []tf.Input{
-			data, pred,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Add all input tensors element wise.
-//
-// Arguments:
-//	inputs: Must all be the same size and shape.
-func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddN",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TryRpcAttr is an optional argument to TryRpc.
-type TryRpcAttr func(optionalAttr)
-
-// TryRpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func TryRpcProtocol(value string) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
-	}
-}
-
-// TryRpcFailFast sets the optional fail_fast attribute to value.
-//
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func TryRpcFailFast(value bool) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["fail_fast"] = value
-	}
-}
-
-// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
-//
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
-// If not specified, defaults to 0
-func TryRpcTimeoutInMs(value int64) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
-	}
-}
-
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
-//
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
-//
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
-//
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
-//
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
-//
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
-//
-// Unlike the standard `Rpc` op, if the connection fails or the remote worker
-// returns an error status, this op does **not** reraise the exception.
-// Instead, the `status_code` and `status_message` entry for the corresponding RPC
-// call is set with the error returned from the RPC call.  The `response` tensor
-// will contain valid response values for those minibatch entries whose RPCs did
-// not fail; the rest of the entries will have empty strings.
-//
-// Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
-//
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
-// returned from the RPC calls.
-func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNParamsToCanonical(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, params tf.Output, num_params int64, optional ...CudnnRNNParamsToCanonicalAttr) (weights []tf.Output, biases []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_params": num_params}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TryRpc",
+		Type: "CudnnRNNParamsToCanonical",
 		Input: []tf.Input{
-			address, method, request,
+			num_layers, num_units, input_size, params,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if weights, idx, err = makeOutputList(op, idx, "weights"); err != nil {
+		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
+		return
+	}
+	if biases, idx, err = makeOutputList(op, idx, "biases"); err != nil {
+		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
+		return
+	}
+	return weights, biases
 }
 
-// EnterAttr is an optional argument to Enter.
-type EnterAttr func(optionalAttr)
+// CTCLossAttr is an optional argument to CTCLoss.
+type CTCLossAttr func(optionalAttr)
 
-// EnterIsConstant sets the optional is_constant attribute to value.
+// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
 //
-// value: If true, the output is constant within the child frame.
+// value: Scalar, if true then repeated labels are
+// collapsed prior to the CTC calculation.
 // If not specified, defaults to false
-func EnterIsConstant(value bool) EnterAttr {
+func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
 	return func(m optionalAttr) {
-		m["is_constant"] = value
+		m["preprocess_collapse_repeated"] = value
 	}
 }
 
-// EnterParallelIterations sets the optional parallel_iterations attribute to value.
+// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
 //
-// value: The number of iterations allowed to run in parallel.
-// If not specified, defaults to 10
-func EnterParallelIterations(value int64) EnterAttr {
+// value: Scalar.  If set to false, *during* CTC calculation
+// repeated non-blank labels will not be merged and are interpreted as
+// individual labels.  This is a simplified version of CTC.
+// If not specified, defaults to true
+func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
 	return func(m optionalAttr) {
-		m["parallel_iterations"] = value
+		m["ctc_merge_repeated"] = value
 	}
 }
 
-// Creates or finds a child frame, and makes `data` available to the child frame.
+// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
 //
-// This op is used together with `Exit` to create loops in the graph.
-// The unique `frame_name` is used by the `Executor` to identify frames. If
-// `is_constant` is true, `output` is a constant in the child frame; otherwise
-// it may be changed in the child frame. At most `parallel_iterations` iterations
-// are run in parallel in the child frame.
+// value: Scalar. If set to true, during CTC
+// calculation, items that have longer output sequences than input sequences
+// are skipped: they don't contribute to the loss term and have zero-gradient.
+// If not specified, defaults to false
+func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ignore_longer_outputs_than_inputs"] = value
+	}
+}
+
+// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
+//
+// the gradient.  This class performs the softmax operation for you, so inputs
+// should be e.g. linear projections of outputs by an LSTM.
 //
 // Arguments:
-//	data: The tensor to be made available to the child frame.
-//	frame_name: The name of the child frame.
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
+// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
+// `(batch b, time t)`.
+//	labels_values: The values (labels) associated with the given batch and time.
+//	sequence_length: A vector containing sequence lengths (batch).
 //
-// Returns The same tensor as `data`.
-func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
+// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
+// `(max_time x batch_size x num_classes)`.
+func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"frame_name": frame_name}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Enter",
+		Type: "CTCLoss",
 		Input: []tf.Input{
-			data,
+			inputs, labels_indices, labels_values, sequence_length,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
-//
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
+// CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
+type CTCGreedyDecoderAttr func(optionalAttr)
+
+// CTCGreedyDecoderMergeRepeated sets the optional merge_repeated attribute to value.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
+// value: If True, merge repeated classes in output.
+// If not specified, defaults to false
+func CTCGreedyDecoderMergeRepeated(value bool) CTCGreedyDecoderAttr {
+	return func(m optionalAttr) {
+		m["merge_repeated"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Exits the current frame to its parent frame.
+// Performs greedy decoding on the logits given in inputs.
 //
-// Exit makes its input `data` available to the parent frame.
+// A note about the attribute merge_repeated: if enabled, when
+// consecutive logits' maximum indices are the same, only the first of
+// these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
+// becomes "A B B" if merge_repeated = True and "A B B B B" if
+// merge_repeated = False.
+//
+// Regardless of the value of merge_repeated, if the maximum index of a given
+// time and batch corresponds to the blank, index `(num_classes - 1)`, no new
+// element is emitted.
 //
 // Arguments:
-//	data: The tensor to be made available to the parent frame.
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch_size)`.
 //
-// Returns The same tensor as `data`.
-func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns Indices matrix, size `(total_decoded_outputs x 2)`,
+// of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].Values vector, size: `(total_decoded_outputs)`,
+// of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.Shape vector, size `(2)`, of the decoded SparseTensor.
+// Values are: `[batch_size, max_decoded_length]`.Matrix, size `(batch_size x 1)`, containing sequence
+// log-probabilities.
+func CTCGreedyDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, optional ...CTCGreedyDecoderAttr) (decoded_indices tf.Output, decoded_values tf.Output, decoded_shape tf.Output, log_probability tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Exit",
-		Input: []tf.Input{
-			data,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns a copy of the input tensor.
-func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Snapshot",
+		Type: "CTCGreedyDecoder",
 		Input: []tf.Input{
-			input,
+			inputs, sequence_length,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Returns a tensor of zeros with the same shape and type as x.
+// Forwards `data` to the output port determined by `pred`.
+//
+// If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
+// the data goes to `output_false`.
+//
+// See also `RefSwitch` and `Merge`.
 //
 // Arguments:
-//	x: a tensor of type T.
+//	data: The tensor to be forwarded to the appropriate output.
+//	pred: A scalar that specifies which output port will receive data.
 //
-// Returns a tensor of the same shape and type as x but filled with zeros.
-func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns If `pred` is false, data will be forwarded to this output.If `pred` is true, data will be forwarded to this output.
+func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Output, output_true tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ZerosLike",
+		Type: "Switch",
 		Input: []tf.Input{
-			x,
+			data, pred,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AbortAttr is an optional argument to Abort.
-type AbortAttr func(optionalAttr)
-
-// AbortErrorMsg sets the optional error_msg attribute to value.
-//
-// value: A string which is the message associated with the exception.
-// If not specified, defaults to ""
-func AbortErrorMsg(value string) AbortAttr {
-	return func(m optionalAttr) {
-		m["error_msg"] = value
-	}
-}
-
-// AbortExitWithoutError sets the optional exit_without_error attribute to value.
-// If not specified, defaults to false
-func AbortExitWithoutError(value bool) AbortAttr {
-	return func(m optionalAttr) {
-		m["exit_without_error"] = value
-	}
-}
-
-// Raise a exception to abort the process when called.
-//
-// If exit_without_error is true, the process will exit normally,
-// otherwise it will exit with a SIGABORT signal.
-//
-// Returns nothing but an exception.
-//
-// Returns the created operation.
-func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Abort",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
-- 
GitLab


From 7052b44b032a35edb10893ce08993a54e2a76e1d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 09:55:33 -0700
Subject: [PATCH 146/570] Roll-forward of CL 214320700: Split up SPARSE_DEPS,
 adding each individual dependency only to the sparse operators that need it.

Automated rollback of commit 120620caf23a044b8aa2db6ba5984384ec936009

PiperOrigin-RevId: 214950946
---
 tensorflow/core/kernels/BUILD | 43 ++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 0b8e9ec527..30171708c1 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4049,11 +4049,6 @@ cc_library(
 )
 
 SPARSE_DEPS = [
-    ":bounds_check",
-    ":cwise_op",
-    ":fill_functor",
-    ":scatter_functor",
-    "//third_party/eigen3",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:sparse_ops_op_lib",
@@ -4086,7 +4081,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "sparse_cross_op",
     prefix = "sparse_cross_op",
-    deps = SPARSE_DEPS,
+    deps = SPARSE_DEPS + [
+        "//third_party/eigen3",
+    ],
 )
 
 tf_kernel_library(
@@ -4098,13 +4095,19 @@ tf_kernel_library(
 tf_kernel_library(
     name = "sparse_dense_binary_op_shared",
     prefix = "sparse_dense_binary_op_shared",
-    deps = SPARSE_DEPS,
+    deps = SPARSE_DEPS + [
+        ":cwise_op",
+        "//third_party/eigen3",
+    ],
 )
 
 tf_kernel_library(
     name = "sparse_sparse_binary_op_shared",
     prefix = "sparse_sparse_binary_op_shared",
-    deps = SPARSE_DEPS,
+    deps = SPARSE_DEPS + [
+        ":cwise_op",
+        "//third_party/eigen3",
+    ],
 )
 
 tf_kernel_library(
@@ -4136,7 +4139,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "sparse_softmax",
     prefix = "sparse_softmax",
-    deps = SPARSE_DEPS,
+    deps = SPARSE_DEPS + [
+        "//third_party/eigen3",
+    ],
 )
 
 tf_kernel_library(
@@ -4148,25 +4153,37 @@ tf_kernel_library(
 tf_kernel_library(
     name = "sparse_tensor_dense_add_op",
     prefix = "sparse_tensor_dense_add_op",
-    deps = SPARSE_DEPS,
+    deps = SPARSE_DEPS + [
+        ":scatter_functor",
+        "//third_party/eigen3",
+    ],
 )
 
 tf_kernel_library(
     name = "sparse_tensor_dense_matmul_op",
     prefix = "sparse_tensor_dense_matmul_op",
-    deps = SPARSE_DEPS,
+    deps = SPARSE_DEPS + [
+        ":bounds_check",
+        ":fill_functor",
+        "//third_party/eigen3",
+    ],
 )
 
 tf_kernel_library(
     name = "sparse_to_dense_op",
     prefix = "sparse_to_dense_op",
-    deps = SPARSE_DEPS,
+    deps = SPARSE_DEPS + [
+        "//third_party/eigen3",
+    ],
 )
 
 tf_kernel_library(
     name = "sparse_xent_op",
     prefix = "sparse_xent_op",
-    deps = SPARSE_DEPS,
+    deps = SPARSE_DEPS + [
+        ":bounds_check",
+        "//third_party/eigen3",
+    ],
 )
 
 tf_kernel_library(
-- 
GitLab


From b47f0b1b0ac8047d53a824f4ca82a12387a16e4d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 10:05:17 -0700
Subject: [PATCH 147/570] Updating the V2 variables API for boosted_trees.

PiperOrigin-RevId: 214952666
---
 .../dnn_tree_combined_estimator_test.py       |  2 +-
 .../python/training/functions/gbdt_batch.py   | 18 +++++++-------
 .../training/functions/gbdt_batch_test.py     | 24 +++++++++----------
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
index 6b6fe9663a..83a8dee632 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
@@ -190,7 +190,7 @@ class CoreDNNBoostedTreeCombinedTest(test_util.TensorFlowTestCase):
     est.train(input_fn=_train_input_fn, steps=1000)
     # 10 steps for dnn + 3 for 1 tree of depth 3 + 1 after the tree finished
     # + 1 for resource variables.
-    self._assert_checkpoint(est.model_dir, global_step=15)
+    self._assert_checkpoint(est.model_dir, global_step=14)
     res = est.evaluate(input_fn=_eval_input_fn, steps=1)
     self.assertLess(0.5, res["auc"])
     est.predict(input_fn=_eval_input_fn)
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index c7eb2493a8..8531e97f90 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -402,13 +402,13 @@ class GradientBoostedDecisionTreeModel(object):
     self._feature_columns = feature_columns
     self._learner_config_serialized = learner_config.SerializeToString()
     self._num_quantiles = num_quantiles
-    self._max_tree_depth = variables.Variable(
+    self._max_tree_depth = variables.VariableV1(
         initial_value=self._learner_config.constraints.max_tree_depth)
-    self._attempted_trees = variables.Variable(
+    self._attempted_trees = variables.VariableV1(
         initial_value=array_ops.zeros([], dtypes.int64),
         trainable=False,
         name="attempted_trees")
-    self._finalized_trees = variables.Variable(
+    self._finalized_trees = variables.VariableV1(
         initial_value=array_ops.zeros([], dtypes.int64),
         trainable=False,
         name="finalized_trees")
@@ -770,28 +770,28 @@ class GradientBoostedDecisionTreeModel(object):
         fc_name_idx += 1
 
       # Create ensemble stats variables.
-      num_layer_examples = variables.Variable(
+      num_layer_examples = variables.VariableV1(
           initial_value=array_ops.zeros([], dtypes.int64),
           name="num_layer_examples",
           trainable=False)
-      num_layer_steps = variables.Variable(
+      num_layer_steps = variables.VariableV1(
           initial_value=array_ops.zeros([], dtypes.int64),
           name="num_layer_steps",
           trainable=False)
-      num_layers = variables.Variable(
+      num_layers = variables.VariableV1(
           initial_value=array_ops.zeros([], dtypes.int64),
           name="num_layers",
           trainable=False)
-      active_tree = variables.Variable(
+      active_tree = variables.VariableV1(
           initial_value=array_ops.zeros([], dtypes.int64),
           name="active_tree",
           trainable=False)
-      active_layer = variables.Variable(
+      active_layer = variables.VariableV1(
           initial_value=array_ops.zeros([], dtypes.int64),
           name="active_layer",
           trainable=False)
       # Variable that becomes false once bias centering is done.
-      continue_centering = variables.Variable(
+      continue_centering = variables.VariableV1(
           initial_value=self._center_bias,
           name="continue_centering",
           trainable=False)
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 9d9941f696..6d20a2e7f4 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -239,7 +239,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
       partition_ids = array_ops.zeros([4], dtypes.int32)
-      ensemble_stamp = variables.Variable(
+      ensemble_stamp = variables.VariableV1(
           initial_value=0,
           name="ensemble_stamp",
           trainable=False,
@@ -503,7 +503,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
       partition_ids = array_ops.zeros([4], dtypes.int32)
-      ensemble_stamp = variables.Variable(
+      ensemble_stamp = variables.VariableV1(
           initial_value=0,
           name="ensemble_stamp",
           trainable=False,
@@ -607,7 +607,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
       partition_ids = array_ops.zeros([4], dtypes.int32)
-      ensemble_stamp = variables.Variable(
+      ensemble_stamp = variables.VariableV1(
           initial_value=0,
           name="ensemble_stamp",
           trainable=False,
@@ -711,7 +711,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
       partition_ids = array_ops.zeros([4], dtypes.int32)
-      ensemble_stamp = variables.Variable(
+      ensemble_stamp = variables.VariableV1(
           initial_value=0,
           name="ensemble_stamp",
           trainable=False,
@@ -783,7 +783,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
       partition_ids = array_ops.zeros([4], dtypes.int32)
-      ensemble_stamp = variables.Variable(
+      ensemble_stamp = variables.VariableV1(
           initial_value=0,
           name="ensemble_stamp",
           trainable=False,
@@ -847,7 +847,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
       partition_ids = array_ops.zeros([4], dtypes.int32)
-      ensemble_stamp = variables.Variable(
+      ensemble_stamp = variables.VariableV1(
           initial_value=0,
           name="ensemble_stamp",
           trainable=False,
@@ -1090,7 +1090,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
       weights = array_ops.ones([batch_size, 1], dtypes.float32)
 
       partition_ids = array_ops.zeros([batch_size], dtypes.int32)
-      ensemble_stamp = variables.Variable(
+      ensemble_stamp = variables.VariableV1(
           initial_value=0,
           name="ensemble_stamp",
           trainable=False,
@@ -1194,7 +1194,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
       weights = array_ops.ones([batch_size, 1], dtypes.float32)
 
       partition_ids = array_ops.zeros([batch_size], dtypes.int32)
-      ensemble_stamp = variables.Variable(
+      ensemble_stamp = variables.VariableV1(
           initial_value=0,
           name="ensemble_stamp",
           trainable=False,
@@ -1299,7 +1299,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
       weights = array_ops.ones([batch_size, 1], dtypes.float32)
 
       partition_ids = array_ops.zeros([batch_size], dtypes.int32)
-      ensemble_stamp = variables.Variable(
+      ensemble_stamp = variables.VariableV1(
           initial_value=0,
           name="ensemble_stamp",
           trainable=False,
@@ -1405,7 +1405,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
       partition_ids = array_ops.zeros([4], dtypes.int32)
-      ensemble_stamp = variables.Variable(
+      ensemble_stamp = variables.VariableV1(
           initial_value=0,
           name="ensemble_stamp",
           trainable=False,
@@ -1524,7 +1524,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
       partition_ids = array_ops.zeros([4], dtypes.int32)
-      ensemble_stamp = variables.Variable(
+      ensemble_stamp = variables.VariableV1(
           initial_value=0,
           name="ensemble_stamp",
           trainable=False,
@@ -1656,7 +1656,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
       partition_ids = array_ops.zeros([4], dtypes.int32)
-      ensemble_stamp = variables.Variable(
+      ensemble_stamp = variables.VariableV1(
           initial_value=0,
           name="ensemble_stamp",
           trainable=False,
-- 
GitLab


From 301e3043e67493ce3777d2b36b43d0210f7b920c Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Fri, 28 Sep 2018 10:25:42 -0700
Subject: [PATCH 148/570] Disable auto_shard for MirroredStrategy by default.
 We will re-enable it when it is more robust.

PiperOrigin-RevId: 214956066
---
 tensorflow/contrib/distribute/README.md                  | 3 ++-
 .../contrib/distribute/python/mirrored_strategy.py       | 8 ++++++--
 tensorflow/contrib/distribute/python/values.py           | 9 ++++++---
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index 91a27f97b7..2e025765e4 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -231,7 +231,8 @@ The same `input_fn` will be used for all workers if you use
 important to shuffle your dataset in your `input_fn`.
 
 `MirroredStrategy` will insert a `tf.dataset.Dataset.shard` call in you
-`input_fn`. As a result, each worker gets a fraction of your input data.
+`input_fn` if `auto_shard_dataset` is set to `True`. As a result, each worker
+gets a fraction of your input data.
 
 ### Performance Tips
 
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 504f45a695..93d42e09a2 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -347,6 +347,8 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
       set, the `configure` method will try to find the best one.
     prefetch_on_device: optional boolean to specify whether to prefetch input
       data to devices.
+    auto_shard_dataset: whether to auto-shard the dataset when there are
+      multiple workers.
   """
 
   def __init__(self,
@@ -354,11 +356,13 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                num_gpus=None,
                num_gpus_per_worker=None,
                cross_tower_ops=None,
-               prefetch_on_device=None):
+               prefetch_on_device=None,
+               auto_shard_dataset=False):
     super(MirroredStrategy, self).__init__()
 
     self._cross_tower_ops = cross_tower_ops
     self._prefetch_on_device = prefetch_on_device
+    self._auto_shard_dataset = auto_shard_dataset
     # Rememeber num GPUs which might be needed by `configure` method.
     if num_gpus is not None and num_gpus_per_worker is not None:
       raise ValueError(
@@ -477,7 +481,7 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     if self._cluster_spec:
       return values.MultiWorkerDataset(
           partial(self._call_dataset_fn, dataset_fn), self._worker_device_map,
-          self._prefetch_on_device)
+          self._prefetch_on_device, self._auto_shard_dataset)
     else:
       return values.PerDeviceDataset(
           self._call_dataset_fn(dataset_fn),
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index cce41e7717..327775a729 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -814,7 +814,8 @@ class MultiWorkerDataset(object):
   eager mode.
   """
 
-  def __init__(self, dataset_fn, worker_device_map, prefetch_on_device=None):
+  def __init__(self, dataset_fn, worker_device_map, prefetch_on_device=None,
+               auto_shard=False):
     """Initialize the MultiWorkerDataset object.
 
     Args:
@@ -822,6 +823,7 @@ class MultiWorkerDataset(object):
       worker_device_map: a dict mapping from each worker to a list of devices
         that belong to this worker.
       prefetch_on_device: whether to prefetch to devices.
+      auto_shard: whether to auto-shard the dataset.
     """
     self._worker_device_map = worker_device_map
     self._datasets = {}
@@ -831,8 +833,9 @@ class MultiWorkerDataset(object):
         six.iteritems(worker_device_map)):
       with ops.device(worker):
         worker_input = dataset_fn()
-        worker_input = input_ops.auto_shard_dataset(
-            worker_input, len(worker_device_map), i)
+        if auto_shard:
+          worker_input = input_ops.auto_shard_dataset(
+              worker_input, len(worker_device_map), i)
         self._datasets[worker] = PerDeviceDataset(
             worker_input,
             worker_devices,
-- 
GitLab


From 0e926947421cc47546efb7f7e2dd8505fbe0ac45 Mon Sep 17 00:00:00 2001
From: Shivani Agrawal <shivaniagrawal@google.com>
Date: Fri, 28 Sep 2018 10:56:02 -0700
Subject: [PATCH 149/570] [tf.data] Throws appropriate error while trying to
 checkpoint input pipeline with associated stats_aggregator.

PiperOrigin-RevId: 214961678
---
 .../serialization/stats_dataset_serialization_test.py | 11 +++++++++++
 .../core/kernels/data/stats_aggregator_dataset_op.cc  | 10 ++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/stats_dataset_serialization_test.py b/tensorflow/contrib/data/python/kernel_tests/serialization/stats_dataset_serialization_test.py
index 14cd3e9c4a..a10f85263a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/stats_dataset_serialization_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/serialization/stats_dataset_serialization_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import stats_ops
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -90,6 +91,16 @@ class StatsDatasetSerializationTest(
         lambda: self._build_dataset_multiple_tags(num_outputs, tag1, tag2),
         None, num_outputs)
 
+  def _build_dataset_stats_aggregator(self):
+    stats_aggregator = stats_ops.StatsAggregator()
+    return dataset_ops.Dataset.range(10).apply(
+        stats_ops.set_stats_aggregator(stats_aggregator))
+
+  def test_set_stats_aggregator_not_support_checkpointing(self):
+    with self.assertRaisesRegexp(errors.UnimplementedError,
+                                 "does not support checkpointing"):
+      self.run_core_tests(self._build_dataset_stats_aggregator, None, 10)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
index 7e528a71be..c8abfb9eb5 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
@@ -118,16 +118,14 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
 
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        return Status::OK();
+        return errors::Unimplemented(dataset()->DebugString(),
+                                     " does not support checkpointing");
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        return Status::OK();
+        return errors::Unimplemented(dataset()->DebugString(),
+                                     " does not support checkpointing");
       }
 
      private:
-- 
GitLab


From d644fa0542a5a9995512674c7ac708468941fe28 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Fri, 28 Sep 2018 11:10:45 -0700
Subject: [PATCH 150/570] [tf.data] Referencing an internal issue.

PiperOrigin-RevId: 214964640
---
 tensorflow/core/kernels/data/map_and_batch_dataset_op.cc       | 2 ++
 tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc | 3 +++
 tensorflow/core/kernels/data/parallel_map_iterator.cc          | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index 2bbf4af664..b4c7f9e510 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -37,6 +37,8 @@ namespace {
 // See documentation in ../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
+// TODO(b/116852688): Make coordination between the performance model and this
+// transformation more robust.
 class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx)
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 2e6e0465f7..2bb38bf0b9 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -1084,6 +1084,9 @@ REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDataset").Device(DEVICE_CPU),
 // The above design choices were made with automated optimizations in mind,
 // isolating the degree of parallelism as the single tunable knob of this
 // implementation.
+//
+// TODO(b/116852688): Make coordination between the performance model and this
+// transformation more robust.
 class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
  public:
   explicit ParallelInterleaveDatasetV2Op(OpKernelConstruction* ctx)
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index ee20249bfe..8393024c51 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -27,6 +27,8 @@ namespace tensorflow {
 namespace data {
 namespace {
 
+// TODO(b/116852688): Make coordination between the performance model and this
+// transformation more robust.
 class ParallelMapIterator : public DatasetBaseIterator {
  public:
   explicit ParallelMapIterator(
-- 
GitLab


From 1a834d3aa84ba47afe39c22fffd60d03ca151d30 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Fri, 28 Sep 2018 11:12:42 -0700
Subject: [PATCH 151/570] Remove @{} api_links and ban "@{}" from python and md
 files.

PiperOrigin-RevId: 214964988
---
 tensorflow/contrib/data/python/ops/optimization.py     |  2 +-
 .../python/collective_all_reduce_strategy.py           |  2 +-
 tensorflow/contrib/tpu/python/tpu/tpu_estimator.py     |  4 ++--
 tensorflow/python/distribute/distribute_coordinator.py |  4 ++--
 tensorflow/python/distribute/estimator_training.py     |  2 +-
 tensorflow/python/estimator/estimator.py               | 10 +++++-----
 tensorflow/python/ops/rnn_cell_impl.py                 |  2 +-
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/tensorflow/contrib/data/python/ops/optimization.py b/tensorflow/contrib/data/python/ops/optimization.py
index 7f5ce97228..30348ede36 100644
--- a/tensorflow/contrib/data/python/ops/optimization.py
+++ b/tensorflow/contrib/data/python/ops/optimization.py
@@ -53,7 +53,7 @@ def model():
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
index c900b41e14..9809204f8f 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
@@ -216,7 +216,7 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     """Configures the object.
 
     Args:
-      session_config: a @{tf.ConfigProto}
+      session_config: a `tf.ConfigProto`
       cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
         cluster configurations.
       task_type: the current task type, such as "worker".
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 23c54511ca..764d85877a 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -231,7 +231,7 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
   `metric_fn` runs on CPU to generate metrics and `tensors` represents the
   `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`.
   To be precise, TPU evaluation expects a slightly different signature from the
-  @{tf.estimator.Estimator}. While `EstimatorSpec.eval_metric_ops` expects a
+  `tf.estimator.Estimator`. While `EstimatorSpec.eval_metric_ops` expects a
   dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`.
   The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The
   `tensors` usually specify the model logits, which are transferred back from
@@ -254,7 +254,7 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
   sending tensors from TPU to CPU. To reduce the overhead, try reducing the
   size of the tensors. The `tensors` are concatenated along their major (batch)
   dimension, and so must be >= rank 1. The `host_call` is useful for writing
-  summaries with @{tf.contrib.summary.create_file_writer}.
+  summaries with `tf.contrib.summary.create_file_writer`.
   """
 
   def __new__(cls,
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
index bd3562f1ff..b9b77d4a5b 100644
--- a/tensorflow/python/distribute/distribute_coordinator.py
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -126,7 +126,7 @@ class _WorkerContext(object):
         replicated training.
       task_id: an integer indicating id of the corresponding task. It can be
         None if it is local training or in-graph replicated training.
-      session_config: an optional @{tf.ConfigProto} object.
+      session_config: an optional `tf.ConfigProto` object.
       rpc_layer: optional string specifying the RPC protocol for communication
         with worker masters. If None or empty, hosts in the `cluster_spec` will
         be used directly.
@@ -685,7 +685,7 @@ def run_distribute_coordinator(worker_fn,
       in a cluster. If not set or empty, fall back to local training.
     task_type: the current task type, optional if this is a client.
     task_id: the current task id, optional if this is a client.
-    session_config: an optional @{tf.ConfigProto} object which will be passed
+    session_config: an optional `tf.ConfigProto` object which will be passed
       to `strategy`'s `configure` method and used to create a session.
     rpc_layer: optional string, the protocol for RPC, e.g. "grpc".
 
diff --git a/tensorflow/python/distribute/estimator_training.py b/tensorflow/python/distribute/estimator_training.py
index 8daa34c885..0289689134 100644
--- a/tensorflow/python/distribute/estimator_training.py
+++ b/tensorflow/python/distribute/estimator_training.py
@@ -62,7 +62,7 @@ def _get_global_id(cluster_spec, task_type, task_id, chief_task_type):
 
   # Sort task names in cluster by "chief"/"master", "evaluator", "worker"
   # and "ps". More details can be found at the documentation of
-  # @{tf.estimator.RunConfig.global_id_in_cluster}.
+  # `tf.estimator.RunConfig.global_id_in_cluster`.
   task_type_ordered_list = []
   if chief_task_type in cluster_spec.jobs:
     task_type_ordered_list = [chief_task_type]
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 827b405e51..b933cedb99 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -144,7 +144,7 @@ class Estimator(object):
           * `labels`: This is the second item returned from the `input_fn`
                  passed to `train`, `evaluate`, and `predict`. This should be a
                  single `tf.Tensor` or `dict` of same (for multi-head models).
-                 If mode is @{tf.estimator.ModeKeys.PREDICT}, `labels=None` will
+                 If mode is `tf.estimator.ModeKeys.PREDICT`, `labels=None` will
                  be passed. If the `model_fn`'s signature does not accept
                  `mode`, the `model_fn` must still be able to handle
                  `labels=None`.
@@ -803,9 +803,9 @@ class Estimator(object):
     those features and labels, and restores the given checkpoint
     (or, lacking that, the most recent checkpoint) into the graph.
     Only one of the modes is used for saving variables to the `SavedModel`
-    (order of preference: @{tf.estimator.ModeKeys#TRAIN$TRAIN},
-    @{tf.estimator.ModeKeys#EVAL$EVAL}, then
-    @{tf.estimator.ModeKeys#PREDICT$PREDICT}), such that up to three
+    (order of preference: `tf.estimator.ModeKeys.TRAIN`,
+    `tf.estimator.ModeKeys.EVAL`, then
+    `tf.estimator.ModeKeys.PREDICT`), such that up to three
     `tf.MetaGraphDefs` are saved with a single set of variables in a single
     `SavedModel` directory.
 
@@ -1101,7 +1101,7 @@ class Estimator(object):
     """Creates the global step tensor in graph.
 
     The global step tensor must be an integer type with name 'global_step' and
-    be added to the collection @{tf.GraphKeys#GLOBAL_STEP$GLOBAL_STEP}.
+    be added to the collection `tf.GraphKeys.GLOBAL_STEP`.
 
     Args:
       graph: The graph in which to create the global step tensor.
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 43cca1a498..c2751e529a 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -611,7 +611,7 @@ class LSTMStateTuple(_LSTMStateTuple):
 # TODO(scottzhu): Stop exporting this class in TF 2.0.
 @tf_export("nn.rnn_cell.BasicLSTMCell")
 class BasicLSTMCell(LayerRNNCell):
-  """DEPRECATED: Please use @{tf.nn.rnn_cell.LSTMCell} instead.
+  """DEPRECATED: Please use `tf.nn.rnn_cell.LSTMCell` instead.
 
   Basic LSTM recurrent network cell.
 
-- 
GitLab


From 90aa10fcf5c80591b31988754e6221d6c2b8bbd0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 11:28:35 -0700
Subject: [PATCH 152/570] internal change only

PiperOrigin-RevId: 214967868
---
 tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index b498599962..8e6e9aa0cd 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -156,8 +156,7 @@ bool NewSession(const string& service_addr,
           channel_args));
   NewProfileSessionResponse new_session_response;
   TF_QCHECK_OK(FromGrpcStatus(
-      stub->NewSession(&context, new_session_request, &new_session_response)))
-      << new_session_response.error_message();
+      stub->NewSession(&context, new_session_request, &new_session_response)));
 
   std::cout << "Profile session succeed for host(s):"
             << str_util::Join(hostnames, ",") << std::endl;
-- 
GitLab


From e00954e8626c74b263b90527e0c020cfd64136b2 Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Fri, 28 Sep 2018 12:08:42 -0700
Subject: [PATCH 153/570] Puts the keras optimizer weights on device.

PiperOrigin-RevId: 214974535
---
 .../contrib/tpu/python/tpu/keras_support.py   | 11 +++-
 .../tpu/python/tpu/keras_tpu_variables.py     | 53 +++++++++++++++++++
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index 956d0142a3..696656e840 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -959,7 +959,16 @@ class TPUFunction(object):
 
       # Compute our outfeed depending on the execution mode
       if is_training:
-        self._cloned_model._make_train_function()
+        if not isinstance(self._cloned_optimizer, keras_optimizers.TFOptimizer):
+          # For Keras optimizer, we try to place the variable weights on the TPU
+          # device. Keras creates optimizer variables (e.g. momentum values for
+          # the Momentum optimizer) when _make_train_function is invoked.
+          with keras_tpu_variables.replicated_variable_for_optimizer(
+              self._tpu_assignment.num_towers):
+            self._cloned_model._make_train_function()
+        else:
+          self._cloned_model._make_train_function()
+
         self._outfeed_spec = [
             tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name)
             for tensor in self._cloned_model.train_function.outputs
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py b/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
index 170977d8ab..598da7418e 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
@@ -25,10 +25,15 @@ from __future__ import print_function
 
 import contextlib
 
+import numpy as np
+
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 
 
@@ -285,3 +290,51 @@ def replicated_scope(num_replicas):
 
   return variable_scope.variable_scope(
       "", custom_getter=_replicated_variable_getter)
+
+
+@contextlib.contextmanager
+def replicated_variable_for_optimizer(num_replicas):
+  """Context manager for optimizer weights. Overrides K.variable."""
+  if num_replicas == 1:
+    yield
+    return
+
+  try:
+    old_v = backend.variable
+
+    def opt_variable(value, dtype=None, name=None, constraint=None):
+      """Instantiates a variable and returns it."""
+      if dtype is None:
+        dtype = backend.floatx()
+
+      variables = []
+      for i in range(num_replicas):
+        # Keras holds the variables in optimizer class instance , so the name
+        # does not matter here. ResourceVariable constructor will find a unique
+        # name (including name=None) for each replica.
+        with ops.device("device:TPU:{}".format(i)):
+          v = resource_variable_ops.ResourceVariable(
+              value,
+              dtype=dtypes_module.as_dtype(dtype),
+              name=name,
+              constraint=constraint)
+          variables.append(v)
+      name = "replicate_{}_{}".format("variable" if name is None else name,
+                                      ops.uid())
+      v = ReplicatedVariable(name, variables)
+
+      # pylint: disable=protected-access
+
+      if isinstance(value, np.ndarray):
+        v._keras_shape = value.shape
+      elif hasattr(value, "shape"):
+        v._keras_shape = backend.int_shape(value)
+      v._uses_learning_phase = False
+      backend.track_variable(v)
+      return v
+
+    backend.variable = opt_variable
+    yield
+
+  finally:
+    backend.variable = old_v
-- 
GitLab


From b5feceb9058e06eac3de86ec45c44f5637054855 Mon Sep 17 00:00:00 2001
From: "Xiaoming (Jason) Cui" <xiaoming.cui@intel.com>
Date: Tue, 25 Sep 2018 00:42:42 -0700
Subject: [PATCH 154/570] Added the feature to disable MKL support of
 TensorFlow by environmental variable TF_DISABLE_MKL=1

---
 .../core/common_runtime/mkl_cpu_allocator.h   | 54 +++++++++++++------
 .../core/common_runtime/process_util.cc       |  5 ++
 .../core/common_runtime/threadpool_device.cc  |  4 ++
 tensorflow/core/graph/mkl_layout_pass.cc      |  5 ++
 .../core/graph/mkl_tfconversion_pass.cc       |  5 ++
 tensorflow/core/util/util.cc                  | 20 +++++++
 tensorflow/core/util/util.h                   |  5 ++
 7 files changed, 81 insertions(+), 17 deletions(-)

diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 429b19599b..516138d28d 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/util/util.h"
 #include "tensorflow/core/platform/numa.h"
 
 #ifndef INTEL_MKL_DNN_ONLY
@@ -163,6 +164,12 @@ class MklCPUAllocator : public Allocator {
   }
 
   Status Initialize() {
+    if (DisableMKL()) {
+        VLOG(1) << "TF-MKL: Disabling pool allocator";
+        tf_disable_pool_allocator_flag_ = true;
+        return Status::OK();
+    }
+
     VLOG(2) << "MklCPUAllocator: In MklCPUAllocator";
 
     // Set upper bound on memory allocation to physical RAM available on the
@@ -217,6 +224,10 @@ class MklCPUAllocator : public Allocator {
   inline string Name() override { return kName; }
 
   inline void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    if (tf_disable_pool_allocator_flag_) {
+      return port::AlignedMalloc(num_bytes, alignment);
+    }
+
     // If the allocation size is less than threshold, call small allocator,
     // otherwise call large-size allocator (BFC). We found that BFC allocator
     // does not deliver good performance for small allocations when
@@ -227,6 +238,10 @@ class MklCPUAllocator : public Allocator {
   }
 
   inline void DeallocateRaw(void* ptr) override {
+    if (tf_disable_pool_allocator_flag_) {
+      port::AlignedFree(ptr);
+      return;
+    }
     // Check if ptr is for "small" allocation. If it is, then call Free
     // directly. Otherwise, call BFC to handle free.
     if (small_size_allocator_->IsSmallSizeAllocation(ptr)) {
@@ -237,26 +252,30 @@ class MklCPUAllocator : public Allocator {
   }
 
   void GetStats(AllocatorStats* stats) override {
-    AllocatorStats l_stats, s_stats;
-    small_size_allocator_->GetStats(&s_stats);
-    large_size_allocator_->GetStats(&l_stats);
-
-    // Combine statistics from small-size and large-size allocator.
-    stats->num_allocs = l_stats.num_allocs + s_stats.num_allocs;
-    stats->bytes_in_use = l_stats.bytes_in_use + s_stats.bytes_in_use;
-    stats->max_bytes_in_use =
-        l_stats.max_bytes_in_use + s_stats.max_bytes_in_use;
-
-    // Since small-size allocations go to MklSmallSizeAllocator,
-    // max_alloc_size from large_size_allocator would be the maximum
-    // size allocated by MklCPUAllocator.
-    stats->max_alloc_size = l_stats.max_alloc_size;
-    stats->bytes_limit = std::max(s_stats.bytes_limit, l_stats.bytes_limit);
+    if (!tf_disable_pool_allocator_flag_) {
+      AllocatorStats l_stats, s_stats;
+      small_size_allocator_->GetStats(&s_stats);
+      large_size_allocator_->GetStats(&l_stats);
+
+      // Combine statistics from small-size and large-size allocator.
+      stats->num_allocs = l_stats.num_allocs + s_stats.num_allocs;
+      stats->bytes_in_use = l_stats.bytes_in_use + s_stats.bytes_in_use;
+      stats->max_bytes_in_use =
+          l_stats.max_bytes_in_use + s_stats.max_bytes_in_use;
+
+      // Since small-size allocations go to MklSmallSizeAllocator,
+      // max_alloc_size from large_size_allocator would be the maximum
+      // size allocated by MklCPUAllocator.
+      stats->max_alloc_size = l_stats.max_alloc_size;
+      stats->bytes_limit = std::max(s_stats.bytes_limit, l_stats.bytes_limit);
+    }
   }
 
   void ClearStats() override {
-    small_size_allocator_->ClearStats();
-    large_size_allocator_->ClearStats();
+    if (!tf_disable_pool_allocator_flag_) {
+      small_size_allocator_->ClearStats();
+      large_size_allocator_->ClearStats();
+    }
   }
 
  private:
@@ -295,6 +314,7 @@ class MklCPUAllocator : public Allocator {
   // The alignment that we need for the allocations
   static constexpr const size_t kAlignment = 64;
 
+  bool tf_disable_pool_allocator_flag_ = false;
   Allocator* large_size_allocator_;              // owned by this class
   MklSmallSizeAllocator* small_size_allocator_;  // owned by this class.
 
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index a5d31b75c7..60fa601907 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/util.h"
 
 namespace tensorflow {
 
@@ -56,6 +57,10 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   const int32 inter_op = options.config.inter_op_parallelism_threads();
   if (inter_op != 0) return inter_op;
 #ifdef INTEL_MKL
+  // Early return if MKL is disabled
+  if (DisableMKL())
+    return port::NumSchedulableCPUs();
+
   // MKL library executes ops in parallel using OMP threads
   // Set inter_op conservatively to avoid thread oversubscription that could
   // lead to severe perf degradations and OMP resource exhaustion
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 8587d1783a..29c01d7f72 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/util/util.h"
 
 #ifdef INTEL_MKL
 #ifdef _OPENMP
@@ -49,6 +50,9 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
       allocator_(allocator),
       scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
 #ifdef INTEL_MKL
+  // Eearly return when MKL is disabled
+  if (DisableMKL())
+    return;
 #ifdef _OPENMP
   const char* user_omp_threads = getenv("OMP_NUM_THREADS");
   if (user_omp_threads == nullptr) {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 06d3fefef1..7394b1cddf 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/util.h"
 
 #include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/graph/mkl_layout_pass.h"
@@ -4511,6 +4512,10 @@ Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
   if (options.graph == nullptr && options.partition_graphs == nullptr) {
     return Status::OK();
   }
+  if (DisableMKL()) {
+    VLOG(2) << "TF-MKL: Disabling MKL";
+    return Status::OK();
+  }
 
   auto process_graph = [&](std::unique_ptr<Graph>* g) {
     // Get the ownership of a graph
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index 8c5ffd71a3..6804ab84ce 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/util.h"
 
 #include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/graph/mkl_tfconversion_pass.h"
@@ -424,6 +425,10 @@ Status MklToTfConversionPass::Run(const GraphOptimizationPassOptions& options) {
   if (options.graph == nullptr && options.partition_graphs == nullptr) {
     return Status::OK();
   }
+  if (DisableMKL()) {
+    VLOG(2) << "TF-MKL: Disabling MKL";
+    return Status::OK();
+  }
 
   auto process_graph = [&](std::unique_ptr<Graph>* g) {
     // Get the ownership of graph
diff --git a/tensorflow/core/util/util.cc b/tensorflow/core/util/util.cc
index 1e5a9c5712..44d5becb9c 100644
--- a/tensorflow/core/util/util.cc
+++ b/tensorflow/core/util/util.cc
@@ -120,4 +120,24 @@ string SliceDebugString(const TensorShape& shape, const int64 flat) {
   return result;
 }
 
+#ifdef INTEL_MKL
+bool DisableMKL() {
+  enum MklStatus {
+    MKL_DEFAULT = 0,
+    MKL_ON = 1,
+    MKL_OFF = 2
+  };
+  static MklStatus status = MKL_DEFAULT;
+  if (status == MKL_DEFAULT) {
+    char* tf_disable_mkl = getenv("TF_DISABLE_MKL");
+    if ((tf_disable_mkl != NULL) && (std::stoi(tf_disable_mkl) == 1)) {
+      VLOG(2) << "TF-MKL: Disabling MKL";
+      status = MKL_OFF;
+    } else {
+      status = MKL_ON;
+    }
+  }
+  return status == MKL_OFF ? true : false;
+}
+#endif
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/util.h b/tensorflow/core/util/util.h
index 93dfd51ab5..ba90ad52c2 100644
--- a/tensorflow/core/util/util.h
+++ b/tensorflow/core/util/util.h
@@ -56,6 +56,11 @@ string PrintMemory(const char* ptr, size_t n);
 // "tensor", "tensor[i]", "tensor[i, j]", etc.
 string SliceDebugString(const TensorShape& shape, const int64 flat);
 
+// disable MKL in runtime
+#ifdef INTEL_MKL
+bool DisableMKL();
+#endif
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_UTIL_UTIL_H_
-- 
GitLab


From 5e66d25666aad9fa76ed8cc0d2b162db76ea0cc8 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Fri, 28 Sep 2018 12:46:10 -0700
Subject: [PATCH 155/570] Add flag for enabling while_v2. Add a single test
 flag for enabling v2 control flow in tests since we do not plan to support v2
 ops with legacy control flow. We have 2 test decorators now:
 @with_control_flow_v2: Enables all tests in a class to run with v2 control
 flow. @disable_control_flow_v2: Disables a test function from running in v2.
 I have removed the skiptests to avoid setup/teardown overheads. Enable tests
 in control_flow_ops_py_test that run with control_flow_v2.

PiperOrigin-RevId: 214980108
---
 tensorflow/python/BUILD                       |   1 +
 tensorflow/python/framework/test_util.py      |  84 ++++++--
 tensorflow/python/kernel_tests/BUILD          |   3 +-
 .../kernel_tests/control_flow_ops_py_test.py  | 180 +++++++++---------
 tensorflow/python/ops/control_flow_ops.py     |  16 ++
 tensorflow/python/ops/while_v2.py             |   4 +
 6 files changed, 187 insertions(+), 101 deletions(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 91cafea042..9275ad767e 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2017,6 +2017,7 @@ py_library(
         ":array_ops",
         ":cond_v2_impl",
         ":constant_op",
+        ":control_flow_ops",
         ":control_flow_util",
         ":framework_ops",
         ":function_def_to_graph",
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index cd0b03be43..6673bc5561 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -24,8 +24,8 @@ from collections import OrderedDict
 import contextlib
 import gc
 import itertools
-import os
 import math
+import os
 import random
 import re
 import tempfile
@@ -402,11 +402,14 @@ def with_c_shapes(cls):
   return cls
 
 
-def enable_cond_v2(fn):
-  """Decorator for enabling CondV2 on a test.
+def enable_control_flow_v2(fn):
+  """Decorator for enabling CondV2 and WhileV2 on a test.
 
-  Note this enables using CondV2 after running the test class's setup/teardown
-  methods.
+  Note this enables using CondV2 and WhileV2 after running the test class's
+  setup/teardown methods.
+
+  In addition to this, callers must import the while_v2 module in order to set
+  the _while_v2 module in control_flow_ops.
 
   Args:
     fn: the function to be wrapped
@@ -416,21 +419,56 @@ def enable_cond_v2(fn):
   """
 
   def wrapper(*args, **kwargs):
-    prev_value = control_flow_ops.ENABLE_COND_V2
+    enable_cond_v2_old = control_flow_ops.ENABLE_COND_V2
+    enable_while_v2_old = control_flow_ops.ENABLE_WHILE_V2
     control_flow_ops.ENABLE_COND_V2 = True
+    control_flow_ops.ENABLE_WHILE_V2 = True
     try:
       fn(*args, **kwargs)
     finally:
-      control_flow_ops.ENABLE_COND_V2 = prev_value
+      control_flow_ops.ENABLE_COND_V2 = enable_cond_v2_old
+      control_flow_ops.ENABLE_WHILE_V2 = enable_while_v2_old
 
   return wrapper
 
 
-def with_cond_v2(cls):
-  """Adds methods that call original methods but with CondV2 enabled.
+def with_control_flow_v2(cls):
+  """Adds methods that call original methods with WhileV2 and CondV2 enabled.
 
-  Note this enables CondV2 in new methods after running the test class's
-  setup method.
+  Note this enables CondV2 and WhileV2 in new methods after running the test
+  class's setup method.
+
+  In addition to this, callers must import the while_v2 module in order to set
+  the _while_v2 module in control_flow_ops.
+
+  If a test function has _disable_control_flow_v2 attr set to True (using the
+  @disable_control_flow_v2 decorator), the v2 function is not generated for it.
+
+  Example:
+
+  @test_util.with_control_flow_v2
+  class ControlFlowTest(test.TestCase):
+
+    def testEnabledForV2(self):
+      ...
+
+    @test_util.disable_control_flow_v2("b/xyzabc")
+    def testDisabledForV2(self):
+      ...
+
+  Generated class:
+  class ControlFlowTest(test.TestCase):
+
+    def testEnabledForV2(self):
+      ...
+
+    def testEnabledForV2WithControlFlowV2(self):
+      // Enable V2 flags.
+      testEnabledForV2(self)
+      // Restore V2 flags.
+
+    def testDisabledForV2(self):
+      ...
 
   Args:
     cls: class to decorate
@@ -438,15 +476,33 @@ def with_cond_v2(cls):
   Returns:
     cls with new test methods added
   """
-  if control_flow_ops.ENABLE_COND_V2:
+  if control_flow_ops.ENABLE_WHILE_V2 and control_flow_ops.ENABLE_COND_V2:
     return cls
 
   for name, value in cls.__dict__.copy().items():
-    if callable(value) and name.startswith("test"):
-      setattr(cls, name + "WithCondV2", enable_cond_v2(value))
+    if (callable(value) and name.startswith("test") and
+        not getattr(value, "_disable_control_flow_v2", False)):
+      setattr(cls, name + "WithControlFlowV2", enable_control_flow_v2(value))
   return cls
 
 
+def disable_control_flow_v2(unused_msg):
+  """Decorator for a function in a with_control_flow_v2 enabled test class.
+
+  Blocks the function from being run with v2 control flow ops.
+
+  Args:
+    unused_msg: Reason for disabling.
+
+  Returns:
+    The wrapped function with _disable_control_flow_v2 attr set to True.
+  """
+  def wrapper(func):
+    func._disable_control_flow_v2 = True
+    return func
+  return wrapper
+
+
 def assert_no_new_pyobjects_executing_eagerly(f):
   """Decorator for asserting that no new Python objects persist after a test.
 
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 280c18ec00..65b9e04ed9 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1480,7 +1480,7 @@ cuda_py_test(
     name = "control_flow_ops_py_test",
     # TODO(b/70473603): change this back to "small" once the C API is
     # permanently enabled
-    size = "medium",
+    size = "large",
     srcs = ["control_flow_ops_py_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1512,6 +1512,7 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python:while_v2",
     ],
 )
 
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 083de84775..d91a848e01 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -23,7 +23,6 @@ from __future__ import print_function
 import collections
 import math
 import time
-import unittest
 
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -63,6 +62,7 @@ from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.ops import while_v2  # pylint: disable=unused-import
 # pylint: disable=unused-import
 import tensorflow.python.ops.tensor_array_grad
 # pylint: enable=unused-import
@@ -125,7 +125,7 @@ def isum(s, maximum_iterations=None):
   return r_s
 
 
-@test_util.with_cond_v2
+@test_util.with_control_flow_v2
 class ControlFlowTest(test.TestCase):
 
   def testRefIdentity(self):
@@ -332,10 +332,8 @@ class ControlFlowTest(test.TestCase):
       with self.assertRaisesOpError("has inputs from different frames"):
         res.eval(feed_dict={data: 1.0})
 
+  @test_util.disable_control_flow_v2("b/113294340")
   def testCondBool(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113296297")
-
     values = constant_op.constant(10)
     fn1 = lambda: math_ops.add(values, 1)
     fn2 = lambda: math_ops.subtract(values, 1)
@@ -366,6 +364,7 @@ class ControlFlowTest(test.TestCase):
                                          "has been marked as not fetchable"):
               sess.run(t, feed_dict={x: 3})
 
+  @test_util.disable_control_flow_v2("Not relevant")
   def testFeedable(self):
     with self.cached_session() as sess:
       c = constant_op.constant(2)
@@ -383,10 +382,8 @@ class ControlFlowTest(test.TestCase):
             with self.assertRaisesRegexp(ValueError, "may not be fed"):
               sess.run(r, feed_dict={t: 3})
 
+  @test_util.disable_control_flow_v2("b/113296180 (IndexedSlices)")
   def testCondIndexedSlices(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113296180")
-
     with self.cached_session():
       values = constant_op.constant(10)
       indices = constant_op.constant(0)
@@ -401,10 +398,8 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
 
+  @test_util.disable_control_flow_v2("b/113296161 (SparseTensors)")
   def testCondSparseTensor(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113296161 (SparseTensors)")
-
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
       indices = constant_op.constant(
@@ -435,10 +430,8 @@ class ControlFlowTest(test.TestCase):
 
       self.assertEqual(1.0, control_flow_ops.cond(rv, case, lambda: t).eval())
 
+  @test_util.disable_control_flow_v2("b/113293074")
   def testCondIndexedSlicesDifferentTypes(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113293074")
-
     with self.cached_session():
       values = constant_op.constant(10)
       i_32 = ops.convert_to_tensor(0, name="one", dtype=dtypes.int32)
@@ -510,10 +503,8 @@ class ControlFlowTest(test.TestCase):
       result = r.eval()
     self.assertAllEqual(12, result)
 
+  @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
   def testCond_4(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113324949 (ref vars)")
-
     with self.cached_session():
       v1 = variables.Variable(7)
       v2 = variables.Variable(7)
@@ -587,10 +578,8 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
       self.assertAllEqual([2.0], r.eval())
 
+  @test_util.disable_control_flow_v2("b/79881896 (control deps)")
   def testCondWithControl(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/79881896")
-
     with self.cached_session():
       control_holder = array_ops.placeholder(dtypes.float32, shape=())
       a = constant_op.constant(3)
@@ -629,10 +618,9 @@ class ControlFlowTest(test.TestCase):
       merged_op = control_flow_ops.merge([assign_v, orig_v])
       self.assertAllEqual([1.0], sess.run(merged_op.output))
 
+  @test_util.disable_control_flow_v2(
+      "b/112477618 (Operation returned from cond)")
   def testCondSwitchIdentity(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/112477618 (Operation returned from cond)")
-
     # Make sure the recv identity is not removed by optimization.
     with session.Session(config=opt_cfg()) as sess:
       pred = constant_op.constant(True)
@@ -646,10 +634,9 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
       sess.run(r)
 
+  @test_util.disable_control_flow_v2(
+      "b/112477618 (Operation returned from cond)")
   def testCondRecvIdentity(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/112477618 (Operation returned from cond)")
-
     # Make sure the switch identity is not removed by optimization.
     with session.Session(config=opt_cfg()) as sess:
       with ops.device(test.gpu_device_name()):
@@ -665,10 +652,8 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
       sess.run(r)
 
+  @test_util.disable_control_flow_v2("b/113346829 (gpu failure)")
   def testCondGrad_1(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113346829 (gpu failure)")
-
     graph = ops.Graph()
     with graph.as_default():
       x = constant_op.constant(10.0, name="x")
@@ -694,10 +679,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(42.0, grad.eval(feed_dict={c: 1}))
       self.assertAllEqual(3.0, grad.eval(feed_dict={c: 3}))
 
+  @test_util.disable_control_flow_v2(
+      "b/110550782 (gradient w.r.t external variable)")
   def testCondGrad_3(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/110550782 (gradient w.r.t external variable)")
-
     with self.cached_session():
       c = array_ops.placeholder(dtypes.int32, shape=[])
       ox = constant_op.constant(10.0)
@@ -729,10 +713,8 @@ class ControlFlowTest(test.TestCase):
       result = gradients_impl.gradients(z, x)[0]
       self.assertEqual(1.0, result.eval())
 
+  @test_util.disable_control_flow_v2("b/113327884")
   def testCondGrad_Gather(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113327884")
-
     with self.cached_session() as sess:
       v1 = variables.Variable([1.0, 42.0])
       c = array_ops.placeholder(dtypes.int32, shape=[])
@@ -756,6 +738,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(dense_gv, [0.0, 2.0])
 
   # Microbenchmark: 256,000 iterations/s.
+  @test_util.disable_control_flow_v2("b/116630618 (Times out)")
   def testWhile_1(self):
     with self.cached_session():
       n = constant_op.constant(0)
@@ -764,6 +747,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
       self.assertEqual(10000, r.eval())
 
+  @test_util.disable_control_flow_v2("b/79881896 (control deps)")
   def testWhileExternalControlDependencies(self):
     with self.cached_session():
       v = variables.Variable(0.0)
@@ -779,6 +763,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(result.eval(), 2)
       self.assertAllEqual(v.eval(), 1.0)
 
+  @test_util.disable_control_flow_v2("b/79881896 (control deps)")
   def testWhileExternalControlDependenciesNoInput(self):
     with self.cached_session():
       v = variables.Variable(0.0)
@@ -794,6 +779,7 @@ class ControlFlowTest(test.TestCase):
       result.eval()
       self.assertAllEqual(v.eval(), 1.0)
 
+  @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileWithRefs_1(self):
     with self.cached_session() as sess:
       x = variables.VariableV1(0)._ref()  # pylint: disable=protected-access
@@ -824,18 +810,22 @@ class ControlFlowTest(test.TestCase):
       r = isum(s)
       self.assertAllEqual(45, r.eval())
 
+  @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
   def testWhileWithMaximumIterations(self):
     with self.cached_session():
       s = constant_op.constant([1, 2, 3, 4, 5])
       r = isum(s, maximum_iterations=3)
       self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], r.eval())
 
+  @test_util.disable_control_flow_v2("b/116339888 (non-tensor loop var)")
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with self.cached_session():
       r = control_flow_ops.while_loop(
           lambda i: i < 3, lambda i: i + 1, [0], maximum_iterations=1)
       self.assertEqual(1, r.eval())
 
+  @test_util.disable_control_flow_v2(
+      "b/116248044 (nested), b/115920078 (gradients)")
   def testSingleNestedMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -861,6 +851,7 @@ class ControlFlowTest(test.TestCase):
     # Should execute without issue.
     self.assertEqual(3, self.evaluate(loop_execute))
 
+  @test_util.disable_control_flow_v2("b/116248044 (nested while_loop)")
   def testInvalidMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -904,10 +895,8 @@ class ControlFlowTest(test.TestCase):
         r"context '.*' \(currently defined in '.*'\)"):
       _ = gradients_impl.gradients(loop_with_maxiter, v)
 
+  @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
   def testInvalidMaximumIterationsFromSiblingContextWhileLoopInXLAContext(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113294340 (enable while_v2)")
-
     v = constant_op.constant(1.0)
 
     def create_while_loop():
@@ -939,6 +928,8 @@ class ControlFlowTest(test.TestCase):
         r"while loop context '' \(currently defined in 'cond/.+'\)"):
       _ = gradients_impl.gradients(loop, v)
 
+  @test_util.disable_control_flow_v2(
+      "b/116248044 (nesting), b/115776323 (max_iters)")
   def testNestedWhileLoopWithMaxItersFromOuterContextInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -1048,6 +1039,7 @@ class ControlFlowTest(test.TestCase):
       result = r[3].eval()
     self.assertAllEqual(42, result)
 
+  @test_util.disable_control_flow_v2("b/116283162 (shape_invariants)")
   def testWhile_5(self):
     with self.cached_session():
 
@@ -1072,6 +1064,7 @@ class ControlFlowTest(test.TestCase):
       result = r[2].eval()
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
+  @test_util.disable_control_flow_v2("b/116338794 (buffer_reuse)")
   def testBufferForwarding(self):
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -1122,6 +1115,7 @@ class ControlFlowTest(test.TestCase):
     self._testWhile_Gpu_1(use_gpu=False)
     self._testWhile_Gpu_1(use_gpu=True)
 
+  @test_util.disable_control_flow_v2("b/116283162 (shape_invariants)")
   def testWhileShape(self):
     with self.cached_session():
       i = constant_op.constant(0)
@@ -1139,6 +1133,7 @@ class ControlFlowTest(test.TestCase):
       r = r[1] * array_ops.ones([8, 8])
       self.assertAllEqual(np.ones((8, 8)), r.eval())
 
+  @test_util.disable_control_flow_v2("b/116339888 (non-tensor loop var)")
   def testWhileWithNonTensorInput_Scalar(self):
     with self.cached_session():
       n = 0
@@ -1147,6 +1142,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
       self.assertEqual(10000, r.eval())
 
+  @test_util.disable_control_flow_v2("b/116339888 (non-tensor loop var)")
   def testWhileWithNonTensorInput_Vector(self):
     with self.cached_session():
       n = np.array([0])  # Note, [0] would not work here; that is a list
@@ -1155,6 +1151,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
       self.assertEqual([10000], r.eval())
 
+  @test_util.disable_control_flow_v2("b/116283162 (shape_invariants)")
   def testWhileShapeInference(self):
     with self.cached_session():
       i = constant_op.constant(0)
@@ -1169,7 +1166,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(
           c, b, [i, m],
           [i.get_shape(), tensor_shape.TensorShape([None, 2])])
-      self.assertTrue(r[1].get_shape()[0].value is None)
+      self.assertIsNone(r[1].get_shape()[0].value)
       self.assertEqual(r[1].get_shape()[1], tensor_shape.Dimension(2))
 
       with self.assertRaisesRegexp(
@@ -1180,6 +1177,7 @@ class ControlFlowTest(test.TestCase):
           r"tf.while_loop to specify a less-specific shape."):
         r = control_flow_ops.while_loop(c, b, [i, m])
 
+  @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
   def testWhileShapeInferenceSparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -1211,6 +1209,7 @@ class ControlFlowTest(test.TestCase):
             c, b, [i, x],
             [i.get_shape(), tensor_shape.TensorShape([5])])
 
+  @test_util.disable_control_flow_v2("b/116282023 (IndexedSlices)")
   def testWhileShapeInferenceIndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant([[2.0, 4.0], [3.0, 5.0]], name="values")
@@ -1265,6 +1264,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [n])
       self.assertEqual(225, r.eval())
 
+  @test_util.disable_control_flow_v2("b/116248044 (nested while)")
   def testNestedWhile_1(self):
     self._testNestedWhile_1(use_gpu=False)
     self._testNestedWhile_1(use_gpu=True)
@@ -1297,6 +1297,7 @@ class ControlFlowTest(test.TestCase):
           outer_c, outer_b, [s0], parallel_iterations=1)
       self.assertEqual(1048576.0, r.eval())
 
+  @test_util.disable_control_flow_v2("b/116248044 (nested while)")
   def testNestedWhile_2(self):
     self._testNestedWhile_2(use_gpu=False)
     self._testNestedWhile_2(use_gpu=True)
@@ -1350,6 +1351,7 @@ class ControlFlowTest(test.TestCase):
             lambda x: x < 10, lambda x: x + array_ops.identity(c), [x0])
       self.assertEqual(10, sess.run(r, {b: True}))
 
+  @test_util.disable_control_flow_v2("b/79881896 (control_deps)")
   def testWhileWithControl_5(self):
     with self.cached_session() as sess:
       b = array_ops.placeholder(dtypes.bool)
@@ -1364,9 +1366,6 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(10, sess.run(r, {b: True}))
 
   def testWhileCondWithControl(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113294377 (unknown shape)")
-
     # Ensure that no control edges by an outer control dependency context are
     # added to nodes inside cond/while contexts.
     with self.cached_session() as sess:
@@ -1380,10 +1379,8 @@ class ControlFlowTest(test.TestCase):
                                            (constant_op.constant(5),))
       self.assertEqual(0, sess.run(loop))
 
+  @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
   def testWhileCondWithControl_1(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113324949 (ref vars)")
-
     with self.cached_session():
       v = variable_scope.get_variable(
           "v", [], initializer=init_ops.constant_initializer(2))
@@ -1405,9 +1402,8 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(4, r.eval())
       self.assertAllClose(65536.0, v.eval())
 
+  @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
   def testWhileCondExitControl(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113294340 (enable while_v2)")
 
     with self.cached_session():
       v = variables.Variable(1)
@@ -1432,8 +1428,6 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(99, v.eval())
 
   def testCondWhile_1(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113294340 (enable while_v2)")
 
     with self.cached_session():
       n = ops.convert_to_tensor(0, name="n")
@@ -1445,8 +1439,6 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(10, r.eval())
 
   def testCondWhile_2(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113294340 (enable while_v2)")
 
     with self.cached_session():
       n = ops.convert_to_tensor(0)
@@ -1458,9 +1450,6 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(10, r.eval())
 
   def _testCondWhile_3(self, use_gpu):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113294340 (enable while_v2)")
-
     with self.test_session(use_gpu=use_gpu) as sess:
       p = array_ops.placeholder(dtypes.bool)
       n = constant_op.constant(0.0)
@@ -1477,18 +1466,17 @@ class ControlFlowTest(test.TestCase):
                                 lambda: control_flow_ops.while_loop(c, b, [n]),
                                 lambda: math_ops.multiply(n, 2.0))
       r1 = gradients_impl.gradients(r, [n])
-      self.assertEqual(10, sess.run(r, {p: True}))
+      self.assertEqual(10., sess.run(r, {p: True}))
       self.assertEqual([1.0], sess.run(r1, {p: True}))
       self.assertEqual(0.0, sess.run(r, {p: False}))
       self.assertEqual([2.0], sess.run(r1, {p: False}))
 
+  @test_util.disable_control_flow_v2("b/116743589")
   def testCondWhile_3(self):
     self._testCondWhile_3(use_gpu=False)
     self._testCondWhile_3(use_gpu=True)
 
   def testWhileCond_1(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113294377 (unknown shape)")
 
     with self.cached_session():
       i = ops.convert_to_tensor(0, name="i")
@@ -1505,8 +1493,6 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(10, r.eval())
 
   def testWhileCond_2(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113294377 (unknown shape)")
 
     with self.cached_session():
       n = ops.convert_to_tensor(0, name="n")
@@ -1516,8 +1502,6 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(10, r.eval())
 
   def testWhileCond_3(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113294377 (unknown shape)")
 
     with self.cached_session():
       n = ops.convert_to_tensor(0)
@@ -1532,6 +1516,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(10, r.eval())
 
   # NOTE: It is ok to have parallel_iterations > 1
+  @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileUpdateVariable_1(self):
     with self.cached_session():
       select = variables.Variable([3.0, 4.0, 5.0])
@@ -1554,6 +1539,7 @@ class ControlFlowTest(test.TestCase):
       result = select.eval()
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
+  @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileUpdateVariable_2(self):
     with self.cached_session():
       select1 = variables.Variable([3.0, 4.0, 5.0])
@@ -1580,6 +1566,7 @@ class ControlFlowTest(test.TestCase):
       result2 = select2.eval()
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result2)
 
+  @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileUpdateVariable_3(self):
     with self.cached_session():
       select = variables.Variable([3.0, 4.0, 5.0])
@@ -1601,7 +1588,7 @@ class ControlFlowTest(test.TestCase):
       result = r[1].eval()
     self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
-  # b/24814703
+  @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileUpdateVariable_4(self):
     with self.cached_session():
       var_a = variables.Variable(0, name="a")
@@ -1629,7 +1616,7 @@ class ControlFlowTest(test.TestCase):
       lpa.eval()  # Run the loop
       self.assertEqual(10, var_b.eval())
 
-  # b/24736492
+  @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileUpdateVariable_5(self):
     with self.cached_session():
       # Create some variables.
@@ -1659,7 +1646,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(10, var_a.eval())
       self.assertEqual(10, var_b.eval())
 
-  # b/24814668
+  @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileUpdateVariable_6(self):
     with self.cached_session():
       # Create some variables.
@@ -1689,6 +1676,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(55, var_b.eval())
       self.assertEqual(10, var_a.eval())
 
+  @test_util.disable_control_flow_v2("b/116742472 (resource accumulator)")
   def testWhileQueue_1(self):
     with self.cached_session():
       q = data_flow_ops.FIFOQueue(-1, dtypes.int32)
@@ -1707,6 +1695,7 @@ class ControlFlowTest(test.TestCase):
       for i in xrange(10):
         self.assertEqual([i], q.dequeue().eval())
 
+  @test_util.disable_control_flow_v2("b/116283162 (shape_invariants)")
   def testWhileStack_1(self):
     with self.cached_session():
       s = gen_data_flow_ops.stack_v2(-1, dtypes.int32, stack_name="foo")
@@ -1775,6 +1764,7 @@ class ControlFlowTest(test.TestCase):
     with self.session(graph=graph) as sess:
       self.assertAllClose(1024.0, sess.run(r))
 
+  @test_util.disable_control_flow_v2("b/116351701 (colocation)")
   def testWhileGrad_ColocateGradients(self):
     self._testWhileGrad_ColocateGradients(colocate=False)
     self._testWhileGrad_ColocateGradients(colocate=True)
@@ -1790,6 +1780,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllClose(1024.0, r.eval())
 
+  @test_util.disable_control_flow_v2("b/116283162 (shape_invariants)")
   def testWhileGrad_Shape(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32, shape=[None])
@@ -1861,8 +1852,6 @@ class ControlFlowTest(test.TestCase):
     self._testWhileGrad_Mul(use_gpu=True, p_iters=10)
 
   def _testNestedWhileCondWhileGrad(self, use_gpu):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113294377 (unknown shape)")
 
     with self.test_session(use_gpu=use_gpu):
       v = constant_op.constant(1.0)
@@ -1885,10 +1874,12 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllClose(512.0, r.eval())
 
+  @test_util.disable_control_flow_v2("b/116248044 (nested while)")
   def testNestedWhileCondWhileGrad(self):
     self._testNestedWhileCondWhileGrad(use_gpu=False)
     self._testNestedWhileCondWhileGrad(use_gpu=True)
 
+  @test_util.disable_control_flow_v2("b/116823782")
   def testWhileGrad_Variable(self):
     with self.cached_session():
       a = variables.Variable(3.0)
@@ -1902,8 +1893,6 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(216.0, r[0].eval())
 
   def testWhileGradInCond(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/110550782 (gradient w.r.t external variable)")
 
     with self.cached_session():
       n = ops.convert_to_tensor(1.0, name="n")
@@ -1919,6 +1908,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(math_ops.less(1, 2), fn1, lambda: x)
       self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
 
+  @test_util.disable_control_flow_v2("b/116340060")
   def testGradInWhileWrtInitialLoopVal(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32, shape=(), name="x")
@@ -1936,6 +1926,7 @@ class ControlFlowTest(test.TestCase):
           "loop invariants or wrt the input parameters to the loop body."):
         control_flow_ops.while_loop(lambda i, x: i < 3, body, [0, y])
 
+  @test_util.disable_control_flow_v2("b/116248044 (nested while)")
   def testWhileGradInWhile(self):
     with self.cached_session():
       n = ops.convert_to_tensor(1.0, name="n")
@@ -1952,9 +1943,8 @@ class ControlFlowTest(test.TestCase):
                                       [tensor_shape.unknown_shape()])
       self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
 
+  @test_util.disable_control_flow_v2("b/116248044 (nested while)")
   def testCondGradInNestedWhiles(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113346829 (gpu failure)")
 
     def outer_body(i, x):
       _, x = control_flow_ops.while_loop(
@@ -1972,6 +1962,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(i_val, 3)
       self.assertAllClose(x_val, 1.0)
 
+  @test_util.disable_control_flow_v2("b/116255781 (flat_args)")
   def testWhile_NestedInput(self):
     with self.cached_session() as sess:
       named = collections.namedtuple("named", ("a", "b"))
@@ -1999,6 +1990,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual([100.0, 1.0, 102.0, 3.0, 4.0 + 100 * 2.0],
                        sess.run(r_flattened))
 
+  @test_util.disable_control_flow_v2("b/116255781(flat_args)")
   def testWhile_NestedBadArityFails(self):
     with self.cached_session():
       named = collections.namedtuple("named", ("a", "b"))
@@ -2057,6 +2049,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients([rx], x)
       self.assertAllClose(1024.0, r[0].eval())
 
+  @test_util.disable_control_flow_v2("b/116355153 (back_prop flag)")
   def testWhileGrad_NoGradient(self):
     with self.cached_session():
       v = constant_op.constant(2.0, name="v")
@@ -2067,6 +2060,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, v)
       self.assertAllClose(1.0, r[0].eval())
 
+  @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileGrad_NoDependency(self):
     with self.cached_session() as sess:
       variable = variables.Variable(array_ops.ones([2, 3]))
@@ -2180,10 +2174,12 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllClose(8.0, r.eval())
 
+  @test_util.disable_control_flow_v2("b/116248044 (nested)")
   def testNestedWhileGrad_Simple(self):
     self._testNestedWhileGrad_Simple(use_gpu=False)
     self._testNestedWhileGrad_Simple(use_gpu=True)
 
+  @test_util.disable_control_flow_v2("b/116248044 (nested)")
   def testNestedWhileGrad_SerialInner(self):
     with self.cached_session():
       v = constant_op.constant(1.0)
@@ -2207,6 +2203,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllClose(256.0, r.eval())
 
+  @test_util.disable_control_flow_v2("b/116248044 (nested)")
   def testNestedWhileGrad_ParallelInner(self):
     with self.cached_session():
       v = constant_op.constant(1.0)
@@ -2230,6 +2227,8 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllClose(512.0, r.eval())
 
+  @test_util.disable_control_flow_v2(
+      "Nested loops and TensorArrays not supported")
   def testNestedWhileGrad_ParallelIterations(self):
     # Make sure the stack pushes and pops of an inner loop are executed in
     # the sequential order of the iterations of its outer loop.
@@ -2268,13 +2267,12 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllClose(1024.0, r.eval())
 
+  @test_util.disable_control_flow_v2("b/116272044 (cond_in_while)")
   def testWhileCondGrad_Simple(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113294377 (unknown shape)")
-
     self._testWhileCondGrad_Simple(use_gpu=False)
     self._testWhileCondGrad_Simple(use_gpu=True)
 
+  @test_util.disable_control_flow_v2("b/116272044 (cond_in_while)")
   def testWhileCondGrad_UnknownShape(self):
     with self.cached_session() as sess:
       v = array_ops.placeholder(dtypes.float32)
@@ -2292,6 +2290,7 @@ class ControlFlowTest(test.TestCase):
       r = sess.run(r, feed_dict={v: 2.0})
       self.assertAllClose(1024.0, r)
 
+  @test_util.disable_control_flow_v2("b/116283162 (shape_invariants)")
   def testWhileGrad_Concat(self):
     with self.cached_session() as sess:
       x = variable_scope.get_variable("x", initializer=[[1., 2.]])
@@ -2315,6 +2314,7 @@ class ControlFlowTest(test.TestCase):
       sess.run(op)
       self.assertAllClose([[0.98000002, 1.98000002]], sess.run(x))
 
+  @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileWithRefsWithGradients_1(self):
     with self.cached_session() as sess:
       x = variables.VariableV1(0.)._ref()  # pylint: disable=protected-access
@@ -2343,6 +2343,7 @@ class ControlFlowTest(test.TestCase):
     self.assertEqual(0, value_x)
     self.assertEqual(73, value_x_grad)
 
+  @test_util.disable_control_flow_v2("b/116282023 (IndexedSlices)")
   def testWhileGrad_IndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -2364,6 +2365,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r.values, values)[0]
       self.assertAllClose(np.array([1024.0, 1024.0]), r.eval())
 
+  @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
   def testWhileGrad_SparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -2386,6 +2388,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r.values, values)[0]
       self.assertAllClose(np.array([1024.0, 1024.0]), r.eval())
 
+  @test_util.disable_control_flow_v2("b/115920078 (gradients)")
   def testCallGradInLoop(self):
     with self.cached_session() as sess:
       i0 = constant_op.constant(0)
@@ -2405,6 +2408,8 @@ class ControlFlowTest(test.TestCase):
           c, b, [i0, constant_op.constant(0.0)])
       self.assertAllClose(600.0, sess.run(output_grad)[1])
 
+  @test_util.disable_control_flow_v2(
+      "b/116255781 (flat_args), b/115660901 (TensorArray)")
   def testWhileAndTensorArray(self):
     with self.cached_session() as sess:
       param = constant_op.constant(2.0)
@@ -2509,6 +2514,7 @@ class ControlFlowTest(test.TestCase):
       all_ops = x.graph.get_operations()
       self.assertFalse(any([name in op.name for op in all_ops]))
 
+  @test_util.disable_control_flow_v2("b/116255781 (flat args)")
   def testWhileGradGradFail(self):
     theta = variables.Variable(initial_value=1.)
 
@@ -2538,6 +2544,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, y)[0]
       self.assertEqual(388.0, r.eval())
 
+  @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileGradientWithNontrainablePath1(self):
     q = variables.Variable([7., 8.])
 
@@ -2555,6 +2562,7 @@ class ControlFlowTest(test.TestCase):
       sess.run(q.initializer)
       self.assertAllClose([0., 0.], sess.run(dy_dq))
 
+  @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileGradientWithNontrainablePath2(self):
     q = variables.Variable([7., 8.])
 
@@ -2572,6 +2580,7 @@ class ControlFlowTest(test.TestCase):
       sess.run(q.initializer)
       self.assertAllClose([1., 1.], sess.run(dy_dq))
 
+  @test_util.disable_control_flow_v2("b/115920078 (gradients)")
   def testIssue16504(self):
     c = constant_op.constant(np.arange(100), dtype=dtypes.float32)
     w = variables.Variable(
@@ -2595,6 +2604,7 @@ class ControlFlowTest(test.TestCase):
     grad, = gradients_impl.gradients(w, c)
     self.assertIsNotNone(grad)
 
+  @test_util.disable_control_flow_v2("b/116270461 (resource)")
   def testStopGradMultiFlows(self):
     with self.cached_session():
 
@@ -2653,10 +2663,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(4.0, i.eval(feed_dict={d: 1}))
       self.assertAllClose(2.0 * math.sqrt(2), i.eval(feed_dict={d: 2}))
 
+  @test_util.disable_control_flow_v2(
+      "b/112477618 (Operation returned from cond)")
   def testCase(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/112477618 (Operation returned from cond)")
-
     with self.cached_session():
       x = constant_op.constant(1)
       y = constant_op.constant(2)
@@ -2708,10 +2717,9 @@ class ControlFlowTest(test.TestCase):
 
       self.assertAllEqual(r6.eval(), 0)
 
+  @test_util.disable_control_flow_v2(
+      "b/112477618 (Operation returned from cond)")
   def testCaseSideEffects(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/112477618 (Operation returned from cond)")
-
     with self.cached_session() as sess:
       v0 = variables.Variable(-1)
       v1 = variables.Variable(-1)
@@ -2746,10 +2754,8 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(0, r0.eval())
       self.assertAllEqual(sess.run([v0, v1, v2]), [0, -1, -1])
 
+  @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
   def testOneOpCond(self):
-    if control_flow_ops.ENABLE_COND_V2:
-      return unittest.skip("b/113324949 (ref vars)")
-
     with self.cached_session():
       v = variables.Variable(0)
       c = ops.convert_to_tensor(0)
@@ -3031,9 +3037,11 @@ class ControlFlowTest(test.TestCase):
 
       r = gradients_impl.gradients(r, x)[0]
       self.assertEqual(r.eval(), 524288.0)
-      self.assertEqual(
-          len([op for op in x.graph.get_operations() if op.type == "StackV2"]),
-          1)
+      # while_v2 does not have stacks.
+      if not control_flow_ops.ENABLE_WHILE_V2:
+        self.assertEqual(
+            len([op for op in x.graph.get_operations() if op.type == "StackV2"
+                ]), 1)
 
 
 class ControlFlowContextCheckTest(test.TestCase):
@@ -3393,7 +3401,7 @@ class WhileOpBenchmark(test.Benchmark):
         name="unroll_same_device", iters=iters, wall_time=duration)
 
 
-@test_util.with_cond_v2
+@test_util.with_control_flow_v2
 class EagerTest(test.TestCase):
 
   def testCond(self):
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 87f8bd85a5..9d7d31df22 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -60,8 +60,17 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.tf_export import tf_export
 
+# The while_v2 module.
+_while_v2 = None
 
 ENABLE_COND_V2 = os.getenv("TF_ENABLE_COND_V2", "0") != "0"
+# Note: Setting this to True is not sufficient to switch to the v2 while_loop.
+# Users must also import the while_v2 module to set the _while_v2 module
+# variable above. We do this to avoid a circular dependency:
+# control_flow_ops -> while_v2 -> gradients_impl -> control_flow_ops
+# A ValueError is raised in tf.while_loop if this is set to True and the
+# `_while_v2` module is not set.
+ENABLE_WHILE_V2 = os.getenv("TF_ENABLE_WHILE_V2", "0") != "0"
 
 
 # We override the 'tuple' for a control flow op, so we keep python's
@@ -3211,6 +3220,13 @@ def while_loop(cond,
   ```
 
   """
+  if ENABLE_WHILE_V2 and not context.executing_eagerly():
+    if not _while_v2:
+      raise ValueError("The while_v2 module is not set. Did you forget to "
+                       "import tensorflow.python.ops."
+                       "while_v2?")
+    return _while_v2.while_loop(cond, body, loop_vars, name)
+
   with ops.name_scope(name, "while", loop_vars):
     if not loop_vars:
       raise ValueError("No loop variables provided")
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 875be31602..6791e1cd61 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -24,6 +24,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import sys
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import function
@@ -33,6 +34,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import cond_v2_impl as cond_v2
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gradients_impl
@@ -41,6 +43,8 @@ from tensorflow.python.util import nest
 
 # pylint: disable=protected-access
 
+control_flow_ops._while_v2 = sys.modules[__name__]
+
 # TODO(b/79881896): Handle external control dependencies. tf.while_loop allows
 # control dependencies on external nodes with at least 1 output.
 # Another idea is to create const nodes outside the loop and add control edges
-- 
GitLab


From 6d02ee8e581bf5211f362b80175122e3782fb37a Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Fri, 28 Sep 2018 12:49:38 -0700
Subject: [PATCH 156/570] Simplify batch_dot logic

Remove dead logical branch.

PiperOrigin-RevId: 214980627
---
 tensorflow/python/keras/backend.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 4589c821e5..584facc859 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -1511,12 +1511,8 @@ def batch_dot(x, y, axes=None):
       out = math_ops.reduce_sum(
           math_ops.multiply(array_ops.transpose(x, [1, 0]), y), axes[1])
   else:
-    if axes is not None:
-      adj_x = None if axes[0] == ndim(x) - 1 else True
-      adj_y = True if axes[1] == ndim(y) - 1 else None
-    else:
-      adj_x = None
-      adj_y = None
+    adj_x = None if axes[0] == ndim(x) - 1 else True
+    adj_y = True if axes[1] == ndim(y) - 1 else None
     out = math_ops.matmul(x, y, adjoint_a=adj_x, adjoint_b=adj_y)
   if diff:
     if x_ndim > y_ndim:
-- 
GitLab


From 00c503b85c2d4b6ab44305e94d66237925eed6bf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 13:25:02 -0700
Subject: [PATCH 157/570] Cleanup

PiperOrigin-RevId: 214985873
---
 tensorflow/contrib/decision_trees/proto/BUILD | 1 -
 tensorflow/contrib/training/BUILD             | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tensorflow/contrib/decision_trees/proto/BUILD b/tensorflow/contrib/decision_trees/proto/BUILD
index 3b50a48336..06940a90d5 100644
--- a/tensorflow/contrib/decision_trees/proto/BUILD
+++ b/tensorflow/contrib/decision_trees/proto/BUILD
@@ -17,7 +17,6 @@ tf_proto_library(
     name = "generic_tree_model",
     srcs = ["generic_tree_model.proto"],
     cc_api_version = 2,
-    java_api_version = 2,
     visibility = ["//visibility:public"],
 )
 
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index ddf8365d61..b565ebd073 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -313,6 +313,5 @@ tf_proto_library(
     name = "protos_all",
     srcs = glob(["**/*.proto"]),
     cc_api_version = 2,
-    java_api_version = 2,
     visibility = ["//visibility:public"],
 )
-- 
GitLab


From c30e729f8f830ea2da46eaa7a5354395c5119def Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 13:27:41 -0700
Subject: [PATCH 158/570] Internal change

PiperOrigin-RevId: 214986255
---
 tensorflow/contrib/lite/java/aar_with_jni.bzl | 53 ++++++++++---------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/tensorflow/contrib/lite/java/aar_with_jni.bzl b/tensorflow/contrib/lite/java/aar_with_jni.bzl
index db837cf29e..9d2aead266 100644
--- a/tensorflow/contrib/lite/java/aar_with_jni.bzl
+++ b/tensorflow/contrib/lite/java/aar_with_jni.bzl
@@ -3,12 +3,12 @@
 load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 
 def aar_with_jni(name, android_library):
-  # Generate dummy AndroidManifest.xml for dummy apk usage
-  # (dummy apk is generated by <name>_dummy_app_for_so target below)
-  native.genrule(
-      name = name + "_binary_manifest_generator",
-      outs = [name + "_generated_AndroidManifest.xml"],
-      cmd = """
+    # Generate dummy AndroidManifest.xml for dummy apk usage
+    # (dummy apk is generated by <name>_dummy_app_for_so target below)
+    native.genrule(
+        name = name + "_binary_manifest_generator",
+        outs = [name + "_generated_AndroidManifest.xml"],
+        cmd = """
 cat > $(OUTS) <<EOF
 <manifest
   xmlns:android="http://schemas.android.com/apk/res/android"
@@ -17,27 +17,28 @@ cat > $(OUTS) <<EOF
 </manifest>
 EOF
 """,
-  )
+    )
 
-  # Generate dummy apk including .so files and later we extract out
-  # .so files and throw away the apk.
-  android_binary(
-      name = name + "_dummy_app_for_so",
-      manifest = name + "_generated_AndroidManifest.xml",
-      custom_package = "dummy.package.for.so",
-      deps = [android_library],
-      # In some platforms we don't have an Android SDK/NDK and this target
-      # can't be built. We need to prevent the build system from trying to
-      # use the target in that case.
-      tags = ["manual"],
-  )
+    # Generate dummy apk including .so files and later we extract out
+    # .so files and throw away the apk.
+    android_binary(
+        name = name + "_dummy_app_for_so",
+        aapt_version = "aapt",
+        manifest = name + "_generated_AndroidManifest.xml",
+        custom_package = "dummy.package.for.so",
+        deps = [android_library],
+        # In some platforms we don't have an Android SDK/NDK and this target
+        # can't be built. We need to prevent the build system from trying to
+        # use the target in that case.
+        tags = ["manual"],
+    )
 
-  native.genrule(
-      name = name,
-      srcs = [android_library + ".aar", name + "_dummy_app_for_so_unsigned.apk"],
-      outs = [name + ".aar"],
-      tags = ["manual"],
-      cmd = """
+    native.genrule(
+        name = name,
+        srcs = [android_library + ".aar", name + "_dummy_app_for_so_unsigned.apk"],
+        outs = [name + ".aar"],
+        tags = ["manual"],
+        cmd = """
 cp $(location {}.aar) $(location :{}.aar)
 chmod +w $(location :{}.aar)
 origdir=$$PWD
@@ -46,4 +47,4 @@ unzip $$origdir/$(location :{}_dummy_app_for_so_unsigned.apk) "lib/*"
 cp -r lib jni
 zip -r $$origdir/$(location :{}.aar) jni/*/*.so
 """.format(android_library, name, name, name, name),
-  )
+    )
-- 
GitLab


From 19143aa0e2ac3cdf0d6826e7e1d00cd864080394 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 13:31:01 -0700
Subject: [PATCH 159/570] Internal change.

PiperOrigin-RevId: 214986756
---
 tensorflow/core/kernels/BUILD        |  4 +++-
 tensorflow/python/kernel_tests/BUILD | 22 +++++++++++++++-------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 30171708c1..9439ab332c 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1197,8 +1197,10 @@ tf_cc_test(
 
 tf_cc_test(
     name = "example_parsing_ops_test",
-    size = "large",
+    size = "medium",
     srcs = ["example_parsing_ops_test.cc"],
+    shard_count = 4,
+    tags = ["optonly"],
     deps = [
         ":example_parsing_ops",
         ":ops_testutil",
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 65b9e04ed9..9490746fd9 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2359,7 +2359,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "transpose_op_test",
-    size = "large",
+    size = "medium",
     srcs = ["transpose_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2367,10 +2367,11 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
-    shard_count = 2,
+    shard_count = 4,
     tags = [
         "no_gpu",
         "no_oss",
+        "optonly",  # times out
     ],
 )
 
@@ -2489,6 +2490,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    shard_count = 2,
     tags = [
         "optonly",  # flaky timeouts unless optimized
     ],
@@ -2509,7 +2511,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "conv_ops_test",
-    size = "large",
+    size = "medium",
     srcs = ["conv_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2528,6 +2530,9 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 4,
+    tags = [
+        "optonly",  # times out
+    ],
 )
 
 cuda_py_test(
@@ -2587,7 +2592,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "fft_ops_test",
-    size = "large",
+    size = "medium",
     srcs = ["fft_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2597,7 +2602,8 @@ cuda_py_test(
         "//tensorflow/python:spectral_ops",
         "//tensorflow/python:spectral_ops_test_util",
     ],
-    shard_count = 3,
+    shard_count = 4,
+    tags = ["optonly"],
 )
 
 cuda_py_test(
@@ -2662,7 +2668,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "scatter_ops_test",
-    size = "large",  # NOTE: This is not run by default.
+    size = "medium",  # NOTE: This is not run by default.
     srcs = ["scatter_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2671,11 +2677,13 @@ cuda_py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
     ],
+    shard_count = 2,
+    tags = ["optonly"],
 )
 
 cuda_py_test(
     name = "slice_op_test",
-    size = "large",
+    size = "medium",
     srcs = ["slice_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
-- 
GitLab


From 64be2ecc07c698df05d88051ec42a0409d1a9863 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 13:36:18 -0700
Subject: [PATCH 160/570] Do not pass in the bazel default toolchain via
 extra_toolchains.

Without this the default toolchain is used for a subset of the build and the
tests do not actually run on GPUs.

This uncovered a setup problem in the Docker image that needed fixing.

PiperOrigin-RevId: 214987676
---
 .../Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04      |  2 +-
 third_party/gpus/crosstool/BUILD.tpl               | 14 ++++++++++++++
 third_party/toolchains/BUILD                       |  4 +---
 .../ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD          |  2 +-
 .../preconfig/ubuntu14.04/gcc-nvcc/BUILD           | 14 ++++++++++++++
 5 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
index a30858db82..dd8d705331 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
@@ -26,7 +26,7 @@ ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 ENV NVIDIA_REQUIRE_CUDA "cuda>=9.0"
 ENV NCCL_VERSION 2.2.13
-ENV CUDNN_VERSION 7.2.1.38
+ENV CUDNN_VERSION 7.1.4.18
 
 # TODO(b/110903506): /usr/loca/cuda/lib64/stubs should not be needed in
 # LD_LIBRARY_PATH. The stubs/libcuda.so is not meant to used at runtime. The
diff --git a/third_party/gpus/crosstool/BUILD.tpl b/third_party/gpus/crosstool/BUILD.tpl
index f638756d23..c8812fab33 100644
--- a/third_party/gpus/crosstool/BUILD.tpl
+++ b/third_party/gpus/crosstool/BUILD.tpl
@@ -2,6 +2,20 @@ licenses(["restricted"])
 
 package(default_visibility = ["//visibility:public"])
 
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
 cc_toolchain_suite(
     name = "toolchain",
     toolchains = {
diff --git a/third_party/toolchains/BUILD b/third_party/toolchains/BUILD
index 7256a7d96e..bcbc4dda11 100644
--- a/third_party/toolchains/BUILD
+++ b/third_party/toolchains/BUILD
@@ -26,12 +26,10 @@ platform(
     constraint_values = [
         "@bazel_tools//platforms:x86_64",
         "@bazel_tools//platforms:linux",
-        "@bazel_tools//tools/cpp:clang",
-        "@bazel_toolchains//constraints:xenial",
     ],
     remote_execution_properties = """
         properties: {
             name: "container-image"
-            value:"docker://gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04@sha256:06b585f42eed3b2030e9566b8f88f48d7472fa0f47e59765bc115376c8801bdf"
+            value:"docker://gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04@sha256:e5099ff15650986e268a43ee99e2d2b7ffe2459b8b6935385078d1d3b2ed4d02"
         }""",
 )
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
index 2d3e41127d..05abcb56d8 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
@@ -1253,7 +1253,7 @@ genrule(
         "cuda/lib/libcupti.so.9.0",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.2.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda-9.0/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.1.4" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda-9.0/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
    """,
 )
 
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/BUILD
index a56b4513fb..6442e7628a 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/BUILD
@@ -2,6 +2,20 @@ licenses(["restricted"])
 
 package(default_visibility = ["//visibility:public"])
 
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
 cc_toolchain_suite(
     name = "toolchain",
     toolchains = {
-- 
GitLab


From 1724d155f00b49bc817189247cbfb0df2092a9da Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Fri, 28 Sep 2018 13:50:12 -0700
Subject: [PATCH 161/570] Automated rollback of commit
 7f1d70d97f543d69a9f02cd6df0964f22f9278f3

PiperOrigin-RevId: 214989908
---
 tensorflow/contrib/distribute/python/BUILD    |  28 ++-
 .../distribute/python/metrics_v1_test.py      |   3 +-
 .../distribute/python/minimize_loss_test.py   |  26 +-
 .../distribute/python/mirrored_strategy.py    |   3 +-
 .../python/mirrored_strategy_multigpu_test.py |  12 +-
 .../contrib/distribute/python/monitor.py      |   1 -
 .../distribute/python/optimizer_v2_test.py    |   8 +-
 .../distribute/python/prefetching_ops_v2.py   | 232 ++++++++++++++++++
 .../python/prefetching_ops_v2_test.py         |  90 +++++++
 .../contrib/distribute/python/step_fn.py      |   7 +-
 .../contrib/distribute/python/step_fn_test.py |   1 -
 .../contrib/distribute/python/values.py       |  51 ++--
 .../contrib/distribute/python/values_test.py  |  23 +-
 13 files changed, 396 insertions(+), 89 deletions(-)
 create mode 100644 tensorflow/contrib/distribute/python/prefetching_ops_v2.py
 create mode 100644 tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index e329b964c4..422983dbef 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -22,6 +22,7 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":input_ops",
+        ":prefetching_ops_v2",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:device_util",
@@ -29,7 +30,6 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:multi_device_iterator_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/training/checkpointable:base",
         "@six_archive//:six",
@@ -647,6 +647,32 @@ cuda_py_test(
     ],
 )
 
+py_library(
+    name = "prefetching_ops_v2",
+    srcs = ["prefetching_ops_v2.py"],
+    deps = [
+        "//tensorflow/contrib/data/python/ops:prefetching_ops",
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+cuda_py_test(
+    name = "prefetching_ops_v2_test",
+    srcs = ["prefetching_ops_v2_test.py"],
+    additional_deps = [
+        ":prefetching_ops_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
+
 py_library(
     name = "input_ops",
     srcs = ["input_ops.py"],
diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py
index f7773aff4f..8163494c8e 100644
--- a/tensorflow/contrib/distribute/python/metrics_v1_test.py
+++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py
@@ -86,11 +86,10 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
   def _test_metric(self, distribution, dataset_fn, metric_fn, expected_fn):
     with ops.Graph().as_default(), distribution.scope():
       iterator = distribution.distribute_dataset(
-          dataset_fn).make_initializable_iterator()
+          dataset_fn).make_one_shot_iterator()
       value, update = distribution.call_for_each_tower(
           metric_fn, iterator.get_next())
       update = distribution.group(update)
-      self.evaluate(iterator.initializer)
       self.evaluate(variables.local_variables_initializer())
       # TODO(josh11b): Once we switch to using a global batch size for input,
       # replace "distribution.num_towers" with "1".
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index d082d5c419..ba147e7824 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -41,14 +41,6 @@ from tensorflow.python.ops.losses import losses_impl
 
 class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
-  def _get_iterator(self, ds):
-    if context.executing_eagerly():
-      iterator = ds.make_one_shot_iterator()
-    else:
-      iterator = ds.make_initializable_iterator()
-      self.evaluate(iterator.initializer)
-    return iterator
-
   @combinations.generate(
       combinations.times(
           combinations.distributions_and_v1_optimizers(),
@@ -70,7 +62,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
             distribution.call_for_each_tower(
                 model_fn, *inputs, run_concurrently=layer.built))
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         return distribution.run_steps_on_dataset(
@@ -106,7 +99,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         return distribution.group(
@@ -165,7 +159,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
             distribution.call_for_each_tower(
                 model_fn, *inputs, run_concurrently=layer.built))
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         return distribution.run_steps_on_dataset(
@@ -249,7 +244,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           fetches += ops.get_collection(ops.GraphKeys.UPDATE_OPS)
         return control_flow_ops.group(fetches)
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         return distribution.run_steps_on_dataset(
@@ -342,7 +338,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
             distribution.call_for_each_tower(
                 model_fn, x, y, run_concurrently=False))
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         return distribution.run_steps_on_dataset(
@@ -435,7 +432,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
             output=loss)
         return distribution.group(train_op)
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         initial_loss = lambda: constant_op.constant(1e7)
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 93d42e09a2..4d7516063c 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -484,8 +484,7 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
           self._prefetch_on_device, self._auto_shard_dataset)
     else:
       return values.PerDeviceDataset(
-          self._call_dataset_fn(dataset_fn),
-          self._devices,
+          self._call_dataset_fn(dataset_fn), self._devices,
           self._prefetch_on_device)
 
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 04c712ce1d..f51e543624 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -300,15 +300,9 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     dist = mirrored_strategy.MirroredStrategy(
         ["/device:GPU:0", "/device:CPU:0"])
-    ds = dist.distribute_dataset(
-        lambda: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10))
-    if context.executing_eagerly():
-      iterator = ds.make_one_shot_iterator()
-    else:
-      iterator = ds.make_initializable_iterator()
-      self.evaluate([iterator.initializer])
-
-    features = iterator.get_next()
+    features = dist.distribute_dataset(
+        lambda: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10)
+    ).make_one_shot_iterator().get_next()
 
     with dist.scope():
       result = dist.call_for_each_tower(
diff --git a/tensorflow/contrib/distribute/python/monitor.py b/tensorflow/contrib/distribute/python/monitor.py
index 17b7ab74f6..7644acedc9 100644
--- a/tensorflow/contrib/distribute/python/monitor.py
+++ b/tensorflow/contrib/distribute/python/monitor.py
@@ -51,7 +51,6 @@ class Monitor(object):
     else:
       if session is None:
         raise ValueError("Should provide a `session` in Graph mode.")
-      session.run(step_callable._iterator.initializer)  # pylint: disable=protected-access
       self._run_step = session.make_callable(step_callable())
       session.run(variables.global_variables_initializer())
 
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
index 3064433129..6e9ba37a19 100644
--- a/tensorflow/contrib/distribute/python/optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -42,11 +42,8 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      ds = distribution.distribute_dataset(dataset_fn)
-      if context.executing_eagerly():
-        iterator = ds.make_one_shot_iterator()
-      else:
-        iterator = ds.make_initializable_iterator()
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         return control_flow_ops.group(distribution.unwrap(
@@ -55,7 +52,6 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
 
       if not context.executing_eagerly():
         with self.cached_session() as sess:
-          sess.run(iterator.initializer)
           run_step = sess.make_callable(run_step())
         self.evaluate(variables.global_variables_initializer())
 
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
new file mode 100644
index 0000000000..8d949943b7
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
@@ -0,0 +1,232 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Extension of prefetching_ops to support more than one device."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from tensorflow.contrib.data.python.ops import prefetching_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.util import nest as data_nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+from tensorflow.python.util import nest
+
+
+# pylint: disable=protected-access
+class _PrefetchToDeviceIterator(object):
+  """A replacement for `tf.data.Iterator` that prefetches to another device.
+
+  Args:
+    input_dataset: The input dataset.
+    one_shot: If true, we make a one shot iterator that's already initialized.
+    devices: Devices on which to prefetch.
+    buffer_size: Size of the prefetching buffer.
+    shared_name: (Optional.) If non-empty, the returned iterator will be shared
+      under the given name across multiple sessions that share the same devices
+      (e.g. when using a remote server). Only used if one_shot is False.
+
+  Returns:
+    An Iterator type object.
+  """
+
+  def __init__(self,
+               input_dataset,
+               one_shot,
+               devices,
+               buffer_size,
+               shared_name=None):
+    self._input_dataset = input_dataset
+    self._get_next_call_count = 0
+    self._one_shot = one_shot
+    if shared_name is None:
+      shared_name = ""
+    self._devices = devices
+
+    if self._one_shot:
+      self._input_iterator = input_dataset.make_one_shot_iterator()
+    else:
+      self._input_iterator = iterator_ops.Iterator.from_structure(
+          self._input_dataset.output_types, self._input_dataset.output_shapes,
+          shared_name, self._input_dataset.output_classes)
+    input_iterator_handle = self._input_iterator.string_handle()
+
+    @function.Defun(dtypes.string)
+    def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          handle, self._input_iterator.output_types,
+          self._input_iterator.output_shapes,
+          self._input_iterator.output_classes)
+      ret = remote_iterator.get_next()
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
+
+    target_device = ged_ops.experimental_iterator_get_device(
+        self._input_iterator._iterator_resource)
+    self._buffering_resources = []
+    for device in nest.flatten(self._devices):
+      with ops.device(device):
+        buffer_resource_handle = prefetching_ops.function_buffering_resource(
+            f=_prefetch_fn,
+            output_types=data_nest.flatten(
+                sparse.as_dense_types(self._input_dataset.output_types,
+                                      self._input_dataset.output_classes)),
+            target_device=target_device,
+            string_arg=input_iterator_handle,
+            buffer_size=buffer_size,
+            shared_name=shared_name)
+        self._buffering_resources.append(buffer_resource_handle)
+
+    if not self._one_shot:
+      reset_ops = []
+      for buffer_resource in self._buffering_resources:
+        reset_ops.append(
+            ged_ops.experimental_function_buffering_resource_reset(
+                buffer_resource))
+      with ops.control_dependencies(reset_ops):
+        self._initializer = self._input_iterator.make_initializer(
+            self._input_dataset)
+
+  def get_next(self, name=None):
+    """See `tf.data.Iterator.get_next`."""
+    self._get_next_call_count += 1
+    if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
+      warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
+
+    flat_result = []
+    # TODO(priyag): This will fail if the input size (typically number of
+    # batches) is not divisible by number of devices.
+    # How do we handle that more gracefully / let the user know?
+    for buffer_resource in self._buffering_resources:
+      flat_ret = ged_ops.experimental_function_buffering_resource_get_next(
+          buffer_resource,
+          output_types=data_nest.flatten(
+              sparse.as_dense_types(self.output_types, self.output_classes)),
+          name=name)
+
+      ret = sparse.deserialize_sparse_tensors(
+          data_nest.pack_sequence_as(self.output_types, flat_ret),
+          self.output_types, self.output_shapes, self.output_classes)
+
+      for tensor, shape in zip(
+          data_nest.flatten(ret), data_nest.flatten(self.output_shapes)):
+        if isinstance(tensor, ops.Tensor):
+          tensor.set_shape(shape)
+      flat_result.append(ret)
+
+    return nest.pack_sequence_as(self._devices, flat_result)
+
+  @property
+  def initializer(self):
+    if self._one_shot:
+      raise NotImplementedError("Can't initialize a one_shot_iterator")
+    return self._initializer
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+# pylint: enable=protected-access
+
+
+class _PrefetchToDeviceDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` whose iterator prefetches elements to other device(s)."""
+
+  def __init__(self, input_dataset, devices, buffer_size):
+    super(_PrefetchToDeviceDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._devices = devices
+    self._buffer_size = buffer_size if buffer_size is not None else 1
+
+  def make_one_shot_iterator(self):
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=True,
+        devices=self._devices,
+        buffer_size=self._buffer_size)
+
+  def make_initializable_iterator(self, shared_name=None):
+    if context.executing_eagerly():
+      raise RuntimeError(
+          "make_initializable_iterator is not supported when eager "
+          "execution is enabled.")
+
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=False,
+        devices=self._devices,
+        buffer_size=self._buffer_size,
+        shared_name=shared_name)
+
+  def _as_variant_tensor(self):
+    # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
+    # transformation methods is called.
+    # TODO(mrry): Investigate support for chaining further transformations after
+    # the prefetch, including GPU support.
+    raise NotImplementedError("`prefetch_to_devices()` must be the last "
+                              "transformation in a dataset pipeline.")
+
+  # TODO(priyag): Fix the output types, shapes and classes to match the result
+  # of get_next (which has the additional nesting layer of devices now).
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+
+def prefetch_to_devices(devices, buffer_size=None):
+  """A transformation that prefetches dataset values to the given `devices`.
+
+  NOTE: Although the transformation creates a `tf.data.Dataset`, the
+  transformation must be the final `Dataset` in the input pipeline.
+
+  Args:
+    devices: A nested structure of devices on which to prefetch the data. It can
+      be a single device name, or a tuple or list of device names.
+    buffer_size: (Optional.) The number of elements to buffer on each device.
+      Defaults to an automatically chosen value.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    return _PrefetchToDeviceDataset(dataset, devices, buffer_size)
+
+  return _apply_fn
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
new file mode 100644
index 0000000000..16799104e8
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
@@ -0,0 +1,90 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for prefetching_ops_v2."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import prefetching_ops_v2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class PrefetchingOpsV2Test(test.TestCase):
+
+  def testPrefetchToOneDevice(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops_v2.prefetch_to_devices("/gpu:0"))
+
+    iterator = device_dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToTwoDevicesInAList(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops_v2.prefetch_to_devices(["/cpu:0", "/gpu:0"]))
+
+    iterator = device_dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    output = []
+    # TODO(rohanj): Modify test to go till the end of the dataset when we
+    # switch to MultiDeviceIterator.
+    with self.cached_session() as sess:
+      for _ in range(4):
+        result = sess.run(next_element)
+        self.assertEqual(2, len(result))
+        output.extend(result)
+      self.assertEquals(set(range(8)), set(output))
+
+  def testPrefetchToTwoDevicesWithReinit(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops_v2.prefetch_to_devices(["/cpu:0", "/gpu:0"]))
+
+    iterator = device_dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    # TODO(rohanj): Modify test to go till the end of the dataset when we
+    # switch to MultiDeviceIterator.
+    with self.cached_session() as sess:
+      sess.run(iterator.initializer)
+      for _ in range(4):
+        sess.run(next_element)
+      sess.run(iterator.initializer)
+      for _ in range(4):
+        sess.run(next_element)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
index 23bf36184f..1b5a4f64e5 100644
--- a/tensorflow/contrib/distribute/python/step_fn.py
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
 from tensorflow.python.training import optimizer as optimizer_lib
 
 
@@ -51,11 +50,7 @@ class StandardInputStep(Step):
   def __init__(self, dataset_fn, distribution):
     super(StandardInputStep, self).__init__(distribution)
     self._distributed_input = distribution.distribute_dataset(dataset_fn)
-    if context.executing_eagerly():
-      self._iterator = self._distributed_input.make_one_shot_iterator()
-    else:
-      # TODO(priyag): Expose initializer via some initializer property.
-      self._iterator = self._distributed_input.make_initializable_iterator()
+    self._iterator = self._distributed_input.make_one_shot_iterator()
 
 
 class StandardSingleLossStep(StandardInputStep):
diff --git a/tensorflow/contrib/distribute/python/step_fn_test.py b/tensorflow/contrib/distribute/python/step_fn_test.py
index 1ff9b9ceec..f1ada49fa3 100644
--- a/tensorflow/contrib/distribute/python/step_fn_test.py
+++ b/tensorflow/contrib/distribute/python/step_fn_test.py
@@ -50,7 +50,6 @@ class SingleLossStepTest(test.TestCase, parameterized.TestCase):
         run_step = single_loss_step
       else:
         with self.cached_session() as sess:
-          sess.run(single_loss_step._iterator.initializer)
           run_step = sess.make_callable(single_loss_step())
       self.evaluate(variables.global_variables_initializer())
 
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 327775a729..4955ded4d5 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -26,7 +26,7 @@ import weakref
 import six
 
 from tensorflow.contrib.distribute.python import input_ops
-from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.contrib.distribute.python import prefetching_ops_v2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
@@ -683,7 +683,7 @@ class PerDeviceDataIterator(object):
   def get_next(self, name=None):
     """Scatter the input across devices."""
     if self._prefetch_on_device:
-      data_list = self._iterator.get_next()
+      data_list = self._iterator.get_next(name=name)
       index = dict(zip(self._devices, data_list))
     else:
       batch = self._iterator.get_next(name=name)
@@ -703,24 +703,21 @@ class PerDeviceDataIterator(object):
 class PerDeviceDataset(object):
   """Like `tf.data.Dataset` split devices, producing `PerDevice` data."""
 
-  def __init__(
-      self,
-      dataset,
-      devices,
-      prefetch_on_device=None,
-  ):
+  def __init__(self, dataset, devices, prefetch_on_device=None):
     self._devices = devices
 
     # Default to using prefetching in graph mode, unless specified.
-    # TODO(rohanj): Enable prefetching in eager mode.
+    # TODO(priyag): Enable prefetching in eager mode.
     self._prefetch_on_device = prefetch_on_device
     if self._prefetch_on_device is None:
       self._prefetch_on_device = not context.executing_eagerly()
     assert not (self._prefetch_on_device and context.executing_eagerly()), (
         "Prefetching is only supported in graph mode currently")
 
-    self._dataset = dataset
-    if not self._prefetch_on_device:
+    if self._prefetch_on_device:
+      self._dataset = dataset.apply(
+          prefetching_ops_v2.prefetch_to_devices(self._devices))
+    else:
       # TODO(priyag): If dropping remainder is not appropriate, find another
       # approach to distributing the dataset when not possible to divide evenly.
       # Possibly not an issue when we start using PartitionedDataset.
@@ -728,33 +725,15 @@ class PerDeviceDataset(object):
 
   def make_one_shot_iterator(self):
     """Get a one time use iterator for the distributed PerDeviceDataset."""
-    # Graph mode prefetching with one shot iterator is disabled.
-    if not context.executing_eagerly():
-      raise ValueError("Cannot create a one shot iterator. Please use "
-                       "`make_initializable_iterator()` instead.")
-    # Eager mode prefetching would error out in constructor. Only remaining
-    # cases are non-prefetching eager / graph mode. We delegate to
-    # PerDeviceDataIterator to handle them.
     dataset_iterator = self._dataset.make_one_shot_iterator()
-    return PerDeviceDataIterator(
-        dataset_iterator, self._devices, prefetch_on_device=False)
+    return PerDeviceDataIterator(dataset_iterator, self._devices,
+                                 self._prefetch_on_device)
 
   def make_initializable_iterator(self):
     """Get an initializable iterator for the distributed PerDeviceDataset."""
-    # Eager mode generates already initialized iterators. Hence we cannot create
-    # an initializable iterator.
-    if context.executing_eagerly():
-      raise ValueError("Cannot create initializable iterator in Eager mode. "
-                       "Please use `make_one_shot_iterator` instead.")
-    if self._prefetch_on_device:
-      dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator(
-          self._dataset, self._devices)
-    else:
-      dataset_iterator = self._dataset.make_initializable_iterator()
-    return PerDeviceDataIterator(
-        dataset_iterator,
-        self._devices,
-        prefetch_on_device=self._prefetch_on_device)
+    dataset_iterator = self._dataset.make_initializable_iterator()
+    return PerDeviceDataIterator(dataset_iterator, self._devices,
+                                 self._prefetch_on_device)
 
 
 class MultiWorkerDataIterator(object):
@@ -837,9 +816,7 @@ class MultiWorkerDataset(object):
           worker_input = input_ops.auto_shard_dataset(
               worker_input, len(worker_device_map), i)
         self._datasets[worker] = PerDeviceDataset(
-            worker_input,
-            worker_devices,
-            prefetch_on_device=prefetch_on_device)
+            worker_input, worker_devices, prefetch_on_device=prefetch_on_device)
 
   def make_one_shot_iterator(self):
     iterators = {}
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 002d61f46e..ae3e134333 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -349,11 +349,7 @@ class PerDeviceDatasetTest(test.TestCase):
   def _test_iterator_no_prefetch(self, devices, dataset, expected_values):
     per_device_dataset = values.PerDeviceDataset(
         dataset, devices, prefetch_on_device=False)
-    if context.executing_eagerly():
-      iterator = per_device_dataset.make_one_shot_iterator()
-    else:
-      iterator = per_device_dataset.make_initializable_iterator()
-      self.evaluate([iterator.initializer])
+    iterator = per_device_dataset.make_one_shot_iterator()
 
     for expected_value in expected_values:
       next_element = iterator.get_next()
@@ -370,14 +366,21 @@ class PerDeviceDatasetTest(test.TestCase):
     if not context.executing_eagerly():
       per_device_dataset = values.PerDeviceDataset(
           dataset, devices, prefetch_on_device=True)
-      iterator = per_device_dataset.make_initializable_iterator()
-      self.evaluate([iterator.initializer])
+      iterator = per_device_dataset.make_one_shot_iterator()
 
+      # With prefetching, we cannot guarantee which input ends up on which
+      # device, so we verify that the complete set seen on all devices is
+      # correct, and equal numbers are distributed to each device.
+      combined_actual = []
+      combined_expected = []
       for expected_value in expected_values:
         next_element = iterator.get_next()
-        computed_value = self.evaluate(
-            [values.select_device(d, next_element) for d in devices])
-        self.assertEqual(expected_value, computed_value)
+        combined_actual.extend(
+            self.evaluate(
+                [values.select_device(d, next_element) for d in devices]))
+        combined_expected.extend(expected_value)
+
+      self.assertEqual(set(combined_expected), set(combined_actual))
 
       with self.assertRaises(errors.OutOfRangeError):
         next_element = iterator.get_next()
-- 
GitLab


From f83da5b0aa37ba55c1b2eaa093e6d043b73f5982 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 14:08:25 -0700
Subject: [PATCH 162/570] Introduce the abstraction of RunHandler which each
 DirectSession can use for the duration of a single RunInternal() call from
 RunHandlerPool. It is used for running inter-op closures with a global
 scheduler (which in the future) to improve both median and tail latency (for
 use-cases like CPU inference). In the case that global pools aren't used,
 this change should be a no-op.

PiperOrigin-RevId: 214992852
---
 tensorflow/core/BUILD                         |  16 ++
 .../core/common_runtime/direct_session.cc     |  49 +++-
 .../core/common_runtime/direct_session.h      |   3 +
 .../common_runtime/direct_session_test.cc     |  28 ++
 tensorflow/core/framework/run_handler.cc      | 249 ++++++++++++++++++
 tensorflow/core/framework/run_handler.h       |  95 +++++++
 tensorflow/core/framework/run_handler_util.cc |  57 ++++
 tensorflow/core/framework/run_handler_util.h  |  43 +++
 .../core/framework/run_handler_util_test.cc   |  93 +++++++
 tensorflow/core/protobuf/config.proto         |   5 +
 ...ensorflow.-run-options.-experimental.pbtxt |   6 +
 .../golden/v1/tensorflow.-run-options.pbtxt   |   6 +
 ...ensorflow.-run-options.-experimental.pbtxt |   6 +
 .../golden/v2/tensorflow.-run-options.pbtxt   |   6 +
 14 files changed, 656 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/core/framework/run_handler.cc
 create mode 100644 tensorflow/core/framework/run_handler.h
 create mode 100644 tensorflow/core/framework/run_handler_util.cc
 create mode 100644 tensorflow/core/framework/run_handler_util.h
 create mode 100644 tensorflow/core/framework/run_handler_util_test.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 50fe308b73..7da4b9fbd0 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2486,6 +2486,8 @@ FRAMEWORK_INTERNAL_PUBLIC_HEADERS = [
     "framework/op_segment.h",
     "framework/rendezvous.h",  # only needed for tests
     "framework/resource_var.h",
+    "framework/run_handler.h",
+    "framework/run_handler_util.h",
     "framework/tensor_reference.h",
     "framework/tracking_allocator.h",  # only needed for tests
     "framework/unique_tensor_references.h",
@@ -2972,6 +2974,7 @@ tf_cuda_library(
         ":core_cpu_internal",
         ":device_tracer",
         ":framework",
+        ":framework_internal",
         ":graph",
         ":lib",
         ":lib_internal",
@@ -4119,6 +4122,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "framework_run_handler_util_test",
+    size = "small",
+    srcs = ["framework/run_handler_util_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":framework_internal",
+        ":lib",
+        ":test",
+        ":test_main",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "common_runtime_direct_session_test",
     size = "small",
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 841181f8c3..458e133b68 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/run_handler.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -244,6 +245,21 @@ void DirectSession::SchedClosure(thread::ThreadPool* pool,
 #endif  // __ANDROID__
 }
 
+static RunHandlerPool* GetOrCreateRunHandlerPool(
+    const SessionOptions& options) {
+  static RunHandlerPool* pool =
+      new RunHandlerPool(NumInterOpThreadsFromSessionOptions(options));
+  return pool;
+}
+
+bool DirectSession::ShouldUseRunHandlerPool() const {
+  if (options_.config.session_inter_op_thread_pool_size() > 0 ||
+      options_.config.use_per_session_threads()) {
+    return false;
+  }
+  return true;
+}
+
 DirectSession::DirectSession(const SessionOptions& options,
                              const DeviceMgr* device_mgr,
                              DirectSessionFactory* const factory)
@@ -582,16 +598,37 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
     }
   }
 
-  Executor::Args::Runner default_runner = [this,
-                                           pool](Executor::Args::Closure c) {
-    SchedClosure(pool, std::move(c));
-  };
+  std::unique_ptr<RunHandler> handler;
+  if (ShouldUseRunHandlerPool() &&
+      run_options.experimental().use_run_handler_pool()) {
+    // Non-null only when a global inter-op pool is used.
+    VLOG(1) << "Using RunHandler to scheduler inter-op closures.";
+    handler = GetOrCreateRunHandlerPool(options_)->Get();
+  }
+  auto* handler_ptr = handler.get();
+
+  Executor::Args::Runner default_runner = nullptr;
+
+  if (pool == nullptr) {
+    default_runner = [](Executor::Args::Closure c) { c(); };
+  } else if (handler_ptr != nullptr) {
+    default_runner = [handler_ptr](Executor::Args::Closure c) {
+      handler_ptr->ScheduleInterOpClosure(std::move(c));
+    };
+  } else {
+    default_runner = [this, pool](Executor::Args::Closure c) {
+      SchedClosure(pool, std::move(c));
+    };
+  }
+
   for (const auto& item : executors_and_keys->items) {
-    // TODO(zhengxq): support partial run.
-    // TODO(zhengxq): if the device picks its own threadpool, we need to assign
+    // TODO(azaks): support partial run.
+    // TODO(azaks): if the device picks its own threadpool, we need to assign
     //     less threads to the main compute pool by default.
     thread::ThreadPool* device_thread_pool =
         item.device->tensorflow_device_thread_pool();
+    // TODO(crk): Investigate usage of RunHandlerPool when using device specific
+    // thread pool(s).
     if (!device_thread_pool) {
       args.runner = default_runner;
     } else {
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 4a6a921ea7..3a168bbe3f 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -247,6 +247,9 @@ class DirectSession : public Session {
                                    ExecutorsAndKeys* executors_and_keys,
                                    RunMetadata* run_metadata);
 
+  // Returns whether inter-op execution uses a global pool.
+  bool ShouldUseRunHandlerPool() const;
+
   ::tensorflow::Status ExtendLocked(const GraphDef& graph)
       EXCLUSIVE_LOCKS_REQUIRED(graph_state_lock_);
 
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 65e816c202..e3e431f800 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -625,6 +625,34 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetworkWithOpts_Callable) {
   EXPECT_EQ(run_metadata.step_stats().dev_stats_size(), 2);
 }
 
+TEST_F(DirectSessionMinusAXTest, UseRunHandlerPool) {
+  Initialize({3, 2, -1, 0});
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def_));
+  std::vector<std::pair<string, Tensor>> inputs;
+
+  // Request two targets: one fetch output and one non-fetched output.
+  std::vector<string> output_names = {y_ + ":0"};
+  std::vector<string> target_nodes = {y_neg_};
+  std::vector<Tensor> outputs;
+
+  // Prepares RunOptions and RunMetadata
+  RunOptions run_options;
+  run_options.mutable_experimental()->set_use_run_handler_pool(true);
+
+  Status s = session->Run(run_options, inputs, output_names, target_nodes,
+                          &outputs, nullptr);
+  TF_ASSERT_OK(s);
+
+  ASSERT_EQ(1, outputs.size());
+  // The first output should be initialized and have the correct
+  // output.
+  auto mat = outputs[0].matrix<float>();
+  ASSERT_TRUE(outputs[0].IsInitialized());
+  EXPECT_FLOAT_EQ(5.0, mat(0, 0));
+}
+
 TEST(DirectSessionTest, KeepsStateAcrossRunsOfSession) {
   GraphDef def;
   Graph g(OpRegistry::Global());
diff --git a/tensorflow/core/framework/run_handler.cc b/tensorflow/core/framework/run_handler.cc
new file mode 100644
index 0000000000..0c4007eafc
--- /dev/null
+++ b/tensorflow/core/framework/run_handler.cc
@@ -0,0 +1,249 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/run_handler.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/run_handler_util.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+
+// Contains the concrete implementation of the RunHandler.
+// Externally visible RunHandler class simply forwards the work to this one.
+class RunHandler::Impl {
+ public:
+  explicit Impl(RunHandlerPool::Impl* pool_impl) : pool_impl_(pool_impl) {
+    Reset();
+  }
+
+  ~Impl() {}
+
+  void set_inter_op_scheduling_range(std::uint_fast32_t start,
+                                     std::uint_fast32_t limit) {
+    inter_op_scheduling_range_.store(EncodePartition(start, limit),
+                                     std::memory_order_release);
+  }
+
+  std::uint_fast32_t inter_op_scheduling_range() const {
+    return inter_op_scheduling_range_.load(std::memory_order_acquire);
+  }
+
+  // Stores now time (in microseconds) since unix epoch when the handler is
+  // requested via RunHandlerPool::Get().
+  uint64 start_time_us() const { return start_time_us_; }
+
+  void ScheduleInterOpClosure(std::function<void()> fn);
+
+  void Reset();
+
+  RunHandlerPool::Impl* pool_impl() { return pool_impl_; }
+
+ private:
+  // Encoding/decoding logic for storing [start, limit) into a single
+  // uint_fast32_t int. We assume that pool_num_threads < (1 << 16).
+  const int kMaxPartitionBits = 16;
+  const int kMaxThreads = 1 << kMaxPartitionBits;
+
+  std::uint_fast32_t EncodePartition(std::uint_fast32_t start,
+                                     std::uint_fast32_t limit) {
+    return (start << kMaxPartitionBits) | limit;
+  }
+
+  void DecodePartition(std::uint_fast32_t val, std::uint_fast32_t* start,
+                       std::uint_fast32_t* limit) {
+    *limit = val & (kMaxThreads - 1);
+    val >>= kMaxPartitionBits;
+    *start = val;
+  }
+
+  std::atomic_uint_fast32_t inter_op_scheduling_range_;
+  RunHandlerPool::Impl* pool_impl_;  // NOT OWNED.
+  uint64 start_time_us_;
+};
+
+// Contains shared state across all run handlers present in the pool. Also
+// responsible for pool management decisions.
+// This class is thread safe.
+class RunHandlerPool::Impl {
+ public:
+  explicit Impl(int num_inter_op_threads)
+      : max_handlers_(128),
+        inter_op_thread_pool_(new thread::ThreadPool(
+            Env::Default(), ThreadOptions(), "inter_op", num_inter_op_threads)),
+        iterations_(0) {
+    VLOG(1) << "Creating a RunHandlerPool with max handlers: " << max_handlers_;
+    for (int i = 0; i < max_handlers_; ++i) {
+      handlers_.emplace_back(new RunHandler::Impl(this));
+      free_handlers_.push_back(handlers_.back().get());
+    }
+  }
+
+  ~Impl() {
+    // Sanity check that all handlers have been returned back to the pool before
+    // destruction.
+    DCHECK_EQ(handlers_.size(), max_handlers_);
+    DCHECK_EQ(free_handlers_.size(), handlers_.size());
+    DCHECK_EQ(sorted_active_handlers_.size(), 0);
+  }
+
+  thread::ThreadPool* inter_op_thread_pool() const {
+    return inter_op_thread_pool_.get();
+  }
+
+  std::unique_ptr<RunHandler> Get() LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    while (free_handlers_.empty()) {
+      one_handler_free_.wait(l);
+    }
+    // Remove the last entry from free_handlers_ and add to the end of
+    // sorted_active_handlers_.
+    auto* handler_impl = free_handlers_.back();
+    handler_impl->Reset();
+    // Sortedness isn't violated if we simply add at the end of the list, since
+    // handlers are expected to be obtained in increasing order of time.
+    sorted_active_handlers_.push_back(handler_impl);
+    DCHECK_LE(sorted_active_handlers_.size(), max_handlers_);
+    free_handlers_.pop_back();
+
+    RecomputePoolStatsLocked();
+    return WrapUnique<RunHandler>(new RunHandler(handler_impl));
+  }
+
+  void ReleaseHandler(RunHandler::Impl* handler) LOCKS_EXCLUDED(mu_) {
+    {
+      mutex_lock l(mu_);
+      DCHECK_GT(sorted_active_handlers_.size(), 0);
+
+      uint64 now = tensorflow::Env::Default()->NowMicros();
+      double elapsed = (now - handler->start_time_us()) / 1000.0;
+      time_hist_.Add(elapsed);
+
+      // Erase from and update sorted_active_handlers_. Add it to the end of
+      // free_handlers_.
+      auto iter = std::find(sorted_active_handlers_.begin(),
+                            sorted_active_handlers_.end(), handler);
+      DCHECK(iter != sorted_active_handlers_.end())
+          << "Unexpected handler: " << handler
+          << " is being requested for release";
+
+      // Remove this handler from this list and add it to the list of free
+      // handlers.
+      sorted_active_handlers_.erase(iter);
+      free_handlers_.push_back(handler);
+      DCHECK_LE(free_handlers_.size(), max_handlers_);
+
+      RecomputePoolStatsLocked();
+    }
+    one_handler_free_.notify_one();
+  }
+
+ private:
+  void RecomputePoolStatsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Maximum number of handlers pre-created during pool construction time. The
+  // number has been chosen expecting each handler might at least want 1
+  // inter-op thread for execution (during compute intensive workloads like
+  // inference).
+  const int max_handlers_;
+
+  // Thread safe part.
+  const std::unique_ptr<thread::ThreadPool> inter_op_thread_pool_;
+
+  // Thread compatible part used only by lock under RunHandlerPool.
+  // Handlers are sorted by start time.
+  std::vector<RunHandler::Impl*> sorted_active_handlers_ GUARDED_BY(mu_);
+  std::vector<RunHandler::Impl*> free_handlers_ GUARDED_BY(mu_);
+  std::vector<std::unique_ptr<RunHandler::Impl>> handlers_ GUARDED_BY(mu_);
+  // Histogram of elapsed runtime of every handler (in ms).
+  histogram::Histogram time_hist_ GUARDED_BY(mu_);
+  std::vector<std::uint_fast32_t> inter_op_start_ GUARDED_BY(mu_);
+  std::vector<std::uint_fast32_t> inter_op_limit_ GUARDED_BY(mu_);
+  int64 iterations_ GUARDED_BY(mu_);
+  condition_variable one_handler_free_;
+  mutex mu_;
+};
+
+void RunHandlerPool::Impl::RecomputePoolStatsLocked() {
+  int num_active_requests = sorted_active_handlers_.size();
+  if (num_active_requests == 0) return;
+
+  int num_threads = inter_op_thread_pool_->NumThreads();
+
+  inter_op_start_.resize(num_active_requests);
+  inter_op_limit_.resize(num_active_requests);
+
+  const int kMinThreadsPerRequest = 3;
+  ComputeInterOpSchedulingRanges(num_active_requests, num_threads,
+                                 kMinThreadsPerRequest, &inter_op_start_,
+                                 &inter_op_limit_);
+
+  for (int i = 0; i < num_active_requests; ++i) {
+    sorted_active_handlers_[i]->set_inter_op_scheduling_range(
+        inter_op_start_[i], inter_op_limit_[i]);
+  }
+
+  if (iterations_++ % 5000 == 0 && VLOG_IS_ON(1)) {
+    VLOG(1) << "Printing time histogram: " << time_hist_.ToString();
+    VLOG(1) << "Active session runs: " << num_active_requests;
+    uint64 now = tensorflow::Env::Default()->NowMicros();
+    string ranges_str = "";
+    string times_str = "";
+    for (int i = 0; i < num_active_requests; ++i) {
+      if (i > 0) {
+        times_str += " ";
+        ranges_str += " ";
+      }
+
+      times_str += strings::StrCat(
+          (now - sorted_active_handlers_[i]->start_time_us()) / 1000.0, " ms.");
+      ranges_str += strings::StrCat("[", inter_op_start_[i], ", ",
+                                    inter_op_limit_[i], ")");
+    }
+    VLOG(1) << "Elapsed times are: " << times_str;
+    VLOG(1) << "Ranges are: " << ranges_str;
+  }
+}
+
+void RunHandler::Impl::ScheduleInterOpClosure(std::function<void()> fn) {
+  std::uint_fast32_t start = 0, limit = 0;
+  DecodePartition(inter_op_scheduling_range(), &start, &limit);
+  pool_impl_->inter_op_thread_pool()->Schedule(std::move(fn));
+}
+
+void RunHandler::Impl::Reset() {
+  set_inter_op_scheduling_range(
+      0, pool_impl_->inter_op_thread_pool()->NumThreads());
+  start_time_us_ = tensorflow::Env::Default()->NowMicros();
+}
+
+RunHandlerPool::RunHandlerPool(int num_inter_op_threads)
+    : impl_(new Impl(num_inter_op_threads)) {}
+
+RunHandlerPool::~RunHandlerPool() {}
+
+std::unique_ptr<RunHandler> RunHandlerPool::Get() { return impl_->Get(); }
+
+RunHandler::RunHandler(Impl* impl) : impl_(impl) {}
+
+void RunHandler::ScheduleInterOpClosure(std::function<void()> fn) {
+  impl_->ScheduleInterOpClosure(std::move(fn));
+}
+
+RunHandler::~RunHandler() { impl_->pool_impl()->ReleaseHandler(impl_); }
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/run_handler.h b/tensorflow/core/framework/run_handler.h
new file mode 100644
index 0000000000..72fa6301b4
--- /dev/null
+++ b/tensorflow/core/framework/run_handler.h
@@ -0,0 +1,95 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_H_
+
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+class RunHandler;
+
+// RunHandlerPool is a fixed size pool of pre-allocated RunHandlers
+// that can be used for tracking inter-op work for a given Session::Run().
+// RunHandler(s) in the pool are initially 'inactive'. A RunHandler becomes
+// 'active' when its unique_ptr is returned by Get() and is being used by a
+// client. It becomes 'inactive' once more when its unique_ptr gets destroyed.
+//
+// Expected usage:
+//
+// * Create a single RunHandlerPool (say run_handler_pool_).
+//
+// * When a Session::Run() is invoked, obtain a handler by:
+// auto handler = run_handler_pool_->Get();
+//
+// * Use handler for scheduling all inter-op work by:
+// handler->ScheduleInterOpClosure(closure);
+//
+// This class is thread safe.
+class RunHandlerPool {
+ public:
+  explicit RunHandlerPool(int num_inter_op_threads);
+  ~RunHandlerPool();
+
+  // Returns an inactive RunHandler from the pool.
+  //
+  // RunHandlers in RunHandlerPool are initially 'inactive'.
+  // A RunHandler becomes 'active' when its unique_ptr its returned by Get()
+  // and is being used by a client.  It becomes 'inactive' once more when the
+  // unique_ptr is destroyed.
+  //
+  // Will block unless there is an inactive handler.
+  std::unique_ptr<RunHandler> Get();
+
+ private:
+  class Impl;
+  friend class RunHandler;
+
+  std::unique_ptr<Impl> impl_;
+};
+
+// RunHandler can be used to schedule inter-op closures to run on a global pool
+// shared across all Session::Run(s).
+//
+// It can only be created via RunHandlerPool::Get().
+//
+// This class can be used instead of directly scheduling closures on a global
+// pool since it maintains a global view across all sessions and optimizes pool
+// scheduling to improve (median and tail) latency.
+//
+// This class is thread safe.
+class RunHandler {
+ public:
+  void ScheduleInterOpClosure(std::function<void()> fn);
+
+  ~RunHandler();
+
+ private:
+  class Impl;
+  friend class RunHandlerPool::Impl;
+
+  explicit RunHandler(Impl* impl);
+
+  Impl* impl_;  // NOT OWNED.
+};
+
+}  // end namespace tensorflow.
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_H_
diff --git a/tensorflow/core/framework/run_handler_util.cc b/tensorflow/core/framework/run_handler_util.cc
new file mode 100644
index 0000000000..3087998c69
--- /dev/null
+++ b/tensorflow/core/framework/run_handler_util.cc
@@ -0,0 +1,57 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/run_handler_util.h"
+
+#include <algorithm>
+#include <cmath>
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+void ComputeInterOpSchedulingRanges(int num_active_requests, int num_threads,
+                                    int min_threads_per_request,
+                                    std::vector<std::uint_fast32_t>* start_vec,
+                                    std::vector<std::uint_fast32_t>* end_vec) {
+  // Each request is expected to have weight W[i] = num_active_requests - i.
+  // Therefore, total_weight = sum of all request weights.
+  float total_weight = 0.5f * num_active_requests * (num_active_requests + 1);
+  float demand_factor = static_cast<float>(num_threads) / total_weight;
+  float last_cumulative_weight = 0.0;
+  min_threads_per_request = std::max(1, min_threads_per_request);
+  for (int i = 0; i != num_active_requests; i++) {
+    float cumulative_weight =
+        static_cast<float>(i + 1) *
+        (num_active_requests - static_cast<float>(i) * 0.5f);
+    float weight = cumulative_weight - last_cumulative_weight;
+    // Quantize thread_demand by rounding up, and also satisfying
+    // `min_threads_per_request` constraint.
+    // Note: We subtract a small epsilon (0.00001) to prevent ceil(..) from
+    // rounding weights like 4.0 to 5.
+    int demand =
+        std::max(min_threads_per_request,
+                 static_cast<int>(ceil(weight * demand_factor - 0.00001f)));
+    // For the quantized range [start, end); compute the floor of real start,
+    // and expand downwards from there with length `demand` and adjust for
+    // boundary conditions.
+    int start = last_cumulative_weight * demand_factor;
+    int end = std::min(num_threads, start + demand);
+    start = std::max(0, std::min(start, end - demand));
+    start_vec->at(i) = start;
+    end_vec->at(i) = end;
+    last_cumulative_weight = cumulative_weight;
+  }
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/run_handler_util.h b/tensorflow/core/framework/run_handler_util.h
new file mode 100644
index 0000000000..c0c36aeccb
--- /dev/null
+++ b/tensorflow/core/framework/run_handler_util.h
@@ -0,0 +1,43 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
+
+#include <cstdint>
+#include <vector>
+
+namespace tensorflow {
+
+// Assign thread ranges to requests.
+// Requests are numbered 0...num_active_requests-1, and
+// threads are numbered 0...num_threads-1.
+// On return, the range start_vec->at(i)...end_vec->at(i)-1
+// indicates the subrange of the threads available to request i.
+// The ranges given to different requests may overlap.
+// Lower numbered requests will tend to be assigned more threads.
+// Thus, a client might associate older requests with lower
+// array indices so they receive access to more threads.
+// However, the routine ensures that each request is given access
+// to at least min(min_threads_per_request, num_threads)  threads.
+// Every thread will be assigned to at least one request range,
+// assuming there is at least one request.
+void ComputeInterOpSchedulingRanges(int num_active_requests, int num_threads,
+                                    int min_threads_per_request,
+                                    std::vector<std::uint_fast32_t>* start_vec,
+                                    std::vector<std::uint_fast32_t>* end_vec);
+
+}  // end namespace tensorflow
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
diff --git a/tensorflow/core/framework/run_handler_util_test.cc b/tensorflow/core/framework/run_handler_util_test.cc
new file mode 100644
index 0000000000..a1928c132b
--- /dev/null
+++ b/tensorflow/core/framework/run_handler_util_test.cc
@@ -0,0 +1,93 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/run_handler_util.h"
+
+#include <vector>
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+namespace tensorflow {
+namespace {
+
+void VerifyFunction(int num_active_requests, int num_threads,
+                    int min_threads_per_request, bool print_stats = false) {
+  if (print_stats) {
+    LOG(INFO) << "Test case# num_active_requests: " << num_active_requests
+              << " num_threads: " << num_threads
+              << " min_threads: " << min_threads_per_request;
+  }
+  std::vector<std::uint_fast32_t> start(num_active_requests);
+  std::vector<std::uint_fast32_t> end(num_active_requests);
+
+  ComputeInterOpSchedulingRanges(num_active_requests, num_threads,
+                                 min_threads_per_request, &start, &end);
+  string range_str = "";
+  for (int i = 0; i < num_active_requests; ++i) {
+    if (i > 0) range_str += " ";
+    range_str += strings::StrCat("[", start[i], ", ", end[i], ")");
+
+    ASSERT_GE(start[i], 0) << range_str;
+    ASSERT_LE(end[i], num_threads) << range_str;
+    if (i > 0) {
+      // Due to linearly decreasing demand, #threads(i - 1) >= #threads(i)
+      ASSERT_GE(end[i - 1] - start[i - 1], end[i] - start[i]) << range_str;
+      // No missing threads.
+      ASSERT_GE(end[i - 1], start[i]) << range_str;
+    }
+    // Each interval is at least of size 'min_threads_per_request'.
+    ASSERT_GE((end[i] - start[i]), min_threads_per_request) << range_str;
+    // Verify that assigned (quantized) threads is not overly estimated
+    // from real demand, when the demand is high (>=
+    // min_threads_per_request).
+    float entry_weight = num_active_requests - i;
+    float total_weight = 0.5f * num_active_requests * (num_active_requests + 1);
+    float thread_demand = (entry_weight * num_threads) / total_weight;
+    if (thread_demand > min_threads_per_request) {
+      // We expect some over-estimation of threads due to quantization,
+      // but we hope it's not more than 1 extra thread.
+      ASSERT_NEAR(end[i] - start[i], thread_demand, 1.0)
+          << "Ranges: " << range_str << " thread_demand: " << thread_demand
+          << " i: " << i;
+    }
+  }
+  ASSERT_EQ(end[num_active_requests - 1], num_threads);
+  ASSERT_EQ(start[0], 0);
+  if (print_stats) {
+    LOG(INFO) << "Assigned ranges: " << range_str;
+  }
+}
+
+TEST(RunHandlerUtilTest, TestComputeInterOpSchedulingRanges) {
+  const int kMinThreadsPerRequestBound = 12;
+  const int kMaxActiveRequests = 128;
+  const int kMaxThreads = 128;
+
+  for (int min_threads_per_request = 1;
+       min_threads_per_request <= kMinThreadsPerRequestBound;
+       ++min_threads_per_request) {
+    for (int num_active_requests = 1; num_active_requests <= kMaxActiveRequests;
+         ++num_active_requests) {
+      for (int num_threads = min_threads_per_request;
+           num_threads <= kMaxThreads; ++num_threads) {
+        VerifyFunction(num_active_requests, num_threads,
+                       min_threads_per_request);
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 85cd02350a..104ab039cb 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -453,6 +453,11 @@ message RunOptions {
     // same group_key value (in a distributed computation where tasks
     // run disjoint graphs).
     int64 collective_graph_key = 1;
+    // If true, then operations (using the inter-op pool) across all
+    // session::run() calls will be centrally scheduled, optimizing for (median
+    // and tail) latency.
+    // Consider using this option for CPU-bound workloads like inference.
+    bool use_run_handler_pool = 2;
   };
 
   Experimental experimental = 8;
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
index 537e73aa89..47b5b56faf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
@@ -8,5 +8,11 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_INT64
     }
+    field {
+      name: "use_run_handler_pool"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
index cec04a2bf0..c0c2e7b9f8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
@@ -55,6 +55,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT64
       }
+      field {
+        name: "use_run_handler_pool"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
     }
     enum_type {
       name: "TraceLevel"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
index 537e73aa89..47b5b56faf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
@@ -8,5 +8,11 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_INT64
     }
+    field {
+      name: "use_run_handler_pool"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
index cec04a2bf0..c0c2e7b9f8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
@@ -55,6 +55,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT64
       }
+      field {
+        name: "use_run_handler_pool"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
     }
     enum_type {
       name: "TraceLevel"
-- 
GitLab


From 17d73444f332490c733d37063710e72dc69d1141 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 14:10:19 -0700
Subject: [PATCH 163/570] Update hooks for distributed jobs with a master node,
 to ensure that summaries are written at the correct interval for jobs with
 long-running evaluations.

PiperOrigin-RevId: 214993119
---
 tensorflow/python/estimator/estimator.py      | 34 ++++++-
 tensorflow/python/estimator/estimator_test.py | 94 +++++++++++++++++++
 2 files changed, 126 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index b933cedb99..34faf03bb0 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -1414,6 +1414,36 @@ class Estimator(object):
         # It is expected to have one CheckpointSaverHook. If multiple, we pick
         # up the first one to add listener.
         saver_hooks[0]._listeners.extend(saving_listeners)  # pylint: disable=protected-access
+
+    # Add summary hooks to worker 0 if we are running with a master, to ensure
+    # that summaries are written at correct intervals even with long-running
+    # evaluations.
+    save_summary_steps = self._config.save_summary_steps
+    log_step_count_steps = self._config.log_step_count_steps
+    if (self._config.cluster_spec and self._config.cluster_spec.jobs and
+        (run_config.TaskType.MASTER in self._config.cluster_spec.jobs)):
+      # Update config values to prevent the default hooks from being created on
+      # the master or other workers.
+      save_summary_steps = 0
+      log_step_count_steps = None
+
+      if (self._config.task_type == run_config.TaskType.WORKER and
+          self._config.task_id == 0):
+        if (self._config.save_summary_steps and
+            self._config.save_summary_steps > 0):
+          worker_hooks.append(
+              training.SummarySaverHook(
+                  save_steps=self._config.save_summary_steps,
+                  output_dir=self._config.model_dir,
+                  scaffold=estimator_spec.scaffold))
+
+        if (self._config.log_step_count_steps and
+            self._config.log_step_count_steps > 0):
+          worker_hooks.append(
+              training.StepCounterHook(
+                  every_n_steps=self._config.log_step_count_steps,
+                  output_dir=self._config.model_dir))
+
     with training.MonitoredTrainingSession(
         master=self._config.master,
         is_chief=self._config.is_chief,
@@ -1423,9 +1453,9 @@ class Estimator(object):
         chief_only_hooks=(
             tuple(chief_hooks) + tuple(estimator_spec.training_chief_hooks)),
         save_checkpoint_secs=0,  # Saving is handled by a hook.
-        save_summaries_steps=self._config.save_summary_steps,
+        save_summaries_steps=save_summary_steps,
         config=self._session_config,
-        log_step_count_steps=self._config.log_step_count_steps) as mon_sess:
+        log_step_count_steps=log_step_count_steps) as mon_sess:
       loss = None
       while not mon_sess.should_stop():
         _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index bc2504ca19..246dfb1a4b 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import functools
 import glob
+import json
 import os
 import tempfile
 
@@ -969,6 +970,99 @@ class EstimatorTrainTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'train_and_evaluate'):
       est.train(dummy_input_fn, steps=1)
 
+  def test_master_distributed_hooks(self):
+    tf_config = json.dumps({
+        'cluster': {
+            run_config.TaskType.PS: ['localhost:1234'],
+            run_config.TaskType.WORKER: ['localhost:1235'],
+            run_config.TaskType.MASTER: ['localhost:1236']
+        },
+        'task': {
+            'type': run_config.TaskType.MASTER,
+            'index': 0
+        }
+    })
+    with test.mock.patch.dict('os.environ', {'TF_CONFIG': tf_config}):
+      est = estimator.Estimator(
+          model_fn=model_fn_global_step_incrementer,
+          config=run_config.RunConfig())
+
+    with test.mock.patch.object(training,
+                                'MonitoredTrainingSession') as mock_sess:
+      est.train(dummy_input_fn, steps=1)
+      self.assertFalse(
+          any(
+              isinstance(hook, basic_session_run_hooks.SummarySaverHook)
+              for hook in mock_sess.call_args[1]['hooks']))
+      self.assertFalse(
+          any(
+              isinstance(hook, basic_session_run_hooks.StepCounterHook)
+              for hook in mock_sess.call_args[1]['hooks']))
+      self.assertEqual(0, mock_sess.call_args[1]['save_summaries_steps'])
+      self.assertIsNone(mock_sess.call_args[1]['log_step_count_steps'])
+
+  def test_master_distributed_hooks_for_worker_0(self):
+    tf_config = json.dumps({
+        'cluster': {
+            run_config.TaskType.PS: ['localhost:1234'],
+            run_config.TaskType.WORKER: ['localhost:1235'],
+            run_config.TaskType.MASTER: ['localhost:1236']
+        },
+        'task': {
+            'type': run_config.TaskType.WORKER,
+            'index': 0
+        }
+    })
+    with test.mock.patch.dict('os.environ', {'TF_CONFIG': tf_config}):
+      est = estimator.Estimator(
+          model_fn=model_fn_global_step_incrementer,
+          config=run_config.RunConfig())
+
+    with test.mock.patch.object(training,
+                                'MonitoredTrainingSession') as mock_sess:
+      est.train(dummy_input_fn, steps=1)
+      self.assertTrue(
+          any(
+              isinstance(hook, basic_session_run_hooks.SummarySaverHook)
+              for hook in mock_sess.call_args[1]['hooks']))
+      self.assertTrue(
+          any(
+              isinstance(hook, basic_session_run_hooks.StepCounterHook)
+              for hook in mock_sess.call_args[1]['hooks']))
+      self.assertEqual(0, mock_sess.call_args[1]['save_summaries_steps'])
+      self.assertIsNone(mock_sess.call_args[1]['log_step_count_steps'])
+
+  def test_master_distributed_hooks_for_worker_nonzero(self):
+    tf_config = json.dumps({
+        'cluster': {
+            run_config.TaskType.PS: ['localhost:1234'],
+            run_config.TaskType.WORKER: ['localhost:1235', 'localhost:1237'],
+            run_config.TaskType.MASTER: ['localhost:1236']
+        },
+        'task': {
+            'type': run_config.TaskType.WORKER,
+            'index': 1
+        }
+    })
+    with test.mock.patch.dict('os.environ', {'TF_CONFIG': tf_config}):
+      est = estimator.Estimator(
+          model_fn=model_fn_global_step_incrementer,
+          config=run_config.RunConfig())
+
+    with test.mock.patch.object(training,
+                                'MonitoredTrainingSession') as mock_sess:
+      est.train(dummy_input_fn, steps=1)
+      self.assertFalse(
+          any(
+              isinstance(hook, basic_session_run_hooks.SummarySaverHook)
+              for hook in mock_sess.call_args[1]['hooks']))
+      self.assertFalse(
+          any(
+              isinstance(hook, basic_session_run_hooks.StepCounterHook)
+              for hook in mock_sess.call_args[1]['hooks']))
+      self.assertEqual(0, mock_sess.call_args[1]['save_summaries_steps'])
+      self.assertIsNone(mock_sess.call_args[1]['log_step_count_steps'])
+
 
 def _model_fn_with_eval_metric_ops(features, labels, mode, params):
   _, _ = features, labels
-- 
GitLab


From 5863cad53afad2fcc5d8a8dac7c2cf88e0e8ebb9 Mon Sep 17 00:00:00 2001
From: Ayush Dubey <ayushd@google.com>
Date: Fri, 28 Sep 2018 14:36:16 -0700
Subject: [PATCH 164/570] Copy shape into CollectiveParams only once per
 CollectiveReduce kernel.

PiperOrigin-RevId: 214997213
---
 tensorflow/core/kernels/collective_ops.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index fa959b5a0e..82e2913b64 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -132,7 +132,6 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
             "Failed to get CollectiveExecutor from OpKernelContext for Op ",
             col_params_.name),
         done);
-    col_params_.instance.shape = c->input(0).shape();
     // Allocate output on the first pass through this function.  This must be
     // done immediately, while we're still in the executor thread.  Otherwise
     // the memory is not guaranteed to be unused by any concurrently executing
@@ -144,6 +143,7 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
                            c->forward_input_or_allocate_output(
                                {0}, 0, c->input(0).shape(), &output),
                            done);
+      col_params_.instance.shape = c->input(0).shape();
     }
     if (!CanProceedWithCompute(c, col_exec, done)) return;
     auto actual_done = [c, col_exec, done](const Status& s) {
-- 
GitLab


From dee0481c07ed952d01b12704c89e50869a383c68 Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Fri, 28 Sep 2018 15:07:29 -0700
Subject: [PATCH 165/570] Adding FeatureColumn V2 support for linear canned
 estimators.

Since we now have support for FeatureColumnV2 for both DNN and Linear models,
adding tests for the combined canned estimators as well.

PiperOrigin-RevId: 215002573
---
 tensorflow/python/estimator/BUILD             |   1 +
 tensorflow/python/estimator/canned/dnn.py     |   3 -
 .../canned/dnn_linear_combined_test.py        | 268 ++++++++++++++----
 tensorflow/python/estimator/canned/linear.py  |  83 ++++--
 .../python/estimator/canned/linear_test.py    | 138 ++++++++-
 .../estimator/canned/linear_testing_utils.py  | 184 +++++++-----
 tensorflow/python/feature_column/BUILD        |   2 +-
 .../feature_column/feature_column_v2.py       | 100 ++++---
 .../feature_column/feature_column_v2_test.py  |   4 +-
 9 files changed, 579 insertions(+), 204 deletions(-)

diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index ba1b7ec2b5..1c4c5951df 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -344,6 +344,7 @@ py_test(
         ":pandas_io",
         ":prediction_keys",
         "//tensorflow:tensorflow_py_no_contrib",
+        "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index 97971f9561..a6c2aaa7d9 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -131,9 +131,7 @@ class _DNNModel(training.Model):
                name=None,
                **kwargs):
     super(_DNNModel, self).__init__(name=name, **kwargs)
-    self._is_v2 = False
     if feature_column_v2.is_feature_column_v2(feature_columns):
-      self._is_v2 = True
       self._input_layer = feature_column_v2.FeatureLayer(
           feature_columns=feature_columns,
           name='input_layer',
@@ -190,7 +188,6 @@ class _DNNModel(training.Model):
           _scope=logits_scope)
       self._add_layer(self._logits_layer, logits_scope.name)
       self._logits_scope_name = logits_scope.name
-    self._logits_layer._use_resource_variables = False  # pylint: disable=protected-access
     self._input_layer_partitioner = input_layer_partitioner
 
   def call(self, features, mode):
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined_test.py b/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
index d16318659b..ae968e717a 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import shutil
 import tempfile
 
+from absl.testing import parameterized
 import numpy as np
 import six
 
@@ -35,6 +36,7 @@ from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.estimator.inputs import pandas_io
 from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import nn
@@ -119,7 +121,16 @@ class LinearOnlyRegressorPartitionerTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearRegressorPartitionerTest.__init__(
-        self, _linear_regressor_fn)
+        self, _linear_regressor_fn, fc_lib=feature_column)
+
+
+class LinearOnlyRegressorPartitionerV2Test(
+    linear_testing_utils.BaseLinearRegressorPartitionerTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorPartitionerTest.__init__(
+        self, _linear_regressor_fn, fc_lib=feature_column_v2)
 
 
 class LinearOnlyRegressorEvaluationTest(
@@ -128,7 +139,16 @@ class LinearOnlyRegressorEvaluationTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
-        self, _linear_regressor_fn)
+        self, _linear_regressor_fn, fc_lib=feature_column)
+
+
+class LinearOnlyRegressorEvaluationV2Test(
+    linear_testing_utils.BaseLinearRegressorEvaluationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
+        self, _linear_regressor_fn, fc_lib=feature_column_v2)
 
 
 class LinearOnlyRegressorPredictTest(
@@ -137,7 +157,16 @@ class LinearOnlyRegressorPredictTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
-        self, _linear_regressor_fn)
+        self, _linear_regressor_fn, fc_lib=feature_column)
+
+
+class LinearOnlyRegressorPredictV2Test(
+    linear_testing_utils.BaseLinearRegressorPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
+        self, _linear_regressor_fn, fc_lib=feature_column_v2)
 
 
 class LinearOnlyRegressorIntegrationTest(
@@ -146,7 +175,16 @@ class LinearOnlyRegressorIntegrationTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearRegressorIntegrationTest.__init__(
-        self, _linear_regressor_fn)
+        self, _linear_regressor_fn, fc_lib=feature_column)
+
+
+class LinearOnlyRegressorIntegrationV2Test(
+    linear_testing_utils.BaseLinearRegressorIntegrationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorIntegrationTest.__init__(
+        self, _linear_regressor_fn, fc_lib=feature_column_v2)
 
 
 class LinearOnlyRegressorTrainingTest(
@@ -155,7 +193,16 @@ class LinearOnlyRegressorTrainingTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
-        self, _linear_regressor_fn)
+        self, _linear_regressor_fn, fc_lib=feature_column)
+
+
+class LinearOnlyRegressorTrainingV2Test(
+    linear_testing_utils.BaseLinearRegressorTrainingTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
+        self, _linear_regressor_fn, fc_lib=feature_column_v2)
 
 
 def _linear_classifier_fn(feature_columns,
@@ -185,7 +232,18 @@ class LinearOnlyClassifierTrainingTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearClassifierTrainingTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn)
+        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
+
+
+class LinearOnlyClassifierTrainingV2Test(
+    linear_testing_utils.BaseLinearClassifierTrainingTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierTrainingTest.__init__(
+        self,
+        linear_classifier_fn=_linear_classifier_fn,
+        fc_lib=feature_column_v2)
 
 
 class LinearOnlyClassifierClassesEvaluationTest(
@@ -194,7 +252,18 @@ class LinearOnlyClassifierClassesEvaluationTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearClassifierEvaluationTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn)
+        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
+
+
+class LinearOnlyClassifierClassesEvaluationV2Test(
+    linear_testing_utils.BaseLinearClassifierEvaluationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierEvaluationTest.__init__(
+        self,
+        linear_classifier_fn=_linear_classifier_fn,
+        fc_lib=feature_column_v2)
 
 
 class LinearOnlyClassifierPredictTest(
@@ -203,7 +272,18 @@ class LinearOnlyClassifierPredictTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearClassifierPredictTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn)
+        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
+
+
+class LinearOnlyClassifierPredictV2Test(
+    linear_testing_utils.BaseLinearClassifierPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierPredictTest.__init__(
+        self,
+        linear_classifier_fn=_linear_classifier_fn,
+        fc_lib=feature_column_v2)
 
 
 class LinearOnlyClassifierIntegrationTest(
@@ -212,9 +292,21 @@ class LinearOnlyClassifierIntegrationTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearClassifierIntegrationTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn)
+        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
+
+
+class LinearOnlyClassifierIntegrationV2Test(
+    linear_testing_utils.BaseLinearClassifierIntegrationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierIntegrationTest.__init__(
+        self,
+        linear_classifier_fn=_linear_classifier_fn,
+        fc_lib=feature_column_v2)
 
 
+@parameterized.parameters((feature_column,), (feature_column_v2,))
 class DNNLinearCombinedRegressorIntegrationTest(test.TestCase):
 
   def setUp(self):
@@ -225,13 +317,15 @@ class DNNLinearCombinedRegressorIntegrationTest(test.TestCase):
       writer_cache.FileWriterCache.clear()
       shutil.rmtree(self._model_dir)
 
-  def _test_complete_flow(
-      self, train_input_fn, eval_input_fn, predict_input_fn, input_dimension,
-      label_dimension, batch_size):
+  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
+                          input_dimension, label_dimension, batch_size,
+                          fc_impl):
     linear_feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))]
+        fc_impl.numeric_column('x', shape=(input_dimension,))
+    ]
     dnn_feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))]
+        fc_impl.numeric_column('x', shape=(input_dimension,))
+    ]
     feature_columns = linear_feature_columns + dnn_feature_columns
     est = dnn_linear_combined.DNNLinearCombinedRegressor(
         linear_feature_columns=linear_feature_columns,
@@ -257,14 +351,14 @@ class DNNLinearCombinedRegressorIntegrationTest(test.TestCase):
     self.assertAllEqual((batch_size, label_dimension), predictions.shape)
 
     # EXPORT
-    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    feature_spec = fc_impl.make_parse_example_spec(feature_columns)
     serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
         feature_spec)
     export_dir = est.export_savedmodel(tempfile.mkdtemp(),
                                        serving_input_receiver_fn)
     self.assertTrue(gfile.Exists(export_dir))
 
-  def test_numpy_input_fn(self):
+  def test_numpy_input_fn(self, fc_impl):
     """Tests complete flow with numpy_input_fn."""
     label_dimension = 2
     batch_size = 10
@@ -293,9 +387,10 @@ class DNNLinearCombinedRegressorIntegrationTest(test.TestCase):
         predict_input_fn=predict_input_fn,
         input_dimension=label_dimension,
         label_dimension=label_dimension,
-        batch_size=batch_size)
+        batch_size=batch_size,
+        fc_impl=fc_impl)
 
-  def test_pandas_input_fn(self):
+  def test_pandas_input_fn(self, fc_impl):
     """Tests complete flow with pandas_input_fn."""
     if not HAS_PANDAS:
       return
@@ -326,9 +421,10 @@ class DNNLinearCombinedRegressorIntegrationTest(test.TestCase):
         predict_input_fn=predict_input_fn,
         input_dimension=label_dimension,
         label_dimension=label_dimension,
-        batch_size=batch_size)
+        batch_size=batch_size,
+        fc_impl=fc_impl)
 
-  def test_input_fn_from_parse_example(self):
+  def test_input_fn_from_parse_example(self, fc_impl):
     """Tests complete flow with input_fn constructed from parse_example."""
     label_dimension = 2
     batch_size = 10
@@ -376,7 +472,8 @@ class DNNLinearCombinedRegressorIntegrationTest(test.TestCase):
         predict_input_fn=_predict_input_fn,
         input_dimension=label_dimension,
         label_dimension=label_dimension,
-        batch_size=batch_size)
+        batch_size=batch_size,
+        fc_impl=fc_impl)
 
 
 # A function to mimic dnn-classifier init reuse same tests.
@@ -407,7 +504,16 @@ class DNNOnlyClassifierEvaluateTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     dnn_testing_utils.BaseDNNClassifierEvaluateTest.__init__(
-        self, _dnn_classifier_fn)
+        self, _dnn_classifier_fn, fc_impl=feature_column)
+
+
+class DNNOnlyClassifierEvaluateV2Test(
+    dnn_testing_utils.BaseDNNClassifierEvaluateTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNClassifierEvaluateTest.__init__(
+        self, _dnn_classifier_fn, fc_impl=feature_column_v2)
 
 
 class DNNOnlyClassifierPredictTest(
@@ -416,7 +522,16 @@ class DNNOnlyClassifierPredictTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     dnn_testing_utils.BaseDNNClassifierPredictTest.__init__(
-        self, _dnn_classifier_fn)
+        self, _dnn_classifier_fn, fc_impl=feature_column)
+
+
+class DNNOnlyClassifierPredictV2Test(
+    dnn_testing_utils.BaseDNNClassifierPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNClassifierPredictTest.__init__(
+        self, _dnn_classifier_fn, fc_impl=feature_column_v2)
 
 
 class DNNOnlyClassifierTrainTest(
@@ -425,7 +540,16 @@ class DNNOnlyClassifierTrainTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     dnn_testing_utils.BaseDNNClassifierTrainTest.__init__(
-        self, _dnn_classifier_fn)
+        self, _dnn_classifier_fn, fc_impl=feature_column)
+
+
+class DNNOnlyClassifierTrainV2Test(dnn_testing_utils.BaseDNNClassifierTrainTest,
+                                   test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNClassifierTrainTest.__init__(
+        self, _dnn_classifier_fn, fc_impl=feature_column_v2)
 
 
 # A function to mimic dnn-regressor init reuse same tests.
@@ -454,7 +578,16 @@ class DNNOnlyRegressorEvaluateTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     dnn_testing_utils.BaseDNNRegressorEvaluateTest.__init__(
-        self, _dnn_regressor_fn)
+        self, _dnn_regressor_fn, fc_impl=feature_column)
+
+
+class DNNOnlyRegressorEvaluateV2Test(
+    dnn_testing_utils.BaseDNNRegressorEvaluateTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorEvaluateTest.__init__(
+        self, _dnn_regressor_fn, fc_impl=feature_column_v2)
 
 
 class DNNOnlyRegressorPredictTest(
@@ -463,7 +596,16 @@ class DNNOnlyRegressorPredictTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     dnn_testing_utils.BaseDNNRegressorPredictTest.__init__(
-        self, _dnn_regressor_fn)
+        self, _dnn_regressor_fn, fc_impl=feature_column)
+
+
+class DNNOnlyRegressorPredictV2Test(
+    dnn_testing_utils.BaseDNNRegressorPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorPredictTest.__init__(
+        self, _dnn_regressor_fn, fc_impl=feature_column_v2)
 
 
 class DNNOnlyRegressorTrainTest(
@@ -472,9 +614,19 @@ class DNNOnlyRegressorTrainTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     dnn_testing_utils.BaseDNNRegressorTrainTest.__init__(
-        self, _dnn_regressor_fn)
+        self, _dnn_regressor_fn, fc_impl=feature_column)
 
 
+class DNNOnlyRegressorTrainV2Test(dnn_testing_utils.BaseDNNRegressorTrainTest,
+                                  test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorTrainTest.__init__(
+        self, _dnn_regressor_fn, fc_impl=feature_column_v2)
+
+
+@parameterized.parameters((feature_column,), (feature_column_v2,))
 class DNNLinearCombinedClassifierIntegrationTest(test.TestCase):
 
   def setUp(self):
@@ -488,13 +640,14 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase):
   def _as_label(self, data_in_float):
     return np.rint(data_in_float).astype(np.int64)
 
-  def _test_complete_flow(
-      self, train_input_fn, eval_input_fn, predict_input_fn, input_dimension,
-      n_classes, batch_size):
+  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
+                          input_dimension, n_classes, batch_size, fc_impl):
     linear_feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))]
+        fc_impl.numeric_column('x', shape=(input_dimension,))
+    ]
     dnn_feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))]
+        fc_impl.numeric_column('x', shape=(input_dimension,))
+    ]
     feature_columns = linear_feature_columns + dnn_feature_columns
     est = dnn_linear_combined.DNNLinearCombinedClassifier(
         linear_feature_columns=linear_feature_columns,
@@ -520,14 +673,14 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase):
     self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
 
     # EXPORT
-    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    feature_spec = fc_impl.make_parse_example_spec(feature_columns)
     serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
         feature_spec)
     export_dir = est.export_savedmodel(tempfile.mkdtemp(),
                                        serving_input_receiver_fn)
     self.assertTrue(gfile.Exists(export_dir))
 
-  def test_numpy_input_fn(self):
+  def test_numpy_input_fn(self, fc_impl):
     """Tests complete flow with numpy_input_fn."""
     n_classes = 3
     input_dimension = 2
@@ -559,9 +712,10 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase):
         predict_input_fn=predict_input_fn,
         input_dimension=input_dimension,
         n_classes=n_classes,
-        batch_size=batch_size)
+        batch_size=batch_size,
+        fc_impl=fc_impl)
 
-  def test_pandas_input_fn(self):
+  def test_pandas_input_fn(self, fc_impl):
     """Tests complete flow with pandas_input_fn."""
     if not HAS_PANDAS:
       return
@@ -593,9 +747,10 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase):
         predict_input_fn=predict_input_fn,
         input_dimension=input_dimension,
         n_classes=n_classes,
-        batch_size=batch_size)
+        batch_size=batch_size,
+        fc_impl=fc_impl)
 
-  def test_input_fn_from_parse_example(self):
+  def test_input_fn_from_parse_example(self, fc_impl):
     """Tests complete flow with input_fn constructed from parse_example."""
     input_dimension = 2
     n_classes = 3
@@ -647,9 +802,11 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase):
         predict_input_fn=_predict_input_fn,
         input_dimension=input_dimension,
         n_classes=n_classes,
-        batch_size=batch_size)
+        batch_size=batch_size,
+        fc_impl=fc_impl)
 
 
+@parameterized.parameters((feature_column,), (feature_column_v2,))
 class DNNLinearCombinedTests(test.TestCase):
 
   def setUp(self):
@@ -681,9 +838,9 @@ class DNNLinearCombinedTests(test.TestCase):
 
     return optimizer_mock
 
-  def test_train_op_calls_both_dnn_and_linear(self):
+  def test_train_op_calls_both_dnn_and_linear(self, fc_impl):
     opt = gradient_descent.GradientDescentOptimizer(1.)
-    x_column = feature_column.numeric_column('x')
+    x_column = fc_impl.numeric_column('x')
     input_fn = numpy_io.numpy_input_fn(
         x={'x': np.array([[0.], [1.]])},
         y=np.array([[0.], [1.]]),
@@ -708,7 +865,7 @@ class DNNLinearCombinedTests(test.TestCase):
                      checkpoint_utils.load_variable(
                          self._model_dir, 'dnn_called'))
 
-  def test_dnn_and_linear_logits_are_added(self):
+  def test_dnn_and_linear_logits_are_added(self, fc_impl):
     with ops.Graph().as_default():
       variables_lib.Variable([[1.0]], name='linear/linear_model/x/weights')
       variables_lib.Variable([2.0], name='linear/linear_model/bias_weights')
@@ -719,7 +876,7 @@ class DNNLinearCombinedTests(test.TestCase):
       variables_lib.Variable(1, name='global_step', dtype=dtypes.int64)
       linear_testing_utils.save_variables_to_ckpt(self._model_dir)
 
-    x_column = feature_column.numeric_column('x')
+    x_column = fc_impl.numeric_column('x')
     est = dnn_linear_combined.DNNLinearCombinedRegressor(
         linear_feature_columns=[x_column],
         dnn_hidden_units=[1],
@@ -737,6 +894,7 @@ class DNNLinearCombinedTests(test.TestCase):
         next(est.predict(input_fn=input_fn)))
 
 
+@parameterized.parameters((feature_column,), (feature_column_v2,))
 class DNNLinearCombinedWarmStartingTest(test.TestCase):
 
   def setUp(self):
@@ -758,11 +916,11 @@ class DNNLinearCombinedWarmStartingTest(test.TestCase):
     writer_cache.FileWriterCache.clear()
     shutil.rmtree(self._ckpt_and_vocab_dir)
 
-  def test_classifier_basic_warm_starting(self):
+  def test_classifier_basic_warm_starting(self, fc_impl):
     """Tests correctness of DNNLinearCombinedClassifier default warm-start."""
-    age = feature_column.numeric_column('age')
-    city = feature_column.embedding_column(
-        feature_column.categorical_column_with_vocabulary_list(
+    age = fc_impl.numeric_column('age')
+    city = fc_impl.embedding_column(
+        fc_impl.categorical_column_with_vocabulary_list(
             'city', vocabulary_list=['Mountain View', 'Palo Alto']),
         dimension=5)
 
@@ -798,11 +956,11 @@ class DNNLinearCombinedWarmStartingTest(test.TestCase):
           dnn_lc_classifier.get_variable_value(variable_name),
           warm_started_dnn_lc_classifier.get_variable_value(variable_name))
 
-  def test_regressor_basic_warm_starting(self):
+  def test_regressor_basic_warm_starting(self, fc_impl):
     """Tests correctness of DNNLinearCombinedRegressor default warm-start."""
-    age = feature_column.numeric_column('age')
-    city = feature_column.embedding_column(
-        feature_column.categorical_column_with_vocabulary_list(
+    age = fc_impl.numeric_column('age')
+    city = fc_impl.embedding_column(
+        fc_impl.categorical_column_with_vocabulary_list(
             'city', vocabulary_list=['Mountain View', 'Palo Alto']),
         dimension=5)
 
@@ -836,11 +994,11 @@ class DNNLinearCombinedWarmStartingTest(test.TestCase):
           dnn_lc_regressor.get_variable_value(variable_name),
           warm_started_dnn_lc_regressor.get_variable_value(variable_name))
 
-  def test_warm_starting_selective_variables(self):
+  def test_warm_starting_selective_variables(self, fc_impl):
     """Tests selecting variables to warm-start."""
-    age = feature_column.numeric_column('age')
-    city = feature_column.embedding_column(
-        feature_column.categorical_column_with_vocabulary_list(
+    age = fc_impl.numeric_column('age')
+    city = fc_impl.embedding_column(
+        fc_impl.categorical_column_with_vocabulary_list(
             'city', vocabulary_list=['Mountain View', 'Palo Alto']),
         dimension=5)
 
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index 115dd18518..8b96284bd3 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -25,14 +25,18 @@ import six
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import optimizers
-from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variable_ops
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
 from tensorflow.python.training import ftrl
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import estimator_export
 
 
@@ -46,23 +50,42 @@ def _get_default_optimizer(feature_columns):
   return ftrl.FtrlOptimizer(learning_rate=learning_rate)
 
 
-def _compute_fraction_of_zero(cols_to_vars):
-  """Given a linear cols_to_vars dict, compute the fraction of zero weights.
+def _get_expanded_variable_list(var_list):
+  """Given a list of variables, expands them if they are partitioned.
 
   Args:
-    cols_to_vars: A dictionary mapping FeatureColumns to lists of tf.Variables
-      like one returned from feature_column_lib.linear_model.
+    var_list: A list of variables.
+
+  Returns:
+    A list of variables where each partitioned variable is expanded to its
+    components.
+  """
+  returned_list = []
+  for variable in var_list:
+    if (isinstance(variable, variable_ops.Variable) or
+        resource_variable_ops.is_resource_variable(variable)):
+      returned_list.append(variable)  # Single variable case.
+    else:  # Must be a PartitionedVariable, so convert into a list.
+      returned_list.extend(list(variable))
+  return returned_list
+
+
+# TODO(rohanj): Consider making this a public utility method.
+def _compute_fraction_of_zero(variables):
+  """Given a linear variables list, compute the fraction of zero weights.
+
+  Args:
+    variables: A list or list of list of variables
 
   Returns:
     The fraction of zeros (sparsity) in the linear model.
   """
   all_weight_vars = []
-  for var_or_var_list in cols_to_vars.values():
+  for var_or_var_list in variables:
+    var_list = nest.flatten(var_or_var_list)
     # Skip empty-lists associated with columns that created no Variables.
-    if var_or_var_list:
-      all_weight_vars += [
-          array_ops.reshape(var, [-1]) for var in var_or_var_list
-      ]
+    if var_list:
+      all_weight_vars += [array_ops.reshape(var, [-1]) for var in var_list]
   return nn.zero_fraction(array_ops.concat(all_weight_vars, axis=0))
 
 
@@ -92,14 +115,36 @@ def _linear_logit_fn_builder(units, feature_columns, sparse_combiner='sum'):
     Returns:
       A `Tensor` representing the logits.
     """
-    cols_to_vars = {}
-    logits = feature_column_lib.linear_model(
-        features=features,
-        feature_columns=feature_columns,
-        units=units,
-        sparse_combiner=sparse_combiner,
-        cols_to_vars=cols_to_vars)
-    bias = cols_to_vars.pop('bias')
+    if feature_column_v2.is_feature_column_v2(feature_columns):
+      shared_state_manager = feature_column_v2.SharedEmbeddingStateManager()
+      linear_model = feature_column_v2.LinearModel(
+          feature_columns=feature_columns,
+          units=units,
+          sparse_combiner=sparse_combiner,
+          shared_state_manager=shared_state_manager)
+      logits = linear_model(features)
+      bias = linear_model.bias_variable
+
+      # We'd like to get all the non-bias variables associated with this
+      # LinearModel. This includes the shared embedding variables as well.
+      variables = linear_model.variables
+      variables.remove(bias)
+      variables.extend(shared_state_manager.variables)
+
+      # Expand (potential) Partitioned variables
+      bias = _get_expanded_variable_list([bias])
+      variables = _get_expanded_variable_list(variables)
+    else:
+      linear_model = feature_column._LinearModel(  # pylint: disable=protected-access
+          feature_columns=feature_columns,
+          units=units,
+          sparse_combiner=sparse_combiner,
+          name='linear_model')
+      logits = linear_model(features)
+      cols_to_vars = linear_model.cols_to_vars()
+      bias = cols_to_vars.pop('bias')
+      variables = cols_to_vars.values()
+
     if units > 1:
       summary.histogram('bias', bias)
     else:
@@ -107,7 +152,7 @@ def _linear_logit_fn_builder(units, feature_columns, sparse_combiner='sum'):
       # so we should provide a scalar summary.
       summary.scalar('bias', bias[0][0])
     summary.scalar('fraction_of_zero_weights',
-                   _compute_fraction_of_zero(cols_to_vars))
+                   _compute_fraction_of_zero(variables))
     return logits
 
   return linear_logit_fn
diff --git a/tensorflow/python/estimator/canned/linear_test.py b/tensorflow/python/estimator/canned/linear_test.py
index 59a230417d..3e6da5de22 100644
--- a/tensorflow/python/estimator/canned/linear_test.py
+++ b/tensorflow/python/estimator/canned/linear_test.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 from tensorflow.python.estimator.canned import linear
 from tensorflow.python.estimator.canned import linear_testing_utils
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.platform import test
 
 
@@ -40,7 +42,16 @@ class LinearRegressorPartitionerTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearRegressorPartitionerTest.__init__(
-        self, _linear_regressor_fn)
+        self, _linear_regressor_fn, fc_lib=feature_column)
+
+
+class LinearRegressorPartitionerV2Test(
+    linear_testing_utils.BaseLinearRegressorPartitionerTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorPartitionerTest.__init__(
+        self, _linear_regressor_fn, fc_lib=feature_column_v2)
 
 
 class LinearRegressorEvaluationTest(
@@ -49,7 +60,16 @@ class LinearRegressorEvaluationTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
-        self, _linear_regressor_fn)
+        self, _linear_regressor_fn, fc_lib=feature_column)
+
+
+class LinearRegressorEvaluationV2Test(
+    linear_testing_utils.BaseLinearRegressorEvaluationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
+        self, _linear_regressor_fn, fc_lib=feature_column_v2)
 
 
 class LinearRegressorPredictTest(
@@ -58,7 +78,16 @@ class LinearRegressorPredictTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
-        self, _linear_regressor_fn)
+        self, _linear_regressor_fn, fc_lib=feature_column)
+
+
+class LinearRegressorPredictV2Test(
+    linear_testing_utils.BaseLinearRegressorPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
+        self, _linear_regressor_fn, fc_lib=feature_column_v2)
 
 
 class LinearRegressorIntegrationTest(
@@ -67,7 +96,16 @@ class LinearRegressorIntegrationTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearRegressorIntegrationTest.__init__(
-        self, _linear_regressor_fn)
+        self, _linear_regressor_fn, fc_lib=feature_column)
+
+
+class LinearRegressorIntegrationV2Test(
+    linear_testing_utils.BaseLinearRegressorIntegrationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorIntegrationTest.__init__(
+        self, _linear_regressor_fn, fc_lib=feature_column_v2)
 
 
 class LinearRegressorTrainingTest(
@@ -76,19 +114,37 @@ class LinearRegressorTrainingTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
-        self, _linear_regressor_fn)
+        self, _linear_regressor_fn, fc_lib=feature_column)
 
 
-# Tests for Linear Classifier.
+class LinearRegressorTrainingV2Test(
+    linear_testing_utils.BaseLinearRegressorTrainingTest, test.TestCase):
 
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
+        self, _linear_regressor_fn, fc_lib=feature_column_v2)
 
+
+# Tests for Linear Classifier.
 class LinearClassifierTrainingTest(
     linear_testing_utils.BaseLinearClassifierTrainingTest, test.TestCase):
 
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearClassifierTrainingTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn)
+        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
+
+
+class LinearClassifierTrainingV2Test(
+    linear_testing_utils.BaseLinearClassifierTrainingTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierTrainingTest.__init__(
+        self,
+        linear_classifier_fn=_linear_classifier_fn,
+        fc_lib=feature_column_v2)
 
 
 class LinearClassifierEvaluationTest(
@@ -97,7 +153,18 @@ class LinearClassifierEvaluationTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearClassifierEvaluationTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn)
+        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
+
+
+class LinearClassifierEvaluationV2Test(
+    linear_testing_utils.BaseLinearClassifierEvaluationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierEvaluationTest.__init__(
+        self,
+        linear_classifier_fn=_linear_classifier_fn,
+        fc_lib=feature_column_v2)
 
 
 class LinearClassifierPredictTest(
@@ -106,7 +173,18 @@ class LinearClassifierPredictTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearClassifierPredictTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn)
+        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
+
+
+class LinearClassifierPredictV2Test(
+    linear_testing_utils.BaseLinearClassifierPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierPredictTest.__init__(
+        self,
+        linear_classifier_fn=_linear_classifier_fn,
+        fc_lib=feature_column_v2)
 
 
 class LinearClassifierIntegrationTest(
@@ -115,7 +193,18 @@ class LinearClassifierIntegrationTest(
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearClassifierIntegrationTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn)
+        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
+
+
+class LinearClassifierIntegrationV2Test(
+    linear_testing_utils.BaseLinearClassifierIntegrationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierIntegrationTest.__init__(
+        self,
+        linear_classifier_fn=_linear_classifier_fn,
+        fc_lib=feature_column_v2)
 
 
 # Tests for Linear logit_fn.
@@ -124,7 +213,17 @@ class LinearLogitFnTest(linear_testing_utils.BaseLinearLogitFnTest,
 
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearLogitFnTest.__init__(self)
+    linear_testing_utils.BaseLinearLogitFnTest.__init__(
+        self, fc_lib=feature_column)
+
+
+class LinearLogitFnV2Test(linear_testing_utils.BaseLinearLogitFnTest,
+                          test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearLogitFnTest.__init__(
+        self, fc_lib=feature_column_v2)
 
 
 # Tests for warm-starting with Linear logit_fn.
@@ -134,7 +233,22 @@ class LinearWarmStartingTest(linear_testing_utils.BaseLinearWarmStartingTest,
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     test.TestCase.__init__(self, methodName)
     linear_testing_utils.BaseLinearWarmStartingTest.__init__(
-        self, _linear_classifier_fn, _linear_regressor_fn)
+        self,
+        _linear_classifier_fn,
+        _linear_regressor_fn,
+        fc_lib=feature_column)
+
+
+class LinearWarmStartingV2Test(linear_testing_utils.BaseLinearWarmStartingTest,
+                               test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearWarmStartingTest.__init__(
+        self,
+        _linear_classifier_fn,
+        _linear_regressor_fn,
+        fc_lib=feature_column_v2)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/estimator/canned/linear_testing_utils.py b/tensorflow/python/estimator/canned/linear_testing_utils.py
index 65cdd50061..827352a70b 100644
--- a/tensorflow/python/estimator/canned/linear_testing_utils.py
+++ b/tensorflow/python/estimator/canned/linear_testing_utils.py
@@ -37,7 +37,8 @@ from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.estimator.inputs import pandas_io
-from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -152,8 +153,9 @@ class CheckPartitionerVarHook(session_run_hook.SessionRunHook):
 
 class BaseLinearRegressorPartitionerTest(object):
 
-  def __init__(self, linear_regressor_fn):
+  def __init__(self, linear_regressor_fn, fc_lib=feature_column):
     self._linear_regressor_fn = linear_regressor_fn
+    self._fc_lib = fc_lib
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
@@ -173,7 +175,7 @@ class BaseLinearRegressorPartitionerTest(object):
       return [partitions, 1] if shape[0] == x_dim else [1]
 
     regressor = self._linear_regressor_fn(
-        feature_columns=(feature_column_lib.categorical_column_with_hash_bucket(
+        feature_columns=(self._fc_lib.categorical_column_with_hash_bucket(
             'language', hash_bucket_size=x_dim),),
         partitioner=_partitioner,
         model_dir=self._model_dir)
@@ -209,9 +211,8 @@ class BaseLinearRegressorPartitionerTest(object):
         '_get_replica_device_setter',
         return_value=lambda _: '/cpu:0'):
       linear_regressor = self._linear_regressor_fn(
-          feature_columns=(
-              feature_column_lib.categorical_column_with_hash_bucket(
-                  'language', hash_bucket_size=x_dim),),
+          feature_columns=(self._fc_lib.categorical_column_with_hash_bucket(
+              'language', hash_bucket_size=x_dim),),
           config=FakeRunConfig(),
           model_dir=self._model_dir)
 
@@ -232,8 +233,9 @@ class BaseLinearRegressorPartitionerTest(object):
 # TODO(b/36813849): Add tests with dynamic shape inputs using placeholders.
 class BaseLinearRegressorEvaluationTest(object):
 
-  def __init__(self, linear_regressor_fn):
+  def __init__(self, linear_regressor_fn, fc_lib=feature_column):
     self._linear_regressor_fn = linear_regressor_fn
+    self._fc_lib = fc_lib
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
@@ -252,7 +254,7 @@ class BaseLinearRegressorEvaluationTest(object):
       save_variables_to_ckpt(self._model_dir)
 
     linear_regressor = self._linear_regressor_fn(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         model_dir=self._model_dir)
     eval_metrics = linear_regressor.evaluate(
         input_fn=lambda: ({'age': ((1,),)}, ((10.,),)), steps=1)
@@ -276,7 +278,7 @@ class BaseLinearRegressorEvaluationTest(object):
       save_variables_to_ckpt(self._model_dir)
 
     linear_regressor = self._linear_regressor_fn(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         model_dir=self._model_dir)
     eval_metrics = linear_regressor.evaluate(
         input_fn=lambda: ({'age': ((1,), (1,))}, ((10.,), (10.,))), steps=1)
@@ -308,7 +310,7 @@ class BaseLinearRegressorEvaluationTest(object):
       return features, labels
 
     linear_regressor = self._linear_regressor_fn(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         weight_column='weights',
         model_dir=self._model_dir)
     eval_metrics = linear_regressor.evaluate(input_fn=_input_fn, steps=1)
@@ -336,8 +338,7 @@ class BaseLinearRegressorEvaluationTest(object):
       save_variables_to_ckpt(self._model_dir)
 
     linear_regressor = self._linear_regressor_fn(
-        feature_columns=(feature_column_lib.numeric_column(
-            'age', shape=(x_dim,)),),
+        feature_columns=(self._fc_lib.numeric_column('age', shape=(x_dim,)),),
         label_dimension=label_dim,
         model_dir=self._model_dir)
     input_fn = numpy_io.numpy_input_fn(
@@ -374,8 +375,8 @@ class BaseLinearRegressorEvaluationTest(object):
 
     batch_size = 2
     feature_columns = [
-        feature_column_lib.numeric_column('age'),
-        feature_column_lib.numeric_column('height')
+        self._fc_lib.numeric_column('age'),
+        self._fc_lib.numeric_column('height')
     ]
     input_fn = numpy_io.numpy_input_fn(
         x={'age': np.array([20, 40]),
@@ -402,8 +403,9 @@ class BaseLinearRegressorEvaluationTest(object):
 
 class BaseLinearRegressorPredictTest(object):
 
-  def __init__(self, linear_regressor_fn):
+  def __init__(self, linear_regressor_fn, fc_lib=feature_column):
     self._linear_regressor_fn = linear_regressor_fn
+    self._fc_lib = fc_lib
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
@@ -422,7 +424,7 @@ class BaseLinearRegressorPredictTest(object):
       save_variables_to_ckpt(self._model_dir)
 
     linear_regressor = self._linear_regressor_fn(
-        feature_columns=(feature_column_lib.numeric_column('x'),),
+        feature_columns=(self._fc_lib.numeric_column('x'),),
         model_dir=self._model_dir)
 
     predict_input_fn = numpy_io.numpy_input_fn(
@@ -441,7 +443,7 @@ class BaseLinearRegressorPredictTest(object):
     batch_size = 2
     label_dimension = 3
     x_dim = 4
-    feature_columns = (feature_column_lib.numeric_column('x', shape=(x_dim,)),)
+    feature_columns = (self._fc_lib.numeric_column('x', shape=(x_dim,)),)
     with ops.Graph().as_default():
       variables_lib.Variable(  # shape=[x_dim, label_dimension]
           [[1., 2., 3.], [2., 3., 4.], [3., 4., 5.], [4., 5., 6.]],
@@ -479,8 +481,8 @@ class BaseLinearRegressorPredictTest(object):
       save_variables_to_ckpt(self._model_dir)
 
     linear_regressor = self._linear_regressor_fn(
-        feature_columns=(feature_column_lib.numeric_column('x0'),
-                         feature_column_lib.numeric_column('x1')),
+        feature_columns=(self._fc_lib.numeric_column('x0'),
+                         self._fc_lib.numeric_column('x1')),
         model_dir=self._model_dir)
 
     predict_input_fn = numpy_io.numpy_input_fn(
@@ -515,9 +517,8 @@ class BaseLinearRegressorPredictTest(object):
               dense_shape=[2, 2]),
       })
 
-    feature_columns = (
-        feature_column_lib.categorical_column_with_vocabulary_list(
-            'language', vocabulary_list=['a', 'b', 'c']),)
+    feature_columns = (self._fc_lib.categorical_column_with_vocabulary_list(
+        'language', vocabulary_list=['a', 'b', 'c']),)
 
     # Check prediction for each sparse_combiner.
     # With sparse_combiner = 'sum', we have
@@ -561,8 +562,9 @@ class BaseLinearRegressorPredictTest(object):
 
 class BaseLinearRegressorIntegrationTest(object):
 
-  def __init__(self, linear_regressor_fn):
+  def __init__(self, linear_regressor_fn, fc_lib=feature_column):
     self._linear_regressor_fn = linear_regressor_fn
+    self._fc_lib = fc_lib
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
@@ -575,7 +577,7 @@ class BaseLinearRegressorIntegrationTest(object):
   def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
                           input_dimension, label_dimension, prediction_length):
     feature_columns = [
-        feature_column_lib.numeric_column('x', shape=(input_dimension,))
+        self._fc_lib.numeric_column('x', shape=(input_dimension,))
     ]
     est = self._linear_regressor_fn(
         feature_columns=feature_columns,
@@ -597,7 +599,7 @@ class BaseLinearRegressorIntegrationTest(object):
     self.assertAllEqual((prediction_length, label_dimension), predictions.shape)
 
     # EXPORT
-    feature_spec = feature_column_lib.make_parse_example_spec(feature_columns)
+    feature_spec = self._fc_lib.make_parse_example_spec(feature_columns)
     serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
         feature_spec)
     export_dir = est.export_savedmodel(tempfile.mkdtemp(),
@@ -729,8 +731,9 @@ class BaseLinearRegressorIntegrationTest(object):
 
 class BaseLinearRegressorTrainingTest(object):
 
-  def __init__(self, linear_regressor_fn):
+  def __init__(self, linear_regressor_fn, fc_lib=feature_column):
     self._linear_regressor_fn = linear_regressor_fn
+    self._fc_lib = fc_lib
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
@@ -808,7 +811,7 @@ class BaseLinearRegressorTrainingTest(object):
     label = 5.
     age = 17
     linear_regressor = self._linear_regressor_fn(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         model_dir=self._model_dir)
 
     # Train for a few steps, and validate final checkpoint.
@@ -820,7 +823,7 @@ class BaseLinearRegressorTrainingTest(object):
   def testTrainWithOneDimLabel(self):
     label_dimension = 1
     batch_size = 20
-    feature_columns = [feature_column_lib.numeric_column('age', shape=(1,))]
+    feature_columns = [self._fc_lib.numeric_column('age', shape=(1,))]
     est = self._linear_regressor_fn(
         feature_columns=feature_columns,
         label_dimension=label_dimension,
@@ -840,7 +843,7 @@ class BaseLinearRegressorTrainingTest(object):
   def testTrainWithOneDimWeight(self):
     label_dimension = 1
     batch_size = 20
-    feature_columns = [feature_column_lib.numeric_column('age', shape=(1,))]
+    feature_columns = [self._fc_lib.numeric_column('age', shape=(1,))]
     est = self._linear_regressor_fn(
         feature_columns=feature_columns,
         label_dimension=label_dimension,
@@ -867,7 +870,7 @@ class BaseLinearRegressorTrainingTest(object):
     # loss = (logits - label)^2 = (0 - 5.)^2 = 25.
     mock_optimizer = self._mock_optimizer(expected_loss=25.)
     linear_regressor = self._linear_regressor_fn(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         model_dir=self._model_dir,
         optimizer=mock_optimizer)
     self.assertEqual(0, mock_optimizer.minimize.call_count)
@@ -900,7 +903,7 @@ class BaseLinearRegressorTrainingTest(object):
     # loss = (logits - label)^2 = (175 - 5)^2 = 28900
     mock_optimizer = self._mock_optimizer(expected_loss=28900.)
     linear_regressor = self._linear_regressor_fn(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         model_dir=self._model_dir,
         optimizer=mock_optimizer)
     self.assertEqual(0, mock_optimizer.minimize.call_count)
@@ -935,7 +938,7 @@ class BaseLinearRegressorTrainingTest(object):
     # loss = sum(logits - label)^2 = (175 - 5)^2 + (155 - 3)^2 = 52004
     mock_optimizer = self._mock_optimizer(expected_loss=52004.)
     linear_regressor = self._linear_regressor_fn(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         model_dir=self._model_dir,
         optimizer=mock_optimizer)
     self.assertEqual(0, mock_optimizer.minimize.call_count)
@@ -954,8 +957,9 @@ class BaseLinearRegressorTrainingTest(object):
 
 class BaseLinearClassifierTrainingTest(object):
 
-  def __init__(self, linear_classifier_fn):
+  def __init__(self, linear_classifier_fn, fc_lib=feature_column):
     self._linear_classifier_fn = linear_classifier_fn
+    self._fc_lib = fc_lib
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
@@ -1031,7 +1035,7 @@ class BaseLinearClassifierTrainingTest(object):
     label = 0
     age = 17
     est = linear.LinearClassifier(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         n_classes=n_classes,
         model_dir=self._model_dir)
 
@@ -1051,7 +1055,7 @@ class BaseLinearClassifierTrainingTest(object):
     batch_size = 20
 
     est = linear.LinearClassifier(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         n_classes=n_classes,
         model_dir=self._model_dir)
     data_rank_1 = np.array([0, 1])
@@ -1078,7 +1082,7 @@ class BaseLinearClassifierTrainingTest(object):
     batch_size = 20
 
     est = linear.LinearClassifier(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         n_classes=n_classes,
         model_dir=self._model_dir)
     data_rank_1 = np.array([0, 1])
@@ -1103,7 +1107,7 @@ class BaseLinearClassifierTrainingTest(object):
     batch_size = 20
 
     est = linear.LinearClassifier(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         weight_column='w',
         n_classes=n_classes,
         model_dir=self._model_dir)
@@ -1129,7 +1133,7 @@ class BaseLinearClassifierTrainingTest(object):
     batch_size = 20
 
     est = linear.LinearClassifier(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         weight_column='w',
         n_classes=n_classes,
         model_dir=self._model_dir)
@@ -1166,7 +1170,7 @@ class BaseLinearClassifierTrainingTest(object):
         expected_loss=-1 * math.log(1.0/n_classes))
 
     est = linear.LinearClassifier(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         n_classes=n_classes,
         optimizer=mock_optimizer,
         model_dir=self._model_dir)
@@ -1229,7 +1233,7 @@ class BaseLinearClassifierTrainingTest(object):
     mock_optimizer = self._mock_optimizer(expected_loss=expected_loss)
 
     est = linear.LinearClassifier(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         n_classes=n_classes,
         optimizer=mock_optimizer,
         model_dir=self._model_dir)
@@ -1277,7 +1281,7 @@ class BaseLinearClassifierTrainingTest(object):
     mock_optimizer = self._mock_optimizer(expected_loss=1.1132617)
 
     est = linear.LinearClassifier(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         n_classes=n_classes,
         optimizer=mock_optimizer,
         model_dir=self._model_dir)
@@ -1341,7 +1345,7 @@ class BaseLinearClassifierTrainingTest(object):
     mock_optimizer = self._mock_optimizer(expected_loss=expected_loss)
 
     est = linear.LinearClassifier(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         n_classes=n_classes,
         optimizer=mock_optimizer,
         model_dir=self._model_dir)
@@ -1368,8 +1372,9 @@ class BaseLinearClassifierTrainingTest(object):
 
 class BaseLinearClassifierEvaluationTest(object):
 
-  def __init__(self, linear_classifier_fn):
+  def __init__(self, linear_classifier_fn, fc_lib=feature_column):
     self._linear_classifier_fn = linear_classifier_fn
+    self._fc_lib = fc_lib
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
@@ -1398,7 +1403,7 @@ class BaseLinearClassifierEvaluationTest(object):
       save_variables_to_ckpt(self._model_dir)
 
     est = self._linear_classifier_fn(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         n_classes=n_classes,
         model_dir=self._model_dir)
     eval_metrics = est.evaluate(
@@ -1464,7 +1469,7 @@ class BaseLinearClassifierEvaluationTest(object):
       save_variables_to_ckpt(self._model_dir)
 
     est = self._linear_classifier_fn(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         n_classes=n_classes,
         model_dir=self._model_dir)
     eval_metrics = est.evaluate(
@@ -1540,7 +1545,7 @@ class BaseLinearClassifierEvaluationTest(object):
       save_variables_to_ckpt(self._model_dir)
 
     est = self._linear_classifier_fn(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         n_classes=n_classes,
         weight_column='w',
         model_dir=self._model_dir)
@@ -1605,8 +1610,9 @@ class BaseLinearClassifierEvaluationTest(object):
 
 class BaseLinearClassifierPredictTest(object):
 
-  def __init__(self, linear_classifier_fn):
+  def __init__(self, linear_classifier_fn, fc_lib=feature_column):
     self._linear_classifier_fn = linear_classifier_fn
+    self._fc_lib = fc_lib
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
@@ -1634,7 +1640,7 @@ class BaseLinearClassifierPredictTest(object):
       save_variables_to_ckpt(self._model_dir)
 
     est = self._linear_classifier_fn(
-        feature_columns=(feature_column_lib.numeric_column('age'),),
+        feature_columns=(self._fc_lib.numeric_column('age'),),
         label_vocabulary=label_vocabulary,
         n_classes=n_classes,
         model_dir=self._model_dir)
@@ -1730,9 +1736,8 @@ class BaseLinearClassifierPredictTest(object):
               dense_shape=[2, 2]),
       })
 
-    feature_columns = (
-        feature_column_lib.categorical_column_with_vocabulary_list(
-            'language', vocabulary_list=['a', 'b', 'c']),)
+    feature_columns = (self._fc_lib.categorical_column_with_vocabulary_list(
+        'language', vocabulary_list=['a', 'b', 'c']),)
 
     # Check prediction for each sparse_combiner.
     # With sparse_combiner = 'sum', we have
@@ -1776,8 +1781,9 @@ class BaseLinearClassifierPredictTest(object):
 
 class BaseLinearClassifierIntegrationTest(object):
 
-  def __init__(self, linear_classifier_fn):
+  def __init__(self, linear_classifier_fn, fc_lib=feature_column):
     self._linear_classifier_fn = linear_classifier_fn
+    self._fc_lib = fc_lib
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
@@ -1789,7 +1795,7 @@ class BaseLinearClassifierIntegrationTest(object):
   def _test_complete_flow(self, n_classes, train_input_fn, eval_input_fn,
                           predict_input_fn, input_dimension, prediction_length):
     feature_columns = [
-        feature_column_lib.numeric_column('x', shape=(input_dimension,))
+        self._fc_lib.numeric_column('x', shape=(input_dimension,))
     ]
     est = self._linear_classifier_fn(
         feature_columns=feature_columns,
@@ -1811,7 +1817,7 @@ class BaseLinearClassifierIntegrationTest(object):
     self.assertAllEqual((prediction_length, 1), predictions.shape)
 
     # EXPORT
-    feature_spec = feature_column_lib.make_parse_example_spec(feature_columns)
+    feature_spec = self._fc_lib.make_parse_example_spec(feature_columns)
     serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
         feature_spec)
     export_dir = est.export_savedmodel(tempfile.mkdtemp(),
@@ -1961,9 +1967,12 @@ class BaseLinearClassifierIntegrationTest(object):
 
 class BaseLinearLogitFnTest(object):
 
+  def __init__(self, fc_lib=feature_column):
+    self._fc_lib = fc_lib
+
   def test_basic_logit_correctness(self):
     """linear_logit_fn simply wraps feature_column_lib.linear_model."""
-    age = feature_column_lib.numeric_column('age')
+    age = self._fc_lib.numeric_column('age')
     with ops.Graph().as_default():
       logit_fn = linear._linear_logit_fn_builder(units=2, feature_columns=[age])
       logits = logit_fn(features={'age': [[23.], [31.]]})
@@ -1983,12 +1992,14 @@ class BaseLinearLogitFnTest(object):
 
   def test_compute_fraction_of_zero(self):
     """Tests the calculation of sparsity."""
-    age = feature_column_lib.numeric_column('age')
-    occupation = feature_column_lib.categorical_column_with_hash_bucket(
+    if self._fc_lib != feature_column:
+      return
+    age = feature_column.numeric_column('age')
+    occupation = feature_column.categorical_column_with_hash_bucket(
         'occupation', hash_bucket_size=5)
     with ops.Graph().as_default():
       cols_to_vars = {}
-      feature_column_lib.linear_model(
+      feature_column.linear_model(
           features={
               'age': [[23.], [31.]],
               'occupation': [['doctor'], ['engineer']]
@@ -1997,7 +2008,42 @@ class BaseLinearLogitFnTest(object):
           units=3,
           cols_to_vars=cols_to_vars)
       cols_to_vars.pop('bias')
-      fraction_zero = linear._compute_fraction_of_zero(cols_to_vars)
+      fraction_zero = linear._compute_fraction_of_zero(cols_to_vars.values())
+      age_var = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
+                                   'linear_model/age')[0]
+      with tf_session.Session() as sess:
+        sess.run([variables_lib.global_variables_initializer()])
+        # Upon initialization, all variables will be zero.
+        self.assertAllClose(1, fraction_zero.eval())
+
+        sess.run(age_var.assign([[2.0, 0.0, -1.0]]))
+        # 1 of the 3 age weights are zero, and all of the 15 (5 hash buckets
+        # x 3-dim output) are zero.
+        self.assertAllClose(16. / 18., fraction_zero.eval())
+
+  def test_compute_fraction_of_zero_v2(self):
+    """Tests the calculation of sparsity."""
+    if self._fc_lib != feature_column_v2:
+      return
+
+    age = feature_column_v2.numeric_column('age')
+    occupation = feature_column_v2.categorical_column_with_hash_bucket(
+        'occupation', hash_bucket_size=5)
+    shared_state_manager = feature_column_v2.SharedEmbeddingStateManager()
+    with ops.Graph().as_default():
+      model = feature_column_v2.LinearModel(
+          feature_columns=[age, occupation],
+          units=3,
+          shared_state_manager=shared_state_manager)
+      features = {
+          'age': [[23.], [31.]],
+          'occupation': [['doctor'], ['engineer']]
+      }
+      model(features)
+      variables = model.variables
+      variables.remove(model.bias_variable)
+      variables.extend(shared_state_manager.variables)
+      fraction_zero = linear._compute_fraction_of_zero(variables)
       age_var = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
                                    'linear_model/age')[0]
       with tf_session.Session() as sess:
@@ -2013,9 +2059,13 @@ class BaseLinearLogitFnTest(object):
 
 class BaseLinearWarmStartingTest(object):
 
-  def __init__(self, _linear_classifier_fn, _linear_regressor_fn):
+  def __init__(self,
+               _linear_classifier_fn,
+               _linear_regressor_fn,
+               fc_lib=feature_column):
     self._linear_classifier_fn = _linear_classifier_fn
     self._linear_regressor_fn = _linear_regressor_fn
+    self._fc_lib = fc_lib
 
   def setUp(self):
     # Create a directory to save our old checkpoint and vocabularies to.
@@ -2039,7 +2089,7 @@ class BaseLinearWarmStartingTest(object):
 
   def test_classifier_basic_warm_starting(self):
     """Tests correctness of LinearClassifier default warm-start."""
-    age = feature_column_lib.numeric_column('age')
+    age = self._fc_lib.numeric_column('age')
 
     # Create a LinearClassifier and train to save a checkpoint.
     linear_classifier = self._linear_classifier_fn(
@@ -2066,7 +2116,7 @@ class BaseLinearWarmStartingTest(object):
 
   def test_regressor_basic_warm_starting(self):
     """Tests correctness of LinearRegressor default warm-start."""
-    age = feature_column_lib.numeric_column('age')
+    age = self._fc_lib.numeric_column('age')
 
     # Create a LinearRegressor and train to save a checkpoint.
     linear_regressor = self._linear_regressor_fn(
@@ -2091,7 +2141,7 @@ class BaseLinearWarmStartingTest(object):
 
   def test_warm_starting_selective_variables(self):
     """Tests selecting variables to warm-start."""
-    age = feature_column_lib.numeric_column('age')
+    age = self._fc_lib.numeric_column('age')
 
     # Create a LinearClassifier and train to save a checkpoint.
     linear_classifier = self._linear_classifier_fn(
@@ -2128,7 +2178,7 @@ class BaseLinearWarmStartingTest(object):
     vocab_file = os.path.join(self._ckpt_and_vocab_dir, 'occupation_vocab')
     with open(vocab_file, 'w') as f:
       f.write('\n'.join(vocab_list))
-    occupation = feature_column_lib.categorical_column_with_vocabulary_file(
+    occupation = self._fc_lib.categorical_column_with_vocabulary_file(
         'occupation',
         vocabulary_file=vocab_file,
         vocabulary_size=len(vocab_list))
@@ -2152,7 +2202,7 @@ class BaseLinearWarmStartingTest(object):
                                   'new_occupation_vocab')
     with open(new_vocab_file, 'w') as f:
       f.write('\n'.join(new_vocab_list))
-    new_occupation = feature_column_lib.categorical_column_with_vocabulary_file(
+    new_occupation = self._fc_lib.categorical_column_with_vocabulary_file(
         'occupation',
         vocabulary_file=new_vocab_file,
         vocabulary_size=len(new_vocab_list))
@@ -2205,7 +2255,7 @@ class BaseLinearWarmStartingTest(object):
 
   def test_warm_starting_with_naming_change(self):
     """Tests warm-starting with a Tensor name remapping."""
-    age_in_years = feature_column_lib.numeric_column('age_in_years')
+    age_in_years = self._fc_lib.numeric_column('age_in_years')
 
     # Create a LinearClassifier and train to save a checkpoint.
     linear_classifier = self._linear_classifier_fn(
@@ -2219,7 +2269,7 @@ class BaseLinearWarmStartingTest(object):
     # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
     # accumulator values that change).
     warm_started_linear_classifier = self._linear_classifier_fn(
-        feature_columns=[feature_column_lib.numeric_column('age')],
+        feature_columns=[self._fc_lib.numeric_column('age')],
         n_classes=4,
         optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
         # The 'age' variable correspond to the 'age_in_years' variable in the
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 5800b693b4..ac53a84eef 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -156,7 +156,7 @@ py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator:numpy_io",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index a8d5bfb437..b79373c475 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -271,6 +271,7 @@ class _StateManagerImpl(StateManager):
         dtype=dtype,
         initializer=initializer,
         trainable=self._trainable and trainable,
+        use_resource=True,
         # TODO(rohanj): Get rid of this hack once we have a mechanism for
         # specifying a default partitioner for an entire layer. In that case,
         # the default getter for Layers should work.
@@ -383,8 +384,8 @@ class FeatureLayer(Layer):
       if isinstance(column, SharedEmbeddingColumn):
         column.create_state(self._shared_state_manager)
       else:
-        with variable_scope.variable_scope(None, default_name=self.name):
-          with variable_scope.variable_scope(None, default_name=column.name):
+        with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
+          with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
             column.create_state(self._state_manager)
       super(FeatureLayer, self).build(None)
 
@@ -414,19 +415,20 @@ class FeatureLayer(Layer):
     output_tensors = []
     ordered_columns = []
     for column in sorted(self._feature_columns, key=lambda x: x.name):
-      ordered_columns.append(column)
-      if isinstance(column, SharedEmbeddingColumn):
-        tensor = column.get_dense_tensor(transformation_cache,
-                                         self._shared_state_manager)
-      else:
-        tensor = column.get_dense_tensor(transformation_cache,
-                                         self._state_manager)
-      num_elements = column.variable_shape.num_elements()
-      batch_size = array_ops.shape(tensor)[0]
-      tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
-      output_tensors.append(tensor)
-      if cols_to_output_tensors is not None:
-        cols_to_output_tensors[column] = tensor
+      with ops.name_scope(column.name):
+        ordered_columns.append(column)
+        if isinstance(column, SharedEmbeddingColumn):
+          tensor = column.get_dense_tensor(transformation_cache,
+                                           self._shared_state_manager)
+        else:
+          tensor = column.get_dense_tensor(transformation_cache,
+                                           self._state_manager)
+        num_elements = column.variable_shape.num_elements()
+        batch_size = array_ops.shape(tensor)[0]
+        tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
+        output_tensors.append(tensor)
+        if cols_to_output_tensors is not None:
+          cols_to_output_tensors[column] = tensor
 
     _verify_static_batch_size_equality(output_tensors, ordered_columns)
     return array_ops.concat(output_tensors, 1)
@@ -601,6 +603,7 @@ class LinearModel(Layer):
           shape=[self._units],
           initializer=init_ops.zeros_initializer(),
           trainable=self.trainable,
+          use_resource=True,
           # TODO(rohanj): Get rid of this hack once we have a mechanism for
           # specifying a default partitioner for an entire layer. In that case,
           # the default getter for Layers should work.
@@ -627,36 +630,41 @@ class LinearModel(Layer):
     if not isinstance(features, dict):
       raise ValueError('We expected a dictionary here. Instead we got: ',
                        features)
-    transformation_cache = FeatureTransformationCache(features)
-    weighted_sums = []
-    for column in self._feature_columns:
-      with ops.name_scope(column.name):
-        # All the weights used in the linear model are owned by the state
-        # manager associated with this Linear Model.
-        weight_var = self._state_manager.get_variable(column, 'weights')
-
-        # The embedding weights for the SharedEmbeddingColumn are owned by
-        # the shared_state_manager and so we need to pass that in while
-        # creating the weighted sum. For all other columns, the state is owned
-        # by the Linear Model's state manager.
-        if isinstance(column, SharedEmbeddingColumn):
-          state_manager = self._shared_state_manager
-        else:
-          state_manager = self._state_manager
-        weighted_sum = _create_weighted_sum(
-            column=column,
-            transformation_cache=transformation_cache,
-            state_manager=state_manager,
-            sparse_combiner=self._sparse_combiner,
-            weight_var=weight_var)
-        weighted_sums.append(weighted_sum)
-
-    _verify_static_batch_size_equality(weighted_sums, self._feature_columns)
-    predictions_no_bias = math_ops.add_n(
-        weighted_sums, name='weighted_sum_no_bias')
-    predictions = nn_ops.bias_add(
-        predictions_no_bias, self._bias_variable, name='weighted_sum')
-    return predictions
+    with ops.name_scope(self.name):
+      transformation_cache = FeatureTransformationCache(features)
+      weighted_sums = []
+      for column in self._feature_columns:
+        with ops.name_scope(column.name):
+          # All the weights used in the linear model are owned by the state
+          # manager associated with this Linear Model.
+          weight_var = self._state_manager.get_variable(column, 'weights')
+
+          # The embedding weights for the SharedEmbeddingColumn are owned by
+          # the shared_state_manager and so we need to pass that in while
+          # creating the weighted sum. For all other columns, the state is owned
+          # by the Linear Model's state manager.
+          if isinstance(column, SharedEmbeddingColumn):
+            state_manager = self._shared_state_manager
+          else:
+            state_manager = self._state_manager
+          weighted_sum = _create_weighted_sum(
+              column=column,
+              transformation_cache=transformation_cache,
+              state_manager=state_manager,
+              sparse_combiner=self._sparse_combiner,
+              weight_var=weight_var)
+          weighted_sums.append(weighted_sum)
+
+      _verify_static_batch_size_equality(weighted_sums, self._feature_columns)
+      predictions_no_bias = math_ops.add_n(
+          weighted_sums, name='weighted_sum_no_bias')
+      predictions = nn_ops.bias_add(
+          predictions_no_bias, self._bias_variable, name='weighted_sum')
+      return predictions
+
+  @property
+  def bias_variable(self):
+    return self._bias_variable
 
 
 def _transform_features(features, feature_columns, state_manager):
@@ -2605,6 +2613,7 @@ class SharedEmbeddingStateManager(Layer):
           dtype=dtype,
           trainable=self.trainable and trainable,
           initializer=initializer,
+          use_resource=True,
           # TODO(rohanj): Get rid of this hack once we have a mechanism for
           # specifying a default partitioner for an entire layer. In that case,
           # the default getter for Layers should work.
@@ -3279,6 +3288,7 @@ def _safe_embedding_lookup_sparse(embedding_weights,
     raise ValueError('Missing embedding_weights %s.' % embedding_weights)
 
   dtype = sparse_weights.dtype if sparse_weights is not None else None
+  # TODO(rohanj): Look into removing this convert_to_tensor call.
   embedding_weights = [
       ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
   ]
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index a13a5010e1..d3787146ed 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -5170,8 +5170,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
-          ValueError, r'Dimensions.*are not compatible'):
+      with self.assertRaisesRegexp(ValueError,
+                                   r'Dimensions.*are not compatible'):
         model = fc.LinearModel((column,))
         model({
             'ids':
-- 
GitLab


From 2f559f2d5f75cf80183ae0d855110809404019f7 Mon Sep 17 00:00:00 2001
From: Tong Shen <endlessroad@google.com>
Date: Fri, 28 Sep 2018 15:14:43 -0700
Subject: [PATCH 166/570] Handle noinline gradient function in control flow
 functionalization.

PiperOrigin-RevId: 215003704
---
 .../tf2xla/functionalize_control_flow.cc      | 84 +++++++++++--------
 tensorflow/compiler/tf2xla/tf2xla_util.cc     | 30 +++++--
 tensorflow/compiler/tf2xla/tf2xla_util.h      | 51 +++++++----
 tensorflow/core/framework/function.cc         |  8 ++
 tensorflow/core/framework/function.h          |  5 ++
 5 files changed, 121 insertions(+), 57 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 2d45507796..36c6f5d316 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -92,13 +92,51 @@ Status FunctionalizeControlFlowForFunction(
   });
   const FunctionBody* body = flr->GetFunctionBody(handle);
 
+  // Call graph optimizer. The most important optimization we need is constant
+  // folding, which will replace ops like Shape/BroadcastGradientArgs with
+  // constant shape input. Without this optimization, those ops might become
+  // dynamic input for then/else body function and XLA will complain that input
+  // is not compile time constant. We enable function inlining as well, because
+  // otherwise we won't be able to infer shape for any node depending on
+  // function call nodes.
+  if (VLOG_IS_ON(4)) {
+    dump_graph::DumpGraphToFile(
+        absl::StrCat("functionalize_control_flow_before_opt_", func_name),
+        *body->graph, fld);
+  }
+  // Optimizer accepts std::unique_ptr<Graph>* as input and might change
+  // underlying pointer, thus we create a new Graph and copy from body->graph.
+  std::unique_ptr<Graph> optimized_graph(new Graph(fld));
+  CopyGraph(*body->graph, optimized_graph.get());
+  OptimizerOptions opts;
+  opts.set_opt_level(OptimizerOptions::L0);
+  opts.set_do_function_inlining(true);
+  opts.set_do_constant_folding(true);
+  GraphOptimizer optimizer(opts);
+  auto cf_consider_fn = [](const Node* n) {
+    // Skip SymbolicGradient op when doing constant folding.
+    // Enabling SymbolicGradient op in constant folding requires
+    // flr->device() to be non-null, and here we have not constructed
+    // proper Device object yet (it will be constructed in XlaCompiler).
+    return n->type_string() != FunctionLibraryDefinition::kGradientOp;
+  };
+  optimizer.Optimize(flr, flr->env(),
+                     /*device=*/nullptr, &optimized_graph,
+                     /*shape_map=*/nullptr, /*cse_consider_fn=*/nullptr,
+                     cf_consider_fn);
+  if (VLOG_IS_ON(4)) {
+    dump_graph::DumpGraphToFile(
+        absl::StrCat("functionalize_control_flow_after_opt_", func_name),
+        *optimized_graph, fld);
+  }
+
   // If any node has associated functions, functionalize them first.
   // Gather nodes with associated functions first, because rewriting those nodes
   // might involve node deletion/addition. Avoid modifying nodes while iterating
   // it.
   std::vector<std::pair<Node*, std::vector<AssociatedFunctionInfo>>>
       nodes_to_associated_functions;
-  for (auto* n : body->graph->nodes()) {
+  for (auto* n : optimized_graph->nodes()) {
     auto associated_functions = GetAssociatedFunctions(*n, flr);
     if (!associated_functions.empty()) {
       nodes_to_associated_functions.push_back({n, associated_functions});
@@ -118,7 +156,14 @@ Status FunctionalizeControlFlowForFunction(
         // but still rewrite the node.
         new_name = iter->second;
       } else {
-        new_name = fld->UniqueFunctionName(absl::StrCat(name, "_f15n_"));
+        if (associated_function.type() ==
+            AssociatedFunctionInfo::AssociatedFunctionType::kSymbolicGradient) {
+          // For SymbolicGradient, `name` is always "SymbolicGradient",
+          // which is not very informative. Use node name instead.
+          new_name = fld->UniqueFunctionName(absl::StrCat(n->name(), "_f15n_"));
+        } else {
+          new_name = fld->UniqueFunctionName(absl::StrCat(name, "_f15n_"));
+        }
         TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
             name, new_name, associated_function.attrs(), fld, flr,
             canonicalized_name_to_new_name));
@@ -129,43 +174,10 @@ Status FunctionalizeControlFlowForFunction(
       // That's fine because in that case, associated_functions will only have
       // one member and the loop will only run once.
       TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
-          body->graph, n, fld, associated_function, new_name));
+          optimized_graph.get(), n, fld, associated_function, new_name));
     }
   }
 
-  // Call graph optimizer. The most important optimization we need is constant
-  // folding, which will replace ops like Shape/BroadcastGradientArgs with
-  // constant shape input. Without this optimization, those ops might become
-  // dynamic input for then/else body function and XLA will complain that input
-  // is not compile time constant. We enable function inlining as well, because
-  // otherwise we won't be able to infer shape for any node depending on
-  // function call nodes.
-  if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
-        absl::StrCat("functionalize_control_flow_before_opt_", func_name),
-        *body->graph, fld);
-  }
-  // Optimizer accepts std::unique_ptr<Graph>* as input and might change
-  // underlying pointer, thus we create a new Graph and copy from body->graph.
-  std::unique_ptr<Graph> optimized_graph(new Graph(fld));
-  CopyGraph(*body->graph, optimized_graph.get());
-  OptimizerOptions opts;
-  opts.set_opt_level(OptimizerOptions::L0);
-  opts.set_do_function_inlining(true);
-  opts.set_do_constant_folding(true);
-  GraphOptimizer optimizer(opts);
-  auto cf_consider_fn = [](const Node* n) {
-    // Skip SymbolicGradient op when doing constant folding.
-    // Enabling SymbolicGradient op in constant folding requires
-    // flr->device() to be non-null, and here we have not constructed
-    // proper Device object yet (it will be constructed in XlaCompiler).
-    return n->type_string() != FunctionLibraryDefinition::kGradientOp;
-  };
-  optimizer.Optimize(flr, flr->env(),
-                     /*device=*/nullptr, &optimized_graph,
-                     /*shape_map=*/nullptr, /*cse_consider_fn=*/nullptr,
-                     cf_consider_fn);
-
   // Functionalize the function body.
   if (VLOG_IS_ON(4)) {
     dump_graph::DumpGraphToFile(
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index d6f42bac86..01dd3ba10f 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -336,9 +336,9 @@ bool HasAssociatedFunction(const NodeDef& node_def,
   }
 
   if (node_def.op() == FunctionLibraryDefinition::kGradientOp) {
-    // Skip gradient op. Gradient op has "f" attr, which is set to the function
-    // we are getting gradient for. That function is not associated with the op.
-    return false;
+    // Gradient op has "f" attr, which is set to the function we are getting
+    // gradient for. We need to functionalize the gradient function.
+    return true;
   }
 
   for (const auto& iter : node_def.attr()) {
@@ -357,17 +357,18 @@ std::vector<AssociatedFunctionInfo> GetAssociatedFunctions(
   if (flr->GetFunctionLibraryDefinition()->Contains(op)) {
     // This is a function call node.
     AttrValueMap attrs(node.attrs().begin(), node.attrs().end());
-    results.emplace_back(AssociatedFunctionInfo(op, attrs));
+    results.emplace_back(AssociatedFunctionInfo::FunctionCall(op, attrs));
   } else if (node.type_string() == FunctionLibraryDefinition::kGradientOp) {
-    // Skip gradient op. Gradient op has "f" attr, which is set to the function
-    // we are getting gradient for. That function is not associated with the op.
+    // This is a SymbolicGradient op.
+    AttrValueMap attrs(node.attrs().begin(), node.attrs().end());
+    results.emplace_back(AssociatedFunctionInfo::SymbolicGradient(op, attrs));
   } else {
     // Collect all function attrs for the node.
     for (auto& iter : node.attrs()) {
       if (iter.second.has_func()) {
         VLOG(2) << "Found function attr for node " << node.name() << ": "
                 << iter.first << " = " << iter.second.func().name();
-        results.emplace_back(AssociatedFunctionInfo(
+        results.emplace_back(AssociatedFunctionInfo::FunctionAttr(
             iter.second.func().name(), iter.second.func().attr(), iter.first));
       }
     }
@@ -410,6 +411,21 @@ Status RewriteAssociatedFunction(
       graph->RemoveNode(node);
       break;
     }
+    case AssociatedFunctionInfo::kSymbolicGradient: {
+      NameAttrList func;
+      TF_RETURN_IF_ERROR(GetNodeAttr(
+          node->attrs(), FunctionLibraryDefinition::kFuncAttr, &func));
+      GradientDef gradient_def;
+      gradient_def.set_function_name(func.name());
+      gradient_def.set_gradient_func(rewritten_function_name);
+      string original_grad_func = fld->FindGradient(func.name());
+      if (original_grad_func.empty()) {
+        TF_RETURN_IF_ERROR(fld->AddGradientDef(gradient_def));
+      } else if (original_grad_func != rewritten_function_name) {
+        TF_RETURN_IF_ERROR(fld->ReplaceGradient(gradient_def));
+      }
+      break;
+    }
     case AssociatedFunctionInfo::kFunctionAttr: {
       // Change function attr to rewritten functions.
       NameAttrList func;
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.h b/tensorflow/compiler/tf2xla/tf2xla_util.h
index 6065d0bb9a..53eab8b63e 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.h
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.h
@@ -65,21 +65,33 @@ uint32 GetXLARandomSeed();
 class AssociatedFunctionInfo {
  public:
   enum AssociatedFunctionType {
-    kFunctionCallNode = 0,
-    kFunctionAttr = 1,
+    kFunctionAttr = 0,
+    kFunctionCallNode = 1,
+    kSymbolicGradient = 2,
   };
 
-  // The node is a function call.
-  AssociatedFunctionInfo(const string& func_name, const AttrValueMap& attrs)
-      : type_(kFunctionCallNode), func_name_(func_name), attrs_(attrs) {}
-
   // The function is an attr of the node.
-  AssociatedFunctionInfo(const string& func_name, const AttrValueMap& attrs,
-                         const string& attr_name)
-      : type_(kFunctionAttr),
-        func_name_(func_name),
-        attrs_(attrs),
-        attr_name_(attr_name) {}
+  static AssociatedFunctionInfo FunctionAttr(const string& func_name,
+                                             const AttrValueMap& attrs,
+                                             const string& attr_name) {
+    return AssociatedFunctionInfo(kFunctionAttr, func_name, attrs, attr_name);
+  }
+
+  // The node is a function call.
+  static AssociatedFunctionInfo FunctionCall(const string& func_name,
+                                             const AttrValueMap& attrs) {
+    // attr_name will not be used in this case.
+    return AssociatedFunctionInfo(kFunctionCallNode, func_name, attrs,
+                                  /*attr_name=*/"");
+  }
+
+  // The node is a SymbolicGradient op.
+  static AssociatedFunctionInfo SymbolicGradient(const string& func_name,
+                                                 const AttrValueMap& attrs) {
+    // attr_name will not be used in this case.
+    return AssociatedFunctionInfo(kSymbolicGradient, func_name, attrs,
+                                  /*attr_name=*/"");
+  }
 
   AssociatedFunctionType type() const { return type_; }
 
@@ -90,6 +102,13 @@ class AssociatedFunctionInfo {
   const AttrValueMap& attrs() const { return attrs_; }
 
  private:
+  AssociatedFunctionInfo(AssociatedFunctionType type, const string& func_name,
+                         const AttrValueMap& attrs, const string& attr_name)
+      : type_(type),
+        func_name_(func_name),
+        attrs_(attrs),
+        attr_name_(attr_name) {}
+
   // Available for all instances.
   AssociatedFunctionType type_;
   string func_name_;
@@ -105,14 +124,18 @@ bool HasAssociatedFunction(const NodeDef& node_def,
 
 // Gets functions associated with the node. Current cases:
 // 1. For function call node, its function name;
-// 2. For nodes like XlaWhile/XlaIf, all their function attributes.
+// 2. For SymbolicGradient op, returned func_name will be "SymbolicGradient",
+//    and returned attrs will be this node's attributes;
+// 3. For nodes like XlaWhile/XlaIf, all their function attributes.
 std::vector<AssociatedFunctionInfo> GetAssociatedFunctions(
     const Node& node, FunctionLibraryRuntime* flr);
 
 // Changes associated functions for the node. Current cases:
 // 1. For function call node, creates a new node with the new function name and
 //    remove the old node;
-// 2. For nodes like XlaWhile/XlaIf, modify their function attributes.
+// 2. For SymbolicGradient op, add or replace GradientDef in
+//    FunctionLibraryDefinition;
+// 3. For nodes like XlaWhile/XlaIf, modify their function attributes.
 Status RewriteAssociatedFunction(
     Graph* graph, Node* node, FunctionLibraryDefinition* fld,
     const AssociatedFunctionInfo& associated_function,
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index a17959a448..20f957190b 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -1101,6 +1101,14 @@ Status FunctionLibraryDefinition::ReplaceFunction(const string& func,
   return Status::OK();
 }
 
+Status FunctionLibraryDefinition::ReplaceGradient(const GradientDef& grad) {
+  mutex_lock l(mu_);
+  bool added;
+  TF_RETURN_IF_ERROR(RemoveGradient(grad.function_name()));
+  TF_RETURN_IF_ERROR(AddGradientDefHelper(grad, &added));
+  return Status::OK();
+}
+
 Status FunctionLibraryDefinition::RemoveFunction(const string& func) {
   const auto& i = function_defs_.find(func);
   if (i == function_defs_.end()) {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index e01eb7503d..4d6d68e214 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -331,6 +331,11 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // a non-OK status if "func" was not found in the library, OK otherwise.
   Status ReplaceFunction(const string& func, const FunctionDef& fdef);
 
+  // Replaces the gradient corresponding to `grad.function_name()`. Returns
+  // a non-OK status if "grad.function_name()" was not found in the library, OK
+  // otherwise.
+  Status ReplaceGradient(const GradientDef& grad);
+
   // Adds the functions and gradients in 'other' to this function library.
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
-- 
GitLab


From 1c4a48ddd49f78fbd8ea3defd3a8755c91284166 Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Fri, 28 Sep 2018 15:22:06 -0700
Subject: [PATCH 167/570] [tf.data] Merged contrib.data's DatasetTestBase with
 the DatasetTestBase in core (and added that as a base class for all the
 contrib tests). Also changed the assertDatasetsEqual functions so they are
 both graph and eager compatible (took the code from CSVDatasetTest) :)

PiperOrigin-RevId: 215004892
---
 .../contrib/data/python/kernel_tests/BUILD    | 37 ++++++---
 .../kernel_tests/batch_dataset_op_test.py     |  9 +--
 .../python/kernel_tests/bucketing_test.py     |  9 ++-
 .../kernel_tests/csv_dataset_op_test.py       | 43 ++--------
 .../dataset_constructor_op_test.py            |  3 +-
 .../directed_interleave_dataset_test.py       |  3 +-
 .../kernel_tests/get_single_element_test.py   |  3 +-
 .../kernel_tests/indexed_dataset_ops_test.py  |  3 +-
 .../interleave_dataset_op_test.py             |  3 +-
 .../python/kernel_tests/iterator_ops_test.py  |  3 +-
 .../kernel_tests/lmdb_dataset_op_test.py      |  3 +-
 .../kernel_tests/map_dataset_op_test.py       |  3 +-
 .../python/kernel_tests/map_defun_op_test.py  |  4 +-
 .../python/kernel_tests/optimization/BUILD    |  9 ++-
 .../assert_next_dataset_op_test.py            |  3 +-
 .../optimization/hoist_random_uniform_test.py |  3 +-
 .../map_and_filter_fusion_test.py             |  3 +-
 .../optimization/map_parallelization_test.py  |  3 +-
 .../optimization/map_vectorization_test.py    | 14 ++--
 .../optimization/model_dataset_op_test.py     |  3 +-
 .../optimization/noop_elimination_test.py     |  3 +-
 .../optimization/optimize_dataset_op_test.py  |  3 +-
 .../python/kernel_tests/parsing_ops_test.py   |  3 +-
 .../kernel_tests/prefetching_ops_test.py      |  7 +-
 .../kernel_tests/range_dataset_op_test.py     |  3 +-
 .../kernel_tests/reader_dataset_ops_test.py   |  3 +-
 .../reader_dataset_ops_test_base.py           | 10 +--
 .../data/python/kernel_tests/resample_test.py |  3 +-
 .../kernel_tests/scan_dataset_op_test.py      |  3 +-
 .../kernel_tests/shuffle_dataset_op_test.py   |  3 +-
 .../kernel_tests/slide_dataset_op_test.py     |  8 +-
 .../kernel_tests/sql_dataset_op_test_base.py  |  5 +-
 .../kernel_tests/stats_dataset_test_base.py   |  4 +-
 .../data/python/kernel_tests/test_utils.py    | 73 -----------------
 .../threadpool_dataset_ops_test.py            |  4 +-
 .../kernel_tests/unique_dataset_op_test.py    |  3 +-
 .../kernel_tests/window_dataset_op_test.py    |  3 +-
 .../python/kernel_tests/writer_ops_test.py    |  3 +-
 tensorflow/python/data/kernel_tests/BUILD     |  3 +
 .../python/data/kernel_tests/test_base.py     | 80 +++++++++++++++++++
 tensorflow/tools/pip_package/BUILD            |  1 -
 41 files changed, 209 insertions(+), 183 deletions(-)
 delete mode 100644 tensorflow/contrib/data/python/kernel_tests/test_utils.py

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 21ac40eb21..33784afa3f 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -31,6 +31,7 @@ py_test(
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -54,6 +55,7 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -77,6 +79,7 @@ py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:session",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
@@ -97,6 +100,7 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
     ],
@@ -112,6 +116,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:random_seed",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -130,6 +135,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -144,6 +150,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -169,6 +176,7 @@ py_test(
         "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@six_archive//:six",
     ],
@@ -188,6 +196,7 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/estimator:estimator_py",
     ],
@@ -214,6 +223,7 @@ py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:session",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//third_party/py/numpy",
     ],
 )
@@ -239,6 +249,7 @@ py_test(
         "//tensorflow/python:io_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -258,6 +269,7 @@ py_test(
         "//tensorflow/python:io_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -282,6 +294,7 @@ py_test(
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
+        "//tensorflow/python/data/kernel_tests:test_base",
     ],
 )
 
@@ -300,6 +313,7 @@ py_test(
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//third_party/py/numpy",
@@ -315,6 +329,7 @@ cuda_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
@@ -340,6 +355,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
@@ -365,6 +381,7 @@ py_library(
         "//tensorflow/python:lib",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:readers",
     ],
@@ -411,6 +428,7 @@ py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -433,6 +451,7 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
@@ -453,6 +472,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -470,6 +490,7 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -489,6 +510,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "@org_sqlite//:python",
     ],
 )
@@ -533,6 +555,7 @@ py_library(
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/kernel_tests:test_base",
     ],
 )
 
@@ -549,6 +572,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -567,6 +591,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
@@ -587,6 +612,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -604,17 +630,8 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:lib",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
     ],
 )
-
-py_library(
-    name = "test_utils",
-    srcs = ["test_utils.py"],
-    deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index e2508de9e9..fed7de5f2b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.contrib.data.python.ops import batching
 from tensorflow.python.client import session
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -40,12 +41,8 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-class BatchDatasetTest(test.TestCase, parameterized.TestCase):
+class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-  def assertSparseValuesEqual(self, a, b):
-    self.assertAllEqual(a.indices, b.indices)
-    self.assertAllEqual(a.values, b.values)
-    self.assertAllEqual(a.dense_shape, b.dense_shape)
 
   def testDenseToSparseBatchDataset(self):
     components = np.random.randint(12, size=(100,)).astype(np.int32)
@@ -723,7 +720,7 @@ class BatchDatasetTest(test.TestCase, parameterized.TestCase):
         self.assertAllEqual([element for _ in range(10)], sess.run(get_next))
 
 
-class RestructuredDatasetTest(test.TestCase):
+class RestructuredDatasetTest(test_base.DatasetTestBase):
 
   def test_assert_element_shape(self):
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
index 48971f2ccc..ae401f786c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -22,6 +22,7 @@ import random
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import grouping
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -35,7 +36,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
-class GroupByReducerTest(test.TestCase):
+class GroupByReducerTest(test_base.DatasetTestBase):
 
   def checkResults(self, dataset, shapes, values):
     self.assertEqual(shapes, dataset.output_shapes)
@@ -198,7 +199,7 @@ class GroupByReducerTest(test.TestCase):
       self.assertEqual(y, 45)
 
 
-class GroupByWindowTest(test.TestCase):
+class GroupByWindowTest(test_base.DatasetTestBase):
 
   def testSimple(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
@@ -345,7 +346,7 @@ class GroupByWindowTest(test.TestCase):
 # NOTE(mrry): These tests are based on the tests in bucket_ops_test.py.
 # Currently, they use a constant batch size, though should be made to use a
 # different batch size per key.
-class BucketTest(test.TestCase):
+class BucketTest(test_base.DatasetTestBase):
 
   def _dynamicPad(self, bucket, window, window_size):
     # TODO(mrry): To match `tf.contrib.training.bucket()`, implement a
@@ -570,7 +571,7 @@ def _get_record_shape(sparse):
   return tensor_shape.TensorShape([None])
 
 
-class BucketBySequenceLength(test.TestCase):
+class BucketBySequenceLength(test_base.DatasetTestBase):
 
   def testBucket(self):
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
index f8e74e4583..5b3c512b64 100644
--- a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
@@ -30,6 +30,7 @@ import numpy as np
 from tensorflow.contrib.data.python.ops import error_ops
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.python.client import session
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -43,37 +44,7 @@ from tensorflow.python.platform import test
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class CsvDatasetOpTest(test.TestCase):
-
-  def _get_next(self, dataset):
-    # Returns a no argument function whose result is fed to self.evaluate to
-    # yield the next element
-    it = dataset.make_one_shot_iterator()
-    if context.executing_eagerly():
-      return it.get_next
-    else:
-      get_next = it.get_next()
-      return lambda: get_next
-
-  def _assert_datasets_equal(self, ds1, ds2):
-    assert ds1.output_shapes == ds2.output_shapes, ('output_shapes differ: %s, '
-                                                    '%s') % (ds1.output_shapes,
-                                                             ds2.output_shapes)
-    assert ds1.output_types == ds2.output_types
-    assert ds1.output_classes == ds2.output_classes
-    next1 = self._get_next(ds1)
-    next2 = self._get_next(ds2)
-    # Run through datasets and check that outputs match, or errors match.
-    while True:
-      try:
-        op1 = self.evaluate(next1())
-      except (errors.OutOfRangeError, ValueError) as e:
-        # If op1 throws an exception, check that op2 throws same exception.
-        with self.assertRaises(type(e)):
-          self.evaluate(next2())
-        break
-      op2 = self.evaluate(next2())
-      self.assertAllEqual(op1, op2)
+class CsvDatasetOpTest(test_base.DatasetTestBase):
 
   def _setup_files(self, inputs, linebreak='\n', compression_type=None):
     filenames = []
@@ -108,7 +79,7 @@ class CsvDatasetOpTest(test.TestCase):
     """Checks that CsvDataset is equiv to TextLineDataset->map(decode_csv)."""
     dataset_actual, dataset_expected = self._make_test_datasets(
         inputs, **kwargs)
-    self._assert_datasets_equal(dataset_actual, dataset_expected)
+    self.assertDatasetsEqual(dataset_actual, dataset_expected)
 
   def _verify_output_or_err(self,
                             dataset,
@@ -116,7 +87,7 @@ class CsvDatasetOpTest(test.TestCase):
                             expected_err_re=None):
     if expected_err_re is None:
       # Verify that output is expected, without errors
-      nxt = self._get_next(dataset)
+      nxt = self.getNext(dataset)
       expected_output = [[
           v.encode('utf-8') if isinstance(v, str) else v for v in op
       ] for op in expected_output]
@@ -128,7 +99,7 @@ class CsvDatasetOpTest(test.TestCase):
     else:
       # Verify that OpError is produced as expected
       with self.assertRaisesOpError(expected_err_re):
-        nxt = self._get_next(dataset)
+        nxt = self.getNext(dataset)
         while True:
           try:
             self.evaluate(nxt())
@@ -354,7 +325,7 @@ class CsvDatasetOpTest(test.TestCase):
     inputs = [['1,,3,4', '5,6,,8']]
     ds_actual, ds_expected = self._make_test_datasets(
         inputs, record_defaults=record_defaults)
-    self._assert_datasets_equal(
+    self.assertDatasetsEqual(
         ds_actual.repeat(5).prefetch(1),
         ds_expected.repeat(5).prefetch(1))
 
@@ -377,7 +348,7 @@ class CsvDatasetOpTest(test.TestCase):
 
     ds = readers.make_csv_dataset(
         file_path, batch_size=1, shuffle=False, num_epochs=1)
-    nxt = self._get_next(ds)
+    nxt = self.getNext(ds)
 
     result = list(self.evaluate(nxt()).values())
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
index a2ab3de52e..722e87e555 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.data.python.ops import batching
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
@@ -25,7 +26,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class DatasetConstructorTest(test.TestCase):
+class DatasetConstructorTest(test_base.DatasetTestBase):
 
   def testRestructureDataset(self):
     components = (array_ops.placeholder(dtypes.int32),
diff --git a/tensorflow/contrib/data/python/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/contrib/data/python/kernel_tests/directed_interleave_dataset_test.py
index eb110324d1..bc10c21472 100644
--- a/tensorflow/contrib/data/python/kernel_tests/directed_interleave_dataset_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/directed_interleave_dataset_test.py
@@ -20,13 +20,14 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import interleave_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import random_seed
 from tensorflow.python.platform import test
 
 
-class DirectedInterleaveDatasetTest(test.TestCase):
+class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
 
   def testBasic(self):
     selector_dataset = dataset_ops.Dataset.range(10).repeat(100)
diff --git a/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py b/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
index f3968cdc15..cc22ea1df7 100644
--- a/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.contrib.data.python.ops import get_single_element
 from tensorflow.contrib.data.python.ops import grouping
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -30,7 +31,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class GetSingleElementTest(test.TestCase, parameterized.TestCase):
+class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
       ("Zero", 0, 1),
diff --git a/tensorflow/contrib/data/python/kernel_tests/indexed_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/indexed_dataset_ops_test.py
index 46a7127b52..d4d3d4adb2 100644
--- a/tensorflow/contrib/data/python/kernel_tests/indexed_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/indexed_dataset_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import unittest
 
 from tensorflow.contrib.data.python.ops import indexed_dataset_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -28,7 +29,7 @@ from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.platform import test
 
 
-class IndexedDatasetOpsTest(test.TestCase):
+class IndexedDatasetOpsTest(test_base.DatasetTestBase):
 
   def testLowLevelIndexedDatasetOps(self):
     identity = ged_ops.experimental_identity_indexed_dataset(
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index b9e74dfddb..28bd670ab5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -25,6 +25,7 @@ import time
 from six.moves import zip_longest
 
 from tensorflow.contrib.data.python.ops import interleave_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -36,7 +37,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
-class ParallelInterleaveDatasetTest(test.TestCase):
+class ParallelInterleaveDatasetTest(test_base.DatasetTestBase):
 
   def setUp(self):
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
index 7e2326bd17..58a1d7c93b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.data.python.ops import iterator_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn
@@ -33,7 +34,7 @@ from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
 
 
-class CheckpointInputPipelineHookTest(test.TestCase):
+class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
 
   @staticmethod
   def _model_fn(features, labels, mode, config):
diff --git a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
index 1cc5ddc9a2..d2a72272db 100644
--- a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
@@ -22,6 +22,7 @@ import os
 import shutil
 
 from tensorflow.contrib.data.python.ops import readers
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -31,7 +32,7 @@ from tensorflow.python.util import compat
 prefix_path = "tensorflow/core/lib"
 
 
-class LMDBDatasetTest(test.TestCase):
+class LMDBDatasetTest(test_base.DatasetTestBase):
 
   def setUp(self):
     super(LMDBDatasetTest, self).setUp()
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index e8519381d6..385c4ef6ea 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.data.python.ops import error_ops
 from tensorflow.contrib.data.python.ops import optimization
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -41,7 +42,7 @@ from tensorflow.python.util import compat
 _NUMPY_RANDOM_SEED = 42
 
 
-class MapDatasetTest(test.TestCase):
+class MapDatasetTest(test_base.DatasetTestBase):
 
   def testMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py
index 25aea0393f..751e6d5b30 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py
@@ -21,6 +21,7 @@ import time
 
 from tensorflow.contrib.data.python.ops import map_defun
 from tensorflow.python.client import session
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -33,7 +34,8 @@ from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
-class MapDefunTest(test.TestCase):
+
+class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunSimple(self):
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/BUILD b/tensorflow/contrib/data/python/kernel_tests/optimization/BUILD
index 1ae92bdeff..d7b5edcd9a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/optimization/BUILD
@@ -15,6 +15,7 @@ py_test(
         "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
@@ -31,6 +32,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -57,7 +59,6 @@ py_test(
     srcs = ["map_vectorization_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/kernel_tests:test_utils",
         "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:client_testlib",
@@ -67,6 +68,7 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -85,6 +87,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -102,6 +105,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -121,6 +125,7 @@ py_test(
         "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -137,6 +142,7 @@ py_test(
         "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -151,6 +157,7 @@ py_test(
         "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/assert_next_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/optimization/assert_next_dataset_op_test.py
index d10da80442..fe1b5280ba 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/assert_next_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/optimization/assert_next_dataset_op_test.py
@@ -18,12 +18,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
 
 
-class AssertNextDatasetTest(test.TestCase):
+class AssertNextDatasetTest(test_base.DatasetTestBase):
 
   def testAssertNext(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/hoist_random_uniform_test.py b/tensorflow/contrib/data/python/kernel_tests/optimization/hoist_random_uniform_test.py
index 9518c2e1ad..b43efb5c7c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/hoist_random_uniform_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/optimization/hoist_random_uniform_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,7 +32,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
-class HoistRandomUniformTest(test.TestCase, parameterized.TestCase):
+class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @staticmethod
   def map_functions():
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/map_and_filter_fusion_test.py b/tensorflow/contrib/data/python/kernel_tests/optimization/map_and_filter_fusion_test.py
index e75edf6086..e9e3fc81e5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/map_and_filter_fusion_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/optimization/map_and_filter_fusion_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -28,7 +29,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class MapAndFilterFusionTest(test.TestCase, parameterized.TestCase):
+class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @staticmethod
   def map_functions():
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/map_parallelization_test.py b/tensorflow/contrib/data/python/kernel_tests/optimization/map_parallelization_test.py
index dd547db086..f7907eb890 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/map_parallelization_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/optimization/map_parallelization_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -30,7 +31,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
-class MapParallelizationTest(test.TestCase, parameterized.TestCase):
+class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @staticmethod
   def map_functions():
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/contrib/data/python/kernel_tests/optimization/map_vectorization_test.py
index 5b493f44c9..a5ea85f454 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/optimization/map_vectorization_test.py
@@ -22,9 +22,9 @@ import time
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests import test_utils
 from tensorflow.contrib.data.python.ops import optimization
 from tensorflow.python.client import session
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -36,7 +36,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class MapVectorizationTest(test_utils.DatasetTestBase, parameterized.TestCase):
+class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _get_test_datasets(self,
                          base_dataset,
@@ -85,7 +85,7 @@ class MapVectorizationTest(test_utils.DatasetTestBase, parameterized.TestCase):
                                                            [3, 4]]).repeat(5)
     unoptimized, optimized = self._get_test_datasets(base_dataset, map_fn,
                                                      num_parallel_calls)
-    self._assert_datasets_equal(unoptimized, optimized)
+    self.assertDatasetsEqual(unoptimized, optimized)
 
   def testOptimizationBadMapFn(self):
     # Test map functions that give an error
@@ -112,7 +112,7 @@ class MapVectorizationTest(test_utils.DatasetTestBase, parameterized.TestCase):
     # TODO(rachelim): when this optimization works, turn on expect_optimized
     unoptimized, optimized = self._get_test_datasets(
         base_dataset, map_fn, expect_optimized=False)
-    self._assert_datasets_equal(optimized, unoptimized)
+    self.assertDatasetsEqual(optimized, unoptimized)
 
   def testOptimizationIgnoreStateful(self):
 
@@ -124,7 +124,7 @@ class MapVectorizationTest(test_utils.DatasetTestBase, parameterized.TestCase):
                                                            [3, 4]]).repeat(5)
     unoptimized, optimized = self._get_test_datasets(
         base_dataset, map_fn, expect_optimized=False)
-    self._assert_datasets_raise_same_error(
+    self.assertDatasetsRaiseSameError(
         unoptimized, optimized, errors.InvalidArgumentError,
         [("OneShotIterator", "OneShotIterator_1", 1),
          ("IteratorGetNext", "IteratorGetNext_1", 1)])
@@ -138,7 +138,7 @@ class MapVectorizationTest(test_utils.DatasetTestBase, parameterized.TestCase):
     base_dataset = dataset_ops.Dataset.range(20).batch(3, drop_remainder=False)
     unoptimized, optimized = self._get_test_datasets(
         base_dataset, map_fn, expect_optimized=False)
-    self._assert_datasets_equal(unoptimized, optimized)
+    self.assertDatasetsEqual(unoptimized, optimized)
 
   def testOptimizationIgnoreRaggedMap(self):
     # Don't optimize when the output of the map fn shapes are unknown.
@@ -148,7 +148,7 @@ class MapVectorizationTest(test_utils.DatasetTestBase, parameterized.TestCase):
     base_dataset = dataset_ops.Dataset.range(20).batch(1, drop_remainder=True)
     unoptimized, optimized = self._get_test_datasets(
         base_dataset, map_fn, expect_optimized=False)
-    self._assert_datasets_raise_same_error(
+    self.assertDatasetsRaiseSameError(
         unoptimized, optimized, errors.InvalidArgumentError,
         [("OneShotIterator", "OneShotIterator_1", 1),
          ("IteratorGetNext", "IteratorGetNext_1", 1)])
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/model_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/optimization/model_dataset_op_test.py
index 3b62a7e468..33c250ab2a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/model_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/optimization/model_dataset_op_test.py
@@ -23,12 +23,13 @@ import numpy as np
 
 from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class ModelDatasetTest(test.TestCase):
+class ModelDatasetTest(test_base.DatasetTestBase):
 
   def testModelMap(self):
     k = 1024 * 1024
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/noop_elimination_test.py b/tensorflow/contrib/data/python/kernel_tests/optimization/noop_elimination_test.py
index 507feda3ad..b9e60cfa4e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/noop_elimination_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/optimization/noop_elimination_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -26,7 +27,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class NoopEliminationTest(test.TestCase):
+class NoopEliminationTest(test_base.DatasetTestBase):
 
   def testNoopElimination(self):
     a = constant_op.constant(1, dtype=dtypes.int64)
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/optimize_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/optimization/optimize_dataset_op_test.py
index a3fb824ce9..04f499f8c5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/optimize_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/optimization/optimize_dataset_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -28,7 +29,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
-class OptimizeDatasetTest(test.TestCase):
+class OptimizeDatasetTest(test_base.DatasetTestBase):
 
   def testOptimizationDefault(self):
     dataset = dataset_ops.Dataset.range(10).apply(
diff --git a/tensorflow/contrib/data/python/kernel_tests/parsing_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/parsing_ops_test.py
index c4623bca73..66ccaceea5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/parsing_ops_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.contrib.data.python.ops import parsing_ops as contrib_parsing_ops
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
@@ -72,7 +73,7 @@ def _compare_output_to_expected(tester, dict_tensors, expected_tensors,
     i += 1
 
 
-class ParseExampleTest(test.TestCase):
+class ParseExampleTest(test_base.DatasetTestBase):
 
   def _test(self,
             input_tensor,
diff --git a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
index 33a64ea767..7a6a7a709a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
@@ -22,6 +22,7 @@ import threading
 from tensorflow.contrib.data.python.ops import prefetching_ops
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.compat import compat
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
@@ -35,7 +36,7 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
 
-class PrefetchingKernelsOpsTest(test.TestCase):
+class PrefetchingKernelsOpsTest(test_base.DatasetTestBase):
 
   def setUp(self):
     self._event = threading.Event()
@@ -244,7 +245,7 @@ class PrefetchingKernelsOpsTest(test.TestCase):
       sess.run(destroy_op)
 
 
-class PrefetchToDeviceTest(test.TestCase):
+class PrefetchToDeviceTest(test_base.DatasetTestBase):
 
   def testPrefetchToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
@@ -445,7 +446,7 @@ class PrefetchToDeviceTest(test.TestCase):
         sess.run(next_element)
 
 
-class CopyToDeviceTest(test.TestCase):
+class CopyToDeviceTest(test_base.DatasetTestBase):
 
   def testCopyToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
diff --git a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
index db8fe6aa1b..2e901587f4 100644
--- a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.data.python.ops import counter
 from tensorflow.contrib.data.python.ops import enumerate_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -27,7 +28,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import test
 
 
-class RangeDatasetTest(test.TestCase):
+class RangeDatasetTest(test_base.DatasetTestBase):
 
   def testEnumerateDataset(self):
     components = (["a", "b"], [1, 2], [37.0, 38])
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index ed75b27a44..66ed547b6d 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.contrib.data.python.ops import readers
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
@@ -242,7 +243,7 @@ class ReadBatchFeaturesTest(
         self.assertEqual(32, shape[0])
 
 
-class MakeCsvDatasetTest(test.TestCase):
+class MakeCsvDatasetTest(test_base.DatasetTestBase):
 
   def _make_csv_dataset(self, filenames, batch_size, num_epochs=1, **kwargs):
     return readers.make_csv_dataset(
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test_base.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test_base.py
index 08b9f03816..f443b5501b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test_base.py
@@ -25,6 +25,7 @@ import zlib
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.framework import constant_op
@@ -32,11 +33,10 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-class FixedLengthRecordDatasetTestBase(test.TestCase):
+class FixedLengthRecordDatasetTestBase(test_base.DatasetTestBase):
   """Base class for setting up and testing FixedLengthRecordDataset."""
 
   def setUp(self):
@@ -63,7 +63,7 @@ class FixedLengthRecordDatasetTestBase(test.TestCase):
     return filenames
 
 
-class ReadBatchFeaturesTestBase(test.TestCase):
+class ReadBatchFeaturesTestBase(test_base.DatasetTestBase):
   """Base class for setting up and testing `make_batched_feature_dataset`."""
 
   def setUp(self):
@@ -273,7 +273,7 @@ class ReadBatchFeaturesTestBase(test.TestCase):
         self.assertAllEqual(expected_batch[i], actual_batch[i])
 
 
-class TextLineDatasetTestBase(test.TestCase):
+class TextLineDatasetTestBase(test_base.DatasetTestBase):
   """Base class for setting up and testing TextLineDataset."""
 
   def _lineText(self, f, l):
@@ -313,7 +313,7 @@ class TextLineDatasetTestBase(test.TestCase):
     return filenames
 
 
-class TFRecordDatasetTestBase(test.TestCase):
+class TFRecordDatasetTestBase(test_base.DatasetTestBase):
   """Base class for setting up and testing TFRecordDataset."""
 
   def setUp(self):
diff --git a/tensorflow/contrib/data/python/kernel_tests/resample_test.py b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
index 16b1441baa..32474bd411 100644
--- a/tensorflow/contrib/data/python/kernel_tests/resample_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.data.python.ops import resampling
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -57,7 +58,7 @@ def _time_resampling(
   return end_time - start_time
 
 
-class ResampleTest(test.TestCase, parameterized.TestCase):
+class ResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
       ("InitialDistributionKnown", True),
diff --git a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
index dde678bd54..bdf80eae4e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
@@ -22,6 +22,7 @@ import itertools
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import scan_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -33,7 +34,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class ScanDatasetTest(test.TestCase):
+class ScanDatasetTest(test_base.DatasetTestBase):
 
   def _counting_dataset(self, start, scan_fn):
     return dataset_ops.Dataset.from_tensors(0).repeat().apply(
diff --git a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
index 440e48db30..c97002a255 100644
--- a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
@@ -20,13 +20,14 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import shuffle_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
 
-class ShuffleAndRepeatTest(test.TestCase):
+class ShuffleAndRepeatTest(test_base.DatasetTestBase):
 
   def _build_ds(self, seed, count=5, num_elements=20):
     return dataset_ops.Dataset.range(num_elements).apply(
diff --git a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
index 90d18dca2a..c5a7862322 100644
--- a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
@@ -21,6 +21,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import sliding
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -30,7 +31,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class SlideDatasetTest(test.TestCase, parameterized.TestCase):
+class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
       ("1", 20, 14, 7, 1),
@@ -197,11 +198,6 @@ class SlideDatasetTest(test.TestCase, parameterized.TestCase):
           sliding.sliding_window_batch(
               window_size=1, stride=1, window_shift=1, window_stride=1))
 
-  def assertSparseValuesEqual(self, a, b):
-    self.assertAllEqual(a.indices, b.indices)
-    self.assertAllEqual(a.values, b.values)
-    self.assertAllEqual(a.dense_shape, b.dense_shape)
-
   def testSlideSparse(self):
 
     def _sparse(i):
diff --git a/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test_base.py b/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test_base.py
index 1f5c725a92..319a2ea263 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test_base.py
@@ -24,12 +24,13 @@ import os
 import sqlite3
 
 from tensorflow.contrib.data.python.ops import readers
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class SqlDatasetTestBase(test.TestCase):
+class SqlDatasetTestBase(test_base.DatasetTestBase):
   """Base class for setting up and testing SqlDataset."""
 
   def _createSqlDataset(self, output_types, num_repeats=1):
@@ -92,5 +93,3 @@ class SqlDatasetTestBase(test.TestCase):
           9007199254740992.0)])
     conn.commit()
     conn.close()
-
-
diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_test_base.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_test_base.py
index b1b4c23510..80f2625927 100644
--- a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_test_base.py
@@ -19,10 +19,10 @@ from __future__ import print_function
 
 
 from tensorflow.core.framework import summary_pb2
-from tensorflow.python.platform import test
+from tensorflow.python.data.kernel_tests import test_base
 
 
-class StatsDatasetTestBase(test.TestCase):
+class StatsDatasetTestBase(test_base.DatasetTestBase):
   """Base class for testing statistics gathered in `StatsAggregator`."""
 
   def _assertSummaryContains(self, summary_str, tag):
diff --git a/tensorflow/contrib/data/python/kernel_tests/test_utils.py b/tensorflow/contrib/data/python/kernel_tests/test_utils.py
deleted file mode 100644
index 4c3353fe40..0000000000
--- a/tensorflow/contrib/data/python/kernel_tests/test_utils.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test utilities for tf.data functionality."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import re
-
-from tensorflow.python.data.util import nest
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import test
-
-
-class DatasetTestBase(test.TestCase):
-  """Base class for dataset tests."""
-
-  def _assert_datasets_equal(self, dataset1, dataset2):
-    # TODO(rachelim): support sparse tensor outputs
-    next1 = dataset1.make_one_shot_iterator().get_next()
-    next2 = dataset2.make_one_shot_iterator().get_next()
-    with self.cached_session() as sess:
-      while True:
-        try:
-          op1 = sess.run(next1)
-        except errors.OutOfRangeError:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(next2)
-          break
-        op2 = sess.run(next2)
-
-        op1 = nest.flatten(op1)
-        op2 = nest.flatten(op2)
-        assert len(op1) == len(op2)
-        for i in range(len(op1)):
-          self.assertAllEqual(op1[i], op2[i])
-
-  def _assert_datasets_raise_same_error(self,
-                                        dataset1,
-                                        dataset2,
-                                        exception_class,
-                                        replacements=None):
-    # We are defining next1 and next2 in the same line so that we get identical
-    # file:line_number in the error messages
-    # pylint: disable=line-too-long
-    next1, next2 = dataset1.make_one_shot_iterator().get_next(), dataset2.make_one_shot_iterator().get_next()
-    # pylint: enable=line-too-long
-    with self.cached_session() as sess:
-      try:
-        sess.run(next1)
-        raise ValueError(
-            "Expected dataset to raise an error of type %s, but it did not." %
-            repr(exception_class))
-      except exception_class as e:
-        expected_message = e.message
-        for old, new, count in replacements:
-          expected_message = expected_message.replace(old, new, count)
-        # Check that the first segment of the error messages are the same.
-        with self.assertRaisesRegexp(exception_class,
-                                     re.escape(expected_message)):
-          sess.run(next2)
diff --git a/tensorflow/contrib/data/python/kernel_tests/threadpool_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/threadpool_dataset_ops_test.py
index 8d335e87d5..08de3a9143 100644
--- a/tensorflow/contrib/data/python/kernel_tests/threadpool_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/threadpool_dataset_ops_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.contrib.data.python.ops import threadpool
 from tensorflow.contrib.data.python.ops import unique
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -31,7 +32,8 @@ from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
-class OverrideThreadpoolDatasetTest(test.TestCase, parameterized.TestCase):
+class OverrideThreadpoolDatasetTest(test_base.DatasetTestBase,
+                                    parameterized.TestCase):
 
   @parameterized.named_parameters(
       ("1", 1, None),
diff --git a/tensorflow/contrib/data/python/kernel_tests/unique_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/unique_dataset_op_test.py
index f994c8563f..8856ce5afb 100644
--- a/tensorflow/contrib/data/python/kernel_tests/unique_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/unique_dataset_op_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.data.python.ops import unique
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -25,7 +26,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-class UniqueDatasetTest(test.TestCase):
+class UniqueDatasetTest(test_base.DatasetTestBase):
 
   def _testSimpleHelper(self, dtype, test_cases):
     """Test the `unique()` transformation on a list of test cases.
diff --git a/tensorflow/contrib/data/python/kernel_tests/window_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/window_dataset_op_test.py
index 8b7b3ac0f7..79134c7bc6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/window_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/window_dataset_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.data.python.ops import grouping
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -31,7 +32,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
-class WindowDatasetTest(test.TestCase, parameterized.TestCase):
+class WindowDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _structuredDataset(self, structure, shape, dtype):
     if structure is None:
diff --git a/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py
index 867ee2ba37..fca546a570 100644
--- a/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.contrib.data.python.ops import writers
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.framework import dtypes
@@ -30,7 +31,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-class TFRecordWriterTest(test.TestCase):
+class TFRecordWriterTest(test_base.DatasetTestBase):
 
   def setUp(self):
     super(TFRecordWriterTest, self).setUp()
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 5f9818566f..cadfe7f9e0 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -471,6 +471,9 @@ py_library(
     srcs = ["test_base.py"],
     deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/util:nest",
     ],
 )
 
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index b4f64115b7..b730e10949 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -17,6 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
+
+from tensorflow.python.data.util import nest
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.platform import test
 
 
@@ -24,6 +30,80 @@ class DatasetTestBase(test.TestCase):
   """Base class for dataset tests."""
 
   def assertSparseValuesEqual(self, a, b):
+    """Asserts that two SparseTensors/SparseTensorValues are equal."""
     self.assertAllEqual(a.indices, b.indices)
     self.assertAllEqual(a.values, b.values)
     self.assertAllEqual(a.dense_shape, b.dense_shape)
+
+  def getNext(self, dataset):
+    """Returns a callable that returns the next element of the dataset.
+
+    Example use:
+    ```python
+    # In both graph and eager modes
+    dataset = ...
+    nxt = self.getNext(dataset)
+    result = self.evaluate(nxt())
+    ```
+
+    Args:
+      dataset: A dataset whose next element is returned
+
+    Returns:
+      A callable that returns the next element of `dataset`
+    """
+    it = dataset.make_one_shot_iterator()
+    if context.executing_eagerly():
+      return it.get_next
+    else:
+      nxt = it.get_next()
+      return lambda: nxt
+
+  def assertDatasetsEqual(self, dataset1, dataset2):
+    """Checks that datasets are equal. Supports both graph and eager mode."""
+    self.assertEqual(dataset1.output_types, dataset2.output_types)
+    self.assertEqual(dataset1.output_classes, dataset2.output_classes)
+
+    next1 = self.getNext(dataset1)
+    next2 = self.getNext(dataset2)
+    while True:
+      try:
+        op1 = self.evaluate(next1())
+      except errors.OutOfRangeError:
+        with self.assertRaises(errors.OutOfRangeError):
+          self.evaluate(next2())
+        break
+      op2 = self.evaluate(next2())
+
+      op1 = nest.flatten(op1)
+      op2 = nest.flatten(op2)
+      assert len(op1) == len(op2)
+      for i in range(len(op1)):
+        if isinstance(
+            op1[i],
+            (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
+          self.assertSparseValuesEqual(op1[i], op2[i])
+        else:
+          self.assertAllEqual(op1[i], op2[i])
+
+  def assertDatasetsRaiseSameError(self,
+                                   dataset1,
+                                   dataset2,
+                                   exception_class,
+                                   replacements=None):
+    """Checks that datasets raise the same error on the first get_next call."""
+    next1 = self.getNext(dataset1)
+    next2 = self.getNext(dataset2)
+    try:
+      self.evaluate(next1())
+      raise ValueError(
+          'Expected dataset to raise an error of type %s, but it did not.' %
+          repr(exception_class))
+    except exception_class as e:
+      expected_message = e.message
+      for old, new, count in replacements:
+        expected_message = expected_message.replace(old, new, count)
+      # Check that the first segment of the error messages are the same.
+      with self.assertRaisesRegexp(exception_class,
+                                   re.escape(expected_message)):
+        self.evaluate(next2())
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 7d925a8fef..c621812535 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -66,7 +66,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/constrained_optimization:constrained_optimization_pip",
     "//tensorflow/contrib/data/python/kernel_tests/serialization:dataset_serialization_test_base",
     "//tensorflow/contrib/data/python/kernel_tests:stats_dataset_test_base",
-    "//tensorflow/contrib/data/python/kernel_tests:test_utils",
     "//tensorflow/contrib/eager/python/examples:examples_pip",
     "//tensorflow/contrib/eager/python:evaluator",
     "//tensorflow/contrib/gan:gan",
-- 
GitLab


From e4eeda33ca1d4a08ae2be7400f71b218fba25ccc Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 28 Sep 2018 15:27:40 -0700
Subject: [PATCH 168/570] Internal change.

PiperOrigin-RevId: 215005698
---
 ...nsorflow.-config-proto.-experimental.pbtxt |  24 --
 .../api/golden/tensorflow.-config-proto.pbtxt | 148 ---------
 .../golden/tensorflow.data.-iterator.pbtxt    |  46 ---
 ....estimator.-boosted-trees-classifier.pbtxt |  58 ----
 ...w.estimator.-boosted-trees-regressor.pbtxt |  58 ----
 .../tensorflow.estimator.-run-config.pbtxt    | 105 -------
 .../tools/api/golden/tensorflow.image.pbtxt   | 251 ---------------
 .../api/golden/tensorflow.keras.-model.pbtxt  | 268 ----------------
 .../golden/tensorflow.keras.-sequential.pbtxt | 289 ------------------
 .../golden/tensorflow.keras.activations.pbtxt |  55 ----
 .../tensorflow.keras.models.-model.pbtxt      | 268 ----------------
 .../tensorflow.keras.models.-sequential.pbtxt | 289 ------------------
 12 files changed, 1859 deletions(-)
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.-config-proto.-experimental.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.data.-iterator.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.image.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.keras.activations.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt

diff --git a/tensorflow/tools/api/golden/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/tensorflow.-config-proto.-experimental.pbtxt
deleted file mode 100644
index eb41deee13..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.-config-proto.-experimental.pbtxt
+++ /dev/null
@@ -1,24 +0,0 @@
-path: "tensorflow.ConfigProto.Experimental"
-tf_proto {
-  descriptor {
-    name: "Experimental"
-    field {
-      name: "collective_group_leader"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "client_handles_error_formatting"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "executor_type"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
deleted file mode 100644
index e565b903d2..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
+++ /dev/null
@@ -1,148 +0,0 @@
-path: "tensorflow.ConfigProto"
-tf_proto {
-  descriptor {
-    name: "ConfigProto"
-    field {
-      name: "device_count"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ConfigProto.DeviceCountEntry"
-    }
-    field {
-      name: "intra_op_parallelism_threads"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "inter_op_parallelism_threads"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "use_per_session_threads"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "session_inter_op_thread_pool"
-      number: 12
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ThreadPoolOptionProto"
-    }
-    field {
-      name: "placement_period"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "device_filters"
-      number: 4
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-    field {
-      name: "gpu_options"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GPUOptions"
-    }
-    field {
-      name: "allow_soft_placement"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "log_device_placement"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "graph_options"
-      number: 10
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GraphOptions"
-    }
-    field {
-      name: "operation_timeout_in_ms"
-      number: 11
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "rpc_options"
-      number: 13
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.RPCOptions"
-    }
-    field {
-      name: "cluster_def"
-      number: 14
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ClusterDef"
-    }
-    field {
-      name: "isolate_session_state"
-      number: 15
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "experimental"
-      number: 16
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ConfigProto.Experimental"
-    }
-    nested_type {
-      name: "DeviceCountEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      options {
-        map_entry: true
-      }
-    }
-    nested_type {
-      name: "Experimental"
-      field {
-        name: "collective_group_leader"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "client_handles_error_formatting"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_BOOL
-      }
-      field {
-        name: "executor_type"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-iterator.pbtxt
deleted file mode 100644
index 4f0147a523..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.data.-iterator.pbtxt
+++ /dev/null
@@ -1,46 +0,0 @@
-path: "tensorflow.data.Iterator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.iterator_ops.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_classes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_types"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'iterator_resource\', \'initializer\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_string_handle"
-    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_structure"
-    argspec: "args=[\'output_types\', \'output_shapes\', \'shared_name\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_next"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_initializer"
-    argspec: "args=[\'self\', \'dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "string_handle"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
deleted file mode 100644
index c23b04b4ef..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-path: "tensorflow.estimator.BoostedTreesClassifier"
-tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "config"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "model_dir"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "model_fn"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "params"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
-  }
-  member_method {
-    name: "eval_dir"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "evaluate"
-    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
-  }
-  member_method {
-    name: "get_variable_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_variable_value"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "latest_checkpoint"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "predict"
-    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
-  }
-  member_method {
-    name: "train"
-    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
deleted file mode 100644
index 6878d28fff..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-path: "tensorflow.estimator.BoostedTreesRegressor"
-tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "config"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "model_dir"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "model_fn"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "params"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
-  }
-  member_method {
-    name: "eval_dir"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "evaluate"
-    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
-  }
-  member_method {
-    name: "get_variable_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_variable_value"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "latest_checkpoint"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "predict"
-    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
-  }
-  member_method {
-    name: "train"
-    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
deleted file mode 100644
index bf1f94b6ae..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
+++ /dev/null
@@ -1,105 +0,0 @@
-path: "tensorflow.estimator.RunConfig"
-tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.run_config.RunConfig\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "cluster_spec"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "device_fn"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "eval_distribute"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "evaluation_master"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "global_id_in_cluster"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "is_chief"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "keep_checkpoint_every_n_hours"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "keep_checkpoint_max"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "log_step_count_steps"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "master"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "model_dir"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "num_ps_replicas"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "num_worker_replicas"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "protocol"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "save_checkpoints_secs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "save_checkpoints_steps"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "save_summary_steps"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "service"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "session_config"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "task_id"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "task_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "tf_random_seed"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "train_distribute"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\', \'eval_distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "replace"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
deleted file mode 100644
index 5c46dc5ee7..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ /dev/null
@@ -1,251 +0,0 @@
-path: "tensorflow.image"
-tf_module {
-  member {
-    name: "ResizeMethod"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "adjust_brightness"
-    argspec: "args=[\'image\', \'delta\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "adjust_contrast"
-    argspec: "args=[\'images\', \'contrast_factor\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "adjust_gamma"
-    argspec: "args=[\'image\', \'gamma\', \'gain\'], varargs=None, keywords=None, defaults=[\'1\', \'1\'], "
-  }
-  member_method {
-    name: "adjust_hue"
-    argspec: "args=[\'image\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "adjust_jpeg_quality"
-    argspec: "args=[\'image\', \'jpeg_quality\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "adjust_saturation"
-    argspec: "args=[\'image\', \'saturation_factor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "central_crop"
-    argspec: "args=[\'image\', \'central_fraction\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "convert_image_dtype"
-    argspec: "args=[\'image\', \'dtype\', \'saturate\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "crop_and_resize"
-    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "crop_to_bounding_box"
-    argspec: "args=[\'image\', \'offset_height\', \'offset_width\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "decode_and_crop_jpeg"
-    argspec: "args=[\'contents\', \'crop_window\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
-  }
-  member_method {
-    name: "decode_bmp"
-    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
-  }
-  member_method {
-    name: "decode_gif"
-    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
-  }
-  member_method {
-    name: "decode_jpeg"
-    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
-  }
-  member_method {
-    name: "decode_png"
-    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
-  }
-  member_method {
-    name: "draw_bounding_boxes"
-    argspec: "args=[\'images\', \'boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "encode_jpeg"
-    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'95\', \'False\', \'False\', \'True\', \'in\', \'300\', \'300\', \'\', \'None\'], "
-  }
-  member_method {
-    name: "encode_png"
-    argspec: "args=[\'image\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
-  }
-  member_method {
-    name: "extract_glimpse"
-    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "extract_image_patches"
-    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "extract_jpeg_shape"
-    argspec: "args=[\'contents\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
-  }
-  member_method {
-    name: "flip_left_right"
-    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flip_up_down"
-    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "grayscale_to_rgb"
-    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "hsv_to_rgb"
-    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "image_gradients"
-    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "is_jpeg"
-    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "non_max_suppression"
-    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'None\'], "
-  }
-  member_method {
-    name: "non_max_suppression_overlaps"
-    argspec: "args=[\'overlaps\', \'scores\', \'max_output_size\', \'overlap_threshold\', \'score_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'None\'], "
-  }
-  member_method {
-    name: "non_max_suppression_padded"
-    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'pad_to_max_output_size\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "pad_to_bounding_box"
-    argspec: "args=[\'image\', \'offset_height\', \'offset_width\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "per_image_standardization"
-    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "psnr"
-    argspec: "args=[\'a\', \'b\', \'max_val\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "random_brightness"
-    argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "random_contrast"
-    argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "random_flip_left_right"
-    argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "random_flip_up_down"
-    argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "random_hue"
-    argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "random_jpeg_quality"
-    argspec: "args=[\'image\', \'min_jpeg_quality\', \'max_jpeg_quality\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "random_saturation"
-    argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "resize_area"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "resize_bicubic"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "resize_bilinear"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "resize_image_with_crop_or_pad"
-    argspec: "args=[\'image\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "resize_image_with_pad"
-    argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\'], varargs=None, keywords=None, defaults=[\'0\'], "
-  }
-  member_method {
-    name: "resize_images"
-    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
-  }
-  member_method {
-    name: "resize_nearest_neighbor"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "rgb_to_grayscale"
-    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "rgb_to_hsv"
-    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "rgb_to_yiq"
-    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "rgb_to_yuv"
-    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "rot90"
-    argspec: "args=[\'image\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
-  }
-  member_method {
-    name: "sample_distorted_bounding_box"
-    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sobel_edges"
-    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "ssim"
-    argspec: "args=[\'img1\', \'img2\', \'max_val\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "ssim_multiscale"
-    argspec: "args=[\'img1\', \'img2\', \'max_val\', \'power_factors\'], varargs=None, keywords=None, defaults=[\'(0.0448, 0.2856, 0.3001, 0.2363, 0.1333)\'], "
-  }
-  member_method {
-    name: "total_variation"
-    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "transpose_image"
-    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "yiq_to_rgb"
-    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "yuv_to_rgb"
-    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
deleted file mode 100644
index e579fe6a1a..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
+++ /dev/null
@@ -1,268 +0,0 @@
-path: "tensorflow.keras.Model"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_spec"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "layers"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "stateful"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
-  }
-  member_method {
-    name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_layer"
-    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
-  }
-  member_method {
-    name: "predict_on_batch"
-    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
-  }
-  member_method {
-    name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "to_json"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "to_yaml"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
deleted file mode 100644
index 6f05cdd093..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ /dev/null
@@ -1,289 +0,0 @@
-path: "tensorflow.keras.Sequential"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_spec"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "layers"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "stateful"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'layers\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add"
-    argspec: "args=[\'self\', \'layer\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
-  }
-  member_method {
-    name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_layer"
-    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "pop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "predict_classes"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
-  }
-  member_method {
-    name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
-  }
-  member_method {
-    name: "predict_on_batch"
-    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "predict_proba"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
-  }
-  member_method {
-    name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "symbolic_set_inputs"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "to_json"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "to_yaml"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.activations.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.activations.pbtxt
deleted file mode 100644
index 2e9de9ebb2..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.activations.pbtxt
+++ /dev/null
@@ -1,55 +0,0 @@
-path: "tensorflow.keras.activations"
-tf_module {
-  member_method {
-    name: "deserialize"
-    argspec: "args=[\'name\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "elu"
-    argspec: "args=[\'x\', \'alpha\'], varargs=None, keywords=None, defaults=[\'1.0\'], "
-  }
-  member_method {
-    name: "get"
-    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "hard_sigmoid"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "linear"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "relu"
-    argspec: "args=[\'x\', \'alpha\', \'max_value\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'0\'], "
-  }
-  member_method {
-    name: "selu"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "serialize"
-    argspec: "args=[\'activation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "sigmoid"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "softmax"
-    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "softplus"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "softsign"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "tanh"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
deleted file mode 100644
index 56914e1746..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ /dev/null
@@ -1,268 +0,0 @@
-path: "tensorflow.keras.models.Model"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_spec"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "layers"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "stateful"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
-  }
-  member_method {
-    name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_layer"
-    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
-  }
-  member_method {
-    name: "predict_on_batch"
-    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
-  }
-  member_method {
-    name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "to_json"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "to_yaml"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
deleted file mode 100644
index 4c1c54001d..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ /dev/null
@@ -1,289 +0,0 @@
-path: "tensorflow.keras.models.Sequential"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_spec"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "layers"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "stateful"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'layers\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add"
-    argspec: "args=[\'self\', \'layer\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
-  }
-  member_method {
-    name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_layer"
-    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "pop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "predict_classes"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
-  }
-  member_method {
-    name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
-  }
-  member_method {
-    name: "predict_on_batch"
-    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "predict_proba"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
-  }
-  member_method {
-    name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "symbolic_set_inputs"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "to_json"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "to_yaml"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-}
-- 
GitLab


From f5086804c758812ec9ed67233c58e18236246299 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 28 Sep 2018 15:48:20 -0700
Subject: [PATCH 169/570] Add documentation of the ownership semantics to
 {Lookup,Create,LookupOrCreate}Resource().

PiperOrigin-RevId: 215008650
---
 tensorflow/core/framework/resource_mgr.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index abb6635984..4a531648d9 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -248,10 +248,16 @@ Status HandleFromInput(OpKernelContext* ctx, StringPiece input,
                        ResourceHandle* handle);
 
 // Create a resource pointed by a given resource handle.
+//
+// If successful, the caller transfers the ownership of one ref on `resource` to
+// `ctx->resource_mgr()`.
 template <typename T>
 Status CreateResource(OpKernelContext* ctx, const ResourceHandle& p, T* value);
 
 // Looks up a resource pointed by a given resource handle.
+//
+// If the lookup is successful, the caller takes the ownership of one ref on
+// `*value`, and must call its `Unref()` method when it has finished using it.
 template <typename T>
 Status LookupResource(OpKernelContext* ctx, const ResourceHandle& p, T** value);
 
@@ -262,6 +268,11 @@ Status LookupResources(
     std::vector<std::unique_ptr<T, core::RefCountDeleter>>* values);
 
 // Looks up or creates a resource.
+//
+// If successful, the caller takes the ownership of one ref on `*value`, and
+// must call its `Unref()` method when it has finished using it. If the
+// `creator` is invoked, its reference on the created resource is transferred
+// to `ctx->resource_mgr()`.
 template <typename T>
 Status LookupOrCreateResource(OpKernelContext* ctx, const ResourceHandle& p,
                               T** value, std::function<Status(T**)> creator);
-- 
GitLab


From 3f4423fad57694bc8d7adc427d65e5a18c8592b2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 15:58:19 -0700
Subject: [PATCH 170/570] Internal changes only.

PiperOrigin-RevId: 215009955
---
 .../contrib/tpu/ops/tpu_embedding_ops.cc      | 42 +++----------------
 1 file changed, 6 insertions(+), 36 deletions(-)

diff --git a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
index 1bd1a31e11..bc1a0c5284 100644
--- a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
@@ -103,19 +103,10 @@ Status RegisterPerTableLoadOpsForAlgorithmBody(
       arg->set_type(DT_FLOAT);
     }
   }
-  {
-    auto* table_id_attr = op_def->add_attr();
-    table_id_attr->set_name("table_id");
-    table_id_attr->set_type("int");
-    table_id_attr->set_has_minimum(true);
-    table_id_attr->set_minimum(-1);
-    table_id_attr->mutable_default_value()->set_i(-1);
-  }
   {
     auto* table_name_attr = op_def->add_attr();
     table_name_attr->set_name("table_name");
     table_name_attr->set_type("string");
-    table_name_attr->mutable_default_value()->set_s("");
   }
   {
     auto* num_shards_attr = op_def->add_attr();
@@ -147,11 +138,9 @@ parameters that are loaded from a checkpoint before a training loop is
 executed.
 %s
 table_name: Name of this table; must match a name in the
-  EmbeddingLayerConfiguration proto (overrides table_id).
+  EmbeddingLayerConfiguration proto.
 num_shards: Number of shards into which the embedding tables are divided.
 shard_id: Identifier of shard for this operation.
-table_id: Index of this table in the EmbeddingLayerConfiguration proto
-  (deprecated).
 )doc",
                                           parameter_descriptions.c_str()));
   op_def->set_is_commutative(false);
@@ -160,14 +149,10 @@ table_id: Index of this table in the EmbeddingLayerConfiguration proto
   auto shape_inference_function =
       [state_variable_specs,
        is_debug_op](shape_inference::InferenceContext* c) -> Status {
-    int table_id;
-    TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
     string table_name;
     TF_RETURN_IF_ERROR(c->GetAttr("table_name", &table_name));
-    // Exactly one must be non-default.
-    if ((table_id >= 0) == (!table_name.empty())) {
-      return errors::InvalidArgument(
-          "exactly one of table_id or table_name must be non-default");
+    if (table_name.empty()) {
+      return errors::InvalidArgument("table_name attribute must be set");
     }
     int num_shards;
     TF_RETURN_IF_ERROR(c->GetAttr("num_shards", &num_shards));
@@ -240,19 +225,10 @@ Status RegisterPerTableRetrieveOpsForAlgorithmBody(
       arg->set_type(DT_FLOAT);
     }
   }
-  {
-    auto* table_id_attr = op_def->add_attr();
-    table_id_attr->set_name("table_id");
-    table_id_attr->set_type("int");
-    table_id_attr->set_has_minimum(true);
-    table_id_attr->set_minimum(-1);
-    table_id_attr->mutable_default_value()->set_i(-1);
-  }
   {
     auto* table_name_attr = op_def->add_attr();
     table_name_attr->set_name("table_name");
     table_name_attr->set_type("string");
-    table_name_attr->mutable_default_value()->set_s("");
   }
   {
     auto* num_shards_attr = op_def->add_attr();
@@ -283,11 +259,9 @@ the correct embedding table configuration. For example, this op is
 used to retrieve updated parameters before saving a checkpoint.
 %s
 table_name: Name of this table; must match a name in the
-  EmbeddingLayerConfiguration proto (overrides table_id).
+  EmbeddingLayerConfiguration proto.
 num_shards: Number of shards into which the embedding tables are divided.
 shard_id: Identifier of shard for this operation.
-table_id: Index of this table in the EmbeddingLayerConfiguration proto
-  (deprecated).
 )doc",
                                           parameter_descriptions.c_str()));
   op_def->set_is_commutative(false);
@@ -296,14 +270,10 @@ table_id: Index of this table in the EmbeddingLayerConfiguration proto
   auto shape_inference_function =
       [state_variable_specs,
        is_debug_op](shape_inference::InferenceContext* c) -> Status {
-    int table_id;
-    TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
     string table_name;
     TF_RETURN_IF_ERROR(c->GetAttr("table_name", &table_name));
-    // Exactly one must be non-default.
-    if ((table_id >= 0) == (!table_name.empty())) {
-      return errors::InvalidArgument(
-          "exactly one of table_id or table_name must be non-default");
+    if (table_name.empty()) {
+      return errors::InvalidArgument("table_name must be non-empty");
     }
     int num_shards;
     TF_RETURN_IF_ERROR(c->GetAttr("num_shards", &num_shards));
-- 
GitLab


From 0a1132ece84bd76d6dceaf8d29211959b5dca216 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 16:03:41 -0700
Subject: [PATCH 171/570] fix broken tests.

PiperOrigin-RevId: 215010842
---
 .../opt/python/training/shampoo_test.py       | 40 +++++++++----------
 .../timeseries/python/timeseries/head_test.py |  2 +-
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/shampoo_test.py b/tensorflow/contrib/opt/python/training/shampoo_test.py
index 05bcf2cfa3..a2fd8fbd87 100644
--- a/tensorflow/contrib/opt/python/training/shampoo_test.py
+++ b/tensorflow/contrib/opt/python/training/shampoo_test.py
@@ -54,9 +54,9 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     grad_np_2 = np.random.rand(size)
 
     with self.cached_session() as sess:
-      global_step = variables.Variable(
+      global_step = variables.VariableV1(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
-      var = variables.Variable(
+      var = variables.VariableV1(
           init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = constant_op.constant(grad_np, dtype=dtypes.float32)
       grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)
@@ -105,9 +105,9 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     grad_np_2 = np.random.rand(size[0], size[1])
 
     with self.cached_session() as sess:
-      global_step = variables.Variable(
+      global_step = variables.VariableV1(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
-      var = variables.Variable(
+      var = variables.VariableV1(
           init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = constant_op.constant(grad_np, dtype=dtypes.float32)
       grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)
@@ -164,9 +164,9 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     grad_np_2 = np.random.rand(size[0], size[1], size[2])
 
     with self.cached_session() as sess:
-      global_step = variables.Variable(
+      global_step = variables.VariableV1(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
-      var = variables.Variable(
+      var = variables.VariableV1(
           init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = constant_op.constant(grad_np, dtype=dtypes.float32)
       grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)
@@ -254,9 +254,9 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     grad_np_2 = np.random.rand(size)
 
     with self.cached_session() as sess:
-      global_step = variables.Variable(
+      global_step = variables.VariableV1(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
-      var = variables.Variable(
+      var = variables.VariableV1(
           init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = constant_op.constant(grad_np, dtype=dtypes.float32)
       grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)
@@ -310,9 +310,9 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     grad_np_2 = np.random.rand(size[0], size[1])
 
     with self.cached_session() as sess:
-      global_step = variables.Variable(
+      global_step = variables.VariableV1(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
-      var = variables.Variable(
+      var = variables.VariableV1(
           init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = constant_op.constant(grad_np, dtype=dtypes.float32)
       grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)
@@ -383,9 +383,9 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     grad_np_2 = np.random.rand(sample_size_2, size[1])
 
     with self.cached_session() as sess:
-      global_step = variables.Variable(
+      global_step = variables.VariableV1(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
-      var = variables.Variable(
+      var = variables.VariableV1(
           init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = ops.IndexedSlices(
           constant_op.constant(grad_np, dtype=dtypes.float32),
@@ -463,9 +463,9 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     grad_np = np.random.rand(sample_size, size[1], size[2])
 
     with self.cached_session() as sess:
-      global_step = variables.Variable(
+      global_step = variables.VariableV1(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
-      var = variables.Variable(
+      var = variables.VariableV1(
           init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = ops.IndexedSlices(
           constant_op.constant(grad_np, dtype=dtypes.float32),
@@ -533,9 +533,9 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     gbar_weight = 0.1
 
     with self.cached_session() as sess:
-      global_step = variables.Variable(
+      global_step = variables.VariableV1(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
-      var = variables.Variable(
+      var = variables.VariableV1(
           init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = constant_op.constant(grad_np, dtype=dtypes.float32)
       grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)
@@ -628,9 +628,9 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     mat_g3 = np.zeros_like(mat_g3_a)
 
     with self.cached_session() as sess:
-      global_step = variables.Variable(
+      global_step = variables.VariableV1(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
-      var = variables.Variable(
+      var = variables.VariableV1(
           init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = array_ops.placeholder(dtypes.float32, shape=size)
 
@@ -705,9 +705,9 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     mat_g3 = np.zeros_like(mat_g3_a)
 
     with self.cached_session() as sess:
-      global_step = variables.Variable(
+      global_step = variables.VariableV1(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
-      var = variables.Variable(
+      var = variables.VariableV1(
           init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = array_ops.placeholder(dtypes.float32, shape=size)
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head_test.py b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
index 647455ae42..04d17bc123 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
@@ -104,7 +104,7 @@ class EvaluationMetricsTests(test.TestCase):
           "ticker":
               array_ops.reshape(
                   math_ops.cast(
-                      variables.Variable(
+                      variables.VariableV1(
                           name="ticker",
                           initial_value=0,
                           dtype=dtypes.int64,
-- 
GitLab


From a98bac521406bedef3ff2b9af9564b21ddda4d82 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Fri, 28 Sep 2018 16:09:49 -0700
Subject: [PATCH 172/570] [TF:XLA] Bump open source abseil revision to
 48cd2c3f351ff188bc85684b84a91b6e6d17d896

This has absl::flat_hash_map in it.

PiperOrigin-RevId: 215011713
---
 tensorflow/contrib/makefile/Makefile | 3 ++-
 tensorflow/workspace.bzl             | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index d962a5e12d..36125c198e 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -133,7 +133,8 @@ $(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*benchmark*.cc) \
 $(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*benchmark*.cc) \
 $(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*/*benchmark*.cc) \
 $(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*/*/*benchmark*.cc) \
-tensorflow/contrib/makefile/downloads/absl/absl/synchronization/internal/mutex_nonprod.cc
+tensorflow/contrib/makefile/downloads/absl/absl/synchronization/internal/mutex_nonprod.cc \
+tensorflow/contrib/makefile/downloads/absl/absl/hash/internal/print_hash_of.cc
 
 ABSL_CC_SRCS := $(filter-out $(ABSL_CC_EXCLUDE_SRCS), $(ABSL_CC_ALL_SRCS))
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 70bade060e..9b4b698874 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -110,11 +110,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
-        sha256 = "278a1af58b633be886fe81bf7061dca6b5fea99566850d1319fffdaa1a061792",
-        strip_prefix = "abseil-cpp-e291c279e458761e77a69b09b129d3d1e81f1e80",
+        sha256 = "7dd09690ae7ca4551de3111d4a86b75b23ec17445f273d3c42bdcdc1c7b02e4e",
+        strip_prefix = "abseil-cpp-48cd2c3f351ff188bc85684b84a91b6e6d17d896",
         urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/e291c279e458761e77a69b09b129d3d1e81f1e80.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/e291c279e458761e77a69b09b129d3d1e81f1e80.tar.gz",
+            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/48cd2c3f351ff188bc85684b84a91b6e6d17d896.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/48cd2c3f351ff188bc85684b84a91b6e6d17d896.tar.gz",
         ],
     )
 
-- 
GitLab


From 478d370eb116ad2294134d75a886637a7d6da225 Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Fri, 28 Sep 2018 16:10:45 -0700
Subject: [PATCH 173/570] [tf.data] Use Graph instead of GraphDef/FunctionDef
 for vectorization transforms

PiperOrigin-RevId: 215011835
---
 .../core/grappler/optimizers/data/BUILD       |   7 +-
 .../grappler/optimizers/data/graph_utils.h    |   4 +-
 .../optimizers/data/map_vectorization.cc      |  28 +-
 .../optimizers/data/map_vectorization_test.cc | 112 +++--
 .../optimizers/data/vectorization/BUILD       |   3 +-
 .../data/vectorization/cast_vectorizer.cc     |  29 +-
 .../data/vectorization/unpack_vectorizer.cc   |  36 +-
 .../data/vectorization/vectorizer.h           |  23 +-
 .../vectorization/vectorizer_registry_test.cc |  16 +-
 .../optimizers/data/vectorization_utils.cc    | 451 +++++++++++-------
 .../optimizers/data/vectorization_utils.h     |  35 +-
 .../data/vectorization_utils_test.cc          | 205 +++++---
 12 files changed, 574 insertions(+), 375 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 81c1bddf67..5a3abbb545 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -124,10 +124,10 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
-        "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
     ] + tf_protos_all(),
 )
@@ -523,6 +523,7 @@ cc_library(
         ":function_utils",
         ":graph_utils",
         "@com_google_absl//absl/strings",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -538,6 +539,7 @@ tf_cc_test(
     srcs = ["vectorization_utils_test.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":graph_utils",
         ":function_utils",
         ":vectorization_utils",
         "//tensorflow/core:framework",
@@ -547,7 +549,10 @@ tf_cc_test(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        # For ops we need registered
+        "//tensorflow/core/kernels/data:dataset_ops",
         "//tensorflow/core/kernels:cast_op",
+        "//tensorflow/core/kernels:logging_ops",
         "//tensorflow/tools/graph_transforms:transform_utils",
     ] + tf_protos_all(),
 )
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index 5dd7819100..3af34f6904 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -116,8 +116,8 @@ std::vector<int> FindAllGraphNodesWithOp(const string& op,
 // is unique across the graph.
 void SetUniqueGraphNodeName(StringPiece prefix, GraphDef* graph, NodeDef* node);
 
-// Sets the node name using the `prefix` name as a prefix while guaranteeing the
-// name is unique across the graph.
+// Sets the function name using the `prefix` name as a prefix while guaranteeing
+// the name is unique across the function library.
 void SetUniqueGraphFunctionName(StringPiece prefix, FunctionDefLibrary* library,
                                 FunctionDef* function);
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
index 32ab912619..9328a7ca99 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
@@ -86,21 +86,19 @@ FunctionDef* AddVectorizedFunction(const NodeDef& map_node,
   // efficient vectorization with VectorizeMapDefun.
   FunctionDef* vectorized_func =
       CreateMapDefunWrapper(map_node, orig_func, library);
-  NodeDef* map_defun_node = vectorized_func->mutable_node_def()->Mutable(0);
-  DCHECK_EQ(map_defun_node->op(), "MapDefun");
-
-  // Create a copy of the original function so that we can mutate it, and
-  // attach that to the map defun node.
-  FunctionDef* map_defun_fn = library->add_function();
-  *map_defun_fn = orig_func;
-  graph_utils::SetUniqueGraphFunctionName(orig_func.signature().name(), library,
-                                          map_defun_fn);
-  (*map_defun_node->mutable_attr())["f"].mutable_func()->set_name(
-      map_defun_fn->signature().name());
-
-  vectorization_utils::VectorizeMapDefun(vectorized_func, map_defun_fn,
-                                         map_defun_node);
-  return vectorized_func;
+  const NodeDef& map_defun_node = vectorized_func->node_def(0);
+  DCHECK_EQ(map_defun_node.op(), "MapDefun");
+
+  // TODO(b/116285210): Unreferenced functions should get cleaned up later
+  FunctionDef* result;
+  Status s = vectorization_utils::VectorizeMapDefun(
+      *vectorized_func, map_defun_node, library, &result);
+
+  if (!s.ok()) {
+    LOG(ERROR) << "VectorizeMapDefun failed: " << s;
+    return vectorized_func;
+  }
+  return result;
 }
 
 bool IsOutputShapesFullyDefined(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc
index ed1bd6bc97..f4faf41549 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc
@@ -30,72 +30,51 @@ namespace {
 using test::function::GDef;
 using test::function::NDef;
 
-void MakeTensorShapeProtoHelper(const gtl::ArraySlice<int> dims,
-                                TensorShapeProto* t) {
-  for (size_t i = 0; i < dims.size(); ++i) {
-    auto* d = t->add_dim();
-    d->set_size(dims[i]);
-  }
-}
-
-AttrValue MakeShapeListAttr(
-    const gtl::ArraySlice<const gtl::ArraySlice<int>>& shapes) {
-  AttrValue shapes_attr;
-  for (size_t i = 0; i < shapes.size(); ++i) {
-    MakeTensorShapeProtoHelper(shapes[i],
-                               shapes_attr.mutable_list()->add_shape());
-  }
-
-  return shapes_attr;
-}
-
-NodeDef MakeMapNodeHelper(
-    StringPiece name, StringPiece input_node_name, StringPiece function_name,
-    StringPiece map_op_name,
-    const gtl::ArraySlice<const gtl::ArraySlice<int>>& output_shapes,
-    const gtl::ArraySlice<DataType>& output_types) {
+NodeDef MakeMapNodeHelper(StringPiece name, StringPiece input_node_name,
+                          StringPiece function_name, StringPiece map_op_name,
+                          gtl::ArraySlice<PartialTensorShape> output_shapes,
+                          gtl::ArraySlice<DataType> output_types) {
   return test::function::NDef(
       name, map_op_name, {string(input_node_name)},
       {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
        {"Targuments", {}},
-       {"output_shapes", MakeShapeListAttr(output_shapes)},
+       {"output_shapes", output_shapes},
        {"output_types", output_types}});
 }
 
-NodeDef MakeMapNode(
-    StringPiece name, StringPiece input_node_name, StringPiece function_name,
-    const gtl::ArraySlice<const gtl::ArraySlice<int>>& output_shapes,
-    const gtl::ArraySlice<DataType>& output_types) {
+NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name,
+                    StringPiece function_name,
+                    gtl::ArraySlice<PartialTensorShape> output_shapes,
+                    gtl::ArraySlice<DataType> output_types) {
   return MakeMapNodeHelper(name, input_node_name, function_name, "MapDataset",
                            output_shapes, output_types);
 }
 
-NodeDef MakeBatchNode(
-    StringPiece name, StringPiece input_node_name,
-    StringPiece input_batch_size_name,
-    const gtl::ArraySlice<const gtl::ArraySlice<int>>& output_shapes,
-    const gtl::ArraySlice<DataType>& output_types) {
-  return NDef(name, "BatchDataset",
-              {string(input_node_name), string(input_batch_size_name)},
-              {{"output_types", output_types},
-               {"output_shapes", MakeShapeListAttr(output_shapes)}});
+NodeDef MakeBatchNode(StringPiece name, StringPiece input_node_name,
+                      StringPiece input_batch_size_name,
+                      gtl::ArraySlice<PartialTensorShape> output_shapes,
+                      gtl::ArraySlice<DataType> output_types) {
+  return NDef(
+      name, "BatchDataset",
+      {string(input_node_name), string(input_batch_size_name)},
+      {{"output_types", output_types}, {"output_shapes", output_shapes}});
 }
 
-NodeDef MakeBatchV2Node(
-    StringPiece name, StringPiece input_node_name,
-    StringPiece input_batch_size_name, StringPiece input_drop_remainder_name,
-    const gtl::ArraySlice<const gtl::ArraySlice<int>>& output_shapes,
-    const gtl::ArraySlice<DataType>& output_types) {
-  return NDef(name, "BatchDatasetV2",
-              {string(input_node_name), string(input_batch_size_name),
-               string(input_drop_remainder_name)},
-              {{"output_types", output_types},
-               {"output_shapes", MakeShapeListAttr(output_shapes)}});
+NodeDef MakeBatchV2Node(StringPiece name, StringPiece input_node_name,
+                        StringPiece input_batch_size_name,
+                        StringPiece input_drop_remainder_name,
+                        gtl::ArraySlice<PartialTensorShape> output_shapes,
+                        gtl::ArraySlice<DataType> output_types) {
+  return NDef(
+      name, "BatchDatasetV2",
+      {string(input_node_name), string(input_batch_size_name),
+       string(input_drop_remainder_name)},
+      {{"output_types", output_types}, {"output_shapes", output_shapes}});
 }
 
-NodeDef MakeRangeNode(StringPiece name, const gtl::ArraySlice<string>& inputs) {
+NodeDef MakeRangeNode(StringPiece name, gtl::ArraySlice<string> inputs) {
   return NDef(name, "RangeDataset", inputs,
-              {{"output_shapes", MakeShapeListAttr({{}})},
+              {{"output_shapes", gtl::ArraySlice<TensorShape>({{}})},
                {"output_types", gtl::ArraySlice<DataType>({DT_INT64})}});
 }
 
@@ -184,7 +163,7 @@ TEST(MapVectorizationTest, VectorizeWithUndefinedOutputTypes) {
   item.graph = GDef(
       {NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
        NDef("input", "InputDataset", {},
-            {{"output_shapes", MakeShapeListAttr({{}})}}),
+            {{"output_shapes", gtl::ArraySlice<TensorShape>({{}})}}),
        MakeMapNode("map", "input", "XTimesTwo", {{}}, {DT_INT32}),
        MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
       // FunctionLib
@@ -196,6 +175,37 @@ TEST(MapVectorizationTest, VectorizeWithUndefinedOutputTypes) {
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 }
 
+TEST(MapVectorizationTest, VectorizeWithFullyDefinedFunction) {
+  GrapplerItem item;
+  item.graph = GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       MakeRangeNode("range", {"start", "stop", "step"}),
+       MakeMapNode("map", "range", "Func", {{}}, {DT_INT32}),
+       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
+      // FunctionLib
+      {FunctionDefHelper::Create(
+          "Func", {"x: int64", "y: int64"}, {"res: int64", "res2: int64"}, {},
+          {{{"o"}, "Mul", {"x", "x"}, {{"T", DT_INT64}}}},
+          {{"res", "o:z"}, {"res2", "o:z"}})});
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("MapDataset", output).size(),
+            1);
+  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("BatchDataset", output).size(),
+            1);
+  const NodeDef& map_node =
+      output.node(graph_utils::FindGraphNodeWithOp("MapDataset", output));
+  const NodeDef& batch_node =
+      output.node(graph_utils::FindGraphNodeWithOp("BatchDataset", output));
+  EXPECT_EQ(map_node.input(0), batch_node.name());
+  EXPECT_EQ(batch_node.input(0), "range");
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
index 1462cb234d..37aa24b947 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
@@ -9,13 +9,14 @@ load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
 
 VECTORIZER_DEPS = [
     ":vectorizer_registry",
-    "//tensorflow/core/grappler/optimizers/data:function_utils",
+    "//tensorflow/core/grappler/optimizers/data:graph_utils",
 ] + tf_protos_all()
 
 cc_library(
     name = "vectorizer",
     hdrs = ["vectorizer.h"],
     deps = [
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
     ] + tf_protos_all(),
 )
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/cast_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/cast_vectorizer.cc
index c1739737a0..3af6bab409 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/cast_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/cast_vectorizer.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
 
 namespace tensorflow {
@@ -23,26 +23,21 @@ namespace vectorization_utils {
 
 class CastVectorizer : public Vectorizer {
  public:
-  Status Vectorize(const NodeDef& node, gtl::ArraySlice<string> inputs,
-                   FunctionDef* outer_scope,
-                   std::map<string, string>* conversion_map) override {
-    if (inputs.size() != 1) {
+  Status Vectorize(const Node& node, Graph* outer_scope,
+                   std::vector<Port>* input_ports,
+                   std::vector<Port>* output_ports) override {
+    Status s;
+    if (node.num_inputs() != 1) {
       return errors::Internal("Cast op should only have one input.");
     }
 
-    // Add new Cast node
-    NodeDef* new_cast_node = outer_scope->add_node_def();
-    *new_cast_node = node;
-    new_cast_node->clear_name();
-    function_utils::SetUniqueFunctionNodeName(
-        strings::StrCat("vectorized/", node.name()), outer_scope,
-        new_cast_node);
-    new_cast_node->set_input(0, inputs[0]);
-
-    // Add the output mapping to conversion map
-    (*conversion_map)[strings::StrCat(node.name(), ":y:0")] =
-        strings::StrCat(new_cast_node->name(), ":y:0");
+    // Add new Cast node with the same op and attrs as the original node
+    auto new_cast_node = outer_scope->AddNode(node.def(), &s);
+    TF_RETURN_IF_ERROR(s);
 
+    // Add input and output mappings
+    input_ports->push_back({new_cast_node, 0});
+    output_ports->push_back({new_cast_node, 0});
     return Status::OK();
   }
 };
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
index 776d3179c5..74ce520ce1 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
 
 namespace tensorflow {
@@ -23,31 +23,29 @@ namespace vectorization_utils {
 
 class UnpackVectorizer : public Vectorizer {
  public:
-  Status Vectorize(const NodeDef& node, gtl::ArraySlice<string> inputs,
-                   FunctionDef* outer_scope,
-                   std::map<string, string>* conversion_map) override {
-    if (inputs.size() != 1) {
+  Status Vectorize(const Node& node, Graph* outer_scope,
+                   std::vector<Port>* input_ports,
+                   std::vector<Port>* output_ports) override {
+    Status s;
+    if (node.num_inputs() != 1) {
       return errors::Internal("Unpack op should only have one input.");
     }
 
-    // Add new Unpack node
-    NodeDef* new_unpack_node = outer_scope->add_node_def();
-    *new_unpack_node = node;
-    new_unpack_node->clear_name();
-    function_utils::SetUniqueFunctionNodeName(
-        strings::StrCat("vectorized/", node.name()), outer_scope,
-        new_unpack_node);
+    // Add new Unpack node with the same op and attrs as the original node
+    auto new_unpack_node = outer_scope->AddNode(node.def(), &s);
+    TF_RETURN_IF_ERROR(s);
 
     // Increment "axis" attr by 1:
-    (*new_unpack_node->mutable_attr())["axis"].set_i(
-        node.attr().at("axis").i() + 1);
-    new_unpack_node->set_input(0, inputs[0]);
+    int new_axis = node.def().attr().at("axis").i() + 1;
+    new_unpack_node->AddAttr("axis", new_axis);
 
-    // Add the output mappings to conversion map
-    int num = new_unpack_node->attr().at("num").i();
+    // Add the input mappings
+    input_ports->push_back({new_unpack_node, 0});
+
+    // Add the output mappings
+    int num = node.def().attr().at("num").i();
     for (int i = 0; i < num; ++i) {
-      (*conversion_map)[strings::StrCat(node.name(), ":output:", i)] =
-          strings::StrCat(new_unpack_node->name(), ":output:", i);
+      output_ports->push_back({new_unpack_node, i});
     }
 
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
index d341dbba7d..56eb88c95e 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
@@ -17,30 +17,33 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_VECTORIZATION_VECTORIZER_H_
 
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace vectorization_utils {
 
+// Describes a tensor with its operation Node and output position
+typedef std::pair<Node*, int> Port;
+
 // Interface for vectorization of TensorFlow operations. See `CastVectorizer`
 // for an example.
 class Vectorizer {
  public:
   virtual ~Vectorizer() {}
 
-  // Vectorizes an operation, `node`, by adding operation(s) to `outer_scope`
+  // Vectorizes an operation, `node`, by adding Node(s) to `outer_scope`
   // that produce the same vector output(s) as executing `node`'s op
-  // on elements of the vector inputs, and adding mappings to `conversion_map`
-  // from old output tensor names to new (vectorized) output tensor names.
-  // The new node(s) collectively have the same number of inputs and outputs as
-  // the node being converted, and use the tensor names in `inputs` as their
-  // inputs.
-  virtual Status Vectorize(const NodeDef& node, gtl::ArraySlice<string> inputs,
-                           FunctionDef* outer_scope,
-                           std::map<string, string>* conversion_map) = 0;
+  // on elements of the vector inputs. The new Node(s) collectively have the
+  // same number of input and output ports as the node being converted.
+  // Adds mappings for the new nodes' input and output ports to `inputs` and
+  // `outputs` respectively, where the i'th Port in inputs/outputs
+  // corresponds to the i'th input/output port of the node to be converted.
+  virtual Status Vectorize(const Node& node, Graph* outer_scope,
+                           std::vector<Port>* input_ports,
+                           std::vector<Port>* output_ports) = 0;
 };
 
 }  // namespace vectorization_utils
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
index 86e303564b..663ceba027 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
@@ -24,9 +24,9 @@ namespace vectorization_utils {
 
 class TestVectorizer : public Vectorizer {
  public:
-  Status Vectorize(const NodeDef& node, gtl::ArraySlice<string> inputs,
-                   FunctionDef* outer_scope,
-                   std::map<string, string>* conversion_map) override {
+  Status Vectorize(const Node& node, Graph* outer_scope,
+                   std::vector<Port>* inputs,
+                   std::vector<Port>* outputs) override {
     return Status::OK();
   }
 };
@@ -39,10 +39,12 @@ TEST(TestVectorizer, TestTestVectorizer) {
   auto vectorizer = VectorizerRegistry::Global()->Get("test_op");
   EXPECT_NE(vectorizer, nullptr);
 
-  FunctionDef function;
-  NodeDef node;
-  std::map<string, string> conversion_map;
-  EXPECT_TRUE(vectorizer->Vectorize(node, {}, &function, &conversion_map).ok());
+  Graph g(OpRegistry::Global());
+  NodeDef node_def;
+  Status s;
+  Node* node = g.AddNode(node_def, &s);
+  std::vector<Port> inputs, outputs;
+  EXPECT_TRUE(vectorizer->Vectorize(*node, &g, &inputs, &outputs).ok());
 }
 
 }  // namespace vectorization_utils
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
index cb56b65985..cea667f668 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
@@ -14,13 +14,17 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/data/vectorization_utils.h"
+#include <memory>
 #include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
 
 #include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.h"
@@ -36,255 +40,346 @@ namespace tensorflow {
 namespace grappler {
 namespace vectorization_utils {
 
-using function_utils::FunctionDefTensorDesc;
-
 namespace {
 
-void AddMapDefunOutput(FunctionDef* map_defun_fn, NodeDef* map_defun_node,
-                       const string& output_retval, const DataType t) {
-  // Set to unknown shape
-  TensorShapeProto tensor_shape_proto;
-  PartialTensorShape().AsProto(&tensor_shape_proto);
+// Describes a tensor with its operation Node and output position
+typedef std::pair<Node*, int> TensorDesc;
 
-  function_utils::AddFunctionOutputWithUniqueName(
-      "vectorized_out", output_retval, map_defun_fn, t);
+const char* const kRetValOp = "_Retval";
 
-  *(*map_defun_node->mutable_attr())["output_shapes"]
-       .mutable_list()
-       ->add_shape() = tensor_shape_proto;
-  (*map_defun_node->mutable_attr())["output_types"].mutable_list()->add_type(t);
+void ReplaceEdgeSources(const TensorDesc& old_src, const TensorDesc& new_src,
+                        Graph* graph) {
+  // NOTE: We need two for loops here because we can't mutate the set of output
+  // edges as we iterate over them.
+  std::vector<const Edge*> edges_to_replace;
+  for (auto edge : old_src.first->out_edges()) {
+    if (edge->src_output() == old_src.second) {
+      edges_to_replace.push_back(edge);
+    }
+  }
+  for (auto edge : edges_to_replace) {
+    graph->AddEdge(new_src.first, new_src.second, edge->dst(),
+                   edge->dst_input());
+    graph->RemoveEdge(edge);
+  }
 }
 
-void RemoveMapDefunOutput(FunctionDef* outer_scope, FunctionDef* map_defun_fn,
-                          NodeDef* map_defun_node, int output_position) {
-  DCHECK_LT(output_position, map_defun_fn->signature().output_arg_size())
-      << "Trying to remove output that doesn't exist. Output number: "
-      << output_position;
+Status AddMapDefunOutput(FunctionBody* map_defun_fn, Node* map_defun_node,
+                         const TensorDesc& output) {
+  // Note that we don't update MapDefun attrs as we go, only when we are done
+  DataType type = output.first->output_type(output.second);
+  int index = map_defun_fn->ret_nodes.size();
 
-  int num_later_outputs =
-      map_defun_fn->signature().output_arg_size() - output_position - 1;
+  NodeDef ret_node_def;
+  ret_node_def.set_name("map_out");
+  ret_node_def.set_op(kRetValOp);
+  AddNodeAttr("T", type, &ret_node_def);
+  AddNodeAttr("index", index, &ret_node_def);
 
-  // Remove from map_defun_fn's ret dict and output args
-  map_defun_fn->mutable_ret()->erase(
-      map_defun_fn->signature().output_arg(output_position).name());
-  map_defun_fn->mutable_signature()->mutable_output_arg()->DeleteSubrange(
-      output_position, 1);
+  Status s;
+  Node* ret_node = map_defun_fn->graph->AddNode(ret_node_def, &s);
+  TF_RETURN_IF_ERROR(s);
 
-  // Renumber outputs that come after
-  for (int i = 0; i < num_later_outputs; ++i) {
-    function_utils::ReplaceReferences(
-        strings::StrCat(map_defun_node->name(),
-                        ":output:", output_position + i + 1),
-        strings::StrCat(map_defun_node->name(),
-                        ":output:", output_position + i),
-        outer_scope);
-  }
-  map_defun_node->mutable_attr()
-      ->at("output_shapes")
-      .mutable_list()
-      ->mutable_shape()
-      ->DeleteSubrange(output_position, 1);
-  map_defun_node->mutable_attr()
-      ->at("output_types")
-      .mutable_list()
-      ->mutable_type()
-      ->ExtractSubrange(output_position, 1, nullptr);
+  map_defun_fn->graph->AddEdge(output.first, output.second, ret_node, 0);
+  map_defun_fn->ret_nodes.push_back(ret_node);
+  map_defun_fn->ret_types.push_back(type);
+
+  return s;
 }
 
-int FindOutputToConvert(const FunctionDef& function,
-                        const std::set<string>& unconvertible,
-                        FunctionDefTensorDesc* f) {
-  for (int i = function.signature().output_arg_size() - 1; i >= 0; --i) {
-    const string& ret_key = function.signature().output_arg(i).name();
-    *f = FunctionDefTensorDesc(function.ret().at(ret_key));
+void RemoveMapDefunOutput(int output_position, Graph* outer_scope,
+                          FunctionBody* map_defun_fn, Node* map_defun_node) {
+  // Note that we don't update MapDefun attrs as we go, only when we are done
+  DCHECK_LT(output_position, map_defun_fn->ret_nodes.size())
+      << "Trying to remove output that doesn't exist. Output number: "
+      << output_position;
+
+  int num_later_outputs = map_defun_fn->ret_nodes.size() - output_position - 1;
 
-    if (unconvertible.find(f->node_name) == unconvertible.end()) {
-      return i;
-    }
+  // Modify map_defun_fn's signature and remove the output node from its graph
+  map_defun_fn->graph->RemoveNode(map_defun_fn->ret_nodes[output_position]);
+  map_defun_fn->ret_nodes.erase(map_defun_fn->ret_nodes.begin() +
+                                output_position);
+  map_defun_fn->ret_types.erase(map_defun_fn->ret_types.begin() +
+                                output_position);
+
+  // Renumber the nodes and edges that come after
+  for (int i = 0; i < num_later_outputs; ++i) {
+    ReplaceEdgeSources({map_defun_node, output_position + i + 1},
+                       {map_defun_node, output_position + i}, outer_scope);
+    // Each ret node has an "index" attr that has to be updated
+    map_defun_fn->ret_nodes[output_position + i]->AddAttr("index",
+                                                          output_position + i);
   }
-  return -1;
 }
 
 // Helper class that vectorizes the body of a MapDefun node, adding new
 // operations to the graph that collectively compute the same value as what
 // running the MapDefun function on slices of the input would produce.
-// Each instance of the class encapsulates all the data necessary to vectorize a
-// MapDefun op in place.
+// This class transforms the input FunctionDefs into their corresponding
+// Graph objects and works on the graphs directly, then converts them back
+// to FunctionDefs when GetResult is called.
 class Vectorization {
  public:
-  Vectorization(FunctionDef* outer_scope, FunctionDef* map_defun_fn,
-                NodeDef* map_defun_node)
-      : outer_scope_(outer_scope),
-        map_defun_fn_(map_defun_fn),
-        map_defun_node_(map_defun_node) {}
+  explicit Vectorization(FunctionDefLibrary* lib)
+      : lib_(lib), lib_def_(OpRegistry::Global(), *lib) {}
 
-  // Repeatedly tries to convert outputs of map_defun_fn_ into new nodes in
-  // the outer_scope_, until there are no convertible outputs remaining.
-  // This method is idempotent.
-  void Vectorize();
+  // Adds the vectorized function and new map_defun_fn to lib, and points
+  // vectorized_function to the former. Returns an error status if
+  // the conversion between FunctionDef -> Graph -> FunctionDef failed anywhere
+  // along the way.
+  Status Vectorize(const FunctionDef& outer_scope,
+                   const NodeDef& map_defun_node, FunctionDef** result);
 
  private:
-  // Vectorizes the map defun function's output at output_position
-  Status ConvertOutput(int output_position, const FunctionDefTensorDesc& desc);
-  // Given a descriptor of the original output tensor, gets a string
-  // corresponding to the converted output tensor.
-  Status ConvertOutputHelper(const FunctionDefTensorDesc& output_desc,
-                             string* converted);
-  Status AddConversionMappingFromInput(
-      const FunctionDefTensorDesc& output_desc);
+  // Converts FunctionDefs to Graphs.
+  Status Initialize(const FunctionDef& outer_scope,
+                    const NodeDef& map_defun_node);
+
+  // Converts Graphs back to FunctionDefs and adds them to `lib_`.
+  Status GetResult(FunctionDef** vectorized_function);
+
+  // Repeatedly tries to convert outputs of `map_defun_fn_` into new nodes in
+  // `outer_scope_`, until there are no convertible outputs remaining.
+  void VectorizeHelper();
+
+  // Vectorizes map_defun_fn's output at output_position.
+  Status ConvertOutput(int output_position);
 
   // Adds mappings from node's outputs tensors to converted output tensors,
   // creating the necessary new node(s). Generally, the steps to convert an op
   // are:
-  // 1) Promote the inputs of the op inputs to outputs of the map_defun_fn_,
-  //    and modify map_defun_node_ attrs accordingly
-  // 2) Create new node(s) in outer_scope_ that act on batched input tensors.
+  // 1) Create new node(s) in `outer_scope_` that act on batched input tensors.
   //    These operations collectively compute the same value as what running
   //    the original operation on slices of the input tensors would produce.
   //    For example, a Cast op in MapDefun translates to a Cast op in
-  //    outer_scope_, since the vectorized version of Cast is itself.
-  // 3) Set inputs of new node(s) to the corresponding converted inputs (that
-  //    are now outputs of map_defun_node_)
-  // 4) For each output of the old node, add the mapping of output strings to
-  //    the conversion map (eg "Cast:y:0" -> "Vectorize/Cast:y:0")
-  Status AddConversionMappingFromOp(const NodeDef& node,
-                                    const FunctionDefTensorDesc& output_desc);
-
-  // Maps a tensor name to the name of the corresponding vectorized tensor. For
-  // example, "Cast:y:0" -> "Vectorize/Cast:y:0"
-  std::map<string, string> conversion_map_;
-  // Unconvertible node names
-  std::set<string> unconvertible_;
-
-  FunctionDef* outer_scope_;
-  FunctionDef* map_defun_fn_;
-  NodeDef* map_defun_node_;
+  //    `outer_scope_`, since the vectorized version of Cast is itself.
+  // 2) Promote the inputs of the op inputs to outputs of the
+  //    `map_defun_node_` and `map_defun_fn_`.
+  // 3) Add edges between the promoted inputs (that are now outputs of
+  //    `map_defun_node`) and the inputs ports of the new node(s).
+  // 4) For each output of the old node, add the mapping of output tensors to
+  //    the conversion map.
+  Status AddConversionMapping(Node* op_node);
+
+  // Maps a tensor to the corresponding vectorized tensor. For example,
+  // {"Cast" Node*, 0} -> {"Vectorize/Cast" Node*, 0}
+  std::map<TensorDesc, TensorDesc> conversion_map_;
+
+  // Unconvertible ret nodes
+  std::set<Node*> unconvertible_;
+
+  FunctionDefLibrary* lib_;  // Not owned
+  FunctionLibraryDefinition lib_def_;
+  // Note that FunctionBody has a pointer to a Graph object that corresponds
+  // to the function's subgraph, with additional kArgOp and kRetValOp nodes
+  // that denote that function arguments and return values. These nodes have the
+  // attrs "T" for the type, and "index" for the argument / retval index
+  // respectively. FunctionBody also keeps track of arg/ret_nodes and
+  // arg/ret_types, that should be ordered according to argument/output indices.
+  std::unique_ptr<Graph> outer_scope_;
+  std::unique_ptr<FunctionBody> map_defun_fn_;
+  Node* map_defun_node_ = nullptr;  // Owned by `outer_scope`
+  Status status_;
 };
 
-Status Vectorization::AddConversionMappingFromOp(
-    const NodeDef& node, const FunctionDefTensorDesc& output_desc) {
-  for (const string& input_name : node.input()) {
-    if (IsControlInput(input_name)) {
+Status Vectorization::AddConversionMapping(Node* op_node) {
+  for (auto edge : op_node->in_edges()) {
+    if (edge->IsControlEdge()) {
       return errors::InvalidArgument(
           "Vectorizing outputs with control inputs is currently not "
           "supported.");
     }
   }
 
-  // TODO(rachelim): Have some mechanism for registering converters and some
-  // uniform, simpler way to represent them.
-
-  DataTypeVector types;
-  const OpDef* op_def = nullptr;
-  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(node.op(), &op_def));
-  TF_RETURN_IF_ERROR(InputTypesForNode(node, *op_def, &types));
-
-  std::vector<string> promoted_inputs;
-  promoted_inputs.reserve(node.input_size());
-  for (int i = 0; i < node.input_size(); ++i) {
-    promoted_inputs.push_back(strings::StrCat(
-        map_defun_node_->name(),
-        ":output:", map_defun_fn_->signature().output_arg_size() + i));
-  }
-
-  auto vectorizer = VectorizerRegistry::Global()->Get(node.op());
+  auto vectorizer = VectorizerRegistry::Global()->Get(op_node->type_string());
   if (vectorizer == nullptr) {
     return errors::Unimplemented("No vectorizer registered for op: ",
-                                 node.op());
+                                 op_node->type_string());
+  }
+  std::vector<Port> input_ports, output_ports;
+  input_ports.reserve(op_node->num_inputs());
+  output_ports.reserve(op_node->num_outputs());
+  TF_RETURN_IF_ERROR(vectorizer->Vectorize(*op_node, outer_scope_.get(),
+                                           &input_ports, &output_ports));
+
+  std::vector<const Edge*> input_edges;
+  TF_RETURN_IF_ERROR(op_node->input_edges(&input_edges));
+
+  if (op_node->num_outputs() != output_ports.size() ||
+      op_node->num_inputs() != input_ports.size() ||
+      input_edges.size() != input_ports.size()) {
+    return errors::Internal("Vectorizer inputs/outputs don't match.");
   }
 
-  TF_RETURN_IF_ERROR(vectorizer->Vectorize(node, promoted_inputs, outer_scope_,
-                                           &conversion_map_));
+  // Promote the inputs of the op to MapDefun outputs and connect the edges
+  // accordingly.
+  for (size_t i = 0; i < op_node->num_inputs(); ++i) {
+    auto edge = input_edges[i];
+    TF_RETURN_IF_ERROR(AddMapDefunOutput(map_defun_fn_.get(), map_defun_node_,
+                                         {edge->src(), edge->src_output()}));
+    outer_scope_->AddEdge(map_defun_node_, map_defun_fn_->ret_nodes.size() - 1,
+                          input_ports[i].first, input_ports[i].second);
+  }
 
-  // If we get here, the conversion was successful, so we promote the inputs
-  // of the ops to MapDefun outputs.
-  for (int i = 0; i < types.size(); ++i) {
-    AddMapDefunOutput(map_defun_fn_, map_defun_node_, node.input(i), types[i]);
+  // Add output mappings.
+  for (size_t i = 0; i < op_node->num_outputs(); ++i) {
+    conversion_map_.insert({{op_node, i}, std::move(output_ports[i])});
   }
 
   return Status::OK();
 }
 
-Status Vectorization::AddConversionMappingFromInput(
-    const FunctionDefTensorDesc& output_desc) {
-  int input_index = function_utils::FindFunctionInputWithName(
-      output_desc.node_name, *map_defun_fn_);
-  if (input_index == -1) {
-    return errors::Internal("Cannot convert non-existent input.");
+Status Vectorization::ConvertOutput(int output_position) {
+  // ret_edge->src() is the actual op that generated the retval, and
+  // ret_edge->dst() is the retval node whose op is "_Retval"
+  const Edge* ret_edge;
+  TF_RETURN_IF_ERROR(
+      map_defun_fn_->ret_nodes[output_position]->input_edge(0, &ret_edge));
+
+  TensorDesc output({ret_edge->src(), ret_edge->src_output()});
+  TensorDesc converted_output;
+  if (auto found = gtl::FindOrNull(conversion_map_, output)) {
+    // It's possible the output already has a mapping, if it comes from a node
+    // that has already been converted.
+    converted_output = *found;
+  } else {
+    TF_RETURN_IF_ERROR(AddConversionMapping(output.first));
+    converted_output = conversion_map_.at(output);
   }
 
-  conversion_map_[output_desc.full_str] = map_defun_node_->input(input_index);
+  ReplaceEdgeSources({map_defun_node_, output_position}, converted_output,
+                     outer_scope_.get());
+  RemoveMapDefunOutput(output_position, outer_scope_.get(), map_defun_fn_.get(),
+                       map_defun_node_);
+
   return Status::OK();
 }
 
-Status Vectorization::ConvertOutputHelper(
-    const FunctionDefTensorDesc& output_desc, string* converted) {
-  // It's possible the output already has a mapping, if it comes from a node
-  // that has already been converted.
-  if (auto found = gtl::FindOrNull(conversion_map_, output_desc.full_str)) {
-    *converted = *found;
-    return Status::OK();
+Status Vectorization::Vectorize(const FunctionDef& outer_scope,
+                                const NodeDef& map_defun_node,
+                                FunctionDef** result) {
+  TF_RETURN_IF_ERROR(Initialize(outer_scope, map_defun_node));
+  VectorizeHelper();
+  return GetResult(result);
+}
+
+void Vectorization::VectorizeHelper() {
+  while (true) {
+    int output_position = graph_utils::GetFirstElementIndexWithPredicate(
+        [this](Node* n) {
+          return this->unconvertible_.find(n) == this->unconvertible_.end();
+        },
+        map_defun_fn_->ret_nodes);
+
+    // No outputs left to convert
+    if (output_position == -1) break;
+
+    Status s = ConvertOutput(output_position);
+    if (!s.ok()) {
+      Node* output_node = map_defun_fn_->ret_nodes.at(output_position);
+      VLOG(2) << "Could not convert the output at node: "
+              << output_node->DebugString() << "\nError: " << s;
+      unconvertible_.insert(output_node);
+    }
   }
 
-  int index = function_utils::FindFunctionNodeWithName(output_desc.node_name,
-                                                       *map_defun_fn_);
-  if (index == -1) {  // The output comes from an input
-    TF_RETURN_IF_ERROR(AddConversionMappingFromInput(output_desc));
+  // If we've converted all the outputs of the MapDefun function, we no longer
+  // need the MapDefun node and can delete it.
+  if (map_defun_fn_->ret_nodes.empty()) {
+    outer_scope_->RemoveNode(map_defun_node_);
   } else {
-    TF_RETURN_IF_ERROR(AddConversionMappingFromOp(
-        map_defun_fn_->node_def(index), output_desc));
+    // Update MapDefun node attrs accordingly
+    DCHECK_EQ(map_defun_fn_->ret_types.size(), map_defun_fn_->ret_nodes.size());
+    map_defun_node_->AddAttr(
+        "output_shapes",
+        std::vector<PartialTensorShape>(map_defun_fn_->ret_types.size()));
+    map_defun_node_->AddAttr("output_types", map_defun_fn_->ret_types);
   }
-  *converted = conversion_map_.at(output_desc.full_str);
-  return Status::OK();
 }
+Status Vectorization::Initialize(const FunctionDef& outer_scope,
+                                 const NodeDef& map_defun_node) {
+  // Convert outer_scope and map_defun_fn to FunctionBodys so we can
+  // work on Graphs directly.
+  const FunctionDef* map_defun_fn =
+      lib_def_.Find(map_defun_node.attr().at("f").func().name());
+
+  if (map_defun_fn == nullptr) {
+    return errors::NotFound("Could not find function with name ",
+                            map_defun_node.attr().at("f").func().name(),
+                            " in function library.");
+  }
 
-Status Vectorization::ConvertOutput(int output_position,
-                                    const FunctionDefTensorDesc& output_desc) {
-  string converted_output_name;
-  TF_RETURN_IF_ERROR(ConvertOutputHelper(output_desc, &converted_output_name));
+  auto get_func_sig = [this](const string& op, const OpDef** sig) {
+    return this->lib_def_.LookUpOpDef(op, sig);
+  };
+
+  FunctionBody* outer_fn;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(outer_scope, {}, &lib_def_,
+                                             get_func_sig, &outer_fn));
+  // We don't need outer_fn, just the graph
+  outer_scope_.reset(outer_fn->graph);
+  outer_fn->graph = nullptr;
+  delete outer_fn;
+
+  FunctionBody* tmp;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*map_defun_fn, {}, &lib_def_,
+                                             get_func_sig, &tmp));
+  map_defun_fn_.reset(tmp);
+
+  // Find the MapDefun node in outer_scope_
+  int node_id = graph_utils::GetFirstElementIndexWithPredicate(
+      [&map_defun_node](Node* n) { return n->name() == map_defun_node.name(); },
+      outer_scope_->nodes());
+  if (node_id == -1) {
+    return errors::NotFound("Could not find node with name ",
+                            map_defun_node.name(), " in outer_scope.");
+  }
+  map_defun_node_ = outer_scope_->FindNodeId(node_id);
+
+  // Add mappings from map_defun_fn_ arg nodes to map_defun_node_ input nodes to
+  // the conversion map
+  for (auto arg_node : map_defun_fn_->arg_nodes) {
+    Node* input_node;
+    TF_RETURN_IF_ERROR(map_defun_node_->input_node(
+        arg_node->attrs().Find("index")->i(), &input_node));
 
-  // Remove the old output and make everything that referenced it point
-  // to the new string
-  function_utils::ReplaceReferences(
-      strings::StrCat(map_defun_node_->name(), ":output:", output_position),
-      converted_output_name, outer_scope_);
-  RemoveMapDefunOutput(outer_scope_, map_defun_fn_, map_defun_node_,
-                       output_position);
+    conversion_map_.insert({{arg_node, 0}, {input_node, 0}});
+  }
 
   return Status::OK();
 }
 
-void Vectorization::Vectorize() {
-  while (true) {
-    FunctionDefTensorDesc desc;
-    int output_position =
-        FindOutputToConvert(*map_defun_fn_, unconvertible_, &desc);
-    if (output_position == -1) break;
+Status Vectorization::GetResult(FunctionDef** vectorized_function) {
+  TF_RETURN_IF_ERROR(status_);
 
-    if (!ConvertOutput(output_position, desc).ok()) {
-      unconvertible_.insert(desc.node_name);
-    }
-  }
+  if (!map_defun_fn_->ret_nodes.empty()) {
+    FunctionDef* map_defun_fn = lib_->add_function();
+    graph_utils::SetUniqueGraphFunctionName("map_defun_fn", lib_, map_defun_fn);
+    TF_RETURN_IF_ERROR(GraphToFunctionDef(
+        *map_defun_fn_->graph, map_defun_fn->signature().name(), map_defun_fn));
 
-  // If we've converted all the outputs of the MapDefun function, we no longer
-  // need the MapDefun node and can delete it.
-  if (map_defun_fn_->signature().output_arg_size() == 0) {
-    outer_scope_->mutable_node_def()->DeleteSubrange(
-        function_utils::FindFunctionNodeWithName(map_defun_node_->name(),
-                                                 *outer_scope_),
-        1);
+    AttrValue func_attr;
+    func_attr.mutable_func()->set_name(map_defun_fn->signature().name());
+    map_defun_node_->AddAttr("f", func_attr);
   }
 
-  if (!unconvertible_.empty()) {
-    VLOG(2) << "The following nodes could not be converted: ["
-            << absl::StrJoin(unconvertible_, ", ") << "].";
-  }
+  *vectorized_function = lib_->add_function();
+  graph_utils::SetUniqueGraphFunctionName("vectorized_fn", lib_,
+                                          *vectorized_function);
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(
+      *outer_scope_, (*vectorized_function)->signature().name(),
+      *vectorized_function));
+  return Status::OK();
 }
+
 }  // namespace
 
-void VectorizeMapDefun(FunctionDef* outer_scope, FunctionDef* map_defun_fn,
-                       NodeDef* map_defun_node) {
-  Vectorization(outer_scope, map_defun_fn, map_defun_node).Vectorize();
+Status VectorizeMapDefun(const FunctionDef& outer_scope,
+                         const NodeDef& map_defun_node, FunctionDefLibrary* lib,
+                         FunctionDef** result) {
+  *result = nullptr;
+  return Vectorization(lib).Vectorize(outer_scope, map_defun_node, result);
 }
 
 }  // end namespace vectorization_utils
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils.h b/tensorflow/core/grappler/optimizers/data/vectorization_utils.h
index bb405faa77..bd7d390900 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils.h
@@ -24,22 +24,28 @@ namespace tensorflow {
 namespace grappler {
 namespace vectorization_utils {
 
-// Given a function, `map_defun_fn`, that is mapped across some input vector
-// elements via a MapDefun operation, `VectorizeMapDefun` attempts to
-// vectorize the MapDefun by "lifting" operations from the `map_defun_fn` to the
-// `outer_scope`; that is, replacing `map_defun_fn` operations with new
-// `outer_scope` operations that produce the same vector output(s) as executing
-// the `map_defun_fn` operations on elements of vector input(s) would. If all
-// `map_defun_fn` operations are successfully lifted, `map_defun_node` is
-// eliminated from `outer_scope` altogether. However, if some operations cannot
-// be lifted, and this vectorization only succeeds partially, `map_defun_node`
-// remains to be used for operations that were not lifted.
+// Given a MapDefun node (`map_defun_node`) in a FunctionDef (`outer_scope`)
+// that maps a function in lib across some input vector elements,
+// `VectorizeMapDefun` attempts to create a vectorized version of `outer_scope`
+// by "lifting" operations from the MapDefun function to the new function
+// (`result`); that is, replacing operations in the MapDefun function with
+// operations that produce the same vector output(s) as executing the original
+// operations on elements of vector input(s) would. If all operations in the
+// MapDefun function are successfully lifted, `result` has no MapDefun node
+// altogether. However, if some operations cannot be lifted, and this
+// vectorization only succeeds partially, a MapDefun node remains in `result` to
+// be used for operations that were not lifted, and the modified MapDefun
+// function is added to `lib`. The newly vectorized function `result` is also
+// added to `lib`.
+//
+// Returns Status::OK() if the vectorization is completely or partially
+// successful. Otherwise, returns an error, and sets `result` to nullptr.
 //
 // Example:
 //   If the input to the `VectorizeMapDefun` function is a MapDefun
 // whose `map_defun_fn` performs the Cast operation, the vectorization will
 // eliminate the MapDefun. This is because the Cast operation supports
-// any tensor shape and can thus be lifted to the `outer_scope`.
+// any tensor shape and can thus be lifted to `result`.
 //
 // Before:
 //
@@ -68,7 +74,7 @@ namespace vectorization_utils {
 //
 // After:
 //
-// outer_scope     +------+
+// result          +------+
 // +---------------+ Arg0 +---------+
 // |               +---+--+         |
 // |                   |            |
@@ -80,8 +86,9 @@ namespace vectorization_utils {
 // +---------------+ Ret0 +---------+
 //                 +------+
 //
-void VectorizeMapDefun(FunctionDef* outer_scope, FunctionDef* map_defun_fn,
-                       NodeDef* map_defun_node);
+Status VectorizeMapDefun(const FunctionDef& outer_scope,
+                         const NodeDef& map_defun_node, FunctionDefLibrary* lib,
+                         FunctionDef** result);
 
 }  // end namespace vectorization_utils
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
index e129fa9237..1ff62217dd 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/grappler/optimizers/data/function_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
@@ -60,6 +61,11 @@ NodeDef* AddMapDefunNode(const string& name, const std::vector<string>& inputs,
   return node;
 }
 
+string GetRetval(const FunctionDef& function_def, int index) {
+  return function_def.ret().at(
+      function_def.signature().output_arg(index).name());
+}
+
 // TODO(rachelim): Use FunctionDefHelper::Create instead
 FunctionDef CreateFunction(
     StringPiece name, const std::vector<std::pair<string, DataType>>& inputs,
@@ -85,7 +91,6 @@ FunctionDef CreateFunction(
   return func;
 }
 
-TEST(FunctionDefInputDescTest, ConstructedCorrectly) {}
 
 // Before:
 //
@@ -133,10 +138,15 @@ TEST(VectorizeMapDefunTest, VectorizeDefunNoOps) {
       {{}, {}}, inner.signature().name(), &outer);
   CHECK_NOTNULL(map_defun);
 
-  VectorizeMapDefun(&outer, &inner, map_defun);
-  EXPECT_TRUE(!function_utils::ContainsFunctionNodeWithOp("MapDefun", outer));
-  EXPECT_EQ(outer.ret().at("mapdefun"), "ret0");
-  EXPECT_EQ(outer.ret().at("mapdefun_0"), "ret1");
+  FunctionDefLibrary lib;
+  *lib.add_function() = outer;
+  *lib.add_function() = inner;
+  FunctionDef* vectorized;
+  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  EXPECT_TRUE(
+      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+  EXPECT_EQ(GetRetval(*vectorized, 0), "ret0");
+  EXPECT_EQ(GetRetval(*vectorized, 1), "ret1");
 }
 
 // Before:
@@ -149,12 +159,12 @@ TEST(VectorizeMapDefunTest, VectorizeDefunNoOps) {
 // |   +-----------+ Arg0 +---+ Arg1 +----+   |
 // |   |           +---+--+   +---+--+    |   |
 // |   |               |          |       |   |
-// |   |   +------+    |      +---v--+    |   |
-// |   |   |Const |    |      | Op0  |    |   |
-// |   |   +---v--+    |      +---+--+    |   |
+// |   |   +------+    |          |       |   |
+// |   |   |Const |    |          |       |   |
+// |   |   +---v--+    |          |       |   |
 // |   |       |       |          |       |   |
 // |   |       |   +---v--+   +---v--+    |   |
-// |   |       +---| XOp1 |   | XOp2 |    |   |
+// |   |       +---| XOp1 |   | Cast |    |   |
 // |   |           +---+--+   +---+--+    |   |
 // |   |               |          |       |   |
 // |   | MapDefun  +---v--+   +---v--+    |   |
@@ -165,23 +175,50 @@ TEST(VectorizeMapDefunTest, VectorizeDefunNoOps) {
 // +---------------+ Ret0 +---+ Ret1 +--------+
 //                 +------+   +------+
 //
-//   where XOp1 and XOp2 are not convertible.
+//   where XOp1 is not convertible.
 //
 // After:
 //
-// No change because the ops are not convertible.
+//
+//                 +------+   +------+
+// +---------------+ Arg0 +---+ Arg1 +--------+
+// |               +---+--+   +---+--+        |
+// |                   |          |           |
+// |               +---v--+       |           |
+// |   +-----------+ Arg0 +-+     |           |
+// |   |           +---+--+ |     |           |
+// |   |               |    |     |           |
+// |   |   +------+    |    |     |           |
+// |   |   |Const |    |    |     |           |
+// |   |   +---v--+    |    |     |           |
+// |   |       |       |    |     |           |
+// |   |       |   +---v--+ | +---v--+        |
+// |   |       +---| XOp1 | | | Cast |        |
+// |   |           +---+--+ | +---+--+        |
+// |   |               |    |     |           |
+// |   | MapDefun  +---v--+ |     |           |
+// |   +-----------+ Ret0 +-+     |           |
+// |               +---+--+       |           |
+// |                   |          |           |
+// |               +---v--+   +---v--+        |
+// +---------------+ Ret0 +---+ Ret1 +--------+
+//                 +------+   +------+
 //
 TEST(VectorizeMapDefunTest, VectorizeDefunUnconvertible) {
   FunctionDef inner =
       CreateFunction("inner_function", {{"arg0", DT_INT32}, {"arg1", DT_INT32}},
                      {{"ret0", DT_INT32}, {"ret1", DT_INT32}},
-                     {{"ret0", "XOp1:output:0"}, {"ret1", "XOp2:output:0"}});
+                     {{"ret0", "MatMul:product:0"}, {"ret1", "Cast:y:0"}});
+  // TODO(rachelim): If we ever write a converter for MatMul, we have to
+  // change this test.
   NodeDef* x_op1 =
-      function_utils::AddNode("XOp1", "XOp1", {"const", "arg0"}, {}, &inner);
+      function_utils::AddNode("MatMul", "MatMul", {"arg0", "arg0"}, {}, &inner);
   CHECK_NOTNULL(x_op1);
+  graph_transforms::SetNodeAttr("T", DT_INT32, x_op1);
 
-  NodeDef* x_op2 = function_utils::AddNode("XOp2", "XOp2", {"op1"}, {}, &inner);
-  CHECK_NOTNULL(x_op2);
+  NodeDef* cast_node =
+      AddCastNode("Cast", {"arg1"}, DT_INT32, DT_INT32, false, &inner);
+  CHECK_NOTNULL(cast_node);
 
   FunctionDef outer = CreateFunction(
       "outer_function", {{"x", DT_INT32}, {"y", DT_INT32}},
@@ -193,12 +230,22 @@ TEST(VectorizeMapDefunTest, VectorizeDefunUnconvertible) {
       {{}, {}}, inner.signature().name(), &outer);
   CHECK_NOTNULL(map_defun);
 
-  FunctionDef outer_copy(outer);
-  FunctionDef inner_copy(inner);
-  VectorizeMapDefun(&outer, &inner, map_defun);
-  // They should be unchanged
-  EXPECT_TRUE(FunctionDefsEqual(outer_copy, outer));
-  EXPECT_TRUE(FunctionDefsEqual(inner_copy, inner));
+  FunctionDefLibrary lib;
+  *lib.add_function() = outer;
+  *lib.add_function() = inner;
+  FunctionDef* vectorized;
+  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+
+  auto map_defun_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("MapDefun", *vectorized));
+  // The Cast node should be converted just fine.
+  EXPECT_EQ(GetRetval(*vectorized, 1), "Cast:y:0");
+
+  // The inner function should only have one retval.
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(), lib);
+  const FunctionDef* map_defun_fn =
+      lib_def.Find(map_defun_node.attr().at("f").func().name());
+  EXPECT_EQ(map_defun_fn->signature().output_arg_size(), 1);
 }
 
 // Before:
@@ -257,14 +304,19 @@ TEST(VectorizeMapDefunTest, VectorizeDefunSimpleCast) {
                       inner.signature().name(), &outer);
   CHECK_NOTNULL(map_defun);
 
-  VectorizeMapDefun(&outer, &inner, map_defun);
-  EXPECT_TRUE(!function_utils::ContainsFunctionNodeWithOp("MapDefun", outer));
-  const NodeDef& cast_node =
-      outer.node_def(function_utils::FindFunctionNodeWithOp("Cast", outer));
+  FunctionDefLibrary lib;
+  *lib.add_function() = outer;
+  *lib.add_function() = inner;
+  FunctionDef* vectorized;
+  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  EXPECT_TRUE(
+      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+  const NodeDef& cast_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Cast", *vectorized));
   EXPECT_EQ(cast_node.input(0), "x");
-  EXPECT_EQ(outer.ret().at("mapdefun"),
+  EXPECT_EQ(GetRetval(*vectorized, 0),
             strings::StrCat(cast_node.name(), ":y:0"));
-  EXPECT_EQ(outer.node_def_size(), 1);
+  EXPECT_EQ(vectorized->node_def_size(), 1);
 }
 
 // Before:
@@ -330,16 +382,21 @@ TEST(VectorizeMapDefunTest, VectorizeDefunCastUsedTwice) {
                       {{}, {}}, inner.signature().name(), &outer);
   CHECK_NOTNULL(map_defun);
 
-  VectorizeMapDefun(&outer, &inner, map_defun);
-  EXPECT_TRUE(!function_utils::ContainsFunctionNodeWithOp("MapDefun", outer));
-  const NodeDef& cast_node =
-      outer.node_def(function_utils::FindFunctionNodeWithOp("Cast", outer));
+  FunctionDefLibrary lib;
+  *lib.add_function() = outer;
+  *lib.add_function() = inner;
+  FunctionDef* vectorized;
+  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  EXPECT_TRUE(
+      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+  const NodeDef& cast_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Cast", *vectorized));
   EXPECT_EQ(cast_node.input(0), "x");
-  EXPECT_EQ(outer.ret().at("mapdefun"),
+  EXPECT_EQ(GetRetval(*vectorized, 0),
             strings::StrCat(cast_node.name(), ":y:0"));
-  EXPECT_EQ(outer.ret().at("mapdefun_0"),
+  EXPECT_EQ(GetRetval(*vectorized, 1),
             strings::StrCat(cast_node.name(), ":y:0"));
-  EXPECT_EQ(outer.node_def_size(), 1);
+  EXPECT_EQ(vectorized->node_def_size(), 1);
 }
 
 // Before:
@@ -411,21 +468,26 @@ TEST(VectorizeMapDefunTest, VectorizeDefunOpWithMultipleOutputs) {
       {{1}, {1}, {1}}, inner.signature().name(), &outer);
   CHECK_NOTNULL(map_defun);
 
-  VectorizeMapDefun(&outer, &inner, map_defun);
-  EXPECT_TRUE(!function_utils::ContainsFunctionNodeWithOp("MapDefun", outer));
-  const NodeDef& unpack_node =
-      outer.node_def(function_utils::FindFunctionNodeWithOp("Unpack", outer));
+  FunctionDefLibrary lib;
+  *lib.add_function() = outer;
+  *lib.add_function() = inner;
+  FunctionDef* vectorized;
+  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  EXPECT_TRUE(
+      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+  const NodeDef& unpack_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Unpack", *vectorized));
   EXPECT_EQ(unpack_node.input(0), "x");
   EXPECT_EQ(unpack_node.attr().at("axis").i(), 1);
   EXPECT_EQ(unpack_node.attr().at("T").type(), DT_INT32);
   EXPECT_EQ(unpack_node.attr().at("num").i(), 3);
-  EXPECT_EQ(outer.ret().at("mapdefun"),
+  EXPECT_EQ(GetRetval(*vectorized, 0),
             strings::StrCat(unpack_node.name(), ":output:0"));
-  EXPECT_EQ(outer.ret().at("mapdefun_0"),
+  EXPECT_EQ(GetRetval(*vectorized, 1),
             strings::StrCat(unpack_node.name(), ":output:1"));
-  EXPECT_EQ(outer.ret().at("mapdefun_1"),
+  EXPECT_EQ(GetRetval(*vectorized, 2),
             strings::StrCat(unpack_node.name(), ":output:2"));
-  EXPECT_EQ(outer.node_def_size(), 1);
+  EXPECT_EQ(vectorized->node_def_size(), 1);
 }
 
 // Before:
@@ -486,7 +548,7 @@ TEST(VectorizeMapDefunTest, VectorizeDefunChainedConvertibleOps) {
        {"ret1", "MyUnstack:output:1"},
        {"ret2", "MyUnstack:output:2"}});
   NodeDef* cast_op =
-      AddCastNode("Cast", {"arg0"}, DT_INT32, DT_INT64, false, &inner);
+      AddCastNode("Cast", {"arg0"}, DT_INT32, DT_INT32, false, &inner);
   CHECK_NOTNULL(cast_op);
   NodeDef* unstack_op =
       AddUnstackNode("MyUnstack", {"Cast:y:0"}, DT_INT32, 0, 3, &inner);
@@ -505,25 +567,30 @@ TEST(VectorizeMapDefunTest, VectorizeDefunChainedConvertibleOps) {
       {{1}, {1}, {1}}, inner.signature().name(), &outer);
   CHECK_NOTNULL(map_defun);
 
-  VectorizeMapDefun(&outer, &inner, map_defun);
-  EXPECT_TRUE(!function_utils::ContainsFunctionNodeWithOp("MapDefun", outer));
-  const NodeDef& cast_node =
-      outer.node_def(function_utils::FindFunctionNodeWithOp("Cast", outer));
+  FunctionDefLibrary lib;
+  *lib.add_function() = outer;
+  *lib.add_function() = inner;
+  FunctionDef* vectorized;
+  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  EXPECT_TRUE(
+      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+  const NodeDef& cast_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Cast", *vectorized));
   EXPECT_EQ(cast_node.input(0), "x");
-  const NodeDef& unpack_node =
-      outer.node_def(function_utils::FindFunctionNodeWithOp("Unpack", outer));
+  const NodeDef& unpack_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Unpack", *vectorized));
   EXPECT_EQ(unpack_node.input(0), strings::StrCat(cast_node.name(), ":y:0"));
   EXPECT_EQ(unpack_node.attr().at("axis").i(), 1);
   EXPECT_EQ(unpack_node.attr().at("T").type(), DT_INT32);
   EXPECT_EQ(unpack_node.attr().at("num").i(), 3);
 
-  EXPECT_EQ(outer.ret().at("mapdefun"),
+  EXPECT_EQ(GetRetval(*vectorized, 0),
             strings::StrCat(unpack_node.name(), ":output:0"));
-  EXPECT_EQ(outer.ret().at("mapdefun_0"),
+  EXPECT_EQ(GetRetval(*vectorized, 1),
             strings::StrCat(unpack_node.name(), ":output:1"));
-  EXPECT_EQ(outer.ret().at("mapdefun_1"),
+  EXPECT_EQ(GetRetval(*vectorized, 2),
             strings::StrCat(unpack_node.name(), ":output:2"));
-  EXPECT_EQ(outer.node_def_size(), 2);
+  EXPECT_EQ(vectorized->node_def_size(), 2);
 }
 
 // Before:
@@ -561,9 +628,11 @@ TEST(VectorizeMapDefunTest, VectorizeDefunWithControlInputs) {
   FunctionDef inner =
       CreateFunction("inner_function", {{"arg0", DT_INT32}},
                      {{"ret0", DT_INT64}}, {{"ret0", "Cast:y:0"}});
-  // The attrs aren't relevant
-  NodeDef* print_op =
-      function_utils::AddNode("Print", "Print", {"arg0", "arg0"}, {}, &inner);
+  NodeDef* print_op = function_utils::AddNode(
+      "Print", "Print", {"arg0", "arg0"}, {/*attrs*/}, &inner);
+  graph_transforms::SetNodeAttr("T", DT_INT32, print_op);
+  graph_transforms::SetNodeAttr("U", gtl::ArraySlice<DataType>({DT_INT32}),
+                                print_op);
   CHECK_NOTNULL(print_op);
   NodeDef* cast_op = AddCastNode("Cast", {"arg0", "^Print"}, DT_INT32, DT_INT64,
                                  false, &inner);
@@ -578,11 +647,27 @@ TEST(VectorizeMapDefunTest, VectorizeDefunWithControlInputs) {
                       inner.signature().name(), &outer);
   CHECK_NOTNULL(map_defun);
 
-  FunctionDef outer_copy(outer);
-  FunctionDef inner_copy(inner);
-  VectorizeMapDefun(&outer, &inner, map_defun);
+  FunctionDefLibrary lib;
+  *lib.add_function() = outer;
+  *lib.add_function() = inner;
+  FunctionDef* vectorized;
+  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
   // They should be unchanged
-  EXPECT_TRUE(FunctionDefsEqual(outer_copy, outer));
+  // We check this somewhat manually as the names of nodes may have changed
+  EXPECT_EQ(vectorized->node_def_size(), 1);
+  const NodeDef& map_defun_node = vectorized->node_def(0);
+  EXPECT_EQ(map_defun_node.op(), "MapDefun");
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(), lib);
+  const FunctionDef* map_defun_fn =
+      lib_def.Find(map_defun_node.attr().at("f").func().name());
+
+  const NodeDef& print_node = map_defun_fn->node_def(
+      function_utils::FindFunctionNodeWithOp("Print", *map_defun_fn));
+  const NodeDef& cast_node = map_defun_fn->node_def(
+      function_utils::FindFunctionNodeWithOp("Cast", *map_defun_fn));
+  string control_input = strings::StrCat("^", print_node.name());
+  EXPECT_TRUE(cast_node.input(0) == control_input ||
+              cast_node.input(1) == control_input);
 }
 
 // TODO(rachelim): More test cases when we get around to implementing them:
-- 
GitLab


From 4eef4925853a284fdfd4b5fae4b65f594a883b3b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 16:31:05 -0700
Subject: [PATCH 174/570] Add a rewrite_config option to disable
 meta_optimizer.

PiperOrigin-RevId: 215014737
---
 tensorflow/core/grappler/optimizers/meta_optimizer.cc | 6 ++++++
 tensorflow/core/protobuf/rewriter_config.proto        | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index e18a5f21d2..406c1b60ce 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -115,6 +115,9 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
 
 Status MetaOptimizer::InitializeOptimizers(
     std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
+  if (cfg_.disable_meta_optimizer()) {
+    return Status::OK();
+  }
   if (!cfg_.disable_model_pruning()) {
     optimizers->push_back(MakeUnique<ModelPruner>());
   }
@@ -489,6 +492,9 @@ void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
 }
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
+  if (cfg.disable_meta_optimizer()) {
+    return false;
+  }
   return !cfg.disable_model_pruning() ||
          cfg.layout_optimizer() != RewriterConfig::OFF ||
          cfg.function_optimization() != RewriterConfig::OFF ||
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 482178a540..8e0448d536 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -77,6 +77,8 @@ message RewriterConfig {
   Toggle scoped_allocator_optimization = 15;
   // Force small ops onto the CPU (default is ON).
   Toggle pin_to_host_optimization = 18;
+  // Disable the entire meta optimizer (off by default).
+  bool disable_meta_optimizer = 19;
 
   // Controls how many times we run the optimizers in meta optimizer (default
   // is once).
-- 
GitLab


From 0a341bbcb35d72d14bfda17f7f0cb0c61f323bce Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 16:36:07 -0700
Subject: [PATCH 175/570] Internal change

PiperOrigin-RevId: 215015490
---
 tensorflow/contrib/lite/examples/android/BUILD                   | 1 +
 tensorflow/contrib/lite/java/demo/app/src/main/BUILD             | 1 +
 tensorflow/contrib/lite/java/ovic/demo/app/BUILD                 | 1 +
 .../contrib/lite/models/smartreply/demo/app/src/main/BUILD       | 1 +
 4 files changed, 4 insertions(+)

diff --git a/tensorflow/contrib/lite/examples/android/BUILD b/tensorflow/contrib/lite/examples/android/BUILD
index 4d2437e7d3..d180cb4785 100644
--- a/tensorflow/contrib/lite/examples/android/BUILD
+++ b/tensorflow/contrib/lite/examples/android/BUILD
@@ -28,6 +28,7 @@ android_binary(
     srcs = glob([
         "app/src/main/java/**/*.java",
     ]),
+    aapt_version = "aapt",
     # Package assets from assets dir as well as all model targets.
     # Remove undesired models (and corresponding Activities in source)
     # to reduce APK size.
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/BUILD b/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
index 220d6c2159..5ad738389e 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
@@ -7,6 +7,7 @@ licenses(["notice"])  # Apache 2.0
 android_binary(
     name = "TfLiteCameraDemo",
     srcs = glob(["java/**/*.java"]),
+    aapt_version = "aapt",
     assets = [
         "//tensorflow/contrib/lite/java/demo/app/src/main/assets:labels_mobilenet_quant_v1_224.txt",
         "@tflite_mobilenet//:mobilenet_quant_v1_224.tflite",
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/BUILD b/tensorflow/contrib/lite/java/ovic/demo/app/BUILD
index b2e3a9bd7d..058240aada 100644
--- a/tensorflow/contrib/lite/java/ovic/demo/app/BUILD
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/BUILD
@@ -8,6 +8,7 @@ android_binary(
     srcs = [
         "OvicBenchmarkerActivity.java",
     ],
+    aapt_version = "aapt",
     assets = [
         "//tensorflow/contrib/lite/java/ovic/src/testdata:ovic_testdata",
         "//tensorflow/contrib/lite/java/ovic/src/testdata:labels.txt",
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD
index f18a2ca07a..2e5033dab1 100644
--- a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD
@@ -20,6 +20,7 @@ filegroup(
 android_binary(
     name = "SmartReplyDemo",
     srcs = glob(["java/**/*.java"]),
+    aapt_version = "aapt",
     assets = [":assets"],
     assets_dir = "",
     custom_package = "com.example.android.smartreply",
-- 
GitLab


From 541677bfee008a093daab2d033bd72650d886126 Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Fri, 28 Sep 2018 16:41:58 -0700
Subject: [PATCH 176/570] Add option to disable initialization/shutdown of the
 TPU.

PiperOrigin-RevId: 215016286
---
 tensorflow/contrib/tpu/__init__.py                   |  3 +++
 .../contrib/tpu/python/tpu/async_checkpoint.py       | 12 ++++++------
 tensorflow/contrib/tpu/python/tpu/tpu_estimator.py   |  9 +++++++--
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/tpu/__init__.py b/tensorflow/contrib/tpu/__init__.py
index 766466968a..6ce6b779a2 100644
--- a/tensorflow/contrib/tpu/__init__.py
+++ b/tensorflow/contrib/tpu/__init__.py
@@ -55,7 +55,9 @@
 
 @@TPUDistributionStrategy
 @@keras_to_tpu_model
+
 @@AsyncCheckpointSaverHook
+@@TPUInMemoryEvalHook
 """
 
 from __future__ import absolute_import
@@ -65,6 +67,7 @@ from __future__ import print_function
 # pylint: disable=wildcard-import,unused-import
 from tensorflow.contrib.tpu.python import profiler
 from tensorflow.contrib.tpu.python.ops.tpu_ops import *
+from tensorflow.contrib.tpu.python.tpu.async_checkpoint import *
 from tensorflow.contrib.tpu.python.tpu.bfloat16 import *
 from tensorflow.contrib.tpu.python.tpu.device_assignment import *
 from tensorflow.contrib.tpu.python.tpu.keras_support import tpu_model as keras_to_tpu_model
diff --git a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
index e06a720e82..20b7ba0997 100644
--- a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
+++ b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ======================================
-
 """Hook for asynchronous checkpointing.
 
 This hook dispatches checkpoint writing operations in a separate thread to
@@ -28,18 +27,16 @@ import threading
 import time
 
 from tensorflow.core.util.event_pb2 import SessionLog
-
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 from tensorflow.python.training.session_run_hook import SessionRunArgs
 from tensorflow.python.training.summary_io import SummaryWriterCache
 
 
-class AsyncCheckpointSaverHook(session_run_hook.SessionRunHook):
+class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
   """Saves checkpoints every N steps or seconds."""
 
   def __init__(self,
@@ -67,7 +64,7 @@ class AsyncCheckpointSaverHook(session_run_hook.SessionRunHook):
       ValueError: One of `save_steps` or `save_secs` should be set.
       ValueError: At most one of `saver` or `scaffold` should be set.
     """
-    logging.info("Create CheckpointSaverHook.")
+    logging.info("Create AsyncCheckpointSaverHook.")
     if saver is not None and scaffold is not None:
       raise ValueError("You cannot provide both saver and scaffold.")
     self._saver = saver
@@ -144,6 +141,10 @@ class AsyncCheckpointSaverHook(session_run_hook.SessionRunHook):
   def _save(self, session, step, asynchronous=True):
     """Saves the latest checkpoint, returns should_stop."""
 
+    # Skip saving on step 0
+    if step == 0:
+      return
+
     def _save_fn():
       """Run the saver process."""
       logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
@@ -162,7 +163,6 @@ class AsyncCheckpointSaverHook(session_run_hook.SessionRunHook):
                    end_time - start_time)
       logging.info("Checkpoint finished for %d into %s.", step, self._save_path)
 
-    logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
     for l in self._listeners:
       l.before_save(session, step)
 
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 764d85877a..545cee637f 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -404,12 +404,17 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
 
     self._feed_error = None
     self._finished = False
+    self._should_initialize_tpu = True
 
   def begin(self):
     logging.info('TPU job name %s', self._master_job)
     self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-    self._init_ops = [tpu.initialize_system(job=self._master_job)]
-    self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
+    if self._should_initialize_tpu:
+      self._init_ops = [tpu.initialize_system(job=self._master_job)]
+      self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
+    else:
+      self._init_ops = []
+      self._finalize_ops = []
 
     summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
     self._init_ops.extend(summary_writer_init_ops)
-- 
GitLab


From 43e4905a8e554291656bcf65eb7d17d6019df9f8 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Fri, 28 Sep 2018 16:47:53 -0700
Subject: [PATCH 177/570] Synchronize open source and closed source build with
 regard to flex delegate.

PiperOrigin-RevId: 215016968
---
 tensorflow/contrib/lite/BUILD | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index f320b53d94..f3ebe3b245 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -26,6 +26,14 @@ config_setting(
     },
 )
 
+# Enables inclusion of TensorFlow kernels via the TF Lite Flex delegate.
+# WARNING: This build flag is experimental and subject to change.
+config_setting(
+    name = "with_tflite_flex",
+    define_values = {"with_tflite_flex": "true"},
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "schema_fbs_version",
     hdrs = ["version.h"],
@@ -157,6 +165,10 @@ cc_library(
         "stderr_reporter.h",
     ],
     copts = tflite_copts(),
+    defines = select({
+        ":with_tflite_flex": ["TFLITE_FLEX"],
+        "//conditions:default": [],
+    }),
     linkopts = [
     ] + select({
         "//tensorflow:android": [
@@ -180,7 +192,12 @@ cc_library(
         "//tensorflow/contrib/lite/nnapi:nnapi_lib",
         "//tensorflow/contrib/lite/profiling:profiler",
         "//tensorflow/contrib/lite/schema:schema_fbs",
-    ],
+    ] + select({
+        ":with_tflite_flex": [
+            "//tensorflow/contrib/lite/delegates/flex:delegate",
+        ],
+        "//conditions:default": [],
+    }),
 )
 
 cc_library(
-- 
GitLab


From 3c01aa2b00ee4c3fda412b23da39fd0894c04cf7 Mon Sep 17 00:00:00 2001
From: Piotr Padlewski <prazek@google.com>
Date: Fri, 28 Sep 2018 17:04:06 -0700
Subject: [PATCH 178/570] Bunch of micro move optimizations

PiperOrigin-RevId: 215018984
---
 tensorflow/core/framework/node_def_util.h   |  1 -
 tensorflow/core/framework/op.h              | 20 ++++----
 tensorflow/core/framework/op_def_builder.cc | 24 ++++-----
 tensorflow/core/framework/op_def_builder.h  | 14 +++---
 tensorflow/core/grappler/utils/functions.cc | 55 +++++++++++----------
 tensorflow/core/grappler/utils/functions.h  |  5 +-
 6 files changed, 62 insertions(+), 57 deletions(-)

diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 187bfa2c88..0ff67554eb 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_UTIL_H_
 
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value_util.h"
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index 25f8de8dcc..81ed5f95f0 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -209,16 +209,16 @@ template <>
 class OpDefBuilderWrapper<true> {
  public:
   OpDefBuilderWrapper(const char name[]) : builder_(name) {}
-  OpDefBuilderWrapper<true>& Attr(StringPiece spec) {
-    builder_.Attr(spec);
+  OpDefBuilderWrapper<true>& Attr(string spec) {
+    builder_.Attr(std::move(spec));
     return *this;
   }
-  OpDefBuilderWrapper<true>& Input(StringPiece spec) {
-    builder_.Input(spec);
+  OpDefBuilderWrapper<true>& Input(string spec) {
+    builder_.Input(std::move(spec));
     return *this;
   }
-  OpDefBuilderWrapper<true>& Output(StringPiece spec) {
-    builder_.Output(spec);
+  OpDefBuilderWrapper<true>& Output(string spec) {
+    builder_.Output(std::move(spec));
     return *this;
   }
   OpDefBuilderWrapper<true>& SetIsCommutative() {
@@ -237,12 +237,12 @@ class OpDefBuilderWrapper<true> {
     builder_.SetAllowsUninitializedInput();
     return *this;
   }
-  OpDefBuilderWrapper<true>& Deprecated(int version, StringPiece explanation) {
-    builder_.Deprecated(version, explanation);
+  OpDefBuilderWrapper<true>& Deprecated(int version, string explanation) {
+    builder_.Deprecated(version, std::move(explanation));
     return *this;
   }
-  OpDefBuilderWrapper<true>& Doc(StringPiece text) {
-    builder_.Doc(text);
+  OpDefBuilderWrapper<true>& Doc(string text) {
+    builder_.Doc(std::move(text));
     return *this;
   }
   OpDefBuilderWrapper<true>& SetShapeFn(
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index 34a7a43d38..8a9bb63182 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -526,32 +526,32 @@ void FinalizeDoc(const string& text, OpDef* op_def,
 
 }  // namespace
 
-OpDefBuilder::OpDefBuilder(StringPiece op_name) {
-  op_def()->set_name(string(op_name));  // NOLINT
+OpDefBuilder::OpDefBuilder(string op_name) {
+  op_def()->set_name(std::move(op_name));
 }
 
-OpDefBuilder& OpDefBuilder::Attr(StringPiece spec) {
-  attrs_.emplace_back(spec.data(), spec.size());
+OpDefBuilder& OpDefBuilder::Attr(string spec) {
+  attrs_.push_back(std::move(spec));
   return *this;
 }
 
-OpDefBuilder& OpDefBuilder::Input(StringPiece spec) {
-  inputs_.emplace_back(spec.data(), spec.size());
+OpDefBuilder& OpDefBuilder::Input(string spec) {
+  inputs_.push_back(std::move(spec));
   return *this;
 }
 
-OpDefBuilder& OpDefBuilder::Output(StringPiece spec) {
-  outputs_.emplace_back(spec.data(), spec.size());
+OpDefBuilder& OpDefBuilder::Output(string spec) {
+  outputs_.push_back(std::move(spec));
   return *this;
 }
 
 #ifndef TF_LEAN_BINARY
-OpDefBuilder& OpDefBuilder::Doc(StringPiece text) {
+OpDefBuilder& OpDefBuilder::Doc(string text) {
   if (!doc_.empty()) {
     errors_.push_back(
         strings::StrCat("Extra call to Doc() for Op ", op_def()->name()));
   } else {
-    doc_.assign(text.data(), text.size());
+    doc_ = std::move(text);
   }
   return *this;
 }
@@ -577,14 +577,14 @@ OpDefBuilder& OpDefBuilder::SetAllowsUninitializedInput() {
   return *this;
 }
 
-OpDefBuilder& OpDefBuilder::Deprecated(int version, StringPiece explanation) {
+OpDefBuilder& OpDefBuilder::Deprecated(int version, string explanation) {
   if (op_def()->has_deprecation()) {
     errors_.push_back(
         strings::StrCat("Deprecated called twice for Op ", op_def()->name()));
   } else {
     OpDeprecation* deprecation = op_def()->mutable_deprecation();
     deprecation->set_version(version);
-    deprecation->set_explanation(string(explanation));
+    deprecation->set_explanation(std::move(explanation));
   }
   return *this;
 }
diff --git a/tensorflow/core/framework/op_def_builder.h b/tensorflow/core/framework/op_def_builder.h
index 0b39d6e848..8077b20598 100644
--- a/tensorflow/core/framework/op_def_builder.h
+++ b/tensorflow/core/framework/op_def_builder.h
@@ -51,7 +51,7 @@ struct OpRegistrationData {
 class OpDefBuilder {
  public:
   // Constructs an OpDef with just the name field set.
-  explicit OpDefBuilder(StringPiece op_name);
+  explicit OpDefBuilder(string op_name);
 
   // Adds an attr to this OpDefBuilder (and returns *this). The spec has
   // format "<name>:<type>" or "<name>:<type>=<default>"
@@ -84,7 +84,7 @@ class OpDefBuilder {
   // * Ability to restrict the type of the tensor like the existing
   //   restrictions for type attrs.
   // Perhaps by linking the type of the tensor to a type attr?
-  OpDefBuilder& Attr(StringPiece spec);
+  OpDefBuilder& Attr(string spec);
 
   // Adds an input or output to this OpDefBuilder (and returns *this).
   // The spec has form "<name>:<type-expr>" or "<name>:Ref(<type-expr>)"
@@ -101,8 +101,8 @@ class OpDefBuilder {
   // in the spec?
   // TODO(josh11b): SparseInput() and SparseOutput() matching the Python
   // handling?
-  OpDefBuilder& Input(StringPiece spec);
-  OpDefBuilder& Output(StringPiece spec);
+  OpDefBuilder& Input(string spec);
+  OpDefBuilder& Output(string spec);
 
   // Turns on the indicated boolean flag in this OpDefBuilder (and
   // returns *this).
@@ -112,7 +112,7 @@ class OpDefBuilder {
   OpDefBuilder& SetAllowsUninitializedInput();
 
   // Deprecate the op at a certain GraphDef version.
-  OpDefBuilder& Deprecated(int version, StringPiece explanation);
+  OpDefBuilder& Deprecated(int version, string explanation);
 
   // Adds docs to this OpDefBuilder (and returns *this).
   // Docs have the format:
@@ -128,9 +128,9 @@ class OpDefBuilder {
   // to suppress the automatically-generated type documentation in
   // generated output.
 #ifndef TF_LEAN_BINARY
-  OpDefBuilder& Doc(StringPiece text);
+  OpDefBuilder& Doc(string text);
 #else
-  OpDefBuilder& Doc(StringPiece text) { return *this; }
+  OpDefBuilder& Doc(string text) { return *this; }
 #endif
 
   // Sets the shape function to be used for shape inference.
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index a428aea7f5..6861fb423c 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -41,7 +41,8 @@ Status RegisterFunctionBodyOutputs(const OpRegistrationData& registration,
   tensorflow::NameRangeMap outputs_range_map;
   TF_RETURN_IF_ERROR(tensorflow::NameRangesForNode(
       node, registration.op_def, nullptr, &outputs_range_map));
-  connectivity->RegisterFunctionBodyOutputs(node.name(), outputs_range_map);
+  connectivity->RegisterFunctionBodyOutputs(node.name(),
+                                            std::move(outputs_range_map));
   return Status::OK();
 }
 
@@ -75,20 +76,22 @@ Status ResolveFunctionBodyNodeAttrPlaceholders(
 }  // namespace
 
 void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
-    const InputArgExpansion& input_arg_expansion) {
-  const auto& input_name = input_arg_expansion.input_name;
+    InputArgExpansion input_arg_expansion) {
+  string input_name = input_arg_expansion.input_name;
   const auto& placeholders = input_arg_expansion.placeholders;
-  input_arg_expansions_.emplace(input_name, input_arg_expansion);
+
   for (int i = 0; i < placeholders.size(); ++i) {
     const string& placeholder = input_arg_expansion.placeholders[i];
-    input_arg_placeholders_.emplace(
-        placeholder, InputArgPlaceholder{input_name, /*position=*/i});
+    input_arg_placeholders_.insert(
+        {placeholder, InputArgPlaceholder{input_name, /*position=*/i}});
   }
+  input_arg_expansions_.insert(
+      {std::move(input_name), std::move(input_arg_expansion)});
 }
 
 void GrapplerFunctionConnectivity::RegisterFunctionBodyOutputs(
-    const string& node_name, const tensorflow::NameRangeMap& outputs) {
-  function_body_outputs_[node_name] = outputs;
+    const string& node_name, tensorflow::NameRangeMap&& outputs) {
+  function_body_outputs_[node_name] = std::move(outputs);
 }
 
 Status GrapplerFunctionConnectivity::ExpandFunctionDefInput(
@@ -174,11 +177,12 @@ Status GrapplerFunctionConnectivity::ExpandFunctionDefInput(
         const auto& output_range = output->second;
 
         if (position == -1) {
+          graph_def_inputs->reserve(graph_def_inputs->size() +
+                                    output_range.second - output_range.first);
           // If position is not defined expand node output range
           for (int i = output_range.first; i < output_range.second; ++i) {
-            i == 0 ? graph_def_inputs->push_back(node_name)
-                   : graph_def_inputs->push_back(
-                         strings::StrCat(node_name, ":", i));
+            graph_def_inputs->push_back(
+                i == 0 ? node_name : strings::StrCat(node_name, ":", i));
           }
         } else {
           if (position > (output_range.second - output_range.first)) {
@@ -187,9 +191,8 @@ Status GrapplerFunctionConnectivity::ExpandFunctionDefInput(
                 " position: ", position, " (out of range)");
           }
           int pos = output_range.first + position;
-          pos == 0 ? graph_def_inputs->push_back(node_name)
-                   : graph_def_inputs->push_back(
-                         strings::StrCat(node_name, ":", pos));
+          graph_def_inputs->push_back(
+              pos == 0 ? node_name : strings::StrCat(node_name, ":", pos));
         }
 
         return Status::OK();
@@ -211,8 +214,8 @@ Status GrapplerFunctionConnectivity::ExpandNodeInputs(
   }
 
   function_body_node->clear_input();
-  for (const string& expanded_input : expanded_inputs)
-    function_body_node->add_input(expanded_input);
+  for (string& expanded_input : expanded_inputs)
+    function_body_node->add_input(std::move(expanded_input));
   return Status::OK();
 }
 
@@ -323,7 +326,7 @@ GrapplerFunctionItem::GrapplerFunctionItem(
   // Fill the feed nodes with input placeholders.
   for (const InputArgExpansion& input_arg : input_arg_expansions_) {
     for (const string& placeholder : input_arg.placeholders) {
-      feed.emplace_back(placeholder, Tensor());
+      feed.push_back({placeholder, Tensor()});
       input_arg_placeholders_.insert(placeholder);
     }
   }
@@ -460,7 +463,7 @@ Status InstantiationBodyParameters(
 
       auto it = func_instantiation_attr.find(placeholder);
       if (it != func_instantiation_attr.end()) {
-        body_parameters->emplace(placeholder, it->second);
+        body_parameters->insert({placeholder, it->second});
       } else {
         return errors::InvalidArgument("Can't resolve placeholder: ",
                                        placeholder);
@@ -498,10 +501,6 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
   // GraphDef input format (name[:position])
   GrapplerFunctionConnectivity connectivity;
 
-  std::vector<InputArgExpansion> inputs;
-  std::vector<OutputArgExpansion> outputs;
-  std::vector<string> keep_nodes;
-
   // Function body shares the library with the graph that instantiated it.
   GraphDef function_body;
   *function_body.mutable_library() = flib.ToProto();
@@ -518,6 +517,9 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     }
   }
 
+  std::vector<InputArgExpansion> inputs;
+  inputs.reserve(signature.input_arg_size());
+
   // For each input argument create a placeholder in function body.
   for (const OpDef::ArgDef& input : signature.input_arg()) {
     if (!input.type_list_attr().empty() || !input.number_attr().empty()) {
@@ -542,9 +544,10 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
                                       /*is_ref*/ input.is_ref(),
                                       /*placeholders=*/{input.name()}};
     connectivity.RegisterInputArgExpansion(input_expansion);
-    inputs.push_back(input_expansion);
+    inputs.push_back(std::move(input_expansion));
   }
 
+  std::vector<string> keep_nodes;
   // Add all function nodes to the function body
   for (const NodeDef& func_def_node : func.node_def()) {
     NodeDef* new_node = function_body.add_node();
@@ -572,6 +575,8 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     TF_RETURN_IF_ERROR(connectivity.ExpandNodeInputs(&node));
   }
 
+  std::vector<OutputArgExpansion> outputs;
+  outputs.reserve(signature.output_arg_size());
   // Add function outputs
   for (const OpDef::ArgDef& out : signature.output_arg()) {
     std::vector<string> output_tensors;
@@ -589,8 +594,8 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     OutputArgExpansion output{/*output_name=*/out.name(),
                               /*data_type=*/output_data_type,
                               /*is_ref=*/out.is_ref(),
-                              /*output_tensors=*/output_tensors};
-    outputs.push_back(output);
+                              /*output_tensors=*/std::move(output_tensors)};
+    outputs.push_back(std::move(output));
   }
 
   bool is_stateful = signature.is_stateful();
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 733caf325f..ef944ced09 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -70,9 +71,9 @@ struct OutputArgExpansion {
 // and fold it back when doing backward conversion.
 class GrapplerFunctionConnectivity {
  public:
-  void RegisterInputArgExpansion(const InputArgExpansion& input_arg_expansion);
+  void RegisterInputArgExpansion(InputArgExpansion input_arg_expansion);
   void RegisterFunctionBodyOutputs(const string& node_name,
-                                   const tensorflow::NameRangeMap& outputs);
+                                   tensorflow::NameRangeMap&& outputs);
 
   // Expand input encoded in FunctionDef format (name[:output][:position]) into
   // multiple inputs in GraphDef format (name[:position]).
-- 
GitLab


From 6d354f6bd686d748d02039f26197f590b817b8c3 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 28 Sep 2018 17:04:41 -0700
Subject: [PATCH 179/570] [tf.data] Use `std::make_shared` as appropriate in
 `ParallelMapIterator`.

PiperOrigin-RevId: 215019058
---
 .../kernels/data/parallel_map_iterator.cc     | 40 +++++++++----------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index 8393024c51..da067a4e6f 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -106,18 +106,17 @@ class ParallelMapIterator : public DatasetBaseIterator {
     TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("invocation_results.size"),
                                            invocation_results_.size()));
     for (size_t i = 0; i < invocation_results_.size(); i++) {
-      std::shared_ptr<InvocationResult> result = invocation_results_[i];
-      TF_RETURN_IF_ERROR(WriteStatusLocked(writer, i, result->status));
+      const auto& result = *(invocation_results_[i]);
+      TF_RETURN_IF_ERROR(WriteStatusLocked(writer, i, result.status));
       TF_RETURN_IF_ERROR(writer->WriteScalar(
           full_name(strings::StrCat("invocation_results[", i, "].size")),
-          result->return_values.size()));
-      for (size_t j = 0; j < result->return_values.size(); j++) {
-        TF_RETURN_IF_ERROR(
-            writer->WriteTensor(full_name(strings::StrCat(
-                                    "invocation_results[", i, "][", j, "]")),
-                                result->return_values[j]));
+          result.return_values.size()));
+      for (size_t j = 0; j < result.return_values.size(); j++) {
+        TF_RETURN_IF_ERROR(writer->WriteTensor(
+            full_name(strings::StrCat("invocation_results[", i, "][", j, "]")),
+            result.return_values[j]));
       }
-      if (result->end_of_input) {
+      if (result.end_of_input) {
         TF_RETURN_IF_ERROR(writer->WriteScalar(
             full_name(
                 strings::StrCat("invocation_results[", i, "].end_of_input")),
@@ -135,9 +134,9 @@ class ParallelMapIterator : public DatasetBaseIterator {
     TF_RETURN_IF_ERROR(reader->ReadScalar(
         full_name("invocation_results.size"), &invocation_results_size));
     for (size_t i = 0; i < invocation_results_size; i++) {
-      std::shared_ptr<InvocationResult> result(new InvocationResult());
-      invocation_results_.push_back(result);
-      TF_RETURN_IF_ERROR(ReadStatusLocked(reader, i, &result->status));
+      invocation_results_.push_back(std::make_shared<InvocationResult>());
+      auto& result = *invocation_results_.back();
+      TF_RETURN_IF_ERROR(ReadStatusLocked(reader, i, &result.status));
       size_t num_return_values;
       {
         int64 size;
@@ -153,17 +152,16 @@ class ParallelMapIterator : public DatasetBaseIterator {
               ": ", size, " is not a valid value of type size_t."));
         }
       }
-      result->return_values.reserve(num_return_values);
+      result.return_values.reserve(num_return_values);
       for (size_t j = 0; j < num_return_values; j++) {
-        result->return_values.emplace_back();
-        TF_RETURN_IF_ERROR(
-            reader->ReadTensor(full_name(strings::StrCat(
-                                   "invocation_results[", i, "][", j, "]")),
-                               &result->return_values.back()));
+        result.return_values.emplace_back();
+        TF_RETURN_IF_ERROR(reader->ReadTensor(
+            full_name(strings::StrCat("invocation_results[", i, "][", j, "]")),
+            &result.return_values.back()));
       }
-      result->end_of_input = reader->Contains(full_name(
+      result.end_of_input = reader->Contains(full_name(
           strings::StrCat("invocation_results[", i, "].end_of_input")));
-      result->notification.Notify();
+      result.notification.Notify();
     }
     return Status::OK();
   }
@@ -259,7 +257,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
           return;
         }
         while (!busy()) {
-          invocation_results_.emplace_back(new InvocationResult());
+          invocation_results_.push_back(std::make_shared<InvocationResult>());
           new_calls.push_back(invocation_results_.back());
           num_calls_++;
         }
-- 
GitLab


From 05e5d2a1a9d5471d634043135834ecae4355926a Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 28 Sep 2018 17:11:09 -0700
Subject: [PATCH 180/570] [TF] Fix incorrect type constraint on _VarHandlesOp
 kernel on GPU. The kernel supports any combination of the dtypes, and does
 not need a separate kernel for each dtype.

PiperOrigin-RevId: 215019812
---
 .../core/kernels/resource_variable_ops.cc      | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 427044ca67..23d76986bf 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -172,17 +172,21 @@ REGISTER_KERNEL_BUILDER(
                               .Device(DEVICE_GPU)              \
                               .HostMemory("resource")          \
                               .TypeConstraint<type>("dtype"),  \
-                          ResourceHandleOp<Var>)               \
-  REGISTER_KERNEL_BUILDER(Name("_VarHandlesOp")                \
-                              .Device(DEVICE_GPU)              \
-                              .HostMemory("resources")         \
-                              .TypeConstraint<type>("dtypes"), \
-                          ResourceHandlesOp<Var>)
-
+                          ResourceHandleOp<Var>)
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 TF_CALL_variant(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
+
+REGISTER_KERNEL_BUILDER(Name("_VarHandlesOp")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resources")
+                            .TypeConstraint("dtypes",
+                                            {DT_INT64, DT_COMPLEX64,
+                                             DT_COMPLEX128, DT_HALF, DT_FLOAT,
+                                             DT_DOUBLE, DT_BOOL, DT_VARIANT}),
+                        ResourceHandlesOp<Var>);
+
 #endif  // GOOGLE_CUDA
 
 template <typename T>
-- 
GitLab


From 47503fdbfa72357e1419972986a4415ab3ad92a6 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Fri, 28 Sep 2018 17:17:36 -0700
Subject: [PATCH 181/570] Disable jemalloc, and remove its configuration
 option.

PiperOrigin-RevId: 215020524
---
 configure.py   | 2 --
 tools/bazel.rc | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/configure.py b/configure.py
index 9899ae10e8..0a3b9a7894 100644
--- a/configure.py
+++ b/configure.py
@@ -1513,8 +1513,6 @@ def main():
   if is_ppc64le():
     write_action_env_to_bazelrc('OMP_NUM_THREADS', 1)
 
-  set_build_var(environ_cp, 'TF_NEED_JEMALLOC', 'jemalloc as malloc',
-                'with_jemalloc', True)
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
                 True, 'xla')
 
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 3734fab715..0cd148ed87 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -73,6 +73,7 @@ build --define=grpc_no_ares=true
 build --spawn_strategy=standalone
 build --genrule_strategy=standalone
 build -c opt
+build --define=with_jemalloc=false
 
 # Other build flags.
 build --define=grpc_no_ares=true
-- 
GitLab


From 737915c01dcab743256df7f7b1ff1545b951252d Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Fri, 28 Sep 2018 17:26:51 -0700
Subject: [PATCH 182/570] Internal change.

PiperOrigin-RevId: 215021487
---
 tensorflow/core/util/tensor_bundle/BUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index 4d4db86df2..f40ec9b752 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -65,6 +65,10 @@ tf_cc_test(
     name = "tensor_bundle_test",
     srcs = ["tensor_bundle_test.cc"],
     data = glob(["testdata/**"]),
+    tags = [
+        "nomsan",
+        "notsan",
+    ],
     deps = [
         ":tensor_bundle",
         "//tensorflow/core:framework",
-- 
GitLab


From 5f822d694af6e4aa57fe8a426032a91dc61e30d6 Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Fri, 28 Sep 2018 18:07:33 -0700
Subject: [PATCH 183/570] Internal change.

PiperOrigin-RevId: 215025019
---
 tensorflow/contrib/factorization/BUILD                | 9 ++++++++-
 tensorflow/contrib/opt/BUILD                          | 5 +++++
 tensorflow/contrib/timeseries/python/timeseries/BUILD | 7 ++++++-
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index e344d7a23b..510f292508 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -154,6 +154,8 @@ tf_py_test(
     ],
     tags = [
         "no_pip",  # b/38283730
+        "noasan",  # b/116875897
+        "nomsan",
         "notsan",  # Flaky: b/30756419
     ],
 )
@@ -177,7 +179,11 @@ tf_py_test(
         "//tensorflow/python:random_seed",
         "//tensorflow/python:variables",
     ],
-    tags = ["notsan"],  # b/62863147
+    tags = [
+        "noasan",  # b/116875897
+        "nomsan",
+        "notsan",  # b/62863147
+    ],
 )
 
 py_library(
@@ -276,6 +282,7 @@ tf_py_test(
         "manual",
         "noasan",  # times out b/63678675
         "nomsan",
+        "notsan",  # b/116875897
     ],
 )
 
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index f4ac70eb1a..6a67c6295d 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -377,6 +377,11 @@ py_test(
     size = "large",
     srcs = ["python/training/shampoo_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "noasan",  # b/116875897
+        "nomsan",
+        "notsan",
+    ],
     deps = [
         ":opt_py",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index c230919168..cb1f707028 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -159,7 +159,12 @@ py_test(
     ],
     shard_count = 4,
     srcs_version = "PY2AND3",
-    tags = ["no_pip_gpu"],  # b/63391119
+    tags = [
+        "no_pip_gpu",  # b/63391119
+        "noasan",  # b/116875897
+        "nomsan",
+        "notsan",
+    ],
     deps = [
         ":estimators",
         ":feature_keys",
-- 
GitLab


From b34ddf043324e52ee0acdfe62cb18beab7fed08e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 18:22:13 -0700
Subject: [PATCH 184/570] Added flag to enable non-lazy Adam optimizer
 implementation for TPU embeddings (actual implementation is pending). Added
 comments with pointers to C++ implementations of optimizers.

PiperOrigin-RevId: 215026002
---
 .../tpu/proto/optimization_parameters.proto   | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
index fc1320501b..a43f45554f 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
@@ -22,13 +22,22 @@ message LearningRate {
   }
 }
 
+// Each optimizer's parameter proto has a link to its documentation and CPU
+// implementation (if available) for user reference.
+
+// https://www.tensorflow.org/api_docs/python/tf/train/AdagradOptimizer
+// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L151
 message AdagradParameters {
   float initial_accumulator = 1;
 }
 
+// https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer
+// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L423
 message StochasticGradientDescentParameters {
 }
 
+// https://www.tensorflow.org/api_docs/python/tf/train/FtrlOptimizer
+// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L192
 message FtrlParameters {
   float l1 = 1;
   float l2 = 2;
@@ -41,21 +50,38 @@ message FtrlParameters {
 // learning rate feature instead, setting the learning rate to:
 // user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
 // Here, t is the current timestep.
+//
+// https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
 // https://github.com/tensorflow/tensorflow/blob/ab51450c817674c8ff08a7ae4f8ac50cdc4bed8b/tensorflow/python/training/adam.py#L54
+//
+// Note that the code by default implements the lazy version of Adam
+// (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/LazyAdamOptimizer)
+// unless the use_non_lazy_adam parameter is set, in which case it implements
+// the normal version of Adam that updates all parameters in the embedding
+// table, even for entries that are not used in the current minibatch
+// (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/AdamOptimizer). If
+// use_non_lazy_adam is enabled, use_gradient_accumulation is also required in
+// order to get correct results; a warning will be printed otherwise (which may
+// change to an error in the future).
 message AdamParameters {
   float beta1 = 3;
   float beta2 = 4;
   float epsilon = 5;
   float initial_m = 6;
   float initial_v = 7;
+  bool use_non_lazy_adam = 8;
 }
 
+// https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer
+// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L271
 message MomentumParameters {
   float momentum = 1;
   bool use_nesterov = 2;
   float initial_accum = 3;
 }
 
+// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
+// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L356
 message RmsPropParameters {
   float rho = 1;
   float momentum = 2;
@@ -64,6 +90,8 @@ message RmsPropParameters {
   float initial_mom = 5;
 }
 
+// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
+// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L372
 message CenteredRmsPropParameters {
   float rho = 1;
   float momentum = 2;
@@ -73,6 +101,7 @@ message CenteredRmsPropParameters {
   float initial_mg = 6;
 }
 
+// Variant of algorithm in http://proceedings.mlr.press/v44/shamir15.pdf
 message MdlAdagradLightParameters {
   float l2 = 1;
   float lr_power = 2;
@@ -91,6 +120,8 @@ message MdlAdagradLightParameters {
   float initial_benefit = 15;
 }
 
+// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
+// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L68
 message AdadeltaParameters {
   float rho = 1;
   float epsilon = 2;
@@ -98,6 +129,8 @@ message AdadeltaParameters {
   float initial_update = 4;
 }
 
+// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
+// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L164
 message ProximalAdagradParameters {
   float l1 = 1;
   float l2 = 2;
-- 
GitLab


From e4fea9419ac387ddcb9c932abaa8e92fb045e29f Mon Sep 17 00:00:00 2001
From: knightXun <badgangkiller@gmail.com>
Date: Sat, 29 Sep 2018 00:42:23 +0800
Subject: [PATCH 185/570] print error information, when the os is not supported

---
 tensorflow/go/test.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/go/test.sh b/tensorflow/go/test.sh
index 6083608f22..47c3a68379 100755
--- a/tensorflow/go/test.sh
+++ b/tensorflow/go/test.sh
@@ -63,6 +63,9 @@ then
   else
     export DYLD_LIBRARY_PATH="${PWD}/tensorflow:${DYLD_LIBRARY_PATH}"
   fi
+else 
+  echo "Only support Linux/Darwin, System $OS is not supported"
+  exit 1
 fi
 
 # Document the Go version and run tests
-- 
GitLab


From abd5c32c0fa6451e73b491affdd86d852a74177f Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Fri, 28 Sep 2018 18:27:37 -0700
Subject: [PATCH 186/570] Automated rollback of commit
 70f071f7afb2deffddbd9937d7a76b1e1c0b2b75

PiperOrigin-RevId: 215026418
---
 .../estimator_batch/dnn_tree_combined_estimator_test.py       | 3 +--
 .../contrib/boosted_trees/estimator_batch/estimator_test.py   | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
index 83a8dee632..839eedd3a8 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
@@ -188,8 +188,7 @@ class CoreDNNBoostedTreeCombinedTest(test_util.TensorFlowTestCase):
 
     # Train for a few steps.
     est.train(input_fn=_train_input_fn, steps=1000)
-    # 10 steps for dnn + 3 for 1 tree of depth 3 + 1 after the tree finished
-    # + 1 for resource variables.
+    # 10 steps for dnn, 3  for 1 tree of depth 3 + 1 after the tree finished
     self._assert_checkpoint(est.model_dir, global_step=14)
     res = est.evaluate(input_fn=_eval_input_fn, steps=1)
     self.assertLess(0.5, res["auc"])
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index d7b14e00ba..c155128c0e 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -238,8 +238,8 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
         output_leaf_index=False)
 
     classifier.fit(input_fn=_train_input_fn, steps=15)
-    # When no override of global steps, 6 steps were used.
-    self._assert_checkpoint(classifier.model_dir, global_step=6)
+    # When no override of global steps, 5 steps were used.
+    self._assert_checkpoint(classifier.model_dir, global_step=5)
 
   def testOverridesGlobalSteps(self):
     learner_config = learner_pb2.LearnerConfig()
-- 
GitLab


From d37f771cc5a208cdc88a50a65f491b3c06c9f262 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Fri, 28 Sep 2018 18:41:31 -0700
Subject: [PATCH 187/570] Move TPU variables to the TPU device in TPUStrategy.

PiperOrigin-RevId: 215027511
---
 tensorflow/contrib/distribute/python/BUILD    |   1 +
 .../contrib/distribute/python/tpu_strategy.py | 175 +++++++-
 .../contrib/distribute/python/values.py       | 381 ++++++++++++++++++
 .../tpu/python/tpu/keras_tpu_variables.py     |   2 +-
 tensorflow/contrib/tpu/python/tpu/tpu.py      |  11 +-
 tensorflow/python/eager/backprop.py           |   2 +-
 tensorflow/python/estimator/estimator.py      |   4 +
 tensorflow/python/estimator/util.py           |   8 +-
 tensorflow/python/training/optimizer.py       |   5 +-
 tensorflow/python/training/session_manager.py |   5 +
 10 files changed, 565 insertions(+), 29 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 422983dbef..cfb9d42a6f 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -28,6 +28,7 @@ py_library(
         "//tensorflow/python:device_util",
         "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index a6762e5e87..1b555482d3 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.contrib.tpu.python.tpu import training_loop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -37,9 +38,13 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import device_util
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
 
+_TPU_INITIALIZE_SYSTEM_COLLECTION = "TPU_STRATEGY_INITIALIZE"
+
+
 def get_tpu_system_metadata(tpu_cluster_resolver):
   """Retrieves TPU system metadata given a TPUClusterResolver."""
   master = tpu_cluster_resolver.master()
@@ -56,6 +61,58 @@ def get_tpu_system_metadata(tpu_cluster_resolver):
   return tpu_system_metadata
 
 
+# TODO(jhseu): Deduplicate with MirroredStrategy?
+def _create_tpu_mirrored_variable(devices, real_mirrored_creator, *args,
+                                  **kwargs):  # pylint: disable=g-missing-docstring
+  # Figure out what collections this variable should be added to.
+  # We'll add the TPUMirroredVariable to those collections instead.
+  collections = kwargs.pop("collections", None)
+  if collections is None:
+    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+  kwargs["collections"] = []
+
+  # TODO(jhseu): Should we have different behavior for different
+  # synchronization settings?
+
+  # Get aggregation value
+  # TODO(jhseu): Support aggregation in a tower context.
+  aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
+  if aggregation not in [
+      vs.VariableAggregation.NONE,
+      vs.VariableAggregation.SUM,
+      vs.VariableAggregation.MEAN,
+      vs.VariableAggregation.ONLY_FIRST_TOWER,
+  ]:
+    raise ValueError("Invalid variable aggregation mode: {} for variable: {}"
+                     .format(aggregation, kwargs["name"]))
+
+  # Ignore user-specified caching device, not needed for mirrored variables.
+  kwargs.pop("caching_device", None)
+
+  # TODO(josh11b,apassos): It would be better if variable initialization
+  # was never recorded on the tape instead of having to do this manually
+  # here.
+  with tape.stop_recording():
+    index = real_mirrored_creator(devices, *args, **kwargs)
+    result = values.TPUMirroredVariable(index, index[devices[0]], aggregation)
+
+  if not context.executing_eagerly():
+    g = ops.get_default_graph()
+    # If "trainable" is True, next_creator() will add the member variables
+    # to the TRAINABLE_VARIABLES collection, so we manually remove
+    # them and replace with the MirroredVariable. We can't set
+    # "trainable" to False for next_creator() since that causes functions
+    # like implicit_gradients to skip those variables.
+    if kwargs.get("trainable", True):
+      collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+      for v in index.values():
+        l.remove(v)
+    g.add_to_collections(collections, result)
+  return result
+
+
+# TODO(jhseu): Stop inheriting from OneDeviceStrategy.
 class TPUStrategy(one_device_strategy.OneDeviceStrategy):
   """Experimental TPU distribution strategy implementation."""
 
@@ -82,6 +139,15 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
     # TODO(sourabhbajaj): Change this from num_cores to metadata_override
     self._num_cores_override = num_cores
 
+    # TODO(jhseu): Switch to DeviceAssignment to support pods and model
+    # parallelism.
+    device_map = {d.name: i for i, d in enumerate(self._tpu_metadata.devices)
+                  if "device:TPU:" in d.name}
+    self._device_index = values.PerDevice(device_map)
+    self._tpu_devices = sorted(device_map.keys())
+    # Only create variables for the number of towers we're running.
+    self._tpu_devices = self._tpu_devices[:self.num_towers]
+
     # TODO(sourabhbajaj): Remove this once performance of running one step
     # at a time is comparable to multiple steps.
     self.steps_per_run = steps_per_run
@@ -239,6 +305,8 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
     return ctx
 
   def _call_for_each_tower(self, fn, *args, **kwargs):
+    # TODO(jhseu): Consider making it so call_for_each_tower implies that we're
+    # in a tpu.rewrite(), and update TPUMirroredVariable accordingly.
     kwargs.pop('run_concurrently', None)
     with one_device_strategy._OneDeviceTowerContext(self):  # pylint: disable=protected-access
       return fn(*args, **kwargs)
@@ -248,7 +316,15 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
       # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
       raise NotImplementedError('Eager mode not supported in TPUStrategy.')
     else:
-      return [tpu.initialize_system()]
+      # TODO(jhseu): We need this hack because DistributionStrategies must be
+      # pickleable for copy.deepcopy(). Remove when initialize_system goes away.
+      graph = ops.get_default_graph()
+      tpu_init = graph.get_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION)
+      if tpu_init:
+        return tpu_init
+      graph.add_to_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION,
+                              tpu.initialize_system())
+      return graph.get_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION)
 
   def finalize(self):
     if context.executing_eagerly():
@@ -257,21 +333,53 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
     else:
       return [tpu.shutdown_system()]
 
+  def _get_devices_from(self, colocate_with=None):
+     # TODO(jhseu): Change this when we support model parallelism.
+    return self._tpu_devices
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    """Create a TPUMirroredVariable. See `DistributionStrategy.scope`."""
+    colocate_with = kwargs.pop("colocate_with", None)
+    devices = self._get_devices_from(colocate_with)
+
+    def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
+      index = {}
+      for i, d in enumerate(devices):
+        with ops.device(d):
+          if i > 0:
+            # Give replicas meaningful distinct names:
+            var0name = index[devices[0]].name.split(":")[0]
+            # We append a / to variable names created on towers with id > 0 to
+            # ensure that we ignore the name scope and instead use the given
+            # name as the absolute name of the variable.
+            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
+            # Initialize replicas with the same value:
+            if context.executing_eagerly():
+              kwargs["initial_value"] = array_ops.identity(
+                  index[devices[0]].value())
+            else:
+              def initial_value_fn(device=d):
+                with ops.device(device):
+                  return array_ops.identity(index[devices[0]].initial_value)
+              kwargs["initial_value"] = initial_value_fn
+          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+            v = next_creator(*args, **kwargs)
+          assert not isinstance(v, values.TPUMirroredVariable)
+          index[d] = v
+      return index
+
+    return _create_tpu_mirrored_variable(devices, _real_mirrored_creator, *args,
+                                         **kwargs)
+
   def _reduce(self, aggregation, value, destinations):
-    graph = ops.get_default_graph()
-    cf_context = graph._get_control_flow_context()  # pylint: disable=protected-access
-    # If we're inside the ReplicateContext, reduction should be done using
-    # CrossReplicaSum while outside we can directly use an add_n op.
-    while cf_context:
-      if isinstance(cf_context, tpu.TPUReplicateContext):
-        if aggregation == vs.VariableAggregation.MEAN:
-          # TODO(jhseu):  Revisit once we support model-parallelism.
-          value *= (1. / self.num_towers)
-        elif aggregation != vs.VariableAggregation.SUM:
-          raise NotImplementedError(
-              'Currently only support sum & mean in TPUStrategy.')
-        return tpu_ops.cross_replica_sum(value)
-      cf_context = cf_context.outer_context
+    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
+      if aggregation == vs.VariableAggregation.MEAN:
+        # TODO(jhseu):  Revisit once we support model-parallelism.
+        value *= (1. / self.num_towers)
+      elif aggregation != vs.VariableAggregation.SUM:
+        raise NotImplementedError(
+            "Currently only support sum & mean in TPUStrategy.")
+      return tpu_ops.cross_replica_sum(value)
 
     # Validate that the destination is same as the host device
     # Note we don't do this when in replicate context as the reduction is
@@ -290,6 +398,35 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
       return output * (1. / len(value))
     return output
 
+  def _update(self, var, fn, *args, **kwargs):
+    # TODO(jhseu): Consider supporting grouped==False.
+    assert isinstance(var, values.TPUMirroredVariable)
+    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
+      return fn(var, *args, **kwargs)
+
+    # Otherwise, we revert to MirroredStrategy behavior and update each variable
+    # directly.
+    updates = {}
+    for d, v in var._index.items():  # pylint: disable=protected-access
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        # If args and kwargs are not mirrored, the value is returned as is.
+        updates[d] = fn(v,
+                        *values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+
+    # Make a single control dependency to keep the variables mirrored. If one
+    # assignment is fetched, then run all assignments.
+    sorted_keys = sorted(updates.keys())
+    update_tuple = control_flow_ops.tuple([updates[d] for d in sorted_keys])
+    for i, d in enumerate(sorted_keys):
+      updates[d] = update_tuple[i]
+    return values.regroup(updates, values.Mirrored)
+
+  def read_var(self, var):
+    assert isinstance(var, values.TPUMirroredVariable)
+    return var.read_value()
+
   def _unwrap(self, value):
     if isinstance(value, list):
       return value
@@ -323,6 +460,14 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
   def should_save_summary(self):
     return True
 
+  @property
+  def worker_devices(self):
+    return self._tpu_devices
+
+  @property
+  def parameter_devices(self):
+    return self._tpu_devices
+
   def get_host_cpu_device(self, host_id):
     if self._tpu_cluster_resolver.get_master() in ('', 'local'):
       return '/replica:0/task:0/device:CPU:0'
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 4955ded4d5..c18faeb67d 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -22,17 +22,20 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import contextlib
 import weakref
 import six
 
 from tensorflow.contrib.distribute.python import input_ops
 from tensorflow.contrib.distribute.python import prefetching_ops_v2
 from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as variables_lib
@@ -453,6 +456,384 @@ ops.register_tensor_conversion_function(MirroredVariable,
                                         _tensor_conversion_mirrored)
 
 
+def _enclosing_tpu_context():
+  # pylint: disable=protected-access
+  tpu_context = ops.get_default_graph()._get_control_flow_context()
+  # pylint: enable=protected-access
+  while tpu_context is not None and not isinstance(
+      tpu_context, control_flow_ops.XLAControlFlowContext):
+    tpu_context = tpu_context.outer_context
+  return tpu_context
+
+
+# TODO(jhseu): Deduplicate code. We copy code because we don't want to
+# inherit from DistributedDelegate. DistributedDelegate will not work in a
+# tpu.replicate() because it assumes that you're in a device context where you
+# can operate on a single version of the variable, but a tpu.replicate()
+# operates on all variables and is replicated during a rewrite pass.
+class TPUMirroredVariable(checkpointable.CheckpointableBase):
+  """Holds a map from device to TPU variables whose values are kept in sync."""
+
+  def __init__(self, index, primary_var, aggregation):
+    # Use a weakref to make it easy to map from the contained values
+    # to the container without introducing a reference cycle.
+    for v in six.itervalues(index):
+      v._mirrored_container = weakref.ref(self)  # pylint: disable=protected-access
+    self._index = {device_util.canonicalize(key): value
+                   for key, value in six.iteritems(index)}
+    self._primary_var = primary_var
+    self._common_name = self._primary_var.name.split(":")[0]
+    self._aggregation = aggregation
+    # Needed for GradientTape
+    self._trainable = self._primary_var.trainable
+
+  def _get(self, device=None):
+    """Returns the value for the current device or raises a ValueError."""
+    if device is None:
+      tower_context = distribution_strategy_context.get_tower_context()
+      if tower_context:
+        device = tower_context.device
+      else:
+        device = distribute_lib.get_update_device()
+        if device is None:
+          return self._get_cross_tower()
+    device = device_util.canonicalize(device)
+    try:
+      return self._index[device]
+    except KeyError as e:
+      six.raise_from(
+          ValueError("Device %s not found in %s (current device %s)" %
+                     (device, self._index.keys(), device_util.current())), e)
+
+  # pylint: disable=multiple-statements
+  def __add__(self, o): return self.read_value() + o
+  def __radd__(self, o): return o + self.read_value()
+  def __sub__(self, o): return self.read_value() - o
+  def __rsub__(self, o): return o - self.read_value()
+  def __mul__(self, o): return self.read_value() * o
+  def __rmul__(self, o): return o * self.read_value()
+  def __truediv__(self, o): return self.read_value() / o
+  def __rtruediv__(self, o): return o / self.read_value()
+  def __floordiv__(self, o): return self.read_value() // o
+  def __rfloordiv__(self, o): return o // self.read_value()
+  def __mod__(self, o): return self.read_value() % o
+  def __rmod__(self, o): return o % self.read_value()
+  def __lt__(self, o): return self.read_value() < o
+  def __le__(self, o): return self.read_value() <= o
+  def __gt__(self, o): return self.read_value() > o
+  def __ge__(self, o): return self.read_value() >= o
+  def __and__(self, o): return self.read_value() & o
+  def __rand__(self, o): return o & self.read_value()
+  def __or__(self, o): return self.read_value() | o
+  def __ror__(self, o): return o | self.read_value()
+  def __xor__(self, o): return self.read_value() ^ o
+  def __rxor__(self, o): return o ^ self.read_value()
+  def __getitem__(self, o): return self.read_value()[o]
+  def __pow__(self, o, modulo=None): return pow(self.read_value(), o, modulo)
+  def __rpow__(self, o): return pow(o, self.read_value())
+  def __invert__(self): return ~self.read_value()
+  def __neg__(self): return -self.read_value()
+  def __abs__(self): return abs(self.read_value())
+
+  def __div__(self, o):
+    try:
+      return self.read_value().__div__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rdiv__(self, o):
+    try:
+      return self.read_value().__rdiv__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __matmul__(self, o):
+    try:
+      return self.read_value().__matmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rmatmul__(self, o):
+    try:
+      return self.read_value().__rmatmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  @property
+  def handle(self):
+    # If we're in a tpu.rewrite(), return the replicated handle.
+    tpu_context = _enclosing_tpu_context()
+    if tpu_context is not None:
+      return tpu_context.get_replicated_var_handle(
+          self._common_name, nest.flatten(self._index))
+
+    device = distribute_lib.get_update_device()
+    if device is None:
+      return self._primary_var.handle
+    device = device_util.canonicalize(device)
+    try:
+      return self._index[device].handle
+    except KeyError as e:
+      six.raise_from(
+          ValueError("Device %s not found in %s (current device %s)" %
+                     (device, self._index.keys(), device_util.current())), e)
+
+  # The arguments to update() are automatically unwrapped so the update()
+  # function would normally see regular variables, not MirroredVariables.
+  # However, the update function can still operate on wrapped MirroredVariables
+  # through object members, captured arguments, etc. This is more likely in an
+  # update_non_slot() function (like OptimizerV2._finish), which can
+  # update several non-slot variables in one call.
+  def _assign_func(self, *args, **kwargs):
+    if distribution_strategy_context.get_distribution_strategy().__class__.__name__ != "TPUStrategy":
+      raise ValueError("You may only assign to a TPUMirroredVariable within a "
+                       "TPUStrategy.")
+    f = kwargs.pop("f")
+    if distribution_strategy_context.get_cross_tower_context():
+      if _enclosing_tpu_context() is not None:
+        return distribution_strategy_context.get_distribution_strategy().update(
+            self, f, *args, **kwargs)
+
+      update_device = distribute_lib.get_update_device()
+      # We are calling update on the mirrored variable in cross tower context.
+      if update_device is not None:
+        # We are calling an assign function on the mirrored variable in cross
+        # tower context.
+        v = self._get(device=update_device)
+        return f(v, *args, **kwargs)
+
+      return distribution_strategy_context.get_distribution_strategy().update(
+          self, f, *args, **kwargs)
+    else:
+      _assert_tower_context()
+      # We are calling an assign function on the mirrored variable in tower
+      # context.
+      # We reduce the value we want to assign/add/sub. More details about how we
+      # handle the different use cases can be found in the _reduce method.
+      # We call the function on each of the mirrored variables with the reduced
+      # value.
+      if self._aggregation == vs.VariableAggregation.NONE:
+        raise ValueError("You must specify an aggregation method to update a "
+                         "TPUMirroredVariable in Tower Context.")
+
+      def merge_fn(strategy, value, *other_args, **other_kwargs):
+        return strategy.update(
+            self, f,
+            strategy.reduce(
+                aggregation=self._aggregation, value=value, destinations=self),
+            *other_args, **other_kwargs)
+
+      return distribution_strategy_context.get_tower_context().merge_call(
+          merge_fn, *args, **kwargs)
+
+  @contextlib.contextmanager
+  def _handle_graph(self, handle):
+    # Note: might have an eager tensor but not be executing eagerly when
+    # building functions.
+    if (context.executing_eagerly() or isinstance(handle, ops.EagerTensor)
+        or ops.has_default_graph()):
+      yield
+    else:
+      with handle.graph.as_default():
+        yield
+
+  @property
+  def trainable(self):
+    return self._trainable
+
+  def _read_variable_op(self, parent_op=None):
+    if self.trainable:
+      tape.variable_accessed(self)
+    if parent_op is not None:
+      with ops.control_dependencies([parent_op]):
+        return gen_resource_variable_ops.read_variable_op(
+            self.handle, self.dtype)
+
+    return gen_resource_variable_ops.read_variable_op(
+        self.handle, self.dtype)
+
+  def read_value(self):
+    return self._read_variable_op()
+
+  def assign_sub(self, *args, **kwargs):
+    def assign_sub_fn(var, delta, **kw):
+      name = kw.pop("name", None)
+      read_value = kw.pop("read_value", True)
+      with self._handle_graph(var.handle):
+        op = gen_resource_variable_ops.assign_sub_variable_op(
+            var.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
+            name=name)
+      if read_value:
+        return self._read_variable_op(parent_op=op)
+      return op
+
+    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    def assign_add_fn(var, delta, **kw):
+      name = kw.pop("name", None)
+      read_value = kw.pop("read_value", True)
+      with self._handle_graph(var.handle):
+        op = gen_resource_variable_ops.assign_add_variable_op(
+            var.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
+            name=name)
+      if read_value:
+        return self._read_variable_op(parent_op=op)
+      return op
+
+    return self._assign_func(f=assign_add_fn, *args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    def assign_fn(var, value, **kw):
+      name = kw.pop("name", None)
+      read_value = kw.pop("read_value", True)
+      with self._handle_graph(var.handle):
+        op = gen_resource_variable_ops.assign_variable_op(
+            var.handle, ops.convert_to_tensor(value, dtype=self.dtype),
+            name=name)
+      if read_value:
+        return self._read_variable_op(parent_op=op)
+      return op
+
+    return self._assign_func(f=assign_fn, *args, **kwargs)
+
+  @property
+  def aggregation(self):
+    return self._aggregation
+
+  @property
+  def constraint(self):
+    return None
+
+  @property
+  def initializer(self):
+    return control_flow_ops.group(
+        [v.initializer for v in nest.flatten(self._index)])
+
+  @property
+  def graph(self):
+    return self._primary_var.graph
+
+  @property
+  def _shared_name(self):
+    return self._common_name
+
+  @property
+  def _unique_id(self):
+    return self._primary_var._unique_id  # pylint: disable=protected-access
+
+  @property
+  def name(self):
+    return self._primary_var.name
+
+  @property
+  def dtype(self):
+    return self._primary_var.dtype
+
+  @property
+  def shape(self):
+    return self._primary_var.shape
+
+  def get_shape(self):
+    return self._primary_var.get_shape()
+
+  def to_proto(self, export_scope=None):
+    return self._primary_var.to_proto(export_scope=export_scope)
+
+  def _get_cross_tower(self):
+    device = device_util.canonicalize(device_util.current())
+    if device in self._index:
+      return self._index[device]
+    return self._primary_var
+
+  def _as_graph_element(self):
+    # pylint: disable=protected-access
+    if distribution_strategy_context.get_cross_tower_context():
+      return self._primary_var._as_graph_element()
+    return self._read_variable_op()
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides CheckpointableBase method.
+
+    This allows both name-based and object-based save and restore of
+    MirroredVariables.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+    def _saveable_factory(name=self._common_name):
+      return _MirroredSaveable(self, self._primary_var, name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  def _should_act_as_resource_variable(self):
+    """Pass resource_variable_ops.is_resource_variable check."""
+    pass
+
+  # Needed to pass ResourceVariable checks.
+  @property
+  def op(self):
+    return self._primary_var.op
+
+  @property
+  def _in_graph_mode(self):
+    return self._primary_var._in_graph_mode   # pylint: disable=protected-access
+
+  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+    """Converts a variable to a tensor."""
+    # pylint: disable=protected-access
+    if _enclosing_tpu_context() is None:
+      return self._get()._dense_var_to_tensor(dtype, name, as_ref)
+    # pylint: enable=protected-access
+    if dtype is not None and dtype != self.dtype:
+      raise NotImplementedError
+    if as_ref:
+      return self.handle
+    else:
+      return self.read_value()
+
+  def is_initialized(self, name=None):
+    """Identifies if all the component variables are initialized.
+
+    Args:
+      name: Name of the final `logical_and` op.
+
+    Returns:
+      The op that evaluates to True or False depending on if all the
+      component variables are initialized.
+    """
+    # TODO(jhseu): Do we need TPU context implementation?
+
+    # We have to cast the self._index.values() to a `list` because when we
+    # use `model_to_estimator` to run tf.keras models, self._index.values() is
+    # of type `dict_values` and not `list`.
+    values_list = nest.flatten(self._index)
+    result = values_list[0].is_initialized()
+    # We iterate through the list of values except the last one to allow us to
+    # name the final `logical_and` op the same name that is passed by the user
+    # to the `is_initialized` op. For distributed variables, the
+    # `is_initialized` op is a `logical_and` op.
+    for v in values_list[1:-1]:
+      result = math_ops.logical_and(result, v.is_initialized())
+    result = math_ops.logical_and(result, values_list[-1].is_initialized(),
+                                  name=name)
+    return result
+
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+def _tensor_conversion_tpu_mirrored(var, dtype=None, name=None, as_ref=False):
+  return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
+
+
+ops.register_tensor_conversion_function(TPUMirroredVariable,
+                                        _tensor_conversion_tpu_mirrored)
+ops.register_dense_tensor_like_type(TPUMirroredVariable)
+
+
 class _TowerLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
   """Class for defining how to restore a TowerLocalVariable."""
 
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py b/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
index 598da7418e..004b1012e5 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
@@ -78,7 +78,7 @@ class ReplicatedVariable(object):
     if tpu_context is None:
       return self._primary_var.handle
 
-    return tpu_context.get_replicated_var_handle(self)
+    return tpu_context.get_replicated_var_handle(self._name, self._vars)
 
   @contextlib.contextmanager
   def _assign_dependencies(self):
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 883e08bf47..11aaa1c66a 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -155,19 +155,20 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     self._pivot = pivot
     self._replicated_vars = {}
 
-  def get_replicated_var_handle(self, var):
+  def get_replicated_var_handle(self, name, vars_):
     """Returns a variable handle for replicated TPU variable 'var'.
 
     This is a method used by an experimental replicated variable implementation
     and is not intended as a public API.
 
     Args:
-      var: The replicated TPU variable.
+      name: The common name of the variable.
+      vars_: The replicated TPU variables.
 
     Returns:
       The handle of the TPU replicated input node.
     """
-    handle = self._replicated_vars.get(var)
+    handle = self._replicated_vars.get(name)
     if handle is not None:
       return handle
 
@@ -183,10 +184,10 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     saved_context = graph._get_control_flow_context()
     graph._set_control_flow_context(self.outer_context)
     handle = tpu_ops.tpu_replicated_input(
-        [v.handle for v in var._vars], name=var.name + "/handle")
+        [v.handle for v in vars_], name=name + "/handle")
     graph._set_control_flow_context(saved_context)
     # pylint: enable=protected-access
-    self._replicated_vars[var] = handle
+    self._replicated_vars[name] = handle
     return handle
 
   def report_unsupported_operations(self):
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 78f3198011..deac29111f 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -619,7 +619,7 @@ pywrap_tensorflow.TFE_Py_RegisterVSpace(_default_vspace)
 
 def _handle_or_self(x):
   """If x is ResourceVariable, return its handle, else x."""
-  if isinstance(x, resource_variable_ops.ResourceVariable):
+  if resource_variable_ops.is_resource_variable(x):
     x = x.handle
   return x
 
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 34faf03bb0..e6d82f0db7 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -468,6 +468,10 @@ class Estimator(object):
 
       with ops.Graph().as_default():
         if self._eval_distribution:
+          # We want to create the iterations variable outside the distribution
+          # scope as that is just stored on the host and mainly used to drive
+          # the loop and doesn't need to be a Mirrored/Device variable.
+          training.get_or_create_steps_per_run_variable()
           with self._eval_distribution.scope():
             return _evaluate()
         else:
diff --git a/tensorflow/python/estimator/util.py b/tensorflow/python/estimator/util.py
index 31e4778e72..fb110c4b7b 100644
--- a/tensorflow/python/estimator/util.py
+++ b/tensorflow/python/estimator/util.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 import os
 import time
 
-from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training
@@ -144,14 +143,11 @@ class StrategyInitFinalizeHook(training.SessionRunHook):
     self._finalize_fn = finalize_fn
 
   def begin(self):
+    # We only create the init ops, but don't run it. We rely on SessionManager
+    # to run it for us.
     self._init_ops = self._initialization_fn()
     self._finalize_ops = self._finalize_fn()
 
-  def after_create_session(self, session, coord):
-    logging.info('Initialize system')
-    session.run(self._init_ops,
-                options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
-
   def end(self, session):
     logging.info('Finalize system.')
     session.run(self._finalize_ops)
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index f004f3944a..30b0ed20c8 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -471,7 +471,10 @@ class Optimizer(
 
       if var_list is None:
         var_list = tape.watched_variables()
-      grads = tape.gradient(loss_value, var_list, grad_loss)
+      # TODO(jhseu): Figure out why GradientTape's gradients don't require loss
+      # to be executed.
+      with ops.control_dependencies([loss_value]):
+        grads = tape.gradient(loss_value, var_list, grad_loss)
       return list(zip(grads, var_list))
 
     # Non-callable/Tensor loss case
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index a2e0645ba8..5e4749f306 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -182,6 +183,10 @@ class SessionManager(object):
     """
     self._target = master
     sess = session.Session(self._target, graph=self._graph, config=config)
+    # TODO(jhseu): Delete once tpu.initialize_system() goes away.
+    sess.run(
+        distribution_strategy_context.get_distribution_strategy().initialize()
+    )
 
     if checkpoint_dir and checkpoint_filename_with_path:
       raise ValueError("Can not provide both checkpoint_dir and "
-- 
GitLab


From 3760cb47f3603638cf88c8771640af9debd30bad Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Fri, 28 Sep 2018 19:07:21 -0700
Subject: [PATCH 188/570] Fix bad reference to self._name in TPUEstimator
 infeed loop.

PiperOrigin-RevId: 215029224
---
 tensorflow/contrib/tpu/python/tpu/tpu_estimator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 545cee637f..3aa5b6efa1 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -426,10 +426,10 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
   def _run_infeed(self, queue_ctx, session):
     logging.info('Starting infeed thread controller.')
     if self._initial_infeed_sleep_secs:
-      logging.info('%s thread sleeping for %d seconds.', self._name,
+      logging.info('Infeed thread sleeping for %d seconds.',
                    self._initial_infeed_sleep_secs)
       time.sleep(self._initial_infeed_sleep_secs)
-      logging.info('%s thread starting after sleep', self._name)
+      logging.info('Infeed thread starting after sleep')
 
     with self._rendezvous.catch_errors(source='infeed', session=session):
       if self._run_infeed_loop_on_coordinator:
-- 
GitLab


From b5c66300d2c15a9bf1a8631161efa1a057e6ed31 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 19:35:15 -0700
Subject: [PATCH 189/570] Add learning_rates input to the
 SendTPUEmbeddingGradients op. This allows the learning rate to be modified at
 runtime. The implementation is not yet complete.

PiperOrigin-RevId: 215030536
---
 tensorflow/contrib/tpu/BUILD                  |  3 +
 .../contrib/tpu/ops/tpu_embedding_ops.cc      | 86 ++++++++++++-------
 tensorflow/contrib/tpu/python/ops/tpu_ops.py  | 27 ++++++
 3 files changed, 87 insertions(+), 29 deletions(-)

diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index e9aa037634..0c4bdab191 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -133,6 +133,9 @@ tf_custom_op_library(
 
 tf_gen_op_wrapper_py(
     name = "tpu_ops",
+    hidden = [
+        "SendTPUEmbeddingGradients",
+    ],
     deps = [
         ":cross_replica_ops_op_lib",
         ":heartbeat_ops_op_lib",
diff --git a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
index bc1a0c5284..6b0730b40c 100644
--- a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
@@ -365,11 +365,11 @@ REGISTER_OP("RecvTPUEmbeddingActivations")
 An op that receives embedding activations on the TPU.
 
 The TPU system performs the embedding lookups and aggregations specified by
-the arguments to TPUEmbeddingEnqueueSparseBatch. The results of these
-aggregations are visible to the Tensorflow Graph as the outputs of a
-TPUEmbeddingDequeueActivations Op. This op returns a list containing one
-Tensor of activations per table specified in the model. There can be at most
-one ReceieveActivations op in the TPU graph.
+the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
+results of these aggregations are visible to the Tensorflow Graph as the
+outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
+one Tensor of activations per table specified in the model. There can be at
+most one RecvTPUEmbeddingActivations op in the TPU graph.
 
 outputs: A TensorList of embedding activations containing one Tensor per
     embedding table in the model.
@@ -407,10 +407,25 @@ lookup_id: Identifier of the set of embedding indices which produced these
 
 REGISTER_OP("SendTPUEmbeddingGradients")
     .Input("inputs: N * float32")
+    .Input("learning_rates: NN * float32")
     .Attr("N: int >= 1")
+    .Attr("NN: int >= 0 = 0")
     .Attr("config: string")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
+      int nn;
+      TF_RETURN_IF_ERROR(c->GetAttr("NN", &nn));
+      std::vector<shape_inference::ShapeHandle> learning_rates;
+      TF_RETURN_IF_ERROR(c->input("learning_rates", &learning_rates));
+      for (int i = 0; i < nn; ++i) {
+        // Verify that each learning_rates element is scalar
+        shape_inference::ShapeHandle learning_rates_shape;
+        TF_RETURN_IF_ERROR(
+            c->WithRank(learning_rates[i], 0, &learning_rates_shape));
+      }
+
+      return Status::OK();
+    })
     .Doc(R"doc(
 An op that performs gradient updates of embedding tables.
 
@@ -421,6 +436,11 @@ from these gradients via the optimizer specified in the configuration given
 to tpu.initialize_system.
 
 inputs: A TensorList of gradients with which to update embedding tables.
+    It contains one tensor per embedding table in the model.
+learning_rates: A list of float32 scalars, one for each embedding table,
+    containing the learning rates for each table when dynamic learning rate is
+    enabled through the OptimizationParameters in TPUEmbeddingConfiguration.
+    When the learning rate is constant, the list should be empty.
 config: Serialized TPUEmbeddingConfiguration proto.
 )doc");
 
@@ -434,10 +454,9 @@ REGISTER_OP("EnqueueTPUEmbeddingIntegerBatch")
 An op that enqueues a list of input batch tensors to TPUEmbedding.
 
 batch: A list of 1D tensors, one for each embedding table, containing the
-batch inputs represented as integers.
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
+    indices into the tables.
+device_ordinal: The TPU device to use. Should be >= 0 and less than the number
+    of TPU cores in the task on which the node is placed.
 )doc");
 
 REGISTER_OP("EnqueueTPUEmbeddingSparseBatch")
@@ -467,7 +486,8 @@ An op that enqueues TPUEmbedding input indices from a SparseTensor.
 This Op eases the porting of code that uses embedding_lookup_sparse(),
 although some Python preprocessing of the SparseTensor arguments to
 embedding_lookup_sparse() is required to produce the arguments to this Op,
-since only a single EnqueueTPUEmbedding Op is allowed per training step.
+since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
+step.
 
 The tensors at corresponding positions in the three input lists
 must have the same shape, i.e. rank 1 with dim_size() equal to the total
@@ -477,15 +497,18 @@ sample_indices: A list of Rank 1 Tensors specifying the training example and
     feature to which the corresponding embedding_indices and aggregation_weights
     values belong. sample_indices[i] must equal b * nf + f, where nf is the
     number of features from the corresponding table, f is in [0, nf), and
-    b is in [0, training batch size).
+    b is in [0, batch size).
 embedding_indices: A list of Rank 1 Tensors, indices into the embedding tables.
 aggregation_weights: A list of Rank 1 Tensors containing per sample -- i.e. per
     (training example, feature) -- aggregation weights.
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
-combiners: A list of string scalars whose values are 'mean', 'sum', or 'sqrtn'
-to specify how to normalize the embedding activations after weighted summation.
+device_ordinal: The TPU device to use. Should be >= 0 and less than the number
+    of TPU cores in the task on which the node is placed.
+combiners: A list of string scalars, one for each embedding table that specify
+    how to normalize the embedding activations after weighted summation.
+    Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+    the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+    0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+    all tables.
 )doc");
 
 REGISTER_OP("EnqueueTPUEmbeddingSparseTensorBatch")
@@ -505,22 +528,27 @@ sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
 to ith feature. table_ids[i] indicates which embedding table to look up ith
 feature.
 
+The tensors at corresponding positions in the three input lists (sample_indices,
+embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
+with dim_size() equal to the total number of lookups into the table described by
+the corresponding feature.
+
 sample_indices: A list of Rank 1 Tensors, corresponds to sp_ids.indices[:,0] in
-embedding_lookup_sparse().
+    embedding_lookup_sparse().
 embedding_indices: A list of Rank 1 Tensors, corresponds to sp_ids.values
- in embedding_lookup_sparse().
+    in embedding_lookup_sparse().
 aggregation_weights: A list of Rank 1 Tensors, corresponds to sp_weights.values
- in embedding_lookup_sparse().
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
-combiners: A list of strings, one for each embedding table, specifying the
-reduction operation.  Currently, 'sum', 'mean' and 'sqrtn' are supported. It is
-invalid to have the sum of the weights be 0 for 'mean' or the sum of the squared
-weights be 0 for 'sqrtn'. If combiners isn't passed, the default is to
-use 'sum' for all tables.
+    in embedding_lookup_sparse().
+device_ordinal: The TPU device to use. Should be >= 0 and less than the number
+    of TPU cores in the task on which the node is placed.
+combiners: A list of string scalars, one for each embedding table that specify
+    how to normalize the embedding activations after weighted summation.
+    Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+    the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+    0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+    all tables.
 table_ids: A list of int. table_ids[i] indicates which embedding table to look
-up ith feature.
+    up ith feature in the list.
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ops.py b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
index a1aee69691..e2e4acadab 100644
--- a/tensorflow/contrib/tpu/python/ops/tpu_ops.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
@@ -200,6 +200,33 @@ if platform.system() != "Windows":
     return gen_tpu_ops.infeed_dequeue_tuple(dtypes, shapes, name=name)
   # pylint: enable=redefined-outer-name
 
+  # pylint: disable=protected-access
+  def send_tpu_embedding_gradients(inputs,
+                                   config,
+                                   learning_rates=None,
+                                   name=None):
+    """A placeholder op for feeding per-sample gradients to the embedding layer.
+
+    Args:
+      inputs: A TensorList of gradients with which to update embedding tables.
+        Contains one tensor per embedding table in the model.
+      config: Serialized TPUEmbeddingConfiguration proto.
+      learning_rates: A TensorList of float32 scalars, one for each embedding
+        table, containing the learning rates for each table when dynamic
+        learning rate is enabled through the OptimizationParameters in
+        TPUEmbeddingConfiguration. When the learning rate is constant, the list
+        should be empty (optional).
+      name: A name for the operation (optional).
+
+    Returns:
+      A SendTPUEmbeddingGradients operation.
+    """
+    if learning_rates is None:
+      learning_rates = []
+    return gen_tpu_ops._send_tpu_embedding_gradients(
+        inputs=inputs, learning_rates=learning_rates, config=config, name=name)
+
+
 else:
   # We have already built the appropriate libraries into the binary via CMake
   # if we have built contrib, so we don't need this
-- 
GitLab


From d936d819752916d3122f02def571ecac9e995029 Mon Sep 17 00:00:00 2001
From: "Xiaoming (Jason) Cui" <xiaoming.cui@intel.com>
Date: Fri, 28 Sep 2018 19:49:23 -0700
Subject: [PATCH 190/570] Lower the MKLCpuAllocator priority so that it can use
 default allocator when MKL is disabled, and with some  minor changes

---
 .../core/common_runtime/mkl_cpu_allocator.h   | 54 ++++++-------------
 .../core/common_runtime/process_util.cc       | 37 ++++++-------
 .../core/common_runtime/threadpool_device.cc  |  4 +-
 3 files changed, 36 insertions(+), 59 deletions(-)

diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 516138d28d..429b19599b 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mem.h"
-#include "tensorflow/core/util/util.h"
 #include "tensorflow/core/platform/numa.h"
 
 #ifndef INTEL_MKL_DNN_ONLY
@@ -164,12 +163,6 @@ class MklCPUAllocator : public Allocator {
   }
 
   Status Initialize() {
-    if (DisableMKL()) {
-        VLOG(1) << "TF-MKL: Disabling pool allocator";
-        tf_disable_pool_allocator_flag_ = true;
-        return Status::OK();
-    }
-
     VLOG(2) << "MklCPUAllocator: In MklCPUAllocator";
 
     // Set upper bound on memory allocation to physical RAM available on the
@@ -224,10 +217,6 @@ class MklCPUAllocator : public Allocator {
   inline string Name() override { return kName; }
 
   inline void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    if (tf_disable_pool_allocator_flag_) {
-      return port::AlignedMalloc(num_bytes, alignment);
-    }
-
     // If the allocation size is less than threshold, call small allocator,
     // otherwise call large-size allocator (BFC). We found that BFC allocator
     // does not deliver good performance for small allocations when
@@ -238,10 +227,6 @@ class MklCPUAllocator : public Allocator {
   }
 
   inline void DeallocateRaw(void* ptr) override {
-    if (tf_disable_pool_allocator_flag_) {
-      port::AlignedFree(ptr);
-      return;
-    }
     // Check if ptr is for "small" allocation. If it is, then call Free
     // directly. Otherwise, call BFC to handle free.
     if (small_size_allocator_->IsSmallSizeAllocation(ptr)) {
@@ -252,30 +237,26 @@ class MklCPUAllocator : public Allocator {
   }
 
   void GetStats(AllocatorStats* stats) override {
-    if (!tf_disable_pool_allocator_flag_) {
-      AllocatorStats l_stats, s_stats;
-      small_size_allocator_->GetStats(&s_stats);
-      large_size_allocator_->GetStats(&l_stats);
-
-      // Combine statistics from small-size and large-size allocator.
-      stats->num_allocs = l_stats.num_allocs + s_stats.num_allocs;
-      stats->bytes_in_use = l_stats.bytes_in_use + s_stats.bytes_in_use;
-      stats->max_bytes_in_use =
-          l_stats.max_bytes_in_use + s_stats.max_bytes_in_use;
-
-      // Since small-size allocations go to MklSmallSizeAllocator,
-      // max_alloc_size from large_size_allocator would be the maximum
-      // size allocated by MklCPUAllocator.
-      stats->max_alloc_size = l_stats.max_alloc_size;
-      stats->bytes_limit = std::max(s_stats.bytes_limit, l_stats.bytes_limit);
-    }
+    AllocatorStats l_stats, s_stats;
+    small_size_allocator_->GetStats(&s_stats);
+    large_size_allocator_->GetStats(&l_stats);
+
+    // Combine statistics from small-size and large-size allocator.
+    stats->num_allocs = l_stats.num_allocs + s_stats.num_allocs;
+    stats->bytes_in_use = l_stats.bytes_in_use + s_stats.bytes_in_use;
+    stats->max_bytes_in_use =
+        l_stats.max_bytes_in_use + s_stats.max_bytes_in_use;
+
+    // Since small-size allocations go to MklSmallSizeAllocator,
+    // max_alloc_size from large_size_allocator would be the maximum
+    // size allocated by MklCPUAllocator.
+    stats->max_alloc_size = l_stats.max_alloc_size;
+    stats->bytes_limit = std::max(s_stats.bytes_limit, l_stats.bytes_limit);
   }
 
   void ClearStats() override {
-    if (!tf_disable_pool_allocator_flag_) {
-      small_size_allocator_->ClearStats();
-      large_size_allocator_->ClearStats();
-    }
+    small_size_allocator_->ClearStats();
+    large_size_allocator_->ClearStats();
   }
 
  private:
@@ -314,7 +295,6 @@ class MklCPUAllocator : public Allocator {
   // The alignment that we need for the allocations
   static constexpr const size_t kAlignment = 64;
 
-  bool tf_disable_pool_allocator_flag_ = false;
   Allocator* large_size_allocator_;              // owned by this class
   MklSmallSizeAllocator* small_size_allocator_;  // owned by this class.
 
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 60fa601907..b3064a4c08 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -57,28 +57,25 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   const int32 inter_op = options.config.inter_op_parallelism_threads();
   if (inter_op != 0) return inter_op;
 #ifdef INTEL_MKL
-  // Early return if MKL is disabled
-  if (DisableMKL())
-    return port::NumSchedulableCPUs();
-
-  // MKL library executes ops in parallel using OMP threads
-  // Set inter_op conservatively to avoid thread oversubscription that could
-  // lead to severe perf degradations and OMP resource exhaustion
-  int mkl_intra_op = 1;
-#ifdef _OPENMP
-  mkl_intra_op = omp_get_max_threads();
-#endif  // _OPENMP
-  CHECK_GE(mkl_intra_op, 1);
-  const int32 mkl_inter_op = std::max(
-      (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
-  VLOG(0) << "Creating new thread pool with default inter op setting: "
-          << mkl_inter_op
-          << ". Tune using inter_op_parallelism_threads for best performance.";
-  return mkl_inter_op;
-#else
+  if (!DisableMKL()) {
+    // MKL library executes ops in parallel using OMP threads
+    // Set inter_op conservatively to avoid thread oversubscription that could
+    // lead to severe perf degradations and OMP resource exhaustion
+    int mkl_intra_op = 1;
+  #ifdef _OPENMP
+    mkl_intra_op = omp_get_max_threads();
+  #endif  // _OPENMP
+    CHECK_GE(mkl_intra_op, 1);
+    const int32 mkl_inter_op = std::max(
+        (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
+    VLOG(0) << "Creating new thread pool with default inter op setting: "
+            << mkl_inter_op
+            << ". Tune using inter_op_parallelism_threads for best performance.";
+    return mkl_inter_op;
+  }
+#endif  // INTEL_MKL
   // Default to using the number of cores available in the process.
   return port::NumSchedulableCPUs();
-#endif  // INTEL_MKL
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 29c01d7f72..f188016610 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -50,7 +50,7 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
       allocator_(allocator),
       scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
 #ifdef INTEL_MKL
-  // Eearly return when MKL is disabled
+  // Early return when MKL is disabled
   if (DisableMKL())
     return;
 #ifdef _OPENMP
@@ -118,7 +118,7 @@ class MklCPUAllocatorFactory : public AllocatorFactory {
 };
 
 #ifdef ENABLE_MKL
-REGISTER_MEM_ALLOCATOR("MklCPUAllocator", 200, MklCPUAllocatorFactory);
+REGISTER_MEM_ALLOCATOR("MklCPUAllocator", (DisableMKL() ? 50 : 200), MklCPUAllocatorFactory);
 #endif  // ENABLE_MKL
 
 }  // namespace
-- 
GitLab


From 2e0e934e0b3c00863918c78bf55524eea3f0c0dc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 28 Sep 2018 20:51:11 -0700
Subject: [PATCH 191/570] Make tf.contrib.crf compatible with TPUs by using
 utils.smart_cond instead of tf.cond, which allows the static shape to be
 propagated correctly when available.

PiperOrigin-RevId: 215034102
---
 tensorflow/contrib/crf/python/ops/crf.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 2a91dcb63a..43bb43129b 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -56,7 +56,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
@@ -214,10 +213,11 @@ def crf_log_norm(inputs, sequence_lengths, transition_params):
                                log_norm)
     return log_norm
 
-  max_seq_len = array_ops.shape(inputs)[1]
-  return control_flow_ops.cond(pred=math_ops.equal(max_seq_len, 1),
-                               true_fn=_single_seq_fn,
-                               false_fn=_multi_seq_fn)
+  return utils.smart_cond(
+      pred=math_ops.equal(inputs.shape[1].value or
+                          array_ops.shape(inputs)[1], 1),
+      true_fn=_single_seq_fn,
+      false_fn=_multi_seq_fn)
 
 
 def crf_log_likelihood(inputs,
-- 
GitLab


From d78595d333c9b5c8a0705ba6852c08b107d6c462 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 29 Sep 2018 00:59:02 -0700
Subject: [PATCH 192/570] Make cuda_py_test create a gpu and cpu target.

Currently, we run tests on machines with GPUs based on the "gpu" tag, and the
tests automatically adapt to whether a GPU is available. Creating two targets,
one tagged with "gpu" and one not, will make us run the tests in both modes.

PiperOrigin-RevId: 215045035
---
 tensorflow/python/data/kernel_tests/BUILD     |  1 +
 tensorflow/tensorflow.bzl                     | 39 +++++++++++--------
 .../tools/pip_package/pip_smoke_test.py       |  2 +-
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index cadfe7f9e0..99d7f70513 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -318,6 +318,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
     ],
     tags = [
+        "no_oss",  # TODO(b/116813115): Investigate timeout and re-enable.
         "no_windows_gpu",
     ],
 )
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index cad5de1b0c..dead44c57e 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1798,22 +1798,29 @@ def cuda_py_test(
         flaky = 0,
         xla_enabled = False,
         grpc_enabled = False):
-    test_tags = tags + tf_cuda_tests_tags()
-    tf_py_test(
-        name = name,
-        size = size,
-        srcs = srcs,
-        data = data,
-        main = main,
-        args = args,
-        tags = test_tags,
-        shard_count = shard_count,
-        additional_deps = additional_deps,
-        kernels = kernels,
-        flaky = flaky,
-        xla_enabled = xla_enabled,
-        grpc_enabled = grpc_enabled,
-    )
+    if main == None:
+        main = name + ".py"
+    for config in ["cpu", "gpu"]:
+        test_name = name
+        test_tags = tags
+        if config == "gpu":
+            test_name += "_gpu"
+            test_tags = test_tags + tf_cuda_tests_tags()
+        tf_py_test(
+            name = test_name,
+            size = size,
+            srcs = srcs,
+            data = data,
+            main = main,
+            args = args,
+            tags = test_tags,
+            shard_count = shard_count,
+            additional_deps = additional_deps,
+            kernels = kernels,
+            flaky = flaky,
+            xla_enabled = xla_enabled,
+            grpc_enabled = grpc_enabled,
+        )
 
 register_extension_info(
     extension_name = "cuda_py_test",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index c6ef82ccdc..e7f9628fa6 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -142,7 +142,7 @@ def main():
 
   missing_dependencies = []
   # File extensions and endings to ignore
-  ignore_extensions = ["_test", "_test.py"]
+  ignore_extensions = ["_test", "_test.py", "_test_gpu", "_test_gpu.py"]
 
   ignored_files = 0
   blacklisted_files = len(BLACKLIST)
-- 
GitLab


From d8db18b4201d9d82d1c93ed5453914ff16f1adf4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 29 Sep 2018 02:02:02 -0700
Subject: [PATCH 193/570] compat: Update forward compatibility horizon to
 2018-09-29

PiperOrigin-RevId: 215048726
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index b74fce3a4c..24a795c787 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -26,7 +26,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 9, 28)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 9, 29)
 
 
 @tf_export("compat.forward_compatible")
-- 
GitLab


From f16111286b19f4145df63b73c45be1645bde8737 Mon Sep 17 00:00:00 2001
From: Bairen Yi <byi@connect.ust.hk>
Date: Sat, 29 Sep 2018 22:13:09 +0800
Subject: [PATCH 194/570] Added log entries for copying unpinned memory RDMA

Currently there are large number of tensors managed
by non-visitable memory allocators in CPU-only PS.
GPU workers seem less prone to this problem.

Copying large sized tensor buffers may introduce
non-trivial overhead. Should probably fix this.

Signed-off-by: Bairen Yi <byi@connect.ust.hk>
---
 tensorflow/contrib/gdr/gdr_memory_manager.cc | 156 +++++++++++--------
 1 file changed, 93 insertions(+), 63 deletions(-)

diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index bb06f1c41c..3549cedb70 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <fstream>
 #include <list>
 #include <map>
-#include <set>
 
 #include <fcntl.h>
 #include <rdma/rdma_cma.h>
@@ -30,19 +29,17 @@ limitations under the License.
 #include <sys/epoll.h>
 
 #include "tensorflow/contrib/gdr/gdr.pb.h"
-#include "tensorflow/core/common_runtime/bfc_allocator.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/common_runtime/pool_allocator.h"
 #include "tensorflow/core/common_runtime/process_state.h"
 #if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #endif  // GOOGLE_CUDA
-#include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 
 namespace tensorflow {
 
@@ -70,14 +67,11 @@ bool IsGDRAvailable() {
 int TryToReadNumaNode(ibv_device* device) {
 #if defined(__APPLE__)
   LOG(INFO) << "OS X does not support NUMA - returning NUMA node 0";
-  return 0;
+  return port::kNUMANoAffinity;
 #elif defined(PLATFORM_WINDOWS)
   // Windows support for NUMA is not currently implemented. Return node 0.
-  return 0;
+  return port::kNUMANoAffinity;
 #else
-  VLOG(2) << "Trying to read NUMA node for device: " << device->name;
-  static const int kUnknownNumaNode = -1;
-
   auto filename = string(device->ibdev_path) + "/device/numa_node";
 
   std::ifstream ifs(filename.c_str());
@@ -91,12 +85,12 @@ int TryToReadNumaNode(ibv_device* device) {
                 << value
                 << "), but there must be at least one NUMA node"
                    ", so returning NUMA node zero";
-      return 0;
+      return port::kNUMANoAffinity;
     }
     LOG(INFO) << "NUMA node for device: " << device->name << " is " << value;
     return value;
   }
-  return kUnknownNumaNode;
+  return port::kNUMANoAffinity;
 #endif
 }
 
@@ -138,8 +132,6 @@ class GdrMemoryManager : public RemoteMemoryManager {
       Device* device, DeviceContext* device_context, bool on_host,
       StatusCallback done) override;
 
-  static void RegMemVisitors();
-
  protected:
   Status CreateEndpoint(const string& host, const string& port,
                         RdmaEndpointPtr& endpoint);
@@ -150,7 +142,8 @@ class GdrMemoryManager : public RemoteMemoryManager {
 
   ibv_mr* FindMemoryRegion(void* addr, size_t length);
 
-  void InsertMemoryRegion(void* addr, size_t length);
+  void InsertMemoryRegion(void* addr, size_t length,
+                          const std::string& allocator_name);
 
   void EvictMemoryRegion(void* addr, size_t length);
 
@@ -160,6 +153,7 @@ class GdrMemoryManager : public RemoteMemoryManager {
   RdmaEndpointPtr listening_;
   std::atomic<bool> stopped_;
   int epfd_;
+  int numa_node_;
 
   // Server side endpoints
   // Accessed sequentially in Run() so not protected by lock
@@ -190,46 +184,10 @@ GdrMemoryManager::GdrMemoryManager(const string& host, const string& port)
       port_(port),
       listening_(nullptr, EndpointDeleter),
       stopped_(true),
-      next_key_(0) {
-  static std::once_flag flag;
-  std::call_once(flag, []() { RegMemVisitors(); });
-}
+      next_key_(0) {}
 
 GdrMemoryManager::~GdrMemoryManager() { close(epfd_); }
 
-/*static*/ void GdrMemoryManager::RegMemVisitors() {
-  SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node,
-                                           size_t num_bytes) {
-    GdrMemoryManager::Singleton().InsertMemoryRegion(
-        ptr, num_bytes, strings::StrCat("CPU:", numa_node));
-  };
-  SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node,
-                                          size_t num_bytes) {
-    GdrMemoryManager::Singleton().EvictMemoryRegion(ptr, num_bytes);
-  };
-  ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor);
-  ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
-
-#if GOOGLE_CUDA
-  if (IsGDRAvailable()) {
-    int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1;
-
-    // Note we don't free allocated GPU memory so there is no free visitor
-    SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id,
-                                                  size_t num_bytes) {
-      RdmaMemoryMgr::Singleton().InsertMemoryRegion(
-          ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
-    };
-    GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id,
-                                                     cuda_alloc_visitor);
-    GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id,
-                                                          alloc_visitor);
-    GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor);
-    LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id;
-  }
-#endif  // GOOGLE_CUDA
-}
-
 Status GdrMemoryManager::Init() {
   epfd_ = epoll_create1(0);
   if (epfd_ == -1) {
@@ -289,6 +247,42 @@ Status GdrMemoryManager::Init() {
                                "cannot add server to epoll");
   }
 
+  numa_node_ = TryToReadNumaNode(listening_->verbs->device);
+
+  SubAllocator::Visitor alloc_visitor = [this](void* ptr, int numa_node,
+                                               size_t num_bytes) {
+    VLOG(2) << "Registering RDMA capable memory region on numa_node "
+            << numa_node;
+    InsertMemoryRegion(ptr, num_bytes, strings::StrCat("CPU:", numa_node));
+  };
+  SubAllocator::Visitor free_visitor = [this](void* ptr, int numa_node,
+                                              size_t num_bytes) {
+    VLOG(2) << "De-registering RDMA capable memory region on numa_node "
+            << numa_node;
+    EvictMemoryRegion(ptr, num_bytes);
+  };
+  ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor);
+  ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
+  LOG(INFO) << "Instrumenting CPU allocator(s)";
+
+#if GOOGLE_CUDA
+  if (IsGDRAvailable()) {
+    int bus_id = numa_node_ + 1;
+
+    SubAllocator::Visitor cuda_alloc_visitor = [this](void* ptr, int gpu_id,
+                                                      size_t num_bytes) {
+      VLOG(2) << "Registering RDMA capable memory region on GPU " << gpu_id;
+      InsertMemoryRegion(ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
+    };
+    GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id,
+                                                     cuda_alloc_visitor);
+    GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id,
+                                                          alloc_visitor);
+    GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor);
+    LOG(INFO) << "Instrumenting GPU allocator(s) with bus_id " << bus_id;
+  }
+#endif  // GOOGLE_CUDA
+
   return Status::OK();
 }
 
@@ -405,7 +399,7 @@ void GdrMemoryManager::TransportOptionsFromTensor(
   ibv_mr* mr = FindMemoryRegion(addr, length);
 
 #if GOOGLE_CUDA
-  if (!on_host) {
+  if (device->tensorflow_gpu_device_info() && !on_host) {
     Allocator* alloc = GPUProcessState::singleton()->GetCUDAHostAllocator(0);
     Tensor* host_copy = new Tensor(alloc, tensor.dtype(), tensor.shape());
     GPUUtil::CopyGPUTensorToCPU(
@@ -456,11 +450,27 @@ void GdrMemoryManager::TransportOptionsFromTensor(
 #endif
 
   if (mr == nullptr) {
-    done(errors::Unavailable("Cannot find pinned memory region"));
-    return;
+    Allocator* alloc = ProcessState::singleton()->GetCPUAllocator(numa_node_);
+    Tensor host_copy(alloc, tensor.dtype(), tensor.shape());
+
+    std::memcpy(DMAHelper::buffer(&host_copy)->data(), buffer->data(), length);
+    VLOG(2) << "Copying " << length << " bytes unpinned tensor buffer";
+
+    buffer = DMAHelper::buffer(&host_copy);
+    addr = buffer->data();
+    length = buffer->size();
+
+    mr = FindMemoryRegion(addr, length);
+    if (mr == nullptr) {
+      done(errors::Unavailable("Cannot find pinned memory region"));
+      return;
+    }
+
+    buffer->Ref();
+  } else {
+    buffer->Ref();
   }
 
-  buffer->Ref();
   TensorKey tensor_key = next_key_++;
   {
     mutex_lock l(server_mu_);
@@ -470,7 +480,7 @@ void GdrMemoryManager::TransportOptionsFromTensor(
   uint64_t checksum = 0;
   if (VLOG_IS_ON(2)) {
 #ifdef GOOGLE_CUDA
-    if (!on_host) {
+    if (device->tensorflow_gpu_device_info() && !on_host) {
       checksum = GPUUtil::Checksum(device, device_context, tensor);
     } else {
       checksum = GPUUtil::Checksum(tensor);
@@ -508,7 +518,8 @@ void GdrMemoryManager::TensorFromTransportOptions(
   Tensor host_copy;
 #if GOOGLE_CUDA
   if (mr == nullptr && !on_host) {
-    Allocator* alloc = GPUProcessState::singleton()->GetCUDAHostAllocator(0);
+    Allocator* alloc =
+        GPUProcessState::singleton()->GetCUDAHostAllocator(numa_node_);
     host_copy = Tensor(alloc, tensor->dtype(), tensor->shape());
     buffer = DMAHelper::buffer(&host_copy);
     addr = buffer->data();
@@ -518,8 +529,18 @@ void GdrMemoryManager::TensorFromTransportOptions(
 #endif  // GOOGLE_CUDA
 
   if (mr == nullptr) {
-    done(errors::Unavailable("Cannot find pinned memory region"));
-    return;
+    Allocator* alloc = ProcessState::singleton()->GetCPUAllocator(numa_node_);
+    host_copy = Tensor(alloc, tensor->dtype(), tensor->shape());
+
+    buffer = DMAHelper::buffer(&host_copy);
+    addr = buffer->data();
+    length = buffer->size();
+
+    mr = FindMemoryRegion(addr, length);
+    if (mr == nullptr) {
+      done(errors::Unavailable("Cannot find pinned memory region"));
+      return;
+    }
   }
 
   decltype(clients_)::iterator iter;
@@ -568,7 +589,8 @@ void GdrMemoryManager::TensorFromTransportOptions(
   }
 
 #if GOOGLE_CUDA
-  if (host_copy.NumElements() > 0) {
+  if (device->tensorflow_gpu_device_info() && !on_host &&
+      host_copy.NumElements() > 0) {
     uint64_t checksum = 0;
     if (VLOG_IS_ON(2)) {
       checksum = GPUUtil::Checksum(host_copy);
@@ -598,6 +620,12 @@ void GdrMemoryManager::TensorFromTransportOptions(
   }
 #endif  // GOOGLE_CUDA
 
+  if ((on_host || !device->tensorflow_gpu_device_info()) &&
+      host_copy.NumElements() > 0) {
+    std::memcpy(DMAHelper::buffer(tensor)->data(), addr, length);
+    VLOG(2) << "Copying " << length << " bytes unpinned tensor buffer";
+  }
+
   uint64_t end = Env::Default()->NowMicros();
 
   VLOG(2) << "RDMA from remote memory region " << remote_mr.rkey()
@@ -607,7 +635,7 @@ void GdrMemoryManager::TensorFromTransportOptions(
   uint64_t checksum = 0;
   if (VLOG_IS_ON(2)) {
 #ifdef GOOGLE_CUDA
-    if (device->tensorflow_gpu_device_info() && (!on_host)) {
+    if (device->tensorflow_gpu_device_info() && !on_host) {
       checksum = GPUUtil::Checksum(device, device_context, *tensor);
     } else {
       checksum = GPUUtil::Checksum(*tensor);
@@ -668,7 +696,8 @@ ibv_mr* GdrMemoryManager::FindMemoryRegion(void* addr, size_t length) {
   }
 }
 
-void GdrMemoryManager::InsertMemoryRegion(void* addr, size_t length) {
+void GdrMemoryManager::InsertMemoryRegion(void* addr, size_t length,
+                                          const std::string& allocator_name) {
   if (length == 0) return;
   ibv_mr* mr = rdma_reg_read(listening_.get(), addr, length);
   if (mr != nullptr) {
@@ -676,7 +705,8 @@ void GdrMemoryManager::InsertMemoryRegion(void* addr, size_t length) {
     auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
     mrs_.insert(iter, {mr, &MRDeleter});
   } else {
-    LOG(WARNING) << "Cannot register memory region";
+    LOG(WARNING) << "Cannot register memory region allocated by "
+                 << allocator_name;
   }
 }
 
-- 
GitLab


From eb6c1bdcbf6093888f2b443fdb49f836f3352316 Mon Sep 17 00:00:00 2001
From: Joe Yearsley <josephelliotyearsley@gmail.com>
Date: Tue, 13 Mar 2018 07:23:18 +0000
Subject: [PATCH 195/570] Update core.py

Added `data_format` to flatten to allow changing of it during inference time.
---
 tensorflow/python/layers/core.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 9879e5020f..5f89e3c0c3 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -268,7 +268,14 @@ def dropout(inputs,
 @tf_export('layers.Flatten')
 class Flatten(keras_layers.Flatten, base.Layer):
   """Flattens an input tensor while preserving the batch axis (axis 0).
-
+  
+  Arguments:
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+      
   Examples:
 
   ```
@@ -285,11 +292,16 @@ class Flatten(keras_layers.Flatten, base.Layer):
 
 
 @tf_export('layers.flatten')
-def flatten(inputs, name=None):
+def flatten(inputs, data_format='channels_last', name=None):
   """Flattens an input tensor while preserving the batch axis (axis 0).
 
   Arguments:
     inputs: Tensor input.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
     name: The name of the layer (string).
 
   Returns:
@@ -307,7 +319,7 @@ def flatten(inputs, name=None):
     # now `y` has shape `(None, None)`
   ```
   """
-  layer = Flatten(name=name)
+  layer = Flatten(data_format=data_format, name=name)
   return layer.apply(inputs)
 
 
-- 
GitLab


From dd928d5ae31dd0484e5e4a96c6322adecc4e511b Mon Sep 17 00:00:00 2001
From: josephyearsley <joggino23@gmail.com>
Date: Sun, 18 Mar 2018 19:24:10 +0000
Subject: [PATCH 196/570] Added Flatten Test

---
 tensorflow/python/layers/core_test.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index d26f3f4789..0d019897aa 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -476,6 +476,22 @@ class FlattenTest(test.TestCase):
     shape = core_layers.Flatten().compute_output_shape((None, 3, None))
     self.assertEqual(shape.as_list(), [None, None])
 
+  def testDataFormat(self):
+    np_input_channels_last = np.arange(3, 7).reshape([1, 2, 3, 2])
+
+    with self.test_session() as sess:
+      x = array_ops.placeholder(shape=(1, 2, 3, 2), dtype='float32')
+      y = core_layers.Flatten(data_format='channels_last')(x)
+      np_output_cl = sess.run(y, feed_dict={x: np_input_channels_last})
+
+      x = array_ops.placeholder(shape=(1, 2, 3, 2), dtype='float32')
+      y = core_layers.Flatten(data_format='channels_first')(x)
+      np_input_channels_first = np.transpose(np_input_channels_last,
+                                             [0, 3, 1, 2])
+      np_output_cf = sess.run(y, feed_dict={x: np_input_channels_first})
+
+      self.assertEqual(np_output_cl, np_output_cf)
+
   def testFunctionalFlatten(self):
     x = array_ops.placeholder(shape=(None, 2, 3), dtype='float32')
     y = core_layers.flatten(x, name='flatten')
-- 
GitLab


From 579aecd2de1f0582858f83e3c8da2a8dbb57993b Mon Sep 17 00:00:00 2001
From: josephyearsley <joggino23@gmail.com>
Date: Sun, 18 Mar 2018 20:08:59 +0000
Subject: [PATCH 197/570] added dtype to test

---
 tensorflow/python/layers/core_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index 0d019897aa..31f3a4e0b0 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -477,7 +477,7 @@ class FlattenTest(test.TestCase):
     self.assertEqual(shape.as_list(), [None, None])
 
   def testDataFormat(self):
-    np_input_channels_last = np.arange(3, 7).reshape([1, 2, 3, 2])
+    np_input_channels_last = np.arange(12, dtype='float32').reshape([1, 2, 3, 2])
 
     with self.test_session() as sess:
       x = array_ops.placeholder(shape=(1, 2, 3, 2), dtype='float32')
-- 
GitLab


From 76964f315f7c52d63ce6578d87278a96c7394ece Mon Sep 17 00:00:00 2001
From: josephyearsley <joggino23@gmail.com>
Date: Sun, 18 Mar 2018 22:01:21 +0000
Subject: [PATCH 198/570] pylint compliance

---
 tensorflow/python/layers/core.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 5f89e3c0c3..5919fa543e 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -268,14 +268,14 @@ def dropout(inputs,
 @tf_export('layers.Flatten')
 class Flatten(keras_layers.Flatten, base.Layer):
   """Flattens an input tensor while preserving the batch axis (axis 0).
-  
+
   Arguments:
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
       `(batch, ..., channels)` while `channels_first` corresponds to
       inputs with shape `(batch, channels, ...)`.
-      
+
   Examples:
 
   ```
-- 
GitLab


From 110baa57112a95c2644896ce6ff75894e1ae61c7 Mon Sep 17 00:00:00 2001
From: josephyearsley <joggino23@gmail.com>
Date: Sun, 18 Mar 2018 23:10:55 +0000
Subject: [PATCH 199/570] Extended to N-dims

---
 tensorflow/python/layers/core_test.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index 31f3a4e0b0..d5b8a0ff65 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -476,15 +476,31 @@ class FlattenTest(test.TestCase):
     shape = core_layers.Flatten().compute_output_shape((None, 3, None))
     self.assertEqual(shape.as_list(), [None, None])
 
-  def testDataFormat(self):
-    np_input_channels_last = np.arange(12, dtype='float32').reshape([1, 2, 3, 2])
+  def testDataFormat5d(self):
+    np_input_channels_last = np.arange(120, dtype='float32').reshape([1, 5, 4, 3, 2])
 
     with self.test_session() as sess:
-      x = array_ops.placeholder(shape=(1, 2, 3, 2), dtype='float32')
+      x = array_ops.placeholder(shape=(1, 5, 4, 3, 2), dtype='float32')
       y = core_layers.Flatten(data_format='channels_last')(x)
       np_output_cl = sess.run(y, feed_dict={x: np_input_channels_last})
 
-      x = array_ops.placeholder(shape=(1, 2, 3, 2), dtype='float32')
+      x = array_ops.placeholder(shape=(1, 2, 5, 4, 3), dtype='float32')
+      y = core_layers.Flatten(data_format='channels_first')(x)
+      np_input_channels_first = np.transpose(np_input_channels_last,
+                                             [0, 4, 1, 2, 3])
+      np_output_cf = sess.run(y, feed_dict={x: np_input_channels_first})
+
+      self.assertEqual(np_output_cl, np_output_cf)
+
+  def testDataFormat4d(self):
+    np_input_channels_last = np.arange(24, dtype='float32').reshape([1, 4, 3, 2])
+
+    with self.test_session() as sess:
+      x = array_ops.placeholder(shape=(1, 4, 3, 2), dtype='float32')
+      y = core_layers.Flatten(data_format='channels_last')(x)
+      np_output_cl = sess.run(y, feed_dict={x: np_input_channels_last})
+
+      x = array_ops.placeholder(shape=(1, 2, 4, 3), dtype='float32')
       y = core_layers.Flatten(data_format='channels_first')(x)
       np_input_channels_first = np.transpose(np_input_channels_last,
                                              [0, 3, 1, 2])
-- 
GitLab


From 4de591a03a9bd49a05d67fe48f9358dbdac51561 Mon Sep 17 00:00:00 2001
From: josephyearsley <joggino23@gmail.com>
Date: Sat, 30 Jun 2018 08:14:40 +0100
Subject: [PATCH 200/570] Fixed Pylint Issues

---
 tensorflow/python/layers/core_test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index d5b8a0ff65..8ad0e8c4ba 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -477,7 +477,8 @@ class FlattenTest(test.TestCase):
     self.assertEqual(shape.as_list(), [None, None])
 
   def testDataFormat5d(self):
-    np_input_channels_last = np.arange(120, dtype='float32').reshape([1, 5, 4, 3, 2])
+    np_input_channels_last = np.arange(120, dtype='float32').reshape(
+        [1, 5, 4, 3, 2])
 
     with self.test_session() as sess:
       x = array_ops.placeholder(shape=(1, 5, 4, 3, 2), dtype='float32')
@@ -493,7 +494,8 @@ class FlattenTest(test.TestCase):
       self.assertEqual(np_output_cl, np_output_cf)
 
   def testDataFormat4d(self):
-    np_input_channels_last = np.arange(24, dtype='float32').reshape([1, 4, 3, 2])
+    np_input_channels_last = np.arange(24, dtype='float32').reshape(
+        [1, 4, 3, 2])
 
     with self.test_session() as sess:
       x = array_ops.placeholder(shape=(1, 4, 3, 2), dtype='float32')
-- 
GitLab


From 46fc7a9530e9c8f6bf909de8df8c97e4b38a99a5 Mon Sep 17 00:00:00 2001
From: josephyearsley <joggino23@gmail.com>
Date: Mon, 23 Jul 2018 23:06:48 +0100
Subject: [PATCH 201/570] Fixed Tests

---
 tensorflow/python/layers/core_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index 8ad0e8c4ba..22ed75dda7 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -491,7 +491,7 @@ class FlattenTest(test.TestCase):
                                              [0, 4, 1, 2, 3])
       np_output_cf = sess.run(y, feed_dict={x: np_input_channels_first})
 
-      self.assertEqual(np_output_cl, np_output_cf)
+      self.assertAllEqual(np_output_cl, np_output_cf)
 
   def testDataFormat4d(self):
     np_input_channels_last = np.arange(24, dtype='float32').reshape(
@@ -508,7 +508,7 @@ class FlattenTest(test.TestCase):
                                              [0, 3, 1, 2])
       np_output_cf = sess.run(y, feed_dict={x: np_input_channels_first})
 
-      self.assertEqual(np_output_cl, np_output_cf)
+      self.assertAllEqual(np_output_cl, np_output_cf)
 
   def testFunctionalFlatten(self):
     x = array_ops.placeholder(shape=(None, 2, 3), dtype='float32')
-- 
GitLab


From da930ea7fd16c903346ff36f5f57548dbea98bdc Mon Sep 17 00:00:00 2001
From: josephyearsley <joggino23@gmail.com>
Date: Tue, 21 Aug 2018 08:17:29 +0100
Subject: [PATCH 202/570] Updated golden

---
 tensorflow/tools/api/golden/v1/tensorflow.pbtxt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 509ceff9df..e65ffeb12e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -832,10 +832,6 @@ tf_module {
     name: "broadcast_static_shape"
     argspec: "args=[\'shape_x\', \'shape_y\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "broadcast_to"
-    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "case"
     argspec: "args=[\'pred_fn_pairs\', \'default\', \'exclusive\', \'strict\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\', \'case\'], "
-- 
GitLab


From 459accb2b7bdea542415f3a744cbe9e348f847d6 Mon Sep 17 00:00:00 2001
From: josephyearsley <joggino23@gmail.com>
Date: Tue, 21 Aug 2018 21:02:13 +0100
Subject: [PATCH 203/570] Updated layers

---
 tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt
index df74c32e1f..5d9ea2e5a3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt
@@ -122,7 +122,7 @@ tf_module {
   }
   member_method {
     name: "flatten"
-    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'channels_last\', \'None\'], "
   }
   member_method {
     name: "max_pooling1d"
-- 
GitLab


From a58135a6a9637db0908c88f39df22b69bafaec3d Mon Sep 17 00:00:00 2001
From: Joe Yearsley <josephelliotyearsley@gmail.com>
Date: Sat, 25 Aug 2018 16:04:34 +0100
Subject: [PATCH 204/570] Updated protobuf

---
 tensorflow/tools/api/golden/v1/tensorflow.pbtxt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index e65ffeb12e..509ceff9df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -832,6 +832,10 @@ tf_module {
     name: "broadcast_static_shape"
     argspec: "args=[\'shape_x\', \'shape_y\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "broadcast_to"
+    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "case"
     argspec: "args=[\'pred_fn_pairs\', \'default\', \'exclusive\', \'strict\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\', \'case\'], "
-- 
GitLab


From 8e87c649fc290c758c4240bf202de0c7f0f3a4ad Mon Sep 17 00:00:00 2001
From: Joe Yearsley <josephelliotyearsley@gmail.com>
Date: Sat, 29 Sep 2018 17:38:44 +0100
Subject: [PATCH 205/570] Updated v2

---
 tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt
index df74c32e1f..5fd6ba1192 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt
@@ -122,8 +122,8 @@ tf_module {
   }
   member_method {
     name: "flatten"
-    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
+    argspec: "args=[\'inputs\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'channels_last\', \'None\'], "  
+}
   member_method {
     name: "max_pooling1d"
     argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
-- 
GitLab


From 32059ed204ecbee7828057d23a1c1daf561c87fd Mon Sep 17 00:00:00 2001
From: Joe Yearsley <josephelliotyearsley@gmail.com>
Date: Sat, 29 Sep 2018 17:42:52 +0100
Subject: [PATCH 206/570] Update tensorflow.layers.pbtxt

---
 tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt
index 5fd6ba1192..5d9ea2e5a3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt
@@ -122,8 +122,8 @@ tf_module {
   }
   member_method {
     name: "flatten"
-    argspec: "args=[\'inputs\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'channels_last\', \'None\'], "  
-}
+    argspec: "args=[\'inputs\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'channels_last\', \'None\'], "
+  }
   member_method {
     name: "max_pooling1d"
     argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
-- 
GitLab


From 70a395f9795a48c21bc35cdf1dc44778f73a7bba Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 29 Sep 2018 11:58:55 -0700
Subject: [PATCH 207/570] Automated rollback of commit
 d78595d333c9b5c8a0705ba6852c08b107d6c462

PiperOrigin-RevId: 215073584
---
 tensorflow/python/data/kernel_tests/BUILD     |  1 -
 tensorflow/tensorflow.bzl                     | 39 ++++++++-----------
 .../tools/pip_package/pip_smoke_test.py       |  2 +-
 3 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 99d7f70513..cadfe7f9e0 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -318,7 +318,6 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
     ],
     tags = [
-        "no_oss",  # TODO(b/116813115): Investigate timeout and re-enable.
         "no_windows_gpu",
     ],
 )
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index dead44c57e..cad5de1b0c 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1798,29 +1798,22 @@ def cuda_py_test(
         flaky = 0,
         xla_enabled = False,
         grpc_enabled = False):
-    if main == None:
-        main = name + ".py"
-    for config in ["cpu", "gpu"]:
-        test_name = name
-        test_tags = tags
-        if config == "gpu":
-            test_name += "_gpu"
-            test_tags = test_tags + tf_cuda_tests_tags()
-        tf_py_test(
-            name = test_name,
-            size = size,
-            srcs = srcs,
-            data = data,
-            main = main,
-            args = args,
-            tags = test_tags,
-            shard_count = shard_count,
-            additional_deps = additional_deps,
-            kernels = kernels,
-            flaky = flaky,
-            xla_enabled = xla_enabled,
-            grpc_enabled = grpc_enabled,
-        )
+    test_tags = tags + tf_cuda_tests_tags()
+    tf_py_test(
+        name = name,
+        size = size,
+        srcs = srcs,
+        data = data,
+        main = main,
+        args = args,
+        tags = test_tags,
+        shard_count = shard_count,
+        additional_deps = additional_deps,
+        kernels = kernels,
+        flaky = flaky,
+        xla_enabled = xla_enabled,
+        grpc_enabled = grpc_enabled,
+    )
 
 register_extension_info(
     extension_name = "cuda_py_test",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index e7f9628fa6..c6ef82ccdc 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -142,7 +142,7 @@ def main():
 
   missing_dependencies = []
   # File extensions and endings to ignore
-  ignore_extensions = ["_test", "_test.py", "_test_gpu", "_test_gpu.py"]
+  ignore_extensions = ["_test", "_test.py"]
 
   ignored_files = 0
   blacklisted_files = len(BLACKLIST)
-- 
GitLab


From 639d0dd8c1ba8d2956ccb59604c157de7ba0a7f2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 29 Sep 2018 12:00:53 -0700
Subject: [PATCH 208/570] Cleanup

PiperOrigin-RevId: 215073641
---
 tensorflow/core/BUILD          | 3 ---
 tensorflow/core/profiler/BUILD | 1 -
 2 files changed, 4 deletions(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 7da4b9fbd0..57819cec70 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -239,7 +239,6 @@ tf_proto_library(
     srcs = [],
     cc_api_version = 2,
     default_header = True,
-    java_api_version = 2,
     js_api_version = 2,
     protodeps = [
         ":protos_all_proto",
@@ -2385,7 +2384,6 @@ tf_proto_library(
     srcs = ERROR_CODES_PROTO_SRCS,
     cc_api_version = 2,
     default_header = True,
-    java_api_version = 2,
     js_api_version = 2,
     provide_cc_alias = True,
 )
@@ -2406,7 +2404,6 @@ tf_proto_library(
     srcs = COMMON_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS,
     cc_api_version = 2,
     default_header = True,
-    java_api_version = 2,
     js_api_version = 2,
     protodeps = [
         ":error_codes_proto",
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index af034bdd7d..2bf371276e 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -40,7 +40,6 @@ tf_proto_library(
     name = "protos_all",
     srcs = glob(["**/*.proto"]),
     cc_api_version = 2,
-    java_api_version = 2,
     protodeps = tf_additional_all_protos(),
     visibility = ["//visibility:public"],
 )
-- 
GitLab


From 4cf1b45b2e9188086bcb7d12654cd3e130e9b823 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 29 Sep 2018 14:13:01 -0700
Subject: [PATCH 209/570] Disable PinToHostOptimizer for NoOp.

PiperOrigin-RevId: 215079134
---
 .../core/grappler/optimizers/pin_to_host_optimizer.cc     | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
index 2190d38937..89eb76046e 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
@@ -169,7 +169,13 @@ bool IsTPUGraphDef(const GraphDef& def) {
 }
 
 // All the nodes that should be blacklisted and not swapped.
-bool IsBlacklisted(const NodeDef& node) { return IsCollective(node); }
+bool IsBlacklisted(const NodeDef& node) {
+  return
+      // Collective ops should not be swapped.
+      IsCollective(node) ||
+      // NoOp breaks perf regression tests (probably due to group dependencies).
+      IsNoOp(node);
+}
 }  // end namespace internal
 
 Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
-- 
GitLab


From 2538e68a69e585696175bd972cae119e06bde294 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 29 Sep 2018 16:13:51 -0700
Subject: [PATCH 210/570] Remove workaround for symlinked headers.

PiperOrigin-RevId: 215083669
---
 third_party/gpus/cuda_configure.bzl | 33 +++++++++--------------------
 third_party/py/python_configure.bzl |  4 ++--
 2 files changed, 12 insertions(+), 25 deletions(-)

diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index f5fdd3a75e..69f4599c16 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -1107,8 +1107,8 @@ def symlink_genrule_for_dir(
             # $(@D) will include the full path to the file.
             dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
 
-            # On Windows, symlink is not supported, so we just copy all the files.
-            cmd = "cp -f" if _is_windows(repository_ctx) else "ln -s"
+            # Copy the headers to create a sandboxable setup.
+            cmd = "cp -f"
             command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
             outs.append('        "' + dest_dir + dest_files[i] + '",')
     genrule = _genrule(
@@ -1334,27 +1334,14 @@ def _create_local_cuda_repository(repository_ctx):
         cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
         cuda_defines["%{host_compiler_warnings}"] = ""
 
-        # TODO(klimek): We currently need to inject "/" as builtin directory path
-        # to disable bazel's dependency checks.
-        # The problem is that:
-        # - the python rules symlink the python headers into the bazel root
-        # - the rules use 'includes' in the BUILD file to redirect includes of the
-        #   python headers through those paths
-        # - bazel currently uses -isystem for include paths specified via 'includes'
-        # - gcc follows symlinks when resolving files via -isystem paths, and puts
-        #   the resolved paths into the .d file, which makes the dependency check
-        #   fail for bazel
-        # There are multiple possible ways to solve this:
-        # 1. make bazel not use -isystem for paths specified via 'includes'
-        # 2. cp the headers instead of symlinking them
-        #
-        # Once this is fixed, the right builtin directory path is:
-        # (host_compiler_includes +
-        #    "\n  cxx_builtin_include_directory: \"%s\"" % cuda_include_path)
-        # The cuda directory needs to be passed, as there is currently no rule
-        # providing the cuda headers in the same way the python headers are
-        # provided.
-        cuda_defines["%{host_compiler_includes}"] = "\n  cxx_builtin_include_directory: \"/\""
+        # nvcc has the system include paths built in and will automatically
+        # search them; we cannot work around that, so we add the relevant cuda
+        # system paths to the allowed compiler specific include paths.
+        cuda_defines["%{host_compiler_includes}"] = (
+            host_compiler_includes + "\n" +
+            _cuda_include_path(repository_ctx, cuda_config) +
+            "\n  cxx_builtin_include_directory: \"%s\"" % cupti_header_dir +
+            "\n  cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir)
         nvcc_path = str(repository_ctx.path("%s/bin/nvcc%s" %
                                             (
                                                 cuda_config.cuda_toolkit_path,
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 3c7e5c8469..53264630a1 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -130,8 +130,8 @@ def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
       # If we have only one file to link we do not want to use the dest_dir, as
       # $(@D) will include the full path to the file.
       dest = '$(@D)/' + dest_dir + dest_files[i] if len(dest_files) != 1 else '$(@D)/' + dest_files[i]
-      # On Windows, symlink is not supported, so we just copy all the files.
-      cmd = 'cp -f' if _is_windows(repository_ctx) else 'ln -s'
+      # Copy the headers to create a sandboxable setup.
+      cmd = 'cp -f'
       command.append(cmd + ' "%s" "%s"' % (src_files[i] , dest))
       outs.append('        "' + dest_dir + dest_files[i] + '",')
   genrule = _genrule(src_dir, genrule_name, " && ".join(command),
-- 
GitLab


From e0da6256cd116d17057374594f2fc191cf201f42 Mon Sep 17 00:00:00 2001
From: AG Ramesh <ag.ramesh@intel.com>
Date: Sat, 29 Sep 2018 23:29:28 -0700
Subject: [PATCH 211/570] Fixed format errors reported by clang-format

---
 tensorflow/core/common_runtime/process_util.cc      | 11 ++++++-----
 tensorflow/core/common_runtime/threadpool_device.cc |  6 +++---
 tensorflow/core/util/util.cc                        |  8 ++------
 tensorflow/core/util/util.h                         |  2 +-
 4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index b3064a4c08..4570496637 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -62,15 +62,16 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
     // Set inter_op conservatively to avoid thread oversubscription that could
     // lead to severe perf degradations and OMP resource exhaustion
     int mkl_intra_op = 1;
-  #ifdef _OPENMP
+#ifdef _OPENMP
     mkl_intra_op = omp_get_max_threads();
-  #endif  // _OPENMP
+#endif  // _OPENMP
     CHECK_GE(mkl_intra_op, 1);
     const int32 mkl_inter_op = std::max(
         (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
-    VLOG(0) << "Creating new thread pool with default inter op setting: "
-            << mkl_inter_op
-            << ". Tune using inter_op_parallelism_threads for best performance.";
+    VLOG(0)
+        << "Creating new thread pool with default inter op setting: "
+        << mkl_inter_op
+        << ". Tune using inter_op_parallelism_threads for best performance.";
     return mkl_inter_op;
   }
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index f188016610..6404d8bc6a 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -51,8 +51,7 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
       scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
 #ifdef INTEL_MKL
   // Early return when MKL is disabled
-  if (DisableMKL())
-    return;
+  if (DisableMKL()) return;
 #ifdef _OPENMP
   const char* user_omp_threads = getenv("OMP_NUM_THREADS");
   if (user_omp_threads == nullptr) {
@@ -118,7 +117,8 @@ class MklCPUAllocatorFactory : public AllocatorFactory {
 };
 
 #ifdef ENABLE_MKL
-REGISTER_MEM_ALLOCATOR("MklCPUAllocator", (DisableMKL() ? 50 : 200), MklCPUAllocatorFactory);
+REGISTER_MEM_ALLOCATOR("MklCPUAllocator", (DisableMKL() ? 50 : 200),
+                       MklCPUAllocatorFactory);
 #endif  // ENABLE_MKL
 
 }  // namespace
diff --git a/tensorflow/core/util/util.cc b/tensorflow/core/util/util.cc
index 44d5becb9c..489999d1e8 100644
--- a/tensorflow/core/util/util.cc
+++ b/tensorflow/core/util/util.cc
@@ -122,11 +122,7 @@ string SliceDebugString(const TensorShape& shape, const int64 flat) {
 
 #ifdef INTEL_MKL
 bool DisableMKL() {
-  enum MklStatus {
-    MKL_DEFAULT = 0,
-    MKL_ON = 1,
-    MKL_OFF = 2
-  };
+  enum MklStatus { MKL_DEFAULT = 0, MKL_ON = 1, MKL_OFF = 2 };
   static MklStatus status = MKL_DEFAULT;
   if (status == MKL_DEFAULT) {
     char* tf_disable_mkl = getenv("TF_DISABLE_MKL");
@@ -139,5 +135,5 @@ bool DisableMKL() {
   }
   return status == MKL_OFF ? true : false;
 }
-#endif
+#endif  // INTEL_MKL
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/util.h b/tensorflow/core/util/util.h
index ba90ad52c2..4aa47aa48a 100644
--- a/tensorflow/core/util/util.h
+++ b/tensorflow/core/util/util.h
@@ -59,7 +59,7 @@ string SliceDebugString(const TensorShape& shape, const int64 flat);
 // disable MKL in runtime
 #ifdef INTEL_MKL
 bool DisableMKL();
-#endif
+#endif  // INTEL_MKL
 
 }  // namespace tensorflow
 
-- 
GitLab


From 2b456a2b5dc6b5bb092b3986a400acb77b21a30f Mon Sep 17 00:00:00 2001
From: "Xiaoming (Jason) Cui" <xiaoming.cui@intel.com>
Date: Sun, 30 Sep 2018 01:12:34 -0700
Subject: [PATCH 212/570] Added some minor format changes

---
 tensorflow/core/common_runtime/process_util.cc      | 6 +++---
 tensorflow/core/common_runtime/threadpool_device.cc | 6 +++---
 tensorflow/core/util/util.cc                        | 2 +-
 tensorflow/core/util/util.h                         | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index b3064a4c08..c75d8a8ce6 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -62,15 +62,15 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
     // Set inter_op conservatively to avoid thread oversubscription that could
     // lead to severe perf degradations and OMP resource exhaustion
     int mkl_intra_op = 1;
-  #ifdef _OPENMP
+#ifdef _OPENMP
     mkl_intra_op = omp_get_max_threads();
-  #endif  // _OPENMP
+#endif  // _OPENMP
     CHECK_GE(mkl_intra_op, 1);
     const int32 mkl_inter_op = std::max(
         (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
     VLOG(0) << "Creating new thread pool with default inter op setting: "
             << mkl_inter_op
-            << ". Tune using inter_op_parallelism_threads for best performance.";
+            << ".Tune using inter_op_parallelism_threads for best performance.";
     return mkl_inter_op;
   }
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index f188016610..6404d8bc6a 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -51,8 +51,7 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
       scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
 #ifdef INTEL_MKL
   // Early return when MKL is disabled
-  if (DisableMKL())
-    return;
+  if (DisableMKL()) return;
 #ifdef _OPENMP
   const char* user_omp_threads = getenv("OMP_NUM_THREADS");
   if (user_omp_threads == nullptr) {
@@ -118,7 +117,8 @@ class MklCPUAllocatorFactory : public AllocatorFactory {
 };
 
 #ifdef ENABLE_MKL
-REGISTER_MEM_ALLOCATOR("MklCPUAllocator", (DisableMKL() ? 50 : 200), MklCPUAllocatorFactory);
+REGISTER_MEM_ALLOCATOR("MklCPUAllocator", (DisableMKL() ? 50 : 200),
+                       MklCPUAllocatorFactory);
 #endif  // ENABLE_MKL
 
 }  // namespace
diff --git a/tensorflow/core/util/util.cc b/tensorflow/core/util/util.cc
index 44d5becb9c..6e78777dd9 100644
--- a/tensorflow/core/util/util.cc
+++ b/tensorflow/core/util/util.cc
@@ -139,5 +139,5 @@ bool DisableMKL() {
   }
   return status == MKL_OFF ? true : false;
 }
-#endif
+#endif  // INTEL_MKL
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/util.h b/tensorflow/core/util/util.h
index ba90ad52c2..4aa47aa48a 100644
--- a/tensorflow/core/util/util.h
+++ b/tensorflow/core/util/util.h
@@ -59,7 +59,7 @@ string SliceDebugString(const TensorShape& shape, const int64 flat);
 // disable MKL in runtime
 #ifdef INTEL_MKL
 bool DisableMKL();
-#endif
+#endif  // INTEL_MKL
 
 }  // namespace tensorflow
 
-- 
GitLab


From a00fe72261cf6fe4a00467139e401de14c16224c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 30 Sep 2018 02:00:58 -0700
Subject: [PATCH 213/570] compat: Update forward compatibility horizon to
 2018-09-30

PiperOrigin-RevId: 215109054
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 24a795c787..1f7cfe48b3 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -26,7 +26,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 9, 29)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 9, 30)
 
 
 @tf_export("compat.forward_compatible")
-- 
GitLab


From 4ecce5aa64587afe1cd07ee4c92bbb5ce2cf85df Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Sun, 30 Sep 2018 06:52:22 -0700
Subject: [PATCH 214/570] Removing the setuptools upper limit.

PiperOrigin-RevId: 215120867
---
 tensorflow/tools/pip_package/setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index b95e1f5c87..a9d8b0cff5 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -56,7 +56,6 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.6.0',
-    'setuptools <= 39.1.0',
     'tensorboard >= 1.11.0, < 1.12.0',
     'termcolor >= 1.1.0',
 ]
-- 
GitLab


From 5fa4e1ac928b0512b28e955c588c5a7eab2ea046 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 30 Sep 2018 11:57:45 -0700
Subject: [PATCH 215/570] Parallel_for: fix converters for some ops that don't
 support broadcasting.

PiperOrigin-RevId: 215133508
---
 tensorflow/python/ops/parallel_for/pfor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index e0f6d51881..83cbe64ff2 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -1987,14 +1987,12 @@ def _convert_cast(pfor_input):
 @RegisterPForWithArgs("Pow", math_ops.pow)
 @RegisterPForWithArgs("RealDiv", math_ops.divide)
 @RegisterPForWithArgs("Real", math_ops.real)
-@RegisterPForWithArgs("ReciprocalGrad", math_ops.reciprocal_grad)
 @RegisterPForWithArgs("Reciprocal", math_ops.reciprocal)
 @RegisterPForWithArgs("Relu6", nn_ops.relu6)
 @RegisterPForWithArgs("Relu", nn_ops.relu)
 @RegisterPForWithArgs("RightShift", bitwise_ops.right_shift)
 @RegisterPForWithArgs("Rint", math_ops.rint)
 @RegisterPForWithArgs("Round", math_ops.round)
-@RegisterPForWithArgs("RsqrtGrad", math_ops.rsqrt_grad)
 @RegisterPForWithArgs("Rsqrt", math_ops.rsqrt)
 @RegisterPForWithArgs("Selu", nn_ops.selu)
 @RegisterPForWithArgs("Sigmoid", math_ops.sigmoid)
@@ -2003,7 +2001,6 @@ def _convert_cast(pfor_input):
 @RegisterPForWithArgs("Sin", math_ops.sin)
 @RegisterPForWithArgs("Softplus", nn_ops.softplus)
 @RegisterPForWithArgs("Softsign", nn_ops.softsign)
-@RegisterPForWithArgs("SqrtGrad", math_ops.sqrt_grad)
 @RegisterPForWithArgs("Sqrt", math_ops.sqrt)
 @RegisterPForWithArgs("SquaredDifference", math_ops.squared_difference)
 @RegisterPForWithArgs("Square", math_ops.square)
@@ -2095,6 +2092,9 @@ def _convert_biasaddgrad(pfor_input):
 @RegisterPForWithArgs("SoftplusGrad")
 @RegisterPForWithArgs("SoftsignGrad")
 @RegisterPForWithArgs("TanhGrad")
+@RegisterPForWithArgs("SqrtGrad")
+@RegisterPForWithArgs("RsqrtGrad")
+@RegisterPForWithArgs("ReciprocalGrad")
 def _convert_grads(pfor_input, op_type, *args, **kw_args):
   del args
   del kw_args
-- 
GitLab


From 76c4853b50f201b4a809ac66746c798e049b294c Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Sun, 30 Sep 2018 20:03:29 -0700
Subject: [PATCH 216/570] Bump the version of protobuf TF pip package depends
 on.

Fixes #21719

PiperOrigin-RevId: 215154273
---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index a9d8b0cff5..88c9c20d36 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -55,7 +55,7 @@ REQUIRED_PACKAGES = [
     'keras_preprocessing >= 1.0.3',
     'numpy >= 1.13.3',
     'six >= 1.10.0',
-    'protobuf >= 3.6.0',
+    'protobuf >= 3.6.1',
     'tensorboard >= 1.11.0, < 1.12.0',
     'termcolor >= 1.1.0',
 ]
-- 
GitLab


From b797bfb750504e03a38a988c44e3c52e902e87c4 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Sun, 30 Sep 2018 22:34:28 -0700
Subject: [PATCH 217/570] [HloOrdering] Make parameter always defined before
 other instructions.

- Make parameter always defined before other instructions.
- Add extra indentations to the predecessor field in ToString() method to make it clear.

PiperOrigin-RevId: 215162840
---
 .../compiler/xla/service/hlo_ordering.cc      | 10 +++++++---
 .../compiler/xla/service/hlo_ordering_test.cc | 20 +++++++++++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index f1dc08bafa..23d41d91d6 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -92,14 +92,18 @@ bool HloOrdering::ExecutesBefore(const HloInstruction* a,
 }
 
 bool HloOrdering::IsDefinedBefore(const HloValue& a, const HloValue& b) const {
-  // If 'b' is an entry param then 'a' cannot be defined before 'b' because 'b'
-  // is live into the module.
+  // Entry parameter should always be defined before other instructions.
   const HloModule* module = b.defining_instruction()->parent()->parent();
   if (b.defining_instruction()->parent() == module->entry_computation() &&
       b.defining_instruction()->opcode() == HloOpcode::kParameter) {
     return false;
   }
 
+  if (a.defining_instruction()->parent() == module->entry_computation() &&
+      a.defining_instruction()->opcode() == HloOpcode::kParameter) {
+    return true;
+  }
+
   // Phi values require special handling. Because XLA does not have a phi
   // instruction, the definition instruction of the phis values are
   // placeholders: either the subcomputation parameter (body or condition) or
@@ -316,7 +320,7 @@ string PredecessorHloOrdering::ToStringHelper(const string& name) const {
       for (auto predecessor : all) {
         if (predecessors_.at(computation)
                 ->IsReachable(predecessor, instruction)) {
-          pieces.push_back(absl::StrFormat("  %s", predecessor->name()));
+          pieces.push_back(absl::StrFormat("    %s", predecessor->name()));
         }
       }
     }
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 00970bcda3..b045adc964 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -174,6 +174,26 @@ TEST_F(HloOrderingTest, InstructionsInWhileComputations) {
   EXPECT_FALSE(ordering.ExecutesBefore(body_param, cond_param));
 }
 
+TEST_F(HloOrderingTest, ParametersDefinedBeforeOthers) {
+  // Entry parameter should always be defined before other instruction.
+  auto module = CreateNewModule();
+  const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+  auto builder = HloComputation::Builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param"));
+  module->AddEntryComputation(builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(auto dataflow,
+                          HloDataflowAnalysis::Run(*module, /*ssa_form=*/true));
+
+  DependencyHloOrdering ordering(module.get());
+  EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(param),
+                                       dataflow->GetValueDefinedAt(constant)));
+  EXPECT_TRUE(!ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(constant),
+                                        dataflow->GetValueDefinedAt(param)));
+}
+
 TEST_F(HloOrderingTest, ValuesInWhileComputations) {
   // Tests the ordering of values (defined by dataflow analysis) in the body and
   // condition of a while instruction. HLO code:
-- 
GitLab


From 03c5f9cdce62f6711b91fe81505e3c085e54a771 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 02:03:50 -0700
Subject: [PATCH 218/570] compat: Update forward compatibility horizon to
 2018-10-01

PiperOrigin-RevId: 215179315
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 1f7cfe48b3..bea5aa990f 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -26,7 +26,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 9, 30)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 1)
 
 
 @tf_export("compat.forward_compatible")
-- 
GitLab


From 0fd21d8c34e15bc3013e93014d101b672e1f3687 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 02:41:01 -0700
Subject: [PATCH 219/570] [TF:XLA] Teach deadness analysis more of distributive
 property.

PiperOrigin-RevId: 215183847
---
 tensorflow/compiler/jit/deadness_analysis.cc  | 107 ++++++++++++++----
 .../compiler/jit/deadness_analysis_test.cc    |  31 ++++-
 2 files changed, 112 insertions(+), 26 deletions(-)

diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index 9128b48da3..25e2e9a7af 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/deadness_analysis.h"
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/jit/deadness_analysis_internal.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -383,6 +384,8 @@ class PredicateFactory {
   }
 
   Predicate* MakeAndOrImpl(absl::Span<Predicate* const> operands, bool is_and);
+  Predicate* MakeInternedAndOr(std::vector<Predicate*> simplified_ops,
+                               Predicate::Kind pred_kind);
 
   // Predicate instances are interned, meaning that there is only a single
   // instance of a Predicate object with a given content.  This makes checking
@@ -429,11 +432,40 @@ class PredicateFactory {
       interned_symbol_instances_;
 };
 
+Predicate* PredicateFactory::MakeInternedAndOr(
+    std::vector<Predicate*> simplified_ops, Predicate::Kind pred_kind) {
+  std::stable_sort(
+      simplified_ops.begin(), simplified_ops.end(),
+      [](Predicate* a, Predicate* b) { return a->hash() < b->hash(); });
+
+  auto it = interned_and_or_instances_.find({pred_kind, simplified_ops});
+  if (it != interned_and_or_instances_.end()) {
+    return it->second.get();
+  }
+
+  simplified_ops.shrink_to_fit();
+  // NB!  Because we'll use a non-owning reference to simplified_ops in the
+  // key for interned_and_or_instances_ we need to be careful to std::move()
+  // it all the way through.
+  absl::Span<Predicate* const> operands_slice = simplified_ops;
+  std::unique_ptr<Predicate> new_pred =
+      pred_kind == Predicate::Kind::kAnd
+          ? Make<AndPredicate>(std::move(simplified_ops))
+          : Make<OrPredicate>(std::move(simplified_ops));
+
+  Predicate* new_pred_ptr = new_pred.get();
+  interned_and_or_instances_.emplace(
+      SignatureForAndOr(pred_kind, operands_slice), std::move(new_pred));
+  return new_pred_ptr;
+}
+
 // Common code to create AndPredicate or OrPredicate instances.
 Predicate* PredicateFactory::MakeAndOrImpl(
     absl::Span<Predicate* const> operands, bool is_and) {
   Predicate::Kind pred_kind =
       is_and ? Predicate::Kind::kAnd : Predicate::Kind::kOr;
+  Predicate::Kind other_pred_kind =
+      is_and ? Predicate::Kind::kOr : Predicate::Kind::kAnd;
   gtl::FlatSet<Predicate*> simplified_ops_set;
   std::vector<Predicate*> simplified_ops;
   for (Predicate* op : operands) {
@@ -472,30 +504,63 @@ Predicate* PredicateFactory::MakeAndOrImpl(
     }
   }
 
-  std::stable_sort(
-      simplified_ops.begin(), simplified_ops.end(),
-      [](Predicate* a, Predicate* b) { return a->hash() < b->hash(); });
+  // If all ops contain the same subop, then factor it out thanks to the
+  // distributive property. Such as:
+  // - (A & B) | (A & C) | (A & D) => A & (B | C | D)
+  // - (A | B) & (A | C) & (A | D) => A | (B & C & D)
+  //
+  // First find any predicates contained in all subops.
+  std::vector<Predicate*> common_inner_operands;
+  gtl::FlatSet<Predicate*> common_inner_operands_set;
+  for (Predicate* op : simplified_ops) {
+    if (op->kind() != other_pred_kind) {
+      common_inner_operands.clear();
+      break;
+    }
 
-  auto it = interned_and_or_instances_.find({pred_kind, simplified_ops});
-  if (it == interned_and_or_instances_.end()) {
-    simplified_ops.shrink_to_fit();
-    // NB!  Because we'll use a non-owning reference to simplified_ops in the
-    // key for interned_and_or_instances_ we need to be careful to std::move()
-    // it all the way through.
-    absl::Span<Predicate* const> operands_slice = simplified_ops;
-    std::unique_ptr<Predicate> new_pred =
-        is_and ? Make<AndPredicate>(std::move(simplified_ops))
-               : Make<OrPredicate>(std::move(simplified_ops));
+    if (common_inner_operands.empty()) {
+      common_inner_operands.insert(common_inner_operands.end(),
+                                   op->GetOperands().begin(),
+                                   op->GetOperands().end());
+    } else {
+      std::vector<Predicate*> sub_ops_intersection;
+      common_inner_operands.clear();
+      absl::c_copy_if(op->GetOperands(),
+                      std::back_inserter(common_inner_operands),
+                      [&](Predicate* sub_op) {
+                        return common_inner_operands_set.count(sub_op) == 1;
+                      });
+    }
+    if (common_inner_operands.empty()) break;
+    common_inner_operands_set.clear();
+    common_inner_operands_set.insert(common_inner_operands.begin(),
+                                     common_inner_operands.end());
+  }
 
-    Predicate* new_pred_ptr = new_pred.get();
-    CHECK(interned_and_or_instances_
-              .emplace(SignatureForAndOr(pred_kind, operands_slice),
-                       std::move(new_pred))
-              .second);
-    return new_pred_ptr;
-  } else {
-    return it->second.get();
+  if (common_inner_operands.empty()) {
+    return MakeInternedAndOr(std::move(simplified_ops), pred_kind);
   }
+
+  // For all predicates that can be factored out, remove them and recreate the
+  // subops.
+  std::vector<Predicate*> factored_ops;
+  for (Predicate* op : simplified_ops) {
+    std::vector<Predicate*> new_sub_op_ops;
+    absl::c_copy_if(op->GetOperands(), std::back_inserter(new_sub_op_ops),
+                    [&](Predicate* sub_op) {
+                      return std::find(common_inner_operands.begin(),
+                                       common_inner_operands.end(),
+                                       sub_op) == common_inner_operands.end();
+                    });
+    factored_ops.push_back(MakeAndOrImpl(new_sub_op_ops, !is_and));
+  }
+
+  Predicate* new_inner_op = MakeAndOrImpl(factored_ops, is_and);
+  std::vector<Predicate*> outer_ops;
+  outer_ops.push_back(new_inner_op);
+  outer_ops.insert(outer_ops.end(), common_inner_operands.begin(),
+                   common_inner_operands.end());
+  return MakeAndOrImpl(outer_ops, !is_and);
 }
 
 class DeadnessAnalysisImpl : public DeadnessAnalysis {
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 28a56044d5..617e31488c 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -384,10 +384,31 @@ TEST(DeadnessAnalysisTest, OrOfAnd) {
   EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add2.node()));
 }
 
-TEST(DeadnessAnalysisTest, NEGATIVE_AndOrDistributive) {
-  // This demonstrates one of the weaknesses in the current approach -- since we
-  // only do some basic simplifications we can't see that "(A|B)&C" ==
-  // "(A&C)|(B&C)".
+TEST(DeadnessAnalysisTest, AndOrDistributiveSimplified) {
+  // (*A | (~*A & ((~*B & ~*A) | (~*A & *B)))) == #true
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  ops::Switch sw_0 = CreateSwitch(root, "A");
+  ops::Switch sw_1 = CreateSwitch(root, "B");
+  Output add0 =
+      ops::Add(root.WithOpName("and0"), sw_0.output_false, sw_1.output_true);
+  Output add1 =
+      ops::Add(root.WithOpName("and1"), sw_0.output_false, sw_1.output_false);
+  ops::Merge or2(root.WithOpName("or2"), {add0, add1});
+  Output add3 =
+      ops::Add(root.WithOpName("and3"), or2.output, sw_0.output_false);
+  ops::Merge or4(root.WithOpName("or4"), {add3, sw_0.output_true});
+
+  std::unique_ptr<DeadnessAnalysis> result;
+  TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
+
+  PredicateMapTy predicate_map;
+  TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+  EXPECT_EQ(predicate_map[ControlOutputFor(or4.output)], "#true");
+}
+
+TEST(DeadnessAnalysisTest, AndOrDistributive) {
+  // (A|B)&C == (A&C)|(B&C)
   Scope root = Scope::NewRootScope().ExitOnError();
 
   ops::Switch sw_0 = CreateSwitch(root, "0");
@@ -408,7 +429,7 @@ TEST(DeadnessAnalysisTest, NEGATIVE_AndOrDistributive) {
   std::unique_ptr<DeadnessAnalysis> result;
   TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
 
-  EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add2.node()));
+  EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add3.node()));
 }
 
 TEST(DeadnessAnalysisTest, Ternary) {
-- 
GitLab


From c1c63c936c4bc51b401b82fbe54ed1945f49a314 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 03:27:05 -0700
Subject: [PATCH 220/570] Moves the creation of regularizer ops in get_variable
 out of surrounding context.

This resembles the behaviour for initializer ops.

PiperOrigin-RevId: 215187942
---
 tensorflow/python/ops/variable_scope.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index af5c7d4050..5032ca79f9 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -939,7 +939,8 @@ class _VariableStore(object):
     if regularizer:
       with ops.colocate_with(v):
         with ops.name_scope(name + "/Regularizer/"):
-          loss = regularizer(v)
+          with ops.init_scope():
+            loss = regularizer(v)
         if loss is not None:
           if context.executing_eagerly():
             v_name = "v_%s" % type(v)
-- 
GitLab


From 9a169bf3ba840af8ab3caae7ea1c69c682be3ab7 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 1 Oct 2018 03:34:35 -0700
Subject: [PATCH 221/570] Add allowed optimizations to GrapplerItem.

(1) Skip UnaryOpComposition rewrite if the optimized graph needs to have a gradient registered for all nodes.

PiperOrigin-RevId: 215188461
---
 tensorflow/core/grappler/grappler_item.cc     |   1 +
 tensorflow/core/grappler/grappler_item.h      |   9 ++
 tensorflow/core/grappler/op_types.cc          |   4 +
 tensorflow/core/grappler/op_types.h           |   1 +
 tensorflow/core/grappler/optimizers/BUILD     |   2 +
 .../optimizers/arithmetic_optimizer.cc        |   4 +
 .../grappler/optimizers/meta_optimizer.cc     |  19 +++
 .../optimizers/meta_optimizer_test.cc         | 126 ++++++++++++++++++
 8 files changed, 166 insertions(+)

diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index bbc0fedd22..2c490f3966 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -38,6 +38,7 @@ GrapplerItem::GrapplerItem(const GrapplerItem& other, GraphDef* graph_def) {
   restore_op = other.restore_op;
   save_restore_loc_tensor = other.save_restore_loc_tensor;
   queue_runners = other.queue_runners;
+  allowed_optimizations = other.allowed_optimizations;
   graph.Swap(graph_def);
 }
 
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index 939e5fa046..a0748abfe6 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -77,6 +77,15 @@ struct GrapplerItem {
   // Return a set of node names that must be preserved. This includes feed and
   // fetch nodes, keep_ops, init_ops.
   std::unordered_set<string> NodesToPreserve() const;
+
+  // Restrict types of optimizations that are allowed for this GrapplerItem.
+  struct AllowedOptimizations {
+    // Is it allowed to add nodes to the graph that do not have registered
+    // gradient function.
+    bool non_differentiable_rewrites = true;
+  };
+
+  AllowedOptimizations allowed_optimizations;
 };
 
 // Return the transitive fanin of a set of terminal nodes.
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 3521669b63..9f0d9dbf28 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -425,6 +425,10 @@ bool IsSwitch(const NodeDef& node) {
   return op == "Switch" || op == "RefSwitch";
 }
 
+bool IsSymbolicGradient(const NodeDef& node) {
+  return node.op() == "SymbolicGradient";
+}
+
 bool IsTanhGrad(const NodeDef& node) { return node.op() == "TanhGrad"; }
 
 bool IsTile(const NodeDef& node) { return node.op() == "Tile"; }
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 25ab6b65ac..7f86a5f295 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -149,6 +149,7 @@ bool IsStridedSliceGrad(const NodeDef& node);
 bool IsSub(const NodeDef& node);
 bool IsSum(const NodeDef& node);
 bool IsSwitch(const NodeDef& node);
+bool IsSymbolicGradient(const NodeDef& node);
 bool IsTanhGrad(const NodeDef& node);
 bool IsTile(const NodeDef& node);
 bool IsTranspose(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 960d1addb3..c708f84948 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -525,6 +525,7 @@ cc_library(
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/utils:colocation",
@@ -541,6 +542,7 @@ tf_cuda_cc_test(
         ":custom_graph_optimizer_registry",
         ":meta_optimizer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 3388ee8035..7d5014ee0a 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -3249,6 +3249,10 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
   optimized_graph_ = &optimized_item.graph;
   node_map_.reset(new NodeMap(optimized_graph_));
 
+  // Disable restricted graph rewrites.
+  options_.unary_ops_composition &=
+      item.allowed_optimizations.non_differentiable_rewrites;
+
   if (options_.dedup_computations) {
     DedupComputations();
   }
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 406c1b60ce..a5f851fb1a 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
@@ -413,6 +414,15 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   FunctionLibraryDefinition flib(OpRegistry::Global(),
                                  optimized_graph->library());
 
+  // Find functions for which we might need to compute a gradient at runtime.
+  gtl::FlatSet<string> differentiable_functions;
+  for (const NodeDef& node : optimized_graph->node()) {
+    if (IsSymbolicGradient(node)) {
+      const auto* f_attr = gtl::FindOrNull(node.attr(), "f");
+      if (f_attr) differentiable_functions.insert(f_attr->func().name());
+    }
+  }
+
   // Optimize each function only once.
   std::unordered_set<string> optimized_funcs;
   bool optimize_function_library = true;
@@ -428,6 +438,8 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
       // Skip parametrized functions (function type or body is defined only at
       // function call time by caller node attributes).
+      // They should be specialized to their instantiation type parameters by
+      // the function optimizer, before we can optimize function body.
       if (IsParametrized(func)) continue;
 
       VLOG(3) << "Optimize function: function=" << func_name;
@@ -442,6 +454,13 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
           func, flib, item.graph.versions().producer(), &func_item));
 
+      // If we need to compute the gradient of optimized function at runtime, we
+      // can't perform non-differentiable rewrites.
+      if (differentiable_functions.find(func_name) !=
+          differentiable_functions.end()) {
+        func_item.allowed_optimizations.non_differentiable_rewrites = false;
+      }
+
       // Optimize function body graph.
       GraphDef optimized_func_graph;
       TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index c477c4d4b1..3f3f43382f 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -82,6 +83,48 @@ class TestOptimizerWithParams : public TestOptimizer {
 
 REGISTER_GRAPH_OPTIMIZER(TestOptimizerWithParams);
 
+// Record various properties of the GrapplerItems passed for optimization.
+class GrapplerItemPropertiesAccumulator : public CustomGraphOptimizer {
+ public:
+  static void SetAllowedOptimizations(
+      gtl::FlatMap<string, GrapplerItem::AllowedOptimizations>*
+          allowed_optimizations) {
+    allowed_optimizations_ = allowed_optimizations;
+  }
+  static void ResetAllowedOptimizations() { allowed_optimizations_ = nullptr; }
+
+  GrapplerItemPropertiesAccumulator() {}
+  string name() const override {
+    return "grappler_item_properties_accumulator";
+  }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override {
+    *optimized_graph = item.graph;
+    if (allowed_optimizations_) {
+      allowed_optimizations_->insert({item.id, item.allowed_optimizations});
+    }
+    return Status::OK();
+  }
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override {}
+
+ private:
+  static gtl::FlatMap<string, GrapplerItem::AllowedOptimizations>*
+      allowed_optimizations_;
+};
+
+gtl::FlatMap<string, GrapplerItem::AllowedOptimizations>*
+    GrapplerItemPropertiesAccumulator::allowed_optimizations_;
+
+REGISTER_GRAPH_OPTIMIZER(GrapplerItemPropertiesAccumulator);
+
 class MetaOptimizerTest : public GrapplerTest {};
 
 TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
@@ -335,6 +378,89 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   test::ExpectTensorEqual<int>(tensors_expected[1], tensors[1]);
 }
 
+TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  // We will record what type of optimizations meta optimizer allows for each
+  // GrapplerItem (main graph and graphs for each function).
+  gtl::FlatMap<string, GrapplerItem::AllowedOptimizations>
+      allowed_optimizations;
+  GrapplerItemPropertiesAccumulator::SetAllowedOptimizations(
+      &allowed_optimizations);
+
+  // Just record properties of optimized Grappler items.
+  RewriterConfig rewriter_config;
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+  rewriter_config.add_optimizers("GrapplerItemPropertiesAccumulator");
+  rewriter_config.set_min_graph_nodes(-1);
+
+  MetaOptimizer optimizer(nullptr, rewriter_config);
+
+  // Define simple function library with two identical mul functions.
+  FunctionDef mul_func_1 = FunctionDefHelper::Create(
+      "MyMul1", {"x:float", "y:float"}, {"z:float"}, {},
+      {{{"mul"}, "Mul", {"x", "y"}, {}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  FunctionDef mul_func_2 = FunctionDefHelper::Create(
+      "MyMul2", {"x:float", "y:float"}, {"z:float"}, {},
+      {{{"mul"}, "Mul", {"x", "y"}, {}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // Tensorflow graph:
+  //
+  //   x0 = tf.Placeholder(tf.float);
+  //   x1 = tf.Placeholder(tf.float);
+  //   dy = tf.Placeholder(tf.float);
+  //
+  //   mul_1 = MyMul1(x0, x1);
+  //   mul_2 = MyMul2(x0, x1);
+  //   dx = SymbolicGradient({x0, x1, dy}, f=MyMul2)
+  GrapplerItem item;
+  item.id = "main";
+  item.graph = test::function::GDef(
+      {NDef("x0", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("dy", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       // Calls into function library
+       NDef("mul_1", "MyMul1", {"x0", "x1"}, {}, kDevice),
+       NDef("mul_2", "MyMul2", {"x0", "x1"}, {}, kDevice),
+       // Symbolic gradient of a MyMul2
+       NDef("dx", "SymbolicGradient", {"x0", "x1", "dy"},
+            {{"f", FDH::FunctionRef("MyMul2", {})},
+             {"Tin", DataTypeSlice{DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT}}},
+            kDevice)},
+      // FunctionLib
+      {mul_func_1, mul_func_2});
+  item.fetch = {"mul_1", "mul_2", "dx"};
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  // Our custom optimizer must be called for the main graph and for the two
+  // functions.
+  ASSERT_EQ(allowed_optimizations.size(), 3);
+
+  auto allowed_optimizations_main =
+      gtl::FindOrNull(allowed_optimizations, "main");
+  ASSERT_NE(allowed_optimizations_main, nullptr);
+  EXPECT_TRUE(allowed_optimizations_main->non_differentiable_rewrites);
+
+  auto allowed_optimizations_my_mul_1 =
+      gtl::FindOrNull(allowed_optimizations, "MyMul1");
+  ASSERT_NE(allowed_optimizations_my_mul_1, nullptr);
+  EXPECT_TRUE(allowed_optimizations_my_mul_1->non_differentiable_rewrites);
+
+  auto allowed_optimizations_my_mul_2 =
+      gtl::FindOrNull(allowed_optimizations, "MyMul2");
+  ASSERT_NE(allowed_optimizations_my_mul_2, nullptr);
+  EXPECT_FALSE(allowed_optimizations_my_mul_2->non_differentiable_rewrites);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
-- 
GitLab


From b73c5f80926de3b724a92a57cf0bc49aa7de37bd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 05:50:51 -0700
Subject: [PATCH 222/570] Automated rollback of commit
 3f4423fad57694bc8d7adc427d65e5a18c8592b2

PiperOrigin-RevId: 215200418
---
 .../contrib/tpu/ops/tpu_embedding_ops.cc      | 42 ++++++++++++++++---
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
index 6b0730b40c..5c27d59f82 100644
--- a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
@@ -103,10 +103,19 @@ Status RegisterPerTableLoadOpsForAlgorithmBody(
       arg->set_type(DT_FLOAT);
     }
   }
+  {
+    auto* table_id_attr = op_def->add_attr();
+    table_id_attr->set_name("table_id");
+    table_id_attr->set_type("int");
+    table_id_attr->set_has_minimum(true);
+    table_id_attr->set_minimum(-1);
+    table_id_attr->mutable_default_value()->set_i(-1);
+  }
   {
     auto* table_name_attr = op_def->add_attr();
     table_name_attr->set_name("table_name");
     table_name_attr->set_type("string");
+    table_name_attr->mutable_default_value()->set_s("");
   }
   {
     auto* num_shards_attr = op_def->add_attr();
@@ -138,9 +147,11 @@ parameters that are loaded from a checkpoint before a training loop is
 executed.
 %s
 table_name: Name of this table; must match a name in the
-  EmbeddingLayerConfiguration proto.
+  EmbeddingLayerConfiguration proto (overrides table_id).
 num_shards: Number of shards into which the embedding tables are divided.
 shard_id: Identifier of shard for this operation.
+table_id: Index of this table in the EmbeddingLayerConfiguration proto
+  (deprecated).
 )doc",
                                           parameter_descriptions.c_str()));
   op_def->set_is_commutative(false);
@@ -149,10 +160,14 @@ shard_id: Identifier of shard for this operation.
   auto shape_inference_function =
       [state_variable_specs,
        is_debug_op](shape_inference::InferenceContext* c) -> Status {
+    int table_id;
+    TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
     string table_name;
     TF_RETURN_IF_ERROR(c->GetAttr("table_name", &table_name));
-    if (table_name.empty()) {
-      return errors::InvalidArgument("table_name attribute must be set");
+    // Exactly one must be non-default.
+    if ((table_id >= 0) == (!table_name.empty())) {
+      return errors::InvalidArgument(
+          "exactly one of table_id or table_name must be non-default");
     }
     int num_shards;
     TF_RETURN_IF_ERROR(c->GetAttr("num_shards", &num_shards));
@@ -225,10 +240,19 @@ Status RegisterPerTableRetrieveOpsForAlgorithmBody(
       arg->set_type(DT_FLOAT);
     }
   }
+  {
+    auto* table_id_attr = op_def->add_attr();
+    table_id_attr->set_name("table_id");
+    table_id_attr->set_type("int");
+    table_id_attr->set_has_minimum(true);
+    table_id_attr->set_minimum(-1);
+    table_id_attr->mutable_default_value()->set_i(-1);
+  }
   {
     auto* table_name_attr = op_def->add_attr();
     table_name_attr->set_name("table_name");
     table_name_attr->set_type("string");
+    table_name_attr->mutable_default_value()->set_s("");
   }
   {
     auto* num_shards_attr = op_def->add_attr();
@@ -259,9 +283,11 @@ the correct embedding table configuration. For example, this op is
 used to retrieve updated parameters before saving a checkpoint.
 %s
 table_name: Name of this table; must match a name in the
-  EmbeddingLayerConfiguration proto.
+  EmbeddingLayerConfiguration proto (overrides table_id).
 num_shards: Number of shards into which the embedding tables are divided.
 shard_id: Identifier of shard for this operation.
+table_id: Index of this table in the EmbeddingLayerConfiguration proto
+  (deprecated).
 )doc",
                                           parameter_descriptions.c_str()));
   op_def->set_is_commutative(false);
@@ -270,10 +296,14 @@ shard_id: Identifier of shard for this operation.
   auto shape_inference_function =
       [state_variable_specs,
        is_debug_op](shape_inference::InferenceContext* c) -> Status {
+    int table_id;
+    TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
     string table_name;
     TF_RETURN_IF_ERROR(c->GetAttr("table_name", &table_name));
-    if (table_name.empty()) {
-      return errors::InvalidArgument("table_name must be non-empty");
+    // Exactly one must be non-default.
+    if ((table_id >= 0) == (!table_name.empty())) {
+      return errors::InvalidArgument(
+          "exactly one of table_id or table_name must be non-default");
     }
     int num_shards;
     TF_RETURN_IF_ERROR(c->GetAttr("num_shards", &num_shards));
-- 
GitLab


From 7c5eb354a6b5b2d5a2e27d8ce3dc4861cb51153c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 07:15:23 -0700
Subject: [PATCH 223/570] In TensorFlow configure, write the
 .tf_configure.bazelrc into the --workspace path if provided.

This allows repositories that depend on TensorFlow to execute
'bazel run @org_tensorflow//:configure -- --workspace $(pwd)'
to configure TensorFlow.
END_PUBLIC

Before this change, the .tf_configure.bazelrc ended up in the bazel exec root, and 'bazel clean' would undo the configuration.

PiperOrigin-RevId: 215209207
---
 configure.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/configure.py b/configure.py
index 0a3b9a7894..796c6231e8 100644
--- a/configure.py
+++ b/configure.py
@@ -48,10 +48,9 @@ _SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15, 16]
 
 _DEFAULT_PROMPT_ASK_ATTEMPTS = 10
 
-_TF_WORKSPACE_ROOT = os.path.abspath(os.path.dirname(__file__))
 _TF_BAZELRC_FILENAME = '.tf_configure.bazelrc'
-_TF_BAZELRC = os.path.join(_TF_WORKSPACE_ROOT, _TF_BAZELRC_FILENAME)
-_TF_WORKSPACE = os.path.join(_TF_WORKSPACE_ROOT, 'WORKSPACE')
+_TF_WORKSPACE_ROOT = ''
+_TF_BAZELRC = ''
 
 if platform.machine() == 'ppc64le':
   _DEFAULT_TENSORRT_PATH_LINUX = '/usr/lib/powerpc64le-linux-gnu/'
@@ -243,10 +242,10 @@ def setup_python(environ_cp):
     f.write('export PYTHON_BIN_PATH="%s"' % python_bin_path)
 
 
-def reset_tf_configure_bazelrc(workspace_path):
+def reset_tf_configure_bazelrc():
   """Reset file that contains customized config settings."""
   open(_TF_BAZELRC, 'w').close()
-  bazelrc_path = os.path.join(workspace_path, '.bazelrc')
+  bazelrc_path = os.path.join(_TF_WORKSPACE_ROOT, '.bazelrc')
 
   data = []
   if os.path.exists(bazelrc_path):
@@ -1469,21 +1468,27 @@ def config_info_line(name, help_text):
 
 
 def main():
+  global _TF_WORKSPACE_ROOT
+  global _TF_BAZELRC
+
   parser = argparse.ArgumentParser()
   parser.add_argument(
       '--workspace',
       type=str,
-      default=_TF_WORKSPACE_ROOT,
+      default=os.path.abspath(os.path.dirname(__file__)),
       help='The absolute path to your active Bazel workspace.')
   args = parser.parse_args()
 
+  _TF_WORKSPACE_ROOT = args.workspace
+  _TF_BAZELRC = os.path.join(_TF_WORKSPACE_ROOT, _TF_BAZELRC_FILENAME)
+
   # Make a copy of os.environ to be clear when functions and getting and setting
   # environment variables.
   environ_cp = dict(os.environ)
 
   check_bazel_version('0.15.0')
 
-  reset_tf_configure_bazelrc(args.workspace)
+  reset_tf_configure_bazelrc()
   cleanup_makefile()
   setup_python(environ_cp)
 
-- 
GitLab


From 9a2f872acd0c38d74d60e4f67701241aa1a26419 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 08:21:58 -0700
Subject: [PATCH 224/570] Move from deprecated self.test_session() to
 self.cached_session() or self.session().

* Move from self.test_session(graph=ops.Graph(), ...) to self.session(...) (semantically equivalent).
* Move from self.test_session() to self.cached_session(config=self.config) when run_in_graph_and_eager_modes(config=config) is set to be consistent between eager and non eager modes.

self.test_session() has been deprecated in 9962eb5e84b15e309410071b06c2ed2d6148ed44 as its name confuses readers of the test. Moving to cached_session() instead which is more explicit about:
* the fact that the session may be reused.
* the session is not closed even when doing a "with self.test_session()" statement.

PiperOrigin-RevId: 215216964
---
 tensorflow/contrib/distribute/python/values_test.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index ae3e134333..121d2fbb3f 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -641,7 +641,7 @@ class MirroredVariableTest(test.TestCase):
     if context.num_gpus() < 1 and context.executing_eagerly():
       self.skipTest("A GPU is not available for this test in eager mode.")
 
-    with self.test_session() as sess:
+    with self.cached_session(config=self.config) as sess:
       v, devices, mirrored = _make_mirrored()
 
       # Overwrite the initial values.
@@ -744,7 +744,7 @@ class MirroredVariableTest(test.TestCase):
     if context.num_gpus() < 1 or context.executing_eagerly():
       self.skipTest("A GPU is not available for this test or it's eager mode.")
 
-    with self.test_session(
+    with self.session(
         graph=ops.Graph()) as sess, mirrored_strategy.MirroredStrategy(
             ["/device:GPU:0"]).scope():
       with ops.device("/device:GPU:0"):
@@ -827,7 +827,7 @@ class TowerLocalVariableTest(test.TestCase):
     if context.num_gpus() < 1 and context.executing_eagerly():
       self.skipTest("A GPU is not available for this test in eager mode.")
 
-    with self.test_session() as sess:
+    with self.cached_session(config=self.config) as sess:
       v, tower_local = _make_tower_local(variable_scope.VariableAggregation.SUM)
 
       # Overwrite the initial values.
@@ -850,7 +850,7 @@ class TowerLocalVariableTest(test.TestCase):
     if context.num_gpus() < 1 and context.executing_eagerly():
       self.skipTest("A GPU is not available for this test in eager mode.")
 
-    with self.test_session() as sess:
+    with self.cached_session(config=self.config) as sess:
       v, tower_local = _make_tower_local(
           variable_scope.VariableAggregation.MEAN)
 
-- 
GitLab


From e285dea8d9626b832f34d65159639f294c2d6881 Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Mon, 1 Oct 2018 09:23:48 -0700
Subject: [PATCH 225/570] Update documentation. - Use absolute links instead of
 relative links. Relative links break when published on website. - Correct
 NNAPI abbreviation.

PiperOrigin-RevId: 215225415
---
 tensorflow/contrib/lite/g3doc/performance.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/lite/g3doc/performance.md b/tensorflow/contrib/lite/g3doc/performance.md
index 0ae9400068..6b7943caf8 100644
--- a/tensorflow/contrib/lite/g3doc/performance.md
+++ b/tensorflow/contrib/lite/g3doc/performance.md
@@ -7,12 +7,12 @@ Mobile and embedded devices have limited computational resources and it is impor
 Some models may be too large to run on embedded devices. Instead of large models it is better to use a slightly less precise but smaller model for embedded devices. Smaller models not only use less disk space and memory but are generally faster and more energy efficient. One example of models optimized for mobile devices are [MobileNets](https://arxiv.org/abs/1704.04861), which are optimized for mobile vision applications. Tensorflow Lite [models page](models.md) lists several other models that have been optimized specifically for mobile and embedded devices.
 
 You can retrain the listed models on your own dataset by using transfer learning. Check out our transfer learning tutorial for
-[image classification] (https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0) and
+[image classification](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0) and
  [object detection](https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193).
 
 
 ## Profile your model
-Before starting any optimization, it is a good practice to profile and benchmark your model. Tensorflow Lite [benchmarking tool](../tools/benchmark) has a built-in profiler that shows per operator profiling statistics. This can help in understanding performance bottlenecks and which operators dominate the computation time.
+Before starting any optimization, it is a good practice to profile and benchmark your model. Tensorflow Lite [benchmarking tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark) has a built-in profiler that shows per operator profiling statistics. This can help in understanding performance bottlenecks and which operators dominate the computation time.
 
 ## Profile and optimize operators in the graph
 If a particular operator appears frequently in the model and based on profiling you find the operator consuming the most amount of time, you can look into optimizing the operator.
@@ -22,7 +22,7 @@ If a particular operator appears frequently in the model and based on profiling
 If your model uses floating point weights or activations then it may be possible to reduce the size of model up to ~4x by using quantization and other model optimizations. Check out our [model optimization toolkit](https://www.tensorflow.org/performance/model_optimization) for details about optimizing your model. Fully quantized models can be remarkably power efficient as well.
 
 ## Tweak the number of threads
-Tensorflow Lite supports multi-threaded kernels for many operators. You can increase the number of threads and speed up execution of operators. Increasing the number of threads will however make your model use more resources and power. For some applications latency may be more important than energy efficiency. You can increase the number of threads by setting the number of [interpreter](../interpreter.h) threads.
+Tensorflow Lite supports multi-threaded kernels for many operators. You can increase the number of threads and speed up execution of operators. Increasing the number of threads will however make your model use more resources and power. For some applications latency may be more important than energy efficiency. You can increase the number of threads by setting the number of [interpreter](https://github.com/tensorflow/tensorflow/blob/1084594657a5d139102ac794f84d1427a710e39a/tensorflow/contrib/lite/interpreter.h#L337) threads.
 
 ## Eliminate redundant copies
 Tensorflow Lite is optimized to reduce redundant copies. The APIs allow user to [mmap a model file](https://github.com/tensorflow/tensorflow/blob/9982fd6c8831cbd2f58954f79ea71f26660393bc/tensorflow/contrib/lite/model.h#L152) and avoid copies. If your application is not careful, there can be redundant copies when feeding the input to the model and reading output from the model. Make sure to eliminate redundant copies. If you are using higher level APIs like Java API, make sure to carefully check the documentation for performance caveats. For example, the Java API is a lot faster if ByteBuffers are used as [inputs](https://github.com/tensorflow/tensorflow/blob/6305a6d83552ba6a472cd72398b60d9241467f1f/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java#L151).
@@ -31,8 +31,8 @@ Tensorflow Lite is optimized to reduce redundant copies. The APIs allow user to
 Platform specific tools like [Android profiler](https://developer.android.com/studio/profile/android-profiler) and [Instruments](https://help.apple.com/instruments/mac/current/) provide a wealth of profiling information that can be used to debug your app. Sometimes the performance bug may be not in the model but in parts of application code that interact with the model. Make sure to familiarize yourself with platform specific profiling tools and best practices for your platform.
 
 ## Use hardware accelerators available on the device
-Tensorflow Lite is working on adding support for accelerators like GPU and provides acceleration through [NNAPI](https://developer.android.com/ndk/guides/neuralnetworks/) on Android.
-You can utilize these hardware accelerator backends to improve the speed and efficiency of your model. To enable NNAPI call [UseNNAPI](https://github.com/tensorflow/tensorflow/blob/6305a6d83552ba6a472cd72398b60d9241467f1f/tensorflow/contrib/lite/interpreter.h#L334) on the interpreter instance.
+Tensorflow Lite is working on adding support for accelerators like GPU and provides acceleration through [Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/) on Android.
+You can utilize these hardware accelerator backends to improve the speed and efficiency of your model. To enable Neural Networks API call [UseNNAPI](https://github.com/tensorflow/tensorflow/blob/6305a6d83552ba6a472cd72398b60d9241467f1f/tensorflow/contrib/lite/interpreter.h#L334) on the interpreter instance.
 
 ## Need more help
 The Tensorflow team is happy to help diagnose and address specific performance issues you may be facing. Please file a bug on [github](https://github.com/tensorflow/tensorflow/issues) with details of the issue.
-- 
GitLab


From 03a18ca576410d49e8f0692464e35e900a54f59f Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Mon, 1 Oct 2018 10:01:20 -0700
Subject: [PATCH 226/570] Remove outdated integration test in preparation for
 update of keras_preprocessing.

PiperOrigin-RevId: 215231309
---
 .../python/keras/preprocessing/image_test.py  | 37 -------------------
 1 file changed, 37 deletions(-)

diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
index 362cbc1dc9..4abaadfcd3 100644
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -94,43 +94,6 @@ class TestImage(test.TestCase):
         self.assertEqual(x.shape[1:], images.shape[1:])
         break
 
-  def test_image_data_generator_with_validation_split(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    for test_images in _generate_test_images():
-      img_list = []
-      for im in test_images:
-        img_list.append(keras.preprocessing.image.img_to_array(im)[None, ...])
-
-      images = np.vstack(img_list)
-      generator = keras.preprocessing.image.ImageDataGenerator(
-          validation_split=0.5)
-      seq = generator.flow(
-          images,
-          np.arange(images.shape[0]),
-          shuffle=False,
-          batch_size=3,
-          subset='validation')
-      _, y = seq[0]
-      self.assertEqual(list(y), [0, 1, 2])
-      seq = generator.flow(
-          images,
-          np.arange(images.shape[0]),
-          shuffle=False,
-          batch_size=3,
-          subset='training')
-      _, y2 = seq[0]
-      self.assertEqual(list(y2), [4, 5, 6])
-
-      with self.assertRaises(ValueError):
-        generator.flow(
-            images,
-            np.arange(images.shape[0]),
-            shuffle=False,
-            batch_size=3,
-            subset='foo')
-
   def test_image_data_generator_with_split_value_error(self):
     with self.assertRaises(ValueError):
       keras.preprocessing.image.ImageDataGenerator(validation_split=5)
-- 
GitLab


From a5fc8b064884b926ade9f7973dc096c0677a14e0 Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Mon, 1 Oct 2018 10:35:02 -0700
Subject: [PATCH 227/570] Name fusion parameters simply "param_X". Where "X" is
 the parameter number. Previously, fusion parameter names including the name
 of the original instruction which produced the value which was confusing.

PiperOrigin-RevId: 215238171
---
 .../compiler/xla/service/hlo_computation.cc   | 36 +++----------------
 .../compiler/xla/service/hlo_instructions.cc  |  3 +-
 2 files changed, 6 insertions(+), 33 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 0e5920af7a..4613d6762e 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -122,30 +122,6 @@ HloInstruction* HloComputation::AddParameter(
   return instructions_.back().get();
 }
 
-namespace {
-
-// Returns the new name for a fusion parameter when we change its number.
-//
-// Fusion parameters are named foo.param_1, bar.param_2, etc. We are
-// renumbering the parameters, so replace the final number in the name with
-// the updated value.
-string RenameFusionParameter(const string& original_name, int64 new_param_no) {
-  const string param_underscore = ".param_";
-  size_t index = original_name.rfind(param_underscore);
-  if (index == string::npos) {
-    return original_name;
-  }
-  string after_param = original_name.substr(index + param_underscore.size());
-  int64 numeric_suffix;
-  if (absl::SimpleAtoi(after_param, &numeric_suffix)) {
-    return StrCat(original_name.substr(0, index + param_underscore.size()),
-                  new_param_no);
-  }
-  return original_name;
-}
-
-}  // namespace
-
 Status HloComputation::RemoveParameter(int64 param_no) {
   CHECK_GE(param_no, 0);
   CHECK_LT(param_no, param_instructions_.size());
@@ -158,11 +134,9 @@ Status HloComputation::RemoveParameter(int64 param_no) {
 
   while (param_no < param_instructions_.size()) {
     param_instruction = param_instructions_[param_no];
-    string param_name =
-        RenameFusionParameter(param_instruction->name(), param_no);
     HloInstruction* new_instr =
         AddInstructionInternal(HloInstruction::CreateParameter(
-            param_no, param_instruction->shape(), param_name));
+            param_no, param_instruction->shape(), StrCat("param_", param_no)));
     TF_RETURN_IF_ERROR(param_instruction->ReplaceAllUsesWith(new_instr));
     param_instructions_[param_no] = new_instr;
     TF_RETURN_IF_ERROR(RemoveInstruction(param_instruction));
@@ -186,11 +160,9 @@ Status HloComputation::RemoveUnusedParameters() {
 
     if (removed > 0) {
       const int64 param_no = i - removed;
-      string param_name =
-          RenameFusionParameter(param_instruction->name(), param_no);
-      HloInstruction* new_instr =
-          AddInstructionInternal(HloInstruction::CreateParameter(
-              param_no, param_instruction->shape(), param_name));
+      HloInstruction* new_instr = AddInstructionInternal(
+          HloInstruction::CreateParameter(param_no, param_instruction->shape(),
+                                          StrCat("param_", param_no)));
       TF_RETURN_IF_ERROR(param_instruction->ReplaceAllUsesWith(new_instr));
       param_instructions_[param_no] = new_instr;
       TF_RETURN_IF_ERROR(RemoveInstruction(param_instruction));
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index cd71bc3323..ad45a82941 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -1042,7 +1042,8 @@ HloInstruction* HloFusionInstruction::AddFusionOperand(
   const int64 param_no = operand_count();
   // Name the parameter after the instruction it represents in the outer
   // (non-fusion) computation.
-  string param_name = StrCat(new_operand->name(), ".param_", param_no);
+  // string param_name = StrCat(new_operand->name(), ".param_", param_no);
+  string param_name = StrCat("param_", param_no);
   HloInstruction* fused_parameter =
       fused_instructions_computation()->AddParameter(
           HloInstruction::CreateParameter(param_no, new_operand->shape(),
-- 
GitLab


From a6478312ef296ba9684931135851e9c7bb460444 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Mon, 1 Oct 2018 10:36:07 -0700
Subject: [PATCH 228/570] Replace the tf.name_scope call with an internal
 context manager that can contain additional boilerplate later on.
 Unfortunately it could not be extended to include the error handling.

PiperOrigin-RevId: 215238369
---
 tensorflow/python/autograph/converters/BUILD  |  6 +--
 .../{name_scopes.py => function_scopes.py}    | 32 ++++++++-------
 ...scopes_test.py => function_scopes_test.py} | 40 +++++++++----------
 tensorflow/python/autograph/core/BUILD        | 12 ++++++
 .../autograph/core/converter_testing.py       |  2 +
 .../autograph/core/function_wrapping.py       | 30 ++++++++++++++
 .../autograph/core/function_wrapping_test.py  | 34 ++++++++++++++++
 .../python/autograph/impl/conversion.py       |  6 ++-
 8 files changed, 122 insertions(+), 40 deletions(-)
 rename tensorflow/python/autograph/converters/{name_scopes.py => function_scopes.py} (72%)
 rename tensorflow/python/autograph/converters/{name_scopes_test.py => function_scopes_test.py} (71%)
 create mode 100644 tensorflow/python/autograph/core/function_wrapping.py
 create mode 100644 tensorflow/python/autograph/core/function_wrapping_test.py

diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index 7b029de8ed..f06dc78f0e 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -27,10 +27,10 @@ py_library(
         "decorators.py",
         "directives.py",
         "error_handlers.py",
+        "function_scopes.py",
         "list_comprehensions.py",
         "lists.py",
         "logical_expressions.py",
-        "name_scopes.py",
         "return_statements.py",
         "side_effect_guards.py",
         "slices.py",
@@ -157,8 +157,8 @@ py_test(
 )
 
 py_test(
-    name = "name_scopes_test",
-    srcs = ["name_scopes_test.py"],
+    name = "function_scopes_test",
+    srcs = ["function_scopes_test.py"],
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/converters/name_scopes.py b/tensorflow/python/autograph/converters/function_scopes.py
similarity index 72%
rename from tensorflow/python/autograph/converters/name_scopes.py
rename to tensorflow/python/autograph/converters/function_scopes.py
index a9c55ccff0..284b5b3519 100644
--- a/tensorflow/python/autograph/converters/name_scopes.py
+++ b/tensorflow/python/autograph/converters/function_scopes.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Wraps a function body with a `name_scope` of the function name."""
+"""Wraps the body of a converted function with auxiliary constructs."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,8 +24,8 @@ from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import templates
 
 
-class FunctionNameScopeTransformer(converter.Base):
-  """Wrap a function body with a `name_scope` of the function name."""
+class FunctionBodyTransformer(converter.Base):
+  """Wraps function bodies around autograph-specific boilerplate."""
 
   def _name_for_current_scope(self):
     innermost = self.enclosing_entities[-1]
@@ -49,26 +49,28 @@ class FunctionNameScopeTransformer(converter.Base):
   def visit_FunctionDef(self, node):
     node = self.generic_visit(node)
 
-    unscoped_body = []
-    scoped_body = node.body
-    if scoped_body:
-      first = scoped_body[0]
-      if isinstance(first, gast.Expr) and isinstance(first.value, gast.Str):
-        # Skip any docstring.
-        unscoped_body = scoped_body[:1]
-        scoped_body = scoped_body[1:]
+    final_body = []
+    indented_body = node.body
+    if node.body:
+      first_statement = node.body[0]
+      # Skip the docstring, if any.
+      if (isinstance(first_statement, gast.Expr) and
+          isinstance(first_statement.value, gast.Str)):
+        indented_body = indented_body[1:]
+        final_body.append(first_statement)
 
     template = """
-      with tf.name_scope(scope_name):
+      with ag__.function_scope(scope_name):
         body
     """
     scoped_body = templates.replace(
         template,
         scope_name=gast.Str(self._name_for_current_scope()),
-        body=scoped_body)
-    node.body = unscoped_body + scoped_body
+        body=indented_body)
+    final_body.extend(scoped_body)
+    node.body = final_body
     return node
 
 
 def transform(node, ctx):
-  return FunctionNameScopeTransformer(ctx).visit(node)
+  return FunctionBodyTransformer(ctx).visit(node)
diff --git a/tensorflow/python/autograph/converters/name_scopes_test.py b/tensorflow/python/autograph/converters/function_scopes_test.py
similarity index 71%
rename from tensorflow/python/autograph/converters/name_scopes_test.py
rename to tensorflow/python/autograph/converters/function_scopes_test.py
index 73933c1c4f..e5ce03a109 100644
--- a/tensorflow/python/autograph/converters/name_scopes_test.py
+++ b/tensorflow/python/autograph/converters/function_scopes_test.py
@@ -12,51 +12,51 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for for_canonicalization module."""
+"""Tests for function_scopes module."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.autograph.converters import name_scopes
+from tensorflow.python.autograph.converters import function_scopes
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
 
-class FunctionNameScopeTransformer(converter_testing.TestCase):
+class FunctionBodyTransformerTest(converter_testing.TestCase):
 
   def test_basic(self):
 
     def test_fn(l):
-      """This should stay here."""
+      """Docstring."""
       a = 1
       l += a
       return l
 
-    with self.converted(test_fn, name_scopes, {}, ops.name_scope) as result:
+    with self.converted(test_fn, function_scopes, {}) as result:
       result_op = result.test_fn(constant_op.constant(1))
       self.assertIn('test_fn/', result_op.op.name)
-      self.assertEqual('This should stay here.', result.test_fn.__doc__)
+      self.assertEqual('Docstring.', result.test_fn.__doc__)
 
-  def test_long_docstring(self):
+  def test_multiline_docstring(self):
 
-    def test_fn(l):
-      """Multi-line docstring.
+    tf = None
+
+    def test_fn():
+      """First sentence.
 
-      Args:
-        l: A thing.
-      Returns:
-        l
+      Second sentence.
       """
-      return l + 1
+      return tf.constant(1)
 
-    with self.converted(test_fn, name_scopes, {}, ops.name_scope) as result:
-      result_op = result.test_fn(constant_op.constant(1))
+    with self.converted(test_fn, function_scopes, {},
+                        constant_op.constant) as result:
+      result_op = result.test_fn()
       self.assertIn('test_fn/', result_op.op.name)
-      self.assertIn('Multi-line docstring.', result.test_fn.__doc__)
-      self.assertIn('Returns:', result.test_fn.__doc__)
+      self.assertIn('First sentence.', result.test_fn.__doc__)
+      self.assertIn('Second sentence.', result.test_fn.__doc__)
 
   def test_nested_functions(self):
 
@@ -68,7 +68,7 @@ class FunctionNameScopeTransformer(converter_testing.TestCase):
       l += 1
       return l, inner_fn(l)
 
-    with self.converted(test_fn, name_scopes, {}, ops.name_scope) as result:
+    with self.converted(test_fn, function_scopes, {}, ops.name_scope) as result:
       first, second = result.test_fn(constant_op.constant(1))
       self.assertIn('test_fn/', first.op.name)
       self.assertNotIn('inner_fn', first.op.name)
@@ -88,7 +88,7 @@ class FunctionNameScopeTransformer(converter_testing.TestCase):
 
     ns = {'TestClass': TestClass}
     node, ctx = self.prepare(TestClass, ns, owner_type=TestClass)
-    node = name_scopes.transform(node, ctx)
+    node = function_scopes.transform(node, ctx)
 
     with self.compiled(node, {}, ops.name_scope) as result:
       first, second = result.TestClass().test_fn(constant_op.constant(1))
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 85fecf084d..843e381f31 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -20,11 +20,13 @@ py_library(
         "config.py",
         "converter.py",
         "errors.py",
+        "function_wrapping.py",
         "naming.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python/autograph/pyct",
         "//tensorflow/python/autograph/pyct/static_analysis",
         "//tensorflow/python/autograph/utils",
@@ -46,6 +48,16 @@ py_test(
     ],
 )
 
+py_test(
+    name = "function_wrapping_test",
+    srcs = ["function_wrapping_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":core",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "naming_test",
     srcs = ["naming_test.py"],
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index 7ce1b7c4c5..dc2d419d34 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -29,6 +29,7 @@ from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import errors
+from tensorflow.python.autograph.core import function_wrapping
 from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import pretty_printer
@@ -112,6 +113,7 @@ class TestCase(test.TestCase):
       fake_ag.__dict__['utils'] = utils
       fake_ag.__dict__['rewrite_graph_construction_error'] = (
           errors.rewrite_graph_construction_error)
+      fake_ag.__dict__['function_scope'] = function_wrapping.function_scope
       result.__dict__['ag__'] = fake_ag
       for k, v in namespace.items():
         result.__dict__[k] = v
diff --git a/tensorflow/python/autograph/core/function_wrapping.py b/tensorflow/python/autograph/core/function_wrapping.py
new file mode 100644
index 0000000000..21b66eff02
--- /dev/null
+++ b/tensorflow/python/autograph/core/function_wrapping.py
@@ -0,0 +1,30 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for wrapping converted functions bodies with auxiliary logic."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+from tensorflow.python.framework import ops
+
+
+@contextlib.contextmanager
+def function_scope(function_name):
+  """Returns a context manager for the converted body of a function."""
+  with ops.name_scope(function_name):
+    yield
diff --git a/tensorflow/python/autograph/core/function_wrapping_test.py b/tensorflow/python/autograph/core/function_wrapping_test.py
new file mode 100644
index 0000000000..5e217055c7
--- /dev/null
+++ b/tensorflow/python/autograph/core/function_wrapping_test.py
@@ -0,0 +1,34 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for function_wrapping module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.framework import constant_op
+from tensorflow.python.platform import test
+
+
+class FunctionWrappingTest(test.TestCase):
+
+  def test_function_scope_name(self):
+    with function_wrapping.function_scope('test_name'):
+      t = constant_op.constant(1)
+    self.assertIn('test_name', t.name)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index a0d13c82a8..52abd40626 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -34,15 +34,16 @@ from tensorflow.python.autograph.converters import control_flow
 from tensorflow.python.autograph.converters import decorators
 from tensorflow.python.autograph.converters import directives
 from tensorflow.python.autograph.converters import error_handlers
+from tensorflow.python.autograph.converters import function_scopes
 from tensorflow.python.autograph.converters import lists
 from tensorflow.python.autograph.converters import logical_expressions
-from tensorflow.python.autograph.converters import name_scopes
 from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.converters import side_effect_guards
 from tensorflow.python.autograph.converters import slices
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import errors
+from tensorflow.python.autograph.core import function_wrapping
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import origin_info
@@ -257,6 +258,7 @@ def _add_self_references(namespace, autograph_module):
     ag_internal.converted_call = autograph_module.converted_call
     ag_internal.ConversionOptions = autograph_module.ConversionOptions
     ag_internal.utils = utils
+    ag_internal.function_scope = function_wrapping.function_scope
     ag_internal.rewrite_graph_construction_error = (
         errors.rewrite_graph_construction_error)
     # TODO(mdan): Add safeguards against name clashes.
@@ -346,7 +348,7 @@ def node_to_graph(node, context, rewrite_errors=True):
   node = converter.apply_(node, context, conditional_expressions)
   node = converter.apply_(node, context, logical_expressions)
   node = converter.apply_(node, context, side_effect_guards)
-  node = converter.apply_(node, context, name_scopes)
+  node = converter.apply_(node, context, function_scopes)
   if rewrite_errors:
     node = converter.apply_(node, context, error_handlers)
   return node
-- 
GitLab


From 57a831d20929e71279d164905fed93e1f518ee37 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 10:41:58 -0700
Subject: [PATCH 229/570] Bugfix: When a subgraph is encapsulated and replaced
 by XlaLaunch op, the requested device placement of the XlaLaunch op must be
 derived from the subgraph. PiperOrigin-RevId: 215239672

---
 tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc    | 6 ++++++
 .../compiler/jit/encapsulate_xla_computations_pass.cc    | 2 ++
 .../jit/encapsulate_xla_computations_pass_test.cc        | 9 ++++++---
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index e0632ff7e4..15faf31077 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -748,6 +748,12 @@ Node* Encapsulator::Subgraph::MakeNodeImage(const Graph* graph_in, Node* node) {
     graph_->set_versions(graph_in->versions());
   }
 
+  // TODO(b/116981129): Enhance how the device for the encapsulated subgraph is
+  // determined. In case of hard placement, ensure all the encapsulated nodes
+  // have the same requested device, which in turn will be the requested device
+  // for the entire encapsulated subgraph. In case of soft placement, use a
+  // deterministic approach to fill in the requested device. Handle co-location
+  // constraints similarly if they exist.
   if (device_.empty()) {
     device_ = node->assigned_device_name().empty()
                   ? node->requested_device()
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index 97ef8cd3cb..755c364c62 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -297,7 +297,9 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
 
     // Target the XLA CPU/GPU backends.
     VLOG(2) << "Replacing with XlaLaunch";
+    VLOG(2) << "Device is " << launch->requested_device();
     def.set_op("XlaLaunch");
+    def.set_device(launch->requested_device());
     AddNodeAttr("Tconstants", DataTypeVector{}, &def);
     AddNodeAttr("Targs", arg_types, &def);
     AddNodeAttr("Nresources", num_variables, &def);
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
index f643fb0cfe..479038ac8e 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
@@ -55,6 +55,7 @@ static std::unique_ptr<Graph> MakeOuterGraph(
           .Input(u.node()->name(), 0, DT_RESOURCE)
           .Input(v.node()->name(), 0, DT_RESOURCE)
           .Input(w.node()->name(), 0, DT_RESOURCE)
+          .Device("/gpu:0")
           .Attr(EncapsulateXlaComputationsPass::kXlaClusterAttr, "launch0")
           .Attr("_variable_start_index", 4)
           .Finalize(&def));
@@ -107,10 +108,11 @@ static std::unique_ptr<Graph> MakeBodyGraph() {
 
   auto add_attrs = [](Node* node) {
     node->AddAttr(EncapsulateXlaComputationsPass::kXlaClusterAttr, "launch0");
+    node->set_requested_device("/gpu:0");
   };
 
   auto b_identity = ops::Identity(scope.WithOpName("B_identity"), arg1);
-
+  add_attrs(b_identity.node());
   auto read_u = ops::ReadVariableOp(scope.WithOpName("ReadU"), arg4, DT_FLOAT);
   add_attrs(read_u.node());
   auto read_v = ops::ReadVariableOp(scope.WithOpName("ReadV"), arg5, DT_FLOAT);
@@ -215,6 +217,7 @@ TEST(EncapsulateXlaComputations, Encapsulate) {
 
     auto add_attrs = [](Node* node) {
       node->AddAttr(EncapsulateXlaComputationsPass::kXlaClusterAttr, "launch0");
+      node->set_requested_device("/gpu:0");
     };
 
     auto b_identity = ops::Identity(scope.WithOpName("B_identity"), b);
@@ -317,8 +320,8 @@ TEST(EncapsulateXlaComputations, BuildXlaLaunchOp) {
   NameAttrList function;
   function.set_name("launch0");
   auto launch = ops::XlaLaunch(
-      scope.WithOpName("launch0"), std::initializer_list<Input>{},
-      std::initializer_list<Input>{a, b, c, d},
+      scope.WithOpName("launch0").WithDevice("/gpu:0"),
+      std::initializer_list<Input>{}, std::initializer_list<Input>{a, b, c, d},
       std::initializer_list<Input>{u, v, w},
       DataTypeVector{DT_FLOAT, DT_INT32, DT_FLOAT, DT_FLOAT}, function);
 
-- 
GitLab


From ec2b5f889fb3eb677f7b8198cbd8d505b2779fa7 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Mon, 1 Oct 2018 10:42:14 -0700
Subject: [PATCH 230/570] Automated rollback of commit
 5f822d694af6e4aa57fe8a426032a91dc61e30d6

PiperOrigin-RevId: 215239710
---
 tensorflow/contrib/factorization/BUILD           |  9 +--------
 .../contrib/factorization/python/ops/gmm_ops.py  | 14 +++++++-------
 .../factorization/python/ops/wals_test.py        | 16 ++++++++--------
 tensorflow/contrib/opt/BUILD                     |  5 -----
 .../contrib/timeseries/python/timeseries/BUILD   |  7 +------
 5 files changed, 17 insertions(+), 34 deletions(-)

diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index 510f292508..e344d7a23b 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -154,8 +154,6 @@ tf_py_test(
     ],
     tags = [
         "no_pip",  # b/38283730
-        "noasan",  # b/116875897
-        "nomsan",
         "notsan",  # Flaky: b/30756419
     ],
 )
@@ -179,11 +177,7 @@ tf_py_test(
         "//tensorflow/python:random_seed",
         "//tensorflow/python:variables",
     ],
-    tags = [
-        "noasan",  # b/116875897
-        "nomsan",
-        "notsan",  # b/62863147
-    ],
+    tags = ["notsan"],  # b/62863147
 )
 
 py_library(
@@ -282,7 +276,6 @@ tf_py_test(
         "manual",
         "noasan",  # times out b/63678675
         "nomsan",
-        "notsan",  # b/116875897
     ],
 )
 
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index e076631bc1..d365ad1117 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -154,10 +154,10 @@ class GmmAlgorithm(object):
   def _create_variables(self):
     """Initializes GMM algorithm."""
     init_value = array_ops.constant([], dtype=dtypes.float32)
-    self._means = variables.Variable(init_value,
-                                     name=self.CLUSTERS_VARIABLE,
-                                     validate_shape=False)
-    self._covs = variables.Variable(
+    self._means = variables.VariableV1(init_value,
+                                       name=self.CLUSTERS_VARIABLE,
+                                       validate_shape=False)
+    self._covs = variables.VariableV1(
         init_value, name=self.CLUSTERS_COVS_VARIABLE, validate_shape=False)
     # Mixture weights, representing the probability that a randomly
     # selected unobservable data (in EM terms) was generated by component k.
@@ -165,9 +165,9 @@ class GmmAlgorithm(object):
         array_ops.tile([1.0 / self._num_classes], [self._num_classes]),
         name=self.CLUSTERS_WEIGHT,
         validate_shape=False)
-    self._cluster_centers_initialized = variables.Variable(False,
-                                                           dtype=dtypes.bool,
-                                                           name='initialized')
+    self._cluster_centers_initialized = variables.VariableV1(False,
+                                                             dtype=dtypes.bool,
+                                                             name='initialized')
 
   def _initialize_variables(self, data, initial_means=None):
     """Initializes variables.
diff --git a/tensorflow/contrib/factorization/python/ops/wals_test.py b/tensorflow/contrib/factorization/python/ops/wals_test.py
index 9bdbd05015..75d577f429 100644
--- a/tensorflow/contrib/factorization/python/ops/wals_test.py
+++ b/tensorflow/contrib/factorization/python/ops/wals_test.py
@@ -420,13 +420,13 @@ class WALSMatrixFactorizationUnsupportedTest(test.TestCase):
 class SweepHookTest(test.TestCase):
 
   def test_sweeps(self):
-    is_row_sweep_var = variables.Variable(True)
-    is_sweep_done_var = variables.Variable(False)
-    init_done = variables.Variable(False)
-    row_prep_done = variables.Variable(False)
-    col_prep_done = variables.Variable(False)
-    row_train_done = variables.Variable(False)
-    col_train_done = variables.Variable(False)
+    is_row_sweep_var = variables.VariableV1(True)
+    is_sweep_done_var = variables.VariableV1(False)
+    init_done = variables.VariableV1(False)
+    row_prep_done = variables.VariableV1(False)
+    col_prep_done = variables.VariableV1(False)
+    row_train_done = variables.VariableV1(False)
+    col_train_done = variables.VariableV1(False)
 
     init_op = state_ops.assign(init_done, True)
     row_prep_op = state_ops.assign(row_prep_done, True)
@@ -486,7 +486,7 @@ class StopAtSweepHookTest(test.TestCase):
 
   def test_stop(self):
     hook = wals_lib._StopAtSweepHook(last_sweep=10)
-    completed_sweeps = variables.Variable(
+    completed_sweeps = variables.VariableV1(
         8, name=wals_lib.WALSMatrixFactorization.COMPLETED_SWEEPS)
     train_op = state_ops.assign_add(completed_sweeps, 1)
     hook.begin()
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 6a67c6295d..f4ac70eb1a 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -377,11 +377,6 @@ py_test(
     size = "large",
     srcs = ["python/training/shampoo_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "noasan",  # b/116875897
-        "nomsan",
-        "notsan",
-    ],
     deps = [
         ":opt_py",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index cb1f707028..c230919168 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -159,12 +159,7 @@ py_test(
     ],
     shard_count = 4,
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip_gpu",  # b/63391119
-        "noasan",  # b/116875897
-        "nomsan",
-        "notsan",
-    ],
+    tags = ["no_pip_gpu"],  # b/63391119
     deps = [
         ":estimators",
         ":feature_keys",
-- 
GitLab


From ce1cdd52eda4b40ff8fb8c09bc178210883b3773 Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Mon, 1 Oct 2018 10:57:32 -0700
Subject: [PATCH 231/570] Make GCS filesystem/metadata lookup retries
 configurable

PiperOrigin-RevId: 215243030
---
 .../cloud/compute_engine_metadata_client.cc   |   15 +-
 .../cloud/compute_engine_metadata_client.h    |   10 +-
 .../compute_engine_metadata_client_test.cc    |    6 +-
 .../compute_engine_zone_provider_test.cc      |    8 +-
 .../core/platform/cloud/gcs_file_system.cc    |   25 +-
 .../core/platform/cloud/gcs_file_system.h     |    7 +-
 .../platform/cloud/gcs_file_system_test.cc    | 1286 +++++++++--------
 .../cloud/google_auth_provider_test.cc        |   20 +-
 .../platform/cloud/retrying_file_system.h     |   67 +-
 .../cloud/retrying_file_system_test.cc        |  102 +-
 .../core/platform/cloud/retrying_utils.cc     |   35 +-
 .../core/platform/cloud/retrying_utils.h      |   29 +-
 .../platform/cloud/retrying_utils_test.cc     |   32 +-
 13 files changed, 849 insertions(+), 793 deletions(-)

diff --git a/tensorflow/core/platform/cloud/compute_engine_metadata_client.cc b/tensorflow/core/platform/cloud/compute_engine_metadata_client.cc
index f41b83ac34..affb68ebbb 100644
--- a/tensorflow/core/platform/cloud/compute_engine_metadata_client.cc
+++ b/tensorflow/core/platform/cloud/compute_engine_metadata_client.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <utility>
 #include "tensorflow/core/platform/cloud/curl_http_request.h"
-#include "tensorflow/core/platform/cloud/retrying_utils.h"
 
 namespace tensorflow {
 
@@ -25,21 +24,14 @@ namespace {
 
 // The URL to retrieve metadata when running in Google Compute Engine.
 constexpr char kGceMetadataBaseUrl[] = "http://metadata/computeMetadata/v1/";
-// The default initial delay between retries with exponential backoff.
-constexpr int kInitialRetryDelayUsec = 500000;  // 0.5 sec
 
 }  // namespace
 
-ComputeEngineMetadataClient::ComputeEngineMetadataClient(
-    std::shared_ptr<HttpRequest::Factory> http_request_factory)
-    : ComputeEngineMetadataClient(std::move(http_request_factory),
-                                  kInitialRetryDelayUsec) {}
-
 ComputeEngineMetadataClient::ComputeEngineMetadataClient(
     std::shared_ptr<HttpRequest::Factory> http_request_factory,
-    int64 initial_retry_delay_usec)
+    const RetryConfig& config)
     : http_request_factory_(std::move(http_request_factory)),
-      initial_retry_delay_usec_(initial_retry_delay_usec) {}
+      retry_config_(config) {}
 
 Status ComputeEngineMetadataClient::GetMetadata(
     const string& path, std::vector<char>* response_buffer) {
@@ -52,8 +44,7 @@ Status ComputeEngineMetadataClient::GetMetadata(
     return Status::OK();
   };
 
-  return RetryingUtils::CallWithRetries(get_metadata_from_gce,
-                                        initial_retry_delay_usec_);
+  return RetryingUtils::CallWithRetries(get_metadata_from_gce, retry_config_);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/compute_engine_metadata_client.h b/tensorflow/core/platform/cloud/compute_engine_metadata_client.h
index 534ccf30b2..7f060327da 100644
--- a/tensorflow/core/platform/cloud/compute_engine_metadata_client.h
+++ b/tensorflow/core/platform/cloud/compute_engine_metadata_client.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/cloud/http_request.h"
+#include "tensorflow/core/platform/cloud/retrying_utils.h"
 
 namespace tensorflow {
 
@@ -31,10 +32,11 @@ namespace tensorflow {
 class ComputeEngineMetadataClient {
  public:
   explicit ComputeEngineMetadataClient(
-      std::shared_ptr<HttpRequest::Factory> http_request_factory);
-  ComputeEngineMetadataClient(
       std::shared_ptr<HttpRequest::Factory> http_request_factory,
-      int64 initial_retry_delay_usec);
+      const RetryConfig& config = RetryConfig(
+          10000,  /* init_delay_time_us = 1 ms */
+          1000000 /* max_delay_time_us = 1 s */
+          ));
   virtual ~ComputeEngineMetadataClient() {}
 
   /// \brief Get the metadata value for a given attribute of the metadata
@@ -54,7 +56,7 @@ class ComputeEngineMetadataClient {
 
  private:
   std::shared_ptr<HttpRequest::Factory> http_request_factory_;
-  const int64 initial_retry_delay_usec_;
+  const RetryConfig retry_config_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ComputeEngineMetadataClient);
 };
diff --git a/tensorflow/core/platform/cloud/compute_engine_metadata_client_test.cc b/tensorflow/core/platform/cloud/compute_engine_metadata_client_test.cc
index 4c41ccaa0e..e891b4a5e9 100644
--- a/tensorflow/core/platform/cloud/compute_engine_metadata_client_test.cc
+++ b/tensorflow/core/platform/cloud/compute_engine_metadata_client_test.cc
@@ -30,7 +30,8 @@ TEST(ComputeEngineMetadataClientTest, GetMetadata) {
 
   std::shared_ptr<HttpRequest::Factory> http_factory =
       std::make_shared<FakeHttpRequestFactory>(&requests);
-  ComputeEngineMetadataClient client(http_factory, 0);
+  ComputeEngineMetadataClient client(http_factory,
+                                     RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<char> result;
   TF_EXPECT_OK(
@@ -56,7 +57,8 @@ TEST(ComputeEngineMetadataClientTest, RetryOnFailure) {
 
   std::shared_ptr<HttpRequest::Factory> http_factory =
       std::make_shared<FakeHttpRequestFactory>(&requests);
-  ComputeEngineMetadataClient client(http_factory, 0);
+  ComputeEngineMetadataClient client(http_factory,
+                                     RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<char> result;
   TF_EXPECT_OK(
diff --git a/tensorflow/core/platform/cloud/compute_engine_zone_provider_test.cc b/tensorflow/core/platform/cloud/compute_engine_zone_provider_test.cc
index f7477eca23..476e4f9c1f 100644
--- a/tensorflow/core/platform/cloud/compute_engine_zone_provider_test.cc
+++ b/tensorflow/core/platform/cloud/compute_engine_zone_provider_test.cc
@@ -34,8 +34,8 @@ TEST_F(ComputeEngineZoneProviderTest, GetZone) {
 
   auto httpRequestFactory = std::make_shared<FakeHttpRequestFactory>(&requests);
 
-  auto metadata_client =
-      std::make_shared<ComputeEngineMetadataClient>(httpRequestFactory, 0);
+  auto metadata_client = std::make_shared<ComputeEngineMetadataClient>(
+      httpRequestFactory, RetryConfig(0 /* init_delay_time_us */));
 
   ComputeEngineZoneProvider provider(metadata_client);
 
@@ -55,8 +55,8 @@ TEST_F(ComputeEngineZoneProviderTest, InvalidZoneString) {
 
   auto httpRequestFactory = std::make_shared<FakeHttpRequestFactory>(&requests);
 
-  auto metadata_client =
-      std::make_shared<ComputeEngineMetadataClient>(httpRequestFactory, 0);
+  auto metadata_client = std::make_shared<ComputeEngineMetadataClient>(
+      httpRequestFactory, RetryConfig(0 /* init_delay_time_us */));
 
   ComputeEngineZoneProvider provider(metadata_client);
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 83ea8539ed..c61b68aeeb 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -333,14 +333,14 @@ class GcsWritableFile : public WritableFile {
                   GcsFileSystem* filesystem,
                   GcsFileSystem::TimeoutConfig* timeouts,
                   std::function<void()> file_cache_erase,
-                  int64 initial_retry_delay_usec)
+                  RetryConfig retry_config)
       : bucket_(bucket),
         object_(object),
         filesystem_(filesystem),
         timeouts_(timeouts),
         file_cache_erase_(std::move(file_cache_erase)),
         sync_needed_(true),
-        initial_retry_delay_usec_(initial_retry_delay_usec) {
+        retry_config_(retry_config) {
     // TODO: to make it safer, outfile_ should be constructed from an FD
     if (GetTmpFilename(&tmp_content_filename_).ok()) {
       outfile_.open(tmp_content_filename_,
@@ -357,14 +357,14 @@ class GcsWritableFile : public WritableFile {
                   GcsFileSystem* filesystem, const string& tmp_content_filename,
                   GcsFileSystem::TimeoutConfig* timeouts,
                   std::function<void()> file_cache_erase,
-                  int64 initial_retry_delay_usec)
+                  RetryConfig retry_config)
       : bucket_(bucket),
         object_(object),
         filesystem_(filesystem),
         timeouts_(timeouts),
         file_cache_erase_(std::move(file_cache_erase)),
         sync_needed_(true),
-        initial_retry_delay_usec_(initial_retry_delay_usec) {
+        retry_config_(retry_config) {
     tmp_content_filename_ = tmp_content_filename;
     outfile_.open(tmp_content_filename_,
                   std::ofstream::binary | std::ofstream::app);
@@ -441,7 +441,7 @@ class GcsWritableFile : public WritableFile {
           first_attempt = false;
           return UploadToSession(session_uri, already_uploaded);
         },
-        initial_retry_delay_usec_);
+        retry_config_);
     if (upload_status.code() == errors::Code::NOT_FOUND) {
       // GCS docs recommend retrying the whole upload. We're relying on the
       // RetryingFileSystem to retry the Sync() call.
@@ -586,7 +586,7 @@ class GcsWritableFile : public WritableFile {
   GcsFileSystem::TimeoutConfig* timeouts_;
   std::function<void()> file_cache_erase_;
   bool sync_needed_;  // whether there is buffered data that needs to be synced
-  int64 initial_retry_delay_usec_;
+  RetryConfig retry_config_;
 };
 
 class GcsReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
@@ -791,7 +791,7 @@ GcsFileSystem::GcsFileSystem(
     std::unique_ptr<ZoneProvider> zone_provider, size_t block_size,
     size_t max_bytes, uint64 max_staleness, uint64 stat_cache_max_age,
     size_t stat_cache_max_entries, uint64 matching_paths_cache_max_age,
-    size_t matching_paths_cache_max_entries, int64 initial_retry_delay_usec,
+    size_t matching_paths_cache_max_entries, RetryConfig retry_config,
     TimeoutConfig timeouts, const std::unordered_set<string>& allowed_locations,
     std::pair<const string, const string>* additional_header)
     : auth_provider_(std::move(auth_provider)),
@@ -806,7 +806,7 @@ GcsFileSystem::GcsFileSystem(
           kCacheNeverExpire, kBucketLocationCacheMaxEntries)),
       allowed_locations_(allowed_locations),
       timeouts_(timeouts),
-      initial_retry_delay_usec_(initial_retry_delay_usec),
+      retry_config_(retry_config),
       additional_header_(additional_header) {}
 
 Status GcsFileSystem::NewRandomAccessFile(
@@ -941,7 +941,7 @@ Status GcsFileSystem::NewWritableFile(const string& fname,
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(bucket, object, this, &timeouts_,
                                     [this, fname]() { ClearFileCaches(fname); },
-                                    initial_retry_delay_usec_));
+                                    retry_config_));
   return Status::OK();
 }
 
@@ -981,7 +981,7 @@ Status GcsFileSystem::NewAppendableFile(const string& fname,
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(
       bucket, object, this, old_content_filename, &timeouts_,
-      [this, fname]() { ClearFileCaches(fname); }, initial_retry_delay_usec_));
+      [this, fname]() { ClearFileCaches(fname); }, retry_config_));
   return Status::OK();
 }
 
@@ -1534,7 +1534,7 @@ Status GcsFileSystem::RenameObject(const string& src, const string& target) {
   // on the server side, we can't just retry the whole RenameFile operation
   // because the source object is already gone.
   return RetryingUtils::DeleteWithRetries(
-      [this, &src]() { return DeleteFile(src); }, initial_retry_delay_usec_);
+      [this, &src]() { return DeleteFile(src); }, retry_config_);
 }
 
 Status GcsFileSystem::IsDirectory(const string& fname) {
@@ -1590,8 +1590,7 @@ Status GcsFileSystem::DeleteRecursively(const string& dirname,
     // and therefore RetryingFileSystem won't pay attention to the failures,
     // we need to make sure these failures are properly retried.
     const auto& delete_file_status = RetryingUtils::DeleteWithRetries(
-        [this, &full_path]() { return DeleteFile(full_path); },
-        initial_retry_delay_usec_);
+        [this, &full_path]() { return DeleteFile(full_path); }, retry_config_);
     if (!delete_file_status.ok()) {
       if (IsDirectory(full_path).ok()) {
         // The object is a directory marker.
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 71db707687..d0840a3046 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -93,7 +93,7 @@ class GcsFileSystem : public FileSystem {
                 uint64 stat_cache_max_age, size_t stat_cache_max_entries,
                 uint64 matching_paths_cache_max_age,
                 size_t matching_paths_cache_max_entries,
-                int64 initial_retry_delay_usec, TimeoutConfig timeouts,
+                RetryConfig retry_config, TimeoutConfig timeouts,
                 const std::unordered_set<string>& allowed_locations,
                 std::pair<const string, const string>* additional_header);
 
@@ -332,7 +332,7 @@ class GcsFileSystem : public FileSystem {
   GcsStatsInterface* stats_ = nullptr;  // Not owned.
 
   /// The initial delay for exponential backoffs when retrying failed calls.
-  const int64 initial_retry_delay_usec_ = 1000000L;
+  RetryConfig retry_config_;
 
   // Additional header material to be transmitted with all GCS requests
   std::unique_ptr<std::pair<const string, const string>> additional_header_;
@@ -344,7 +344,8 @@ class GcsFileSystem : public FileSystem {
 class RetryingGcsFileSystem : public RetryingFileSystem<GcsFileSystem> {
  public:
   RetryingGcsFileSystem()
-      : RetryingFileSystem(std::unique_ptr<GcsFileSystem>(new GcsFileSystem)) {}
+      : RetryingFileSystem(std::unique_ptr<GcsFileSystem>(new GcsFileSystem),
+                           RetryConfig(100000 /* init_delay_time_us */)) {}
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 14376ad339..702802b185 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -24,6 +24,8 @@ namespace tensorflow {
 namespace {
 
 static GcsFileSystem::TimeoutConfig kTestTimeoutConfig(5, 1, 10, 20, 30);
+static RetryConfig kTestRetryConfig(0 /* init_delay_time_us */);
+
 // Default (empty) constraint config
 static std::unordered_set<string>* kAllowedLocationsDefault =
     new std::unordered_set<string>();
@@ -62,16 +64,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
            "Range: 6-11\n"
            "Timeouts: 5 1 20\n",
            "6789")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -108,9 +110,9 @@ TEST(GcsFileSystemTest,
                    0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */, kTestTimeoutConfig,
-                   *kAllowedLocationsAuto, nullptr /* gcs additional header */);
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsAuto,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -150,9 +152,9 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithLocationConstraintCaching) {
                    0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */, kTestTimeoutConfig,
-                   *kAllowedLocationsAuto, nullptr /* gcs additional header */);
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsAuto,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<RandomAccessFile> file;
 
@@ -191,9 +193,9 @@ TEST(GcsFileSystemTest,
                    0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */, kTestTimeoutConfig,
-                   *kAllowedLocationsAuto, nullptr /* gcs additional header */);
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsAuto,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<RandomAccessFile> file;
   EXPECT_EQ(tensorflow::errors::FailedPrecondition(
@@ -216,16 +218,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_DifferentN) {
            "Range: 3-12\n"
            "Timeouts: 5 1 20\n",
            "3456789")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -283,7 +285,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 9 /* block size */,
       18 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */);
 
@@ -372,7 +374,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 9 /* block size */,
       18 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */);
 
@@ -414,17 +416,17 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
                            "Range: 8-15\n"
                            "Timeouts: 5 1 20\n",
                            "89abcdef")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 8 /* block size */,
-      16 /* max bytes */, 3600 /* max staleness */,
-      3600 /* stat cache max age */, 0 /* stat cache max entries */,
-      0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   8 /* block size */, 16 /* max bytes */,
+                   3600 /* max staleness */, 3600 /* stat cache max age */,
+                   0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
   char scratch[100];
   StringPiece result;
   // There should only be two HTTP requests issued to GCS even though we iterate
@@ -492,7 +494,7 @@ TEST(GcsFileSystemTest,
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 9 /* block size */,
       18 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */);
 
@@ -513,17 +515,17 @@ TEST(GcsFileSystemTest,
 
 TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-      0 /* read ahead bytes */, 0 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
-      0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* read ahead bytes */, 0 /* max bytes */,
+                   0 /* max staleness */, 0 /* stat cache max age */,
+                   0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<RandomAccessFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -547,16 +549,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
            "012")});
 
   // Set stat_cache_max_age to 1000s so that StatCache could work.
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 1e3 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   1e3 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   // Stat the file first so that the file stats are cached.
   FileStatistics stat;
@@ -621,7 +623,7 @@ TEST(GcsFileSystemTest, NewWritableFile) {
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 8 /* block size */,
       8 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */);
 
@@ -703,16 +705,16 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
                            "Timeouts: 5 1 30\n"
                            "Put body: t2\n",
                            "")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -773,17 +775,17 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
            "Range: 0-7\n"
            "Timeouts: 5 1 20\n",
            "01234567")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 8 /* block size */,
-      8 /* max bytes */, 3600 /* max staleness */,
-      3600 /* stat cache max age */, 0 /* stat cache max entries */,
-      0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   8 /* block size */, 8 /* max bytes */,
+                   3600 /* max staleness */, 3600 /* stat cache max age */,
+                   0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
   // Pull the file's first block into the cache. This will trigger the first
   // HTTP request to GCS.
   std::unique_ptr<RandomAccessFile> rfile;
@@ -867,9 +869,9 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
       0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 2 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      0 /* matching paths cache max entries */,
+      RetryConfig(2 /* .init_delay_time_us */), kTestTimeoutConfig,
+      *kAllowedLocationsDefault, nullptr /* gcs additional header */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -918,16 +920,16 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
                            "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            "")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -948,16 +950,16 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
 
 TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1013,7 +1015,7 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 32 /* block size */,
       32 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */);
 
@@ -1041,16 +1043,16 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
 
 TEST(GcsFileSystemTest, NewAppendableFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1075,16 +1077,16 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
                            "Range: 0-",
                            content.size() - 1, "\n", "Timeouts: 5 1 20\n"),
            content)});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   TF_EXPECT_OK(fs.NewReadOnlyMemoryRegionFromFile(
@@ -1096,16 +1098,16 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
 
 TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1120,16 +1122,16 @@ TEST(GcsFileSystemTest, FileExists_YesAsObject) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt"));
 }
@@ -1150,16 +1152,16 @@ TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
            "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/subfolder/\" }]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder"));
 }
@@ -1176,16 +1178,16 @@ TEST(GcsFileSystemTest, FileExists_YesAsBucket) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{\"size\": \"100\"}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket1"));
   TF_EXPECT_OK(fs.FileExists("gs://bucket1/"));
@@ -1206,16 +1208,16 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{\"items\": []}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   EXPECT_EQ(errors::Code::NOT_FOUND,
             fs.FileExists("gs://bucket/path/file1.txt").code());
@@ -1233,19 +1235,19 @@ TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
-  EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.FileExists("gs://bucket2/").code());
-  EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
+  EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
+            fs.FileExists("gs://bucket2/").code());
+  EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
             fs.FileExists("gs://bucket2").code());
 }
 
@@ -1279,7 +1281,7 @@ TEST(GcsFileSystemTest, FileExists_StatCache) {
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
       0 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */);
 
@@ -1306,7 +1308,7 @@ TEST(GcsFileSystemTest, FileExists_DirectoryMark) {
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
       0 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */);
 
@@ -1322,16 +1324,16 @@ TEST(GcsFileSystemTest, GetChildren_NoItems) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{\"prefixes\": [\"path/subpath/\"]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -1350,16 +1352,16 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/file3.txt\" }],"
       "\"prefixes\": [\"path/subpath/\"]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -1379,16 +1381,16 @@ TEST(GcsFileSystemTest, GetChildren_SelfDirectoryMarker) {
       "  { \"name\": \"path/\" },"
       "  { \"name\": \"path/file3.txt\" }],"
       "\"prefixes\": [\"path/subpath/\"]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -1407,16 +1409,16 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles_NoSlash) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/file3.txt\" }],"
       "\"prefixes\": [\"path/subpath/\"]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
@@ -1432,16 +1434,16 @@ TEST(GcsFileSystemTest, GetChildren_Root) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket-a-b-c", &children));
@@ -1457,16 +1459,16 @@ TEST(GcsFileSystemTest, GetChildren_Empty) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -1498,16 +1500,16 @@ TEST(GcsFileSystemTest, GetChildren_Pagination) {
            "  { \"name\": \"path/file4.txt\" },"
            "  { \"name\": \"path/file5.txt\" }]}")});
 
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
@@ -1525,16 +1527,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_NoWildcard) {
       "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/subpath/file2.txt\" }]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> result;
   TF_EXPECT_OK(
@@ -1553,16 +1555,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_BucketAndWildcard) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
       "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", &result));
@@ -1582,16 +1584,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_Matches) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
       "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file2.txt", &result));
@@ -1608,16 +1610,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SelfDirectoryMarker) {
       "{\"items\": [ "
       "  { \"name\": \"path/\" },"
       "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
@@ -1634,16 +1636,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
       "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file3.txt", &result));
@@ -1652,16 +1654,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
 
 TEST(GcsFileSystemTest, GetMatchingPaths_OnlyWildcard) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> result;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1686,16 +1688,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache) {
            "  { \"name\": \"path/file1.txt\" },"
            "  { \"name\": \"path/subpath/file2.txt\" },"
            "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 3600 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   3600 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   // Repeated calls to fs.GetMatchingPaths on these patterns should not lead to
   // any additional HTTP requests to GCS.
@@ -1729,16 +1731,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache_Flush) {
            "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/subpath/file2.txt\" }]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 3600 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   3600 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   // This loop should trigger the first HTTP request to GCS.
   for (int i = 0; i < 10; i++) {
@@ -1800,7 +1802,7 @@ TEST(GcsFileSystemTest, DeleteFile) {
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 16 /* block size */,
       16 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */);
 
@@ -1821,16 +1823,16 @@ TEST(GcsFileSystemTest, DeleteFile) {
 
 TEST(GcsFileSystemTest, DeleteFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
             fs.DeleteFile("gs://bucket/").code());
@@ -1871,7 +1873,7 @@ TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 16 /* block size */,
       16 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */);
 
@@ -1894,16 +1896,16 @@ TEST(GcsFileSystemTest, DeleteDir_Empty) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
 }
@@ -1923,16 +1925,16 @@ TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
                            "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
 }
@@ -1943,16 +1945,16 @@ TEST(GcsFileSystemTest, DeleteDir_BucketOnly) {
       "name%2CnextPageToken&maxResults=2\nAuth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket"));
 }
@@ -1965,16 +1967,16 @@ TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
       "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" }]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
             fs.DeleteDir("gs://bucket/path/").code());
@@ -1988,16 +1990,16 @@ TEST(GcsFileSystemTest, GetFileSize) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   uint64 size;
   TF_EXPECT_OK(fs.GetFileSize("gs://bucket/file.txt", &size));
@@ -2006,16 +2008,16 @@ TEST(GcsFileSystemTest, GetFileSize) {
 
 TEST(GcsFileSystemTest, GetFileSize_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   uint64 size;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -2092,16 +2094,16 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
            "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            "")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.RenameFile("gs://bucket/path1", "gs://bucket/path2/"));
 }
@@ -2191,7 +2193,7 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 16 /* block size */,
       64 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */);
   // Do an initial read of the source and destination files to load their
@@ -2272,7 +2274,7 @@ TEST(GcsFileSystemTest, RenameFile_Object_FlushTargetStatCache) {
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
       0 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */);
   // Do an initial stat of the destination file to load their contents into the
@@ -2332,16 +2334,16 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
            "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(
       fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
@@ -2374,16 +2376,16 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
            "Post: yes\n"
            "Timeouts: 5 1 10\n",
            "{\"done\": false}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   EXPECT_EQ(
       errors::Code::UNIMPLEMENTED,
@@ -2399,16 +2401,16 @@ TEST(GcsFileSystemTest, Stat_Object) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
@@ -2433,16 +2435,16 @@ TEST(GcsFileSystemTest, Stat_Folder) {
            "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"subfolder/\" }]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder", &stat));
@@ -2466,16 +2468,16 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   FileStatistics stat;
   EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/path", &stat).code());
@@ -2487,16 +2489,16 @@ TEST(GcsFileSystemTest, Stat_Bucket) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/", &stat));
@@ -2511,16 +2513,16 @@ TEST(GcsFileSystemTest, Stat_BucketNotFound) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   FileStatistics stat;
   EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/", &stat).code());
@@ -2556,7 +2558,7 @@ TEST(GcsFileSystemTest, Stat_Cache) {
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
       0 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */);
 
@@ -2598,7 +2600,7 @@ TEST(GcsFileSystemTest, Stat_Cache_Flush) {
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
       0 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */);
   // There should be a single HTTP request to GCS for fs.Stat in this loop.
@@ -2628,16 +2630,16 @@ TEST(GcsFileSystemTest, Stat_FilenameEndingWithSlash) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"5\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/dir/", &stat));
@@ -2660,16 +2662,16 @@ TEST(GcsFileSystemTest, IsDirectory_NotFound) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   EXPECT_EQ(error::Code::NOT_FOUND,
             fs.IsDirectory("gs://bucket/file.txt").code());
@@ -2691,16 +2693,16 @@ TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
            "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
             fs.IsDirectory("gs://bucket/file.txt").code());
@@ -2722,16 +2724,16 @@ TEST(GcsFileSystemTest, IsDirectory_Yes) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{\"items\": [{\"name\": \"subfolder/\"}]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder/"));
@@ -2749,16 +2751,16 @@ TEST(GcsFileSystemTest, IsDirectory_Bucket) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/"));
@@ -2770,16 +2772,16 @@ TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   EXPECT_EQ(error::Code::NOT_FOUND, fs.IsDirectory("gs://bucket/").code());
 }
@@ -2812,16 +2814,16 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
                            "Timeouts: 5 1 30\n"
                            "Put body: \n",
                            "")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath"));
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath/"));
@@ -2839,16 +2841,16 @@ TEST(GcsFileSystemTest, CreateDir_Bucket) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/"));
   TF_EXPECT_OK(fs.CreateDir("gs://bucket"));
@@ -2911,16 +2913,16 @@ TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
                            "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
@@ -3004,16 +3006,16 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
 
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
@@ -3039,16 +3041,16 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   int64 undeleted_files, undeleted_dirs;
   EXPECT_EQ(error::Code::NOT_FOUND,
@@ -3130,7 +3132,7 @@ TEST(GcsFileSystemTest, AdditionalRequestHeaderTest) {
       std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
       0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       add_header /* gcs additional header */);
 
@@ -3199,16 +3201,16 @@ TEST(GcsFileSystemTest, CreateHttpRequest) {
                            "Auth Token: fake_token\n"
                            "Header Hello: world\n",
                            "{}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<HttpRequest> request;
   TF_EXPECT_OK(fs.CreateHttpRequest(&request));
@@ -3262,16 +3264,16 @@ TEST(GcsFileSystemTest, Stat_StatsRecording) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   TestGcsStats stats;
   fs.SetStats(&stats);
@@ -3289,16 +3291,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_StatsRecording) {
       "Range: 0-5\n"
       "Timeouts: 5 1 20\n",
       "012345")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
-      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
-      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
 
   TestGcsStats stats;
   fs.SetStats(&stats);
diff --git a/tensorflow/core/platform/cloud/google_auth_provider_test.cc b/tensorflow/core/platform/cloud/google_auth_provider_test.cc
index 07b88a880f..ec31c5ee8c 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider_test.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider_test.cc
@@ -93,8 +93,8 @@ TEST_F(GoogleAuthProviderTest, EnvironmentVariable_Caching) {
 
   std::shared_ptr<HttpRequest::Factory> fakeHttpRequestFactory =
       std::make_shared<FakeHttpRequestFactory>(&requests);
-  auto metadataClient =
-      std::make_shared<ComputeEngineMetadataClient>(fakeHttpRequestFactory, 0);
+  auto metadataClient = std::make_shared<ComputeEngineMetadataClient>(
+      fakeHttpRequestFactory, RetryConfig(0 /* init_delay_time_us */));
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               metadataClient, &env);
   oauth_client->return_token = "fake-token";
@@ -129,8 +129,8 @@ TEST_F(GoogleAuthProviderTest, GCloudRefreshToken) {
   FakeEnv env;
   std::shared_ptr<HttpRequest::Factory> fakeHttpRequestFactory =
       std::make_shared<FakeHttpRequestFactory>(&requests);
-  auto metadataClient =
-      std::make_shared<ComputeEngineMetadataClient>(fakeHttpRequestFactory, 0);
+  auto metadataClient = std::make_shared<ComputeEngineMetadataClient>(
+      fakeHttpRequestFactory, RetryConfig(0 /* init_delay_time_us */));
 
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               metadataClient, &env);
@@ -178,8 +178,8 @@ TEST_F(GoogleAuthProviderTest, RunningOnGCE) {
   FakeEnv env;
   std::shared_ptr<HttpRequest::Factory> fakeHttpRequestFactory =
       std::make_shared<FakeHttpRequestFactory>(&requests);
-  auto metadataClient =
-      std::make_shared<ComputeEngineMetadataClient>(fakeHttpRequestFactory, 0);
+  auto metadataClient = std::make_shared<ComputeEngineMetadataClient>(
+      fakeHttpRequestFactory, RetryConfig(0 /* init_delay_time_us */));
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               metadataClient, &env);
 
@@ -206,8 +206,8 @@ TEST_F(GoogleAuthProviderTest, OverrideForTesting) {
   FakeEnv env;
   std::shared_ptr<HttpRequest::Factory> fakeHttpRequestFactory =
       std::make_shared<FakeHttpRequestFactory>(&empty_requests);
-  auto metadataClient =
-      std::make_shared<ComputeEngineMetadataClient>(fakeHttpRequestFactory, 0);
+  auto metadataClient = std::make_shared<ComputeEngineMetadataClient>(
+      fakeHttpRequestFactory, RetryConfig(0 /* init_delay_time_us */));
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               metadataClient, &env);
 
@@ -228,8 +228,8 @@ TEST_F(GoogleAuthProviderTest, NothingAvailable) {
   FakeEnv env;
   std::shared_ptr<HttpRequest::Factory> fakeHttpRequestFactory =
       std::make_shared<FakeHttpRequestFactory>(&requests);
-  auto metadataClient =
-      std::make_shared<ComputeEngineMetadataClient>(fakeHttpRequestFactory, 0);
+  auto metadataClient = std::make_shared<ComputeEngineMetadataClient>(
+      fakeHttpRequestFactory, RetryConfig(0 /* init_delay_time_us */));
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               metadataClient, &env);
 
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.h b/tensorflow/core/platform/cloud/retrying_file_system.h
index 941ab7ad65..5ce6670dc7 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.h
+++ b/tensorflow/core/platform/cloud/retrying_file_system.h
@@ -34,9 +34,9 @@ template <typename Underlying>
 class RetryingFileSystem : public FileSystem {
  public:
   RetryingFileSystem(std::unique_ptr<Underlying> base_file_system,
-                     int64 delay_microseconds = 1000000)
+                     const RetryConfig& retry_config)
       : base_file_system_(std::move(base_file_system)),
-        initial_delay_microseconds_(delay_microseconds) {}
+        retry_config_(retry_config) {}
 
   Status NewRandomAccessFile(
       const string& filename,
@@ -55,7 +55,7 @@ class RetryingFileSystem : public FileSystem {
   Status FileExists(const string& fname) override {
     return RetryingUtils::CallWithRetries(
         [this, &fname]() { return base_file_system_->FileExists(fname); },
-        initial_delay_microseconds_);
+        retry_config_);
   }
 
   Status GetChildren(const string& dir, std::vector<string>* result) override {
@@ -63,7 +63,7 @@ class RetryingFileSystem : public FileSystem {
         [this, &dir, result]() {
           return base_file_system_->GetChildren(dir, result);
         },
-        initial_delay_microseconds_);
+        retry_config_);
   }
 
   Status GetMatchingPaths(const string& pattern,
@@ -72,31 +72,31 @@ class RetryingFileSystem : public FileSystem {
         [this, &pattern, result]() {
           return base_file_system_->GetMatchingPaths(pattern, result);
         },
-        initial_delay_microseconds_);
+        retry_config_);
   }
 
   Status Stat(const string& fname, FileStatistics* stat) override {
     return RetryingUtils::CallWithRetries(
         [this, &fname, stat]() { return base_file_system_->Stat(fname, stat); },
-        initial_delay_microseconds_);
+        retry_config_);
   }
 
   Status DeleteFile(const string& fname) override {
     return RetryingUtils::DeleteWithRetries(
         [this, &fname]() { return base_file_system_->DeleteFile(fname); },
-        initial_delay_microseconds_);
+        retry_config_);
   }
 
   Status CreateDir(const string& dirname) override {
     return RetryingUtils::CallWithRetries(
         [this, &dirname]() { return base_file_system_->CreateDir(dirname); },
-        initial_delay_microseconds_);
+        retry_config_);
   }
 
   Status DeleteDir(const string& dirname) override {
     return RetryingUtils::DeleteWithRetries(
         [this, &dirname]() { return base_file_system_->DeleteDir(dirname); },
-        initial_delay_microseconds_);
+        retry_config_);
   }
 
   Status GetFileSize(const string& fname, uint64* file_size) override {
@@ -104,7 +104,7 @@ class RetryingFileSystem : public FileSystem {
         [this, &fname, file_size]() {
           return base_file_system_->GetFileSize(fname, file_size);
         },
-        initial_delay_microseconds_);
+        retry_config_);
   }
 
   Status RenameFile(const string& src, const string& target) override {
@@ -112,13 +112,13 @@ class RetryingFileSystem : public FileSystem {
         [this, &src, &target]() {
           return base_file_system_->RenameFile(src, target);
         },
-        initial_delay_microseconds_);
+        retry_config_);
   }
 
   Status IsDirectory(const string& dirname) override {
     return RetryingUtils::CallWithRetries(
         [this, &dirname]() { return base_file_system_->IsDirectory(dirname); },
-        initial_delay_microseconds_);
+        retry_config_);
   }
 
   Status DeleteRecursively(const string& dirname, int64* undeleted_files,
@@ -128,7 +128,7 @@ class RetryingFileSystem : public FileSystem {
           return base_file_system_->DeleteRecursively(dirname, undeleted_files,
                                                       undeleted_dirs);
         },
-        initial_delay_microseconds_);
+        retry_config_);
   }
 
   void FlushCaches() override { base_file_system_->FlushCaches(); }
@@ -137,7 +137,7 @@ class RetryingFileSystem : public FileSystem {
 
  private:
   std::unique_ptr<Underlying> base_file_system_;
-  const int64 initial_delay_microseconds_;
+  const RetryConfig retry_config_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RetryingFileSystem);
 };
@@ -147,9 +147,8 @@ namespace retrying_internals {
 class RetryingRandomAccessFile : public RandomAccessFile {
  public:
   RetryingRandomAccessFile(std::unique_ptr<RandomAccessFile> base_file,
-                           int64 delay_microseconds)
-      : base_file_(std::move(base_file)),
-        initial_delay_microseconds_(delay_microseconds) {}
+                           const RetryConfig& retry_config)
+      : base_file_(std::move(base_file)), retry_config_(retry_config) {}
 
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
@@ -157,20 +156,19 @@ class RetryingRandomAccessFile : public RandomAccessFile {
         [this, offset, n, result, scratch]() {
           return base_file_->Read(offset, n, result, scratch);
         },
-        initial_delay_microseconds_);
+        retry_config_);
   }
 
  private:
   std::unique_ptr<RandomAccessFile> base_file_;
-  const int64 initial_delay_microseconds_;
+  const RetryConfig retry_config_;
 };
 
 class RetryingWritableFile : public WritableFile {
  public:
   RetryingWritableFile(std::unique_ptr<WritableFile> base_file,
-                       int64 delay_microseconds)
-      : base_file_(std::move(base_file)),
-        initial_delay_microseconds_(delay_microseconds) {}
+                       const RetryConfig& retry_config)
+      : base_file_(std::move(base_file)), retry_config_(retry_config) {}
 
   ~RetryingWritableFile() override {
     // Makes sure the retrying version of Close() is called in the destructor.
@@ -179,25 +177,24 @@ class RetryingWritableFile : public WritableFile {
 
   Status Append(StringPiece data) override {
     return RetryingUtils::CallWithRetries(
-        [this, &data]() { return base_file_->Append(data); },
-        initial_delay_microseconds_);
+        [this, &data]() { return base_file_->Append(data); }, retry_config_);
   }
   Status Close() override {
     return RetryingUtils::CallWithRetries(
-        [this]() { return base_file_->Close(); }, initial_delay_microseconds_);
+        [this]() { return base_file_->Close(); }, retry_config_);
   }
   Status Flush() override {
     return RetryingUtils::CallWithRetries(
-        [this]() { return base_file_->Flush(); }, initial_delay_microseconds_);
+        [this]() { return base_file_->Flush(); }, retry_config_);
   }
   Status Sync() override {
     return RetryingUtils::CallWithRetries(
-        [this]() { return base_file_->Sync(); }, initial_delay_microseconds_);
+        [this]() { return base_file_->Sync(); }, retry_config_);
   }
 
  private:
   std::unique_ptr<WritableFile> base_file_;
-  const int64 initial_delay_microseconds_;
+  const RetryConfig retry_config_;
 };
 
 }  // namespace retrying_internals
@@ -210,9 +207,9 @@ Status RetryingFileSystem<Underlying>::NewRandomAccessFile(
       [this, &filename, &base_file]() {
         return base_file_system_->NewRandomAccessFile(filename, &base_file);
       },
-      initial_delay_microseconds_));
+      retry_config_));
   result->reset(new retrying_internals::RetryingRandomAccessFile(
-      std::move(base_file), initial_delay_microseconds_));
+      std::move(base_file), retry_config_));
   return Status::OK();
 }
 
@@ -224,9 +221,9 @@ Status RetryingFileSystem<Underlying>::NewWritableFile(
       [this, &filename, &base_file]() {
         return base_file_system_->NewWritableFile(filename, &base_file);
       },
-      initial_delay_microseconds_));
+      retry_config_));
   result->reset(new retrying_internals::RetryingWritableFile(
-      std::move(base_file), initial_delay_microseconds_));
+      std::move(base_file), retry_config_));
   return Status::OK();
 }
 
@@ -238,9 +235,9 @@ Status RetryingFileSystem<Underlying>::NewAppendableFile(
       [this, &filename, &base_file]() {
         return base_file_system_->NewAppendableFile(filename, &base_file);
       },
-      initial_delay_microseconds_));
+      retry_config_));
   result->reset(new retrying_internals::RetryingWritableFile(
-      std::move(base_file), initial_delay_microseconds_));
+      std::move(base_file), retry_config_));
   return Status::OK();
 }
 
@@ -252,7 +249,7 @@ Status RetryingFileSystem<Underlying>::NewReadOnlyMemoryRegionFromFile(
         return base_file_system_->NewReadOnlyMemoryRegionFromFile(filename,
                                                                   result);
       },
-      initial_delay_microseconds_);
+      retry_config_);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
index 5910fef1d2..868eea096c 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@@ -184,7 +184,8 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
@@ -211,7 +212,8 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_SuccessWith3rdTry) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
@@ -235,7 +237,8 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
@@ -265,7 +268,8 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_NoRetriesForSomeErrors) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
@@ -291,7 +295,8 @@ TEST(RetryingFileSystemTest, NewWritableFile_ImmediateSuccess) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -317,7 +322,8 @@ TEST(RetryingFileSystemTest, NewWritableFile_SuccessWith3rdTry) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -343,7 +349,8 @@ TEST(RetryingFileSystemTest, NewWritableFile_SuccessWith3rdTry_ViaDestructor) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -368,7 +375,8 @@ TEST(RetryingFileSystemTest, NewAppendableFile_SuccessWith3rdTry) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   // Retrieve the wrapped appendable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -391,7 +399,8 @@ TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -412,7 +421,8 @@ TEST(RetryingFileSystemTest,
        std::make_tuple("NewReadOnlyMemoryRegionFromFile", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::unique_ptr<ReadOnlyMemoryRegion> result;
   TF_EXPECT_OK(fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result));
@@ -423,7 +433,8 @@ TEST(RetryingFileSystemTest, NewReadOnlyMemoryRegionFromFile_AllRetriesFailed) {
       CreateRetriableErrors("NewReadOnlyMemoryRegionFromFile", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::unique_ptr<ReadOnlyMemoryRegion> result;
   const auto& status =
@@ -440,7 +451,8 @@ TEST(RetryingFileSystemTest, GetChildren_SuccessWith2ndTry) {
        std::make_tuple("GetChildren", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetChildren("gs://path", &result));
@@ -450,7 +462,8 @@ TEST(RetryingFileSystemTest, GetChildren_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("GetChildren", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
   const auto& status = fs.GetChildren("gs://path", &result);
@@ -466,7 +479,8 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_SuccessWith2ndTry) {
        std::make_tuple("GetMatchingPaths", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://path/dir", &result));
@@ -477,7 +491,8 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
       CreateRetriableErrors("GetMatchingPaths", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
   const auto& status = fs.GetMatchingPaths("gs://path/dir", &result);
@@ -492,7 +507,8 @@ TEST(RetryingFileSystemTest, DeleteFile_SuccessWith2ndTry) {
        std::make_tuple("DeleteFile", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.DeleteFile("gs://path/file.txt"));
@@ -502,7 +518,8 @@ TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("DeleteFile", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
   const auto& status = fs.DeleteFile("gs://path/file.txt");
@@ -517,7 +534,8 @@ TEST(RetryingFileSystemTest, CreateDir_SuccessWith2ndTry) {
        std::make_tuple("CreateDir", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.CreateDir("gs://path/newdir"));
@@ -527,7 +545,8 @@ TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("CreateDir", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
   const auto& status = fs.CreateDir("gs://path/newdir");
@@ -542,7 +561,8 @@ TEST(RetryingFileSystemTest, DeleteDir_SuccessWith2ndTry) {
        std::make_tuple("DeleteDir", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.DeleteDir("gs://path/dir"));
@@ -552,7 +572,8 @@ TEST(RetryingFileSystemTest, DeleteDir_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("DeleteDir", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
   const auto& status = fs.DeleteDir("gs://path/dir");
@@ -568,7 +589,8 @@ TEST(RetryingFileSystemTest, GetFileSize_SuccessWith2ndTry) {
        std::make_tuple("GetFileSize", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   uint64 size;
   TF_EXPECT_OK(fs.GetFileSize("gs://path/file.txt", &size));
@@ -578,7 +600,8 @@ TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("GetFileSize", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   uint64 size;
   const auto& status = fs.GetFileSize("gs://path/file.txt", &size);
@@ -593,7 +616,8 @@ TEST(RetryingFileSystemTest, RenameFile_SuccessWith2ndTry) {
        std::make_tuple("RenameFile", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   TF_EXPECT_OK(fs.RenameFile("old_name", "new_name"));
 }
@@ -602,7 +626,8 @@ TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("RenameFile", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   const auto& status = fs.RenameFile("old_name", "new_name");
   EXPECT_TRUE(
@@ -616,7 +641,8 @@ TEST(RetryingFileSystemTest, Stat_SuccessWith2ndTry) {
        std::make_tuple("Stat", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("file_name", &stat));
@@ -626,7 +652,8 @@ TEST(RetryingFileSystemTest, Stat_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("Stat", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   FileStatistics stat;
   const auto& status = fs.Stat("file_name", &stat);
@@ -639,7 +666,8 @@ TEST(RetryingFileSystemTest, FileExists_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("FileExists", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   const auto& status = fs.FileExists("file_name");
   EXPECT_TRUE(
@@ -653,7 +681,8 @@ TEST(RetryingFileSystemTest, FileExists_SuccessWith2ndTry) {
        std::make_tuple("FileExists", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   TF_EXPECT_OK(fs.FileExists("gs://path/dir"));
 }
@@ -665,7 +694,8 @@ TEST(RetryingFileSystemTest, IsDirectory_SuccessWith2ndTry) {
        std::make_tuple("IsDirectory", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   TF_EXPECT_OK(fs.IsDirectory("gs://path/dir"));
 }
@@ -674,7 +704,8 @@ TEST(RetryingFileSystemTest, IsDirectory_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("IsDirectory", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   const auto& status = fs.IsDirectory("gs://path/dir");
   EXPECT_TRUE(
@@ -689,7 +720,8 @@ TEST(RetryingFileSystemTest, DeleteRecursively_SuccessWith2ndTry) {
        std::make_tuple("DeleteRecursively", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
   int64 undeleted_files, undeleted_dirs;
 
   TF_EXPECT_OK(
@@ -701,7 +733,8 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
       CreateRetriableErrors("DeleteRecursively", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
   int64 undeleted_files, undeleted_dirs;
 
   const auto& status =
@@ -715,7 +748,8 @@ TEST(RetryingFileSystemTest, FlushCaches) {
   ExpectedCalls none;
   bool flushed = false;
   std::unique_ptr<MockFileSystem> base_fs(new MockFileSystem(none, &flushed));
-  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(
+      std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
   fs.FlushCaches();
   EXPECT_TRUE(flushed);
 }
diff --git a/tensorflow/core/platform/cloud/retrying_utils.cc b/tensorflow/core/platform/cloud/retrying_utils.cc
index d2df422024..cb0aecdd35 100644
--- a/tensorflow/core/platform/cloud/retrying_utils.cc
+++ b/tensorflow/core/platform/cloud/retrying_utils.cc
@@ -23,11 +23,6 @@ namespace tensorflow {
 
 namespace {
 
-// In case of failure, every call will be retried kMaxRetries times.
-constexpr int kMaxRetries = 10;
-// Maximum backoff time in microseconds.
-constexpr int64 kMaximumBackoffMicroseconds = 32000000;  // 32 seconds.
-
 bool IsRetriable(error::Code code) {
   switch (code) {
     case error::UNAVAILABLE:
@@ -43,40 +38,41 @@ bool IsRetriable(error::Code code) {
 }  // namespace
 
 Status RetryingUtils::CallWithRetries(const std::function<Status()>& f,
-                                      const int64 initial_delay_microseconds) {
-  return CallWithRetries(f, initial_delay_microseconds, [](int64 micros) {
-    return Env::Default()->SleepForMicroseconds(micros);
-  });
+                                      const RetryConfig& config) {
+  return CallWithRetries(
+      f,
+      [](int64 micros) { return Env::Default()->SleepForMicroseconds(micros); },
+      config);
 }
 
 Status RetryingUtils::CallWithRetries(
-    const std::function<Status()>& f, const int64 initial_delay_microseconds,
-    const std::function<void(int64)>& sleep_usec) {
+    const std::function<Status()>& f,
+    const std::function<void(int64)>& sleep_usec, const RetryConfig& config) {
   int retries = 0;
   while (true) {
     auto status = f();
     if (!IsRetriable(status.code())) {
       return status;
     }
-    if (retries >= kMaxRetries) {
+    if (retries >= config.max_retries) {
       // Return AbortedError, so that it doesn't get retried again somewhere
       // at a higher level.
       return Status(
           error::ABORTED,
           strings::StrCat(
-              "All ", kMaxRetries,
+              "All ", config.max_retries,
               " retry attempts failed. The last failure: ", status.ToString()));
     }
     int64 delay_micros = 0;
-    if (initial_delay_microseconds > 0) {
+    if (config.init_delay_time_us > 0) {
       const int64 random_micros = random::New64() % 1000000;
-      delay_micros = std::min(initial_delay_microseconds << retries,
-                              kMaximumBackoffMicroseconds) +
+      delay_micros = std::min(config.init_delay_time_us << retries,
+                              config.max_delay_time_us) +
                      random_micros;
     }
     LOG(INFO) << "The operation failed and will be automatically retried in "
               << (delay_micros / 1000000.0) << " seconds (attempt "
-              << (retries + 1) << " out of " << kMaxRetries
+              << (retries + 1) << " out of " << config.max_retries
               << "), caused by: " << status.ToString();
     sleep_usec(delay_micros);
     retries++;
@@ -84,8 +80,7 @@ Status RetryingUtils::CallWithRetries(
 }
 
 Status RetryingUtils::DeleteWithRetries(
-    const std::function<Status()>& delete_func,
-    const int64 initial_delay_microseconds) {
+    const std::function<Status()>& delete_func, const RetryConfig& config) {
   bool is_retried = false;
   return RetryingUtils::CallWithRetries(
       [delete_func, &is_retried]() {
@@ -96,7 +91,7 @@ Status RetryingUtils::DeleteWithRetries(
         is_retried = true;
         return status;
       },
-      initial_delay_microseconds);
+      config);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/retrying_utils.h b/tensorflow/core/platform/cloud/retrying_utils.h
index 546b8d1c4a..1a7ce1b122 100644
--- a/tensorflow/core/platform/cloud/retrying_utils.h
+++ b/tensorflow/core/platform/cloud/retrying_utils.h
@@ -21,6 +21,26 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Default time before reporting failure: ~100 seconds.
+struct RetryConfig {
+  RetryConfig(int64 init_delay_time_us = 100 * 1000,
+              int64 max_delay_time_us = 32 * 1000 * 1000,
+              int max_retries = 10) {
+    this->init_delay_time_us = init_delay_time_us;
+    this->max_delay_time_us = max_delay_time_us;
+    this->max_retries = max_retries;
+  }
+
+  // In case of failure, every call will be retried max_retries times.
+  int max_retries;
+
+  // Initial backoff time
+  int64 init_delay_time_us;
+
+  // Maximum backoff time in microseconds.
+  int64 max_delay_time_us;
+};
+
 class RetryingUtils {
  public:
   /// \brief Retries the function in case of failure with exponential backoff.
@@ -31,18 +51,19 @@ class RetryingUtils {
   /// retries.
   /// If all retries failed, returns the last error status.
   static Status CallWithRetries(const std::function<Status()>& f,
-                                const int64 initial_delay_microseconds);
+                                const RetryConfig& config);
+
   /// sleep_usec is a function that sleeps for the given number of microseconds.
   static Status CallWithRetries(const std::function<Status()>& f,
-                                const int64 initial_delay_microseconds,
-                                const std::function<void(int64)>& sleep_usec);
+                                const std::function<void(int64)>& sleep_usec,
+                                const RetryConfig& config);
   /// \brief A retrying wrapper for a function that deletes a resource.
   ///
   /// The function takes care of the scenario when a delete operation
   /// returns a failure but succeeds under the hood: if a retry returns
   /// NOT_FOUND, the whole operation is considered a success.
   static Status DeleteWithRetries(const std::function<Status()>& delete_func,
-                                  const int64 initial_delay_microseconds);
+                                  const RetryConfig& config);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/retrying_utils_test.cc b/tensorflow/core/platform/cloud/retrying_utils_test.cc
index 1b6527618a..75fe8a98f4 100644
--- a/tensorflow/core/platform/cloud/retrying_utils_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_utils_test.cc
@@ -30,7 +30,8 @@ TEST(RetryingUtilsTest, CallWithRetries_RetryDelays) {
   };
   std::function<Status()> f = []() { return errors::Unavailable("Failed."); };
 
-  const auto& status = RetryingUtils::CallWithRetries(f, 500000L, sleep);
+  const auto& status = RetryingUtils::CallWithRetries(
+      f, sleep, RetryConfig(500000 /* init_delay_time_us */));
   EXPECT_EQ(errors::Code::ABORTED, status.code());
   EXPECT_TRUE(str_util::StrContains(
       status.error_message(),
@@ -60,8 +61,10 @@ TEST(RetryingUtilsTest, CallWithRetries_NotFoundIsNotRetried) {
     results.erase(results.begin());
     return result;
   };
-  EXPECT_EQ(errors::Code::NOT_FOUND,
-            RetryingUtils::CallWithRetries(f, 0).code());
+  EXPECT_EQ(
+      errors::Code::NOT_FOUND,
+      RetryingUtils::CallWithRetries(f, RetryConfig(0 /* init_delay_time_us */))
+          .code());
 }
 
 TEST(RetryingUtilsTest, CallWithRetries_ImmediateSuccess) {
@@ -74,7 +77,8 @@ TEST(RetryingUtilsTest, CallWithRetries_ImmediateSuccess) {
     results.erase(results.begin());
     return result;
   };
-  TF_EXPECT_OK(RetryingUtils::CallWithRetries(f, 1.0, sleep));
+  TF_EXPECT_OK(RetryingUtils::CallWithRetries(
+      f, sleep, RetryConfig(1L /* init_delay_time_us */)));
 }
 
 TEST(RetryingUtilsTest, CallWithRetries_EventualSuccess) {
@@ -86,7 +90,8 @@ TEST(RetryingUtilsTest, CallWithRetries_EventualSuccess) {
     results.erase(results.begin());
     return result;
   };
-  TF_EXPECT_OK(RetryingUtils::CallWithRetries(f, 0));
+  TF_EXPECT_OK(RetryingUtils::CallWithRetries(
+      f, RetryConfig(0 /* init_delay_time_us */)));
 }
 
 TEST(RetryingUtilsTest, DeleteWithRetries_ImmediateSuccess) {
@@ -96,7 +101,8 @@ TEST(RetryingUtilsTest, DeleteWithRetries_ImmediateSuccess) {
     delete_results.erase(delete_results.begin());
     return result;
   };
-  TF_EXPECT_OK(RetryingUtils::DeleteWithRetries(delete_func, 0));
+  TF_EXPECT_OK(RetryingUtils::DeleteWithRetries(
+      delete_func, RetryConfig(0 /* init_delay_time_us */)));
 }
 
 TEST(RetryingUtilsTest, DeleteWithRetries_EventualSuccess) {
@@ -106,7 +112,8 @@ TEST(RetryingUtilsTest, DeleteWithRetries_EventualSuccess) {
     delete_results.erase(delete_results.begin());
     return result;
   };
-  TF_EXPECT_OK(RetryingUtils::DeleteWithRetries(delete_func, 0));
+  TF_EXPECT_OK(RetryingUtils::DeleteWithRetries(
+      delete_func, RetryConfig(0 /* init_delay_time_us */)));
 }
 
 TEST(RetryingUtilsTest, DeleteWithRetries_PermissionDeniedNotRetried) {
@@ -118,7 +125,9 @@ TEST(RetryingUtilsTest, DeleteWithRetries_PermissionDeniedNotRetried) {
     return result;
   };
   EXPECT_EQ(errors::Code::PERMISSION_DENIED,
-            RetryingUtils::DeleteWithRetries(delete_func, 0).code());
+            RetryingUtils::DeleteWithRetries(
+                delete_func, RetryConfig(0 /* init_delay_time_us */))
+                .code());
 }
 
 TEST(RetryingUtilsTest, DeleteWithRetries_SuccessThroughFileNotFound) {
@@ -129,7 +138,8 @@ TEST(RetryingUtilsTest, DeleteWithRetries_SuccessThroughFileNotFound) {
     delete_results.erase(delete_results.begin());
     return result;
   };
-  TF_EXPECT_OK(RetryingUtils::DeleteWithRetries(delete_func, 0));
+  TF_EXPECT_OK(RetryingUtils::DeleteWithRetries(
+      delete_func, RetryConfig(0 /* init_delay_time_us */)));
 }
 
 TEST(RetryingUtilsTest, DeleteWithRetries_FirstNotFoundReturnedAsIs) {
@@ -140,7 +150,9 @@ TEST(RetryingUtilsTest, DeleteWithRetries_FirstNotFoundReturnedAsIs) {
     return result;
   };
   EXPECT_EQ(error::NOT_FOUND,
-            RetryingUtils::DeleteWithRetries(delete_func, 0).code());
+            RetryingUtils::DeleteWithRetries(
+                delete_func, RetryConfig(0 /* init_delay_time_us */))
+                .code());
 }
 
 }  // namespace
-- 
GitLab


From 84a051e7d0cd1406c1bb846efc677c8aa3fc896e Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Mon, 1 Oct 2018 11:12:03 -0700
Subject: [PATCH 232/570] Fix typo.

PiperOrigin-RevId: 215246174
---
 tensorflow/python/autograph/CONTRIBUTING.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/python/autograph/CONTRIBUTING.md b/tensorflow/python/autograph/CONTRIBUTING.md
index 1ded5ba5f6..f3587a4384 100644
--- a/tensorflow/python/autograph/CONTRIBUTING.md
+++ b/tensorflow/python/autograph/CONTRIBUTING.md
@@ -9,8 +9,6 @@ In preparation for TF 2.0, we moved the code base of AutoGraph from
 does not impact functionality, and AutoGraph will remain accessible under
 `tensorflow.contrib.autograph` until `tensorflow.contrib` is retired.
 
-When 
-
 ## TensorFlow Code of Conduct
 Please review and follow the [TensorFlow Code of Conduct](../../CODE_OF_CONDUCT.md).
 
-- 
GitLab


From 2bbf05148ad94928c1c828d40e479afdf34e2ef8 Mon Sep 17 00:00:00 2001
From: Christopher Olston <olston@google.com>
Date: Mon, 1 Oct 2018 11:24:41 -0700
Subject: [PATCH 233/570] Automated rollback of commit
 6a787235b95dd3040fc5ff7fb7104585e746c66a

PiperOrigin-RevId: 215248737
---
 tensorflow/core/kernels/batching_util/BUILD | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 039b0db144..0d53240330 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -12,11 +12,6 @@ cc_library(
     name = "periodic_function_dynamic",
     srcs = ["periodic_function.cc"],
     hdrs = ["periodic_function.h"],
-    visibility = [
-        "//learning/serving:__subpackages__",
-        "//tensorflow:internal",
-        "//tensorflow_serving:__subpackages__",
-    ],
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:protos_all_cc",
@@ -25,11 +20,6 @@ cc_library(
 
 cc_library(
     name = "periodic_function",
-    visibility = [
-        "//learning/serving:__subpackages__",
-        "//tensorflow:internal",
-        "//tensorflow_serving:__subpackages__",
-    ],
     deps = [
         ":periodic_function_dynamic",
         "//tensorflow/core:lib",
@@ -198,11 +188,6 @@ cc_library(
     testonly = 1,
     srcs = ["fake_clock_env.cc"],
     hdrs = ["fake_clock_env.h"],
-    visibility = [
-        "//learning/serving:__subpackages__",
-        "//tensorflow:internal",
-        "//tensorflow_serving:__subpackages__",
-    ],
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:tensorflow",
-- 
GitLab


From a9b01e8a31a02188bc81349c103f136095f322ac Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 11:26:02 -0700
Subject: [PATCH 234/570] internal change only

PiperOrigin-RevId: 215248985
---
 tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 8e6e9aa0cd..1c5ea2d997 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -237,7 +237,8 @@ void StartMonitoring(const tensorflow::string& service_addr, int duration_ms,
     MonitorResponse response;
     TF_QCHECK_OK(FromGrpcStatus(stub->Monitor(&context, request, &response)));
 
-    std::cout << "Xprof Monitoring Results (Sample " << query + 1 << "):\n\n"
+    std::cout << "Cloud TPU Monitoring Results (Sample " << query + 1
+              << "):\n\n"
               << response.data() << std::flush;
   }
 }
-- 
GitLab


From f0f301f05fb1f1965c966ef57cc390e48d966f12 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Mon, 1 Oct 2018 11:29:30 -0700
Subject: [PATCH 235/570] Add deprecation notice for BasicRNNCell, which will
 be replaced by keras.SimpleRNNCell.

PiperOrigin-RevId: 215249611
---
 tensorflow/python/kernel_tests/rnn_test.py    |  39 ++++
 tensorflow/python/ops/rnn_cell_impl.py        |   4 +-
 ...orflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt | 202 ------------------
 .../golden/v2/tensorflow.nn.rnn_cell.pbtxt    |   4 -
 4 files changed, 42 insertions(+), 207 deletions(-)
 delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt

diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 05ad9f6336..2f6963f6b8 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -535,6 +535,45 @@ class RNNTest(test.TestCase):
     self.assertAllClose(tf_out, k_out)
     self.assertAllClose(tf_state, k_state)
 
+  def testSimpleRNNCellAndBasicRNNCellComparison(self):
+    input_shape = 10
+    output_shape = 5
+    timestep = 4
+    batch = 20
+    (x_train, _), _ = testing_utils.get_test_data(
+        train_samples=batch,
+        test_samples=0,
+        input_shape=(timestep, input_shape),
+        num_classes=output_shape)
+    fix_weights_generator = keras.layers.SimpleRNNCell(output_shape)
+    fix_weights_generator.build((None, input_shape))
+    # The SimpleRNNCell contains 3 weights: kernel, recurrent_kernel, and bias
+    # The BasicRNNCell contains 2 weight: kernel and bias, where kernel is
+    # zipped [kernel, recurrent_kernel] in SimpleRNNCell.
+    keras_weights = fix_weights_generator.get_weights()
+    kernel, recurrent_kernel, bias = keras_weights
+    tf_weights = [np.concatenate((kernel, recurrent_kernel)), bias]
+
+    with self.test_session(graph=ops_lib.Graph()) as sess:
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape))
+      cell = keras.layers.SimpleRNNCell(output_shape)
+      k_out, k_state = rnn.dynamic_rnn(
+          cell, inputs, dtype=dtypes.float32)
+      cell.set_weights(keras_weights)
+      [k_out, k_state] = sess.run([k_out, k_state], {inputs: x_train})
+    with self.test_session(graph=ops_lib.Graph()) as sess:
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape))
+      cell = rnn_cell_impl.BasicRNNCell(output_shape)
+      tf_out, tf_state = rnn.dynamic_rnn(
+          cell, inputs, dtype=dtypes.float32)
+      cell.set_weights(tf_weights)
+      [tf_out, tf_state] = sess.run([tf_out, tf_state], {inputs: x_train})
+
+    self.assertAllClose(tf_out, k_out)
+    self.assertAllClose(tf_state, k_state)
+
   def testBasicLSTMCellInterchangeWithLSTMCell(self):
     with self.session(graph=ops_lib.Graph()) as sess:
       basic_cell = rnn_cell_impl.BasicLSTMCell(1)
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index c2751e529a..dd4f3d7a99 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -370,7 +370,7 @@ class LayerRNNCell(RNNCell):
                                      *args, **kwargs)
 
 
-@tf_export("nn.rnn_cell.BasicRNNCell")
+@tf_export(v1=["nn.rnn_cell.BasicRNNCell"])
 class BasicRNNCell(LayerRNNCell):
   """The most basic RNN cell.
 
@@ -393,6 +393,8 @@ class BasicRNNCell(LayerRNNCell):
       `trainable` etc when constructing the cell from configs of get_config().
   """
 
+  @deprecated(None, "This class is equivalent as tf.keras.layers.SimpleRNNCell,"
+                    " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self,
                num_units,
                activation=None,
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
deleted file mode 100644
index a4483fefa2..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ /dev/null
@@ -1,202 +0,0 @@
-path: "tensorflow.nn.rnn_cell.BasicRNNCell"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.BasicRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "zero_state"
-    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
index 64697e8a02..24767e250f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
@@ -4,10 +4,6 @@ tf_module {
     name: "BasicLSTMCell"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "BasicRNNCell"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "DeviceWrapper"
     mtype: "<type \'type\'>"
-- 
GitLab


From 7cabc6be4e32dfb7f42c7f5e33549984bfdb68a3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 11:44:17 -0700
Subject: [PATCH 236/570] Allow zero number of inputs in XRT execute operation.

PiperOrigin-RevId: 215252408
---
 tensorflow/compiler/xrt/ops/xrt_execute_op.cc |  2 +-
 tensorflow/compiler/xrt/tests/raw_api_test.cc | 41 +++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xrt/ops/xrt_execute_op.cc b/tensorflow/compiler/xrt/ops/xrt_execute_op.cc
index fda4c31298..40ec1b0ba9 100644
--- a/tensorflow/compiler/xrt/ops/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_execute_op.cc
@@ -21,7 +21,7 @@ limitations under the License.
 namespace tensorflow {
 
 REGISTER_OP("XRTExecute")
-    .Attr("Ninputs: int")
+    .Attr("Ninputs: int >= 0")
     .Input("computation_handle: int64")
     .Input("execution_config: string")
     .Input("input_handles: Ninputs * int64")
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index 2952feb16a..f590fbf0d9 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -108,6 +108,14 @@ bool CompareLiteralToLiteralProto(const xla::Literal& a,
   return equal;
 }
 
+xla::XlaComputation OnePlusTwo() {
+  xla::XlaBuilder builder("OnePlusTwo");
+  auto c0 = xla::ConstantR0(&builder, 1.0f);
+  auto c1 = xla::ConstantR0(&builder, 2.0f);
+  xla::Add(c0, c1);
+  return builder.Build().ValueOrDie();
+}
+
 xla::XlaComputation AddAndScale() {
   xla::XlaBuilder builder("AddAndScale");
   auto p0 = xla::Parameter(&builder, 0,
@@ -346,6 +354,39 @@ TEST(RawApiTest, CompileAndExecute) {
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 }
 
+TEST(RawApiTest, CompileAndExecuteZeroArg) {
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::F32, {});
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+  StoreComputationSnapshot(OnePlusTwo(), c.mutable_hlo_snapshot());
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto computation =
+      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto result = ops::XRTExecute(root, c_handle, e_config,
+                                std::initializer_list<Input>({}));
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+
+  auto expected = xla::LiteralUtil::CreateR0<float>(3.0f);
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
 TEST(RawApiTest, CompileAndExecuteReturnTuple) {
   xrt::XLAAllocation p0;
   p0.set_device_ordinal(0);
-- 
GitLab


From f1fd53748b99532b2572b8909efcd4f5c06ce28d Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 1 Oct 2018 11:53:27 -0700
Subject: [PATCH 237/570] Updating function and class tf_export decorators for
 endpoints according to https://github.com/tensorflow/community/pull/16. In
 addition to the changes in the doc, I made the following updates (these
 changes make sense to me and I didn't notice them when compiling the doc): *
 deprecate saved_model.builder.SavedModelBuilder - replaced with
 saved_model.SavedModelBuilder * deprecate python_io.tf_record_iterator -
 replaced with io.tf_record_iterator * deprecate python_io.TFRecordWriter -
 replaced with io.TFRecordWriter * move reduce_join to tf.string

PiperOrigin-RevId: 215253944
---
 tensorflow/python/framework/dtypes.py         |   4 +-
 tensorflow/python/framework/errors_impl.py    |   6 +-
 tensorflow/python/framework/graph_io.py       |   2 +-
 tensorflow/python/framework/importer.py       |   2 +-
 tensorflow/python/framework/random_seed.py    |   6 +-
 tensorflow/python/framework/sparse_tensor.py  |   2 +-
 tensorflow/python/lib/io/tf_record.py         |  13 +-
 tensorflow/python/ops/array_ops.py            |  44 ++--
 .../python/ops/candidate_sampling_ops.py      |   8 +-
 tensorflow/python/ops/check_ops.py            |  63 ++++--
 tensorflow/python/ops/clip_ops.py             |   8 +-
 tensorflow/python/ops/confusion_matrix.py     |   4 +-
 tensorflow/python/ops/control_flow_ops.py     |   2 +-
 tensorflow/python/ops/data_flow_ops.py        |  17 +-
 tensorflow/python/ops/init_ops.py             |   5 +
 tensorflow/python/ops/linalg_ops.py           |  15 +-
 tensorflow/python/ops/lookup_ops.py           |   2 +-
 tensorflow/python/ops/manip_ops.py            |   4 +-
 tensorflow/python/ops/math_ops.py             | 145 ++++++++------
 tensorflow/python/ops/nn_impl.py              |   6 +-
 tensorflow/python/ops/nn_ops.py               |   8 +-
 tensorflow/python/ops/numerics.py             |   4 +-
 tensorflow/python/ops/parsing_ops.py          |  18 +-
 tensorflow/python/ops/random_ops.py           |  19 +-
 tensorflow/python/ops/sparse_ops.py           | 107 ++++++----
 tensorflow/python/ops/special_math_ops.py     |   4 +-
 tensorflow/python/ops/string_ops.py           |   7 +-
 tensorflow/python/saved_model/builder_impl.py |   7 +-
 tensorflow/python/saved_model/loader_impl.py  |   8 +-
 tensorflow/python/saved_model/main_op_impl.py |   5 +-
 .../saved_model/signature_def_utils_impl.py   |  27 ++-
 tensorflow/python/saved_model/utils_impl.py   |  10 +-
 .../tools/api/generator/api_init_files.bzl    |   1 +
 .../tools/api/generator/api_init_files_v1.bzl |   1 +
 tensorflow/python/training/input.py           |   3 +-
 .../api/golden/v1/tensorflow.debugging.pbtxt  |  96 +++++++++
 .../golden/v1/tensorflow.dtypes.-d-type.pbtxt |  77 +++++++
 .../api/golden/v1/tensorflow.dtypes.pbtxt     |  20 ++
 .../api/golden/v1/tensorflow.graph_util.pbtxt |   4 +
 .../api/golden/v1/tensorflow.image.pbtxt      |   4 +
 .../golden/v1/tensorflow.initializers.pbtxt   |   4 +
 .../v1/tensorflow.io.-fixed-len-feature.pbtxt |  27 +++
 ...rflow.io.-fixed-len-sequence-feature.pbtxt |  31 +++
 ...tensorflow.io.-padding-f-i-f-o-queue.pbtxt |  66 ++++++
 .../v1/tensorflow.io.-priority-queue.pbtxt    |  66 ++++++
 .../golden/v1/tensorflow.io.-queue-base.pbtxt |  65 ++++++
 .../tensorflow.io.-random-shuffle-queue.pbtxt |  66 ++++++
 .../v1/tensorflow.io.-sparse-feature.pbtxt    |  35 ++++
 ...flow.io.-t-f-record-compression-type.pbtxt |  20 ++
 .../tensorflow.io.-t-f-record-options.pbtxt   |  17 ++
 .../v1/tensorflow.io.-t-f-record-writer.pbtxt |  21 ++
 .../v1/tensorflow.io.-var-len-feature.pbtxt   |  19 ++
 .../tools/api/golden/v1/tensorflow.io.pbtxt   |  84 ++++++++
 .../api/golden/v1/tensorflow.linalg.pbtxt     |  12 ++
 .../tools/api/golden/v1/tensorflow.math.pbtxt | 188 ++++++++++++++++++
 .../tools/api/golden/v1/tensorflow.nn.pbtxt   |  12 ++
 .../tools/api/golden/v1/tensorflow.pbtxt      |   8 +
 .../golden/v1/tensorflow.quantization.pbtxt   |   4 +
 .../api/golden/v1/tensorflow.random.pbtxt     |  47 +++++
 .../v1/tensorflow.saved_model.-builder.pbtxt  |  21 ++
 .../golden/v1/tensorflow.saved_model.pbtxt    |  44 ++++
 ...arse.-sparse-conditional-accumulator.pbtxt |  46 +++++
 .../v1/tensorflow.sparse.-sparse-tensor.pbtxt |  54 +++++
 .../api/golden/v1/tensorflow.sparse.pbtxt     | 112 +++++++++++
 .../api/golden/v1/tensorflow.strings.pbtxt    |   4 +
 .../api/golden/v1/tensorflow.train.pbtxt      |   4 +
 .../api/golden/v2/tensorflow.debugging.pbtxt  |  96 +++++++++
 .../golden/v2/tensorflow.dtypes.-d-type.pbtxt |  77 +++++++
 .../api/golden/v2/tensorflow.dtypes.pbtxt     |  20 ++
 .../api/golden/v2/tensorflow.graph_util.pbtxt |   4 +
 .../api/golden/v2/tensorflow.image.pbtxt      |   4 +
 .../golden/v2/tensorflow.initializers.pbtxt   |   4 +
 .../v2/tensorflow.io.-fixed-len-feature.pbtxt |  27 +++
 ...rflow.io.-fixed-len-sequence-feature.pbtxt |  31 +++
 ...tensorflow.io.-padding-f-i-f-o-queue.pbtxt |  66 ++++++
 .../v2/tensorflow.io.-priority-queue.pbtxt    |  66 ++++++
 .../golden/v2/tensorflow.io.-queue-base.pbtxt |  65 ++++++
 .../tensorflow.io.-random-shuffle-queue.pbtxt |  66 ++++++
 .../v2/tensorflow.io.-sparse-feature.pbtxt    |  35 ++++
 ...flow.io.-t-f-record-compression-type.pbtxt |  20 ++
 .../tensorflow.io.-t-f-record-options.pbtxt   |  17 ++
 .../v2/tensorflow.io.-t-f-record-writer.pbtxt |  21 ++
 .../v2/tensorflow.io.-var-len-feature.pbtxt   |  19 ++
 .../tools/api/golden/v2/tensorflow.io.pbtxt   |  84 ++++++++
 .../api/golden/v2/tensorflow.linalg.pbtxt     |  12 ++
 .../tools/api/golden/v2/tensorflow.math.pbtxt | 188 ++++++++++++++++++
 .../tools/api/golden/v2/tensorflow.nn.pbtxt   |  12 ++
 .../tools/api/golden/v2/tensorflow.pbtxt      |   8 +
 .../golden/v2/tensorflow.quantization.pbtxt   |   4 +
 .../api/golden/v2/tensorflow.random.pbtxt     |  47 +++++
 .../v2/tensorflow.saved_model.-builder.pbtxt  |  21 ++
 .../golden/v2/tensorflow.saved_model.pbtxt    |  44 ++++
 ...arse.-sparse-conditional-accumulator.pbtxt |  46 +++++
 .../v2/tensorflow.sparse.-sparse-tensor.pbtxt |  54 +++++
 .../api/golden/v2/tensorflow.sparse.pbtxt     | 112 +++++++++++
 .../api/golden/v2/tensorflow.strings.pbtxt    |   4 +
 .../api/golden/v2/tensorflow.train.pbtxt      |   4 +
 97 files changed, 2926 insertions(+), 217 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.dtypes.-d-type.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-feature.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-sequence-feature.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.io.-padding-f-i-f-o-queue.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.io.-priority-queue.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.io.-queue-base.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.io.-random-shuffle-queue.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.io.-sparse-feature.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.io.-t-f-record-compression-type.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.io.-t-f-record-options.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.io.-t-f-record-writer.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.io.-var-len-feature.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-conditional-accumulator.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.dtypes.-d-type.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-feature.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-sequence-feature.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.io.-padding-f-i-f-o-queue.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.io.-priority-queue.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.io.-queue-base.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.io.-random-shuffle-queue.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.io.-sparse-feature.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.io.-t-f-record-compression-type.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.io.-t-f-record-options.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.io.-t-f-record-writer.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.io.-var-len-feature.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.saved_model.-builder.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-conditional-accumulator.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt

diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index c3f70df7d8..64d3b42d89 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -26,7 +26,7 @@ from tensorflow.python.util.tf_export import tf_export
 _np_bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
 
 
-@tf_export("DType")
+@tf_export("dtypes.DType", "DType")
 class DType(object):
   """Represents the type of the elements in a `Tensor`.
 
@@ -658,7 +658,7 @@ _PYTHON_TO_TF = {
 }
 
 
-@tf_export("as_dtype")
+@tf_export("dtypes.as_dtype", "as_dtype")
 def as_dtype(type_value):
   """Converts the given `type_value` to a `DType`.
 
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index 5af71f2cfb..8b303fa8a9 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -25,11 +25,13 @@ from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("OpError", "errors.OpError")
+@tf_export("errors.OpError", "OpError")
+@deprecation.deprecated_endpoints("OpError")
 class OpError(Exception):
   """A generic error that is raised when TensorFlow execution fails.
 
@@ -72,7 +74,7 @@ class OpError(Exception):
     or `Recv` op, there will be no corresponding
     `tf.Operation`
     object.  In that case, this will return `None`, and you should
-    instead use the `tf.OpError.node_def` to
+    instead use the `tf.errors.OpError.node_def` to
     discover information about the op.
 
     Returns:
diff --git a/tensorflow/python/framework/graph_io.py b/tensorflow/python/framework/graph_io.py
index be30b16f5f..47e1344eae 100644
--- a/tensorflow/python/framework/graph_io.py
+++ b/tensorflow/python/framework/graph_io.py
@@ -27,7 +27,7 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('train.write_graph')
+@tf_export('io.write_graph', 'train.write_graph')
 def write_graph(graph_or_graph_def, logdir, name, as_text=True):
   """Writes a graph proto to a file.
 
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index e48e67c8a1..c6595918ae 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -329,7 +329,7 @@ def _SetDefaultAttrValues(node_def, op_def):
         node_def.attr[key].CopyFrom(attr_def.default_value)
 
 
-@tf_export('import_graph_def')
+@tf_export('graph_util.import_graph_def', 'import_graph_def')
 @deprecated_args(None, 'Please file an issue at '
                  'https://github.com/tensorflow/tensorflow/issues if you depend'
                  ' on this feature.', 'op_dict')
diff --git a/tensorflow/python/framework/random_seed.py b/tensorflow/python/framework/random_seed.py
index 2f9504889a..6f9f347a99 100644
--- a/tensorflow/python/framework/random_seed.py
+++ b/tensorflow/python/framework/random_seed.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -33,7 +34,8 @@ def _truncate_seed(seed):
   return seed % _MAXINT32  # Truncate to fit into 32-bit integer
 
 
-@tf_export('get_seed')
+@tf_export('random.get_seed', 'get_seed')
+@deprecation.deprecated_endpoints('get_seed')
 def get_seed(op_seed):
   """Returns the local seeds an operation should use given an op-specific seed.
 
@@ -80,7 +82,7 @@ def get_seed(op_seed):
   return seeds
 
 
-@tf_export('set_random_seed')
+@tf_export('random.set_random_seed', 'set_random_seed')
 def set_random_seed(seed):
   """Sets the graph-level random seed.
 
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index d1bdd9b80a..41ef2e11d1 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -33,7 +33,7 @@ _override_helper = ops._override_helper
 # pylint: enable=protected-access
 
 
-@tf_export("SparseTensor")
+@tf_export("sparse.SparseTensor", "SparseTensor")
 class SparseTensor(_TensorLike):
   """Represents a sparse tensor.
 
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index cce71a2bab..9ab683d96a 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -22,10 +22,12 @@ from __future__ import print_function
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("python_io.TFRecordCompressionType")
+@tf_export("io.TFRecordCompressionType", "python_io.TFRecordCompressionType")
+@deprecation.deprecated_endpoints("python_io.TFRecordCompressionType")
 class TFRecordCompressionType(object):
   """The type of compression for the record."""
   NONE = 0
@@ -33,7 +35,8 @@ class TFRecordCompressionType(object):
   GZIP = 2
 
 
-@tf_export("python_io.TFRecordOptions")
+@tf_export("io.TFRecordOptions", "python_io.TFRecordOptions")
+@deprecation.deprecated_endpoints("python_io.TFRecordOptions")
 class TFRecordOptions(object):
   """Options used for manipulating TFRecord files."""
   compression_type_map = {
@@ -143,7 +146,8 @@ class TFRecordOptions(object):
     return options
 
 
-@tf_export("python_io.tf_record_iterator")
+@tf_export("io.tf_record_iterator", "python_io.tf_record_iterator")
+@deprecation.deprecated_endpoints("python_io.tf_record_iterator")
 def tf_record_iterator(path, options=None):
   """An iterator that read the records from a TFRecords file.
 
@@ -175,7 +179,8 @@ def tf_record_iterator(path, options=None):
     reader.Close()
 
 
-@tf_export("python_io.TFRecordWriter")
+@tf_export("io.TFRecordWriter", "python_io.TFRecordWriter")
+@deprecation.deprecated_endpoints("python_io.TFRecordWriter")
 class TFRecordWriter(object):
   """A class to write records to a TFRecords file.
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index a7f57e94e3..9f5149d5ac 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1204,7 +1204,8 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
     return _apply_mask_1d(tensor, mask, axis)
 
 
-@tf_export("sparse_mask")
+@tf_export("sparse.mask", "sparse_mask")
+@deprecation.deprecated_endpoints("sparse_mask")
 def sparse_mask(a, mask_indices, name=None):
   """Masks elements of `IndexedSlices`.
 
@@ -1226,7 +1227,7 @@ def sparse_mask(a, mask_indices, name=None):
   # `b` will be the subset of `a` slices at its second and third indices, so
   # we want to mask its first and last indices (which are at absolute
   # indices 12, 45)
-  b = tf.sparse_mask(a, [12, 45])
+  b = tf.sparse.mask(a, [12, 45])
 
   b.indices  # [26, 37]
   tf.shape(b.values)  # [2, 10]
@@ -1382,7 +1383,7 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
                     [10, 11, 12]]])
 
   # Take the transpose of the matrices in dimension-0
-  # (this common operation has a shorthand `matrix_transpose`)
+  # (this common operation has a shorthand `linalg.transpose`)
   tf.transpose(x, perm=[0, 2, 1])  # [[[1,  4],
                                    #   [2,  5],
                                    #   [3,  6]],
@@ -1421,7 +1422,8 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
 
 
 # pylint: disable=invalid-name
-@tf_export("matrix_transpose", "linalg.transpose")
+@tf_export("linalg.transpose", "matrix_transpose")
+@deprecation.deprecated_endpoints("matrix_transpose")
 def matrix_transpose(a, name="matrix_transpose", conjugate=False):
   """Transposes last two dimensions of tensor `a`.
 
@@ -1429,19 +1431,19 @@ def matrix_transpose(a, name="matrix_transpose", conjugate=False):
 
   ```python
   x = tf.constant([[1, 2, 3], [4, 5, 6]])
-  tf.matrix_transpose(x)  # [[1, 4],
+  tf.linalg.transpose(x)  # [[1, 4],
                           #  [2, 5],
                           #  [3, 6]]
 
   x = tf.constant([[1 + 1j, 2 + 2j, 3 + 3j],
                    [4 + 4j, 5 + 5j, 6 + 6j]])
-  tf.matrix_transpose(x, conjugate=True)  # [[1 - 1j, 4 - 4j],
+  tf.linalg.transpose(x, conjugate=True)  # [[1 - 1j, 4 - 4j],
                                           #  [2 - 2j, 5 - 5j],
                                           #  [3 - 3j, 6 - 6j]]
 
   # Matrix with two batch dimensions.
   # x.shape is [1, 2, 3, 4]
-  # tf.matrix_transpose(x) is shape [1, 2, 4, 3]
+  # tf.linalg.transpose(x) is shape [1, 2, 4, 3]
   ```
 
   Note that `tf.matmul` provides kwargs allowing for transpose of arguments.
@@ -1452,14 +1454,14 @@ def matrix_transpose(a, name="matrix_transpose", conjugate=False):
   tf.matmul(matrix, b, transpose_b=True)
 
   # Inefficient!
-  tf.matmul(matrix, tf.matrix_transpose(b))
+  tf.matmul(matrix, tf.linalg.transpose(b))
   ```
 
   @compatibility(numpy)
   In `numpy` transposes are memory-efficient constant time operations as they
   simply return a new view of the same data with adjusted `strides`.
 
-  TensorFlow does not support strides, `matrix_transposes` return a new tensor
+  TensorFlow does not support strides, `linalg.transposes` return a new tensor
   with the items permuted.
   @end_compatibility
 
@@ -1467,7 +1469,7 @@ def matrix_transpose(a, name="matrix_transpose", conjugate=False):
     a: A `Tensor` with `rank >= 2`.
     name: A name for the operation (optional).
     conjugate: Optional bool. Setting it to `True` is mathematically equivalent
-      to tf.conj(tf.matrix_transpose(input)).
+      to tf.conj(tf.linalg.transpose(input)).
 
   Returns:
     A transposed batch matrix `Tensor`.
@@ -1756,7 +1758,8 @@ def _normalize_sparse_shape(shape, name):
   return (ops.convert_to_tensor(shape, dtype=dtypes.int64, name=name), rank)
 
 
-@tf_export("sparse_placeholder")
+@tf_export("sparse.placeholder", "sparse_placeholder")
+@deprecation.deprecated_endpoints("sparse_placeholder")
 def sparse_placeholder(dtype, shape=None, name=None):
   """Inserts a placeholder for a sparse tensor that will be always fed.
 
@@ -1767,8 +1770,8 @@ def sparse_placeholder(dtype, shape=None, name=None):
   For example:
 
   ```python
-  x = tf.sparse_placeholder(tf.float32)
-  y = tf.sparse_reduce_sum(x)
+  x = tf.sparse.placeholder(tf.float32)
+  y = tf.sparse.reduce_sum(x)
 
   with tf.Session() as sess:
     print(sess.run(y))  # ERROR: will fail because x was not fed.
@@ -2250,7 +2253,8 @@ def required_space_to_batch_paddings(input_shape,
     return result_paddings, result_crops
 
 
-@tf_export("space_to_batch")
+@tf_export("nn.space_to_batch", "space_to_batch")
+@deprecation.deprecated_endpoints("space_to_batch")
 def space_to_batch(input, paddings, block_size, name=None):  # pylint: disable=redefined-builtin
   result = space_to_batch_nd(
       input,
@@ -2264,7 +2268,8 @@ def space_to_batch(input, paddings, block_size, name=None):  # pylint: disable=r
 space_to_batch.__doc__ = gen_array_ops.space_to_batch.__doc__
 
 
-@tf_export("space_to_depth")
+@tf_export("nn.space_to_depth", "space_to_depth")
+@deprecation.deprecated_endpoints("space_to_depth")
 def space_to_depth(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
 
@@ -2272,7 +2277,8 @@ def space_to_depth(input, block_size, name=None, data_format="NHWC"):  # pylint:
 space_to_depth.__doc__ = gen_array_ops.space_to_depth.__doc__
 
 
-@tf_export("depth_to_space")
+@tf_export("nn.depth_to_space", "depth_to_space")
+@deprecation.deprecated_endpoints("depth_to_space")
 def depth_to_space(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
 
@@ -2747,7 +2753,8 @@ def batch_gather(params, indices, name=None):
 @tf_export("quantize_v2")
 @deprecation.deprecated(
     "2017-10-25",
-    "`tf.quantize_v2` is deprecated, please use `tf.quantize` instead.")
+    "`tf.quantize_v2` is deprecated, please use `tf.quantization.quantize` "
+    "instead.")  # pylint: disable=missing-docstring
 def quantize_v2(input,  # pylint: disable=redefined-builtin
                 min_range,
                 max_range,
@@ -2769,7 +2776,8 @@ quantize_v2.__doc__ = """Please use `tf.quantize` instead."""
 
 # We want to expose tf.quantize instead of tf.quantize_v2; we can deprecate
 # tf.quantize_v2 in next version of TensorFlow.
-@tf_export("quantize")
+@tf_export("quantization.quantize", "quantize")
+@deprecation.deprecated_endpoints("quantize")
 def quantize(input,  # pylint: disable=redefined-builtin
              min_range,
              max_range,
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index 9ea1ea9c92..98dde995c9 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -23,10 +23,12 @@ from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gen_candidate_sampling_ops
 from tensorflow.python.ops import math_ops  # pylint: disable=unused-import
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('nn.uniform_candidate_sampler')
+@tf_export('random.uniform_candidate_sampler', 'nn.uniform_candidate_sampler')
+@deprecation.deprecated_endpoints('nn.uniform_candidate_sampler')
 def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
                               range_max, seed=None, name=None):
   """Samples a set of classes using a uniform base distribution.
@@ -82,7 +84,9 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
       seed2=seed2, name=name)
 
 
-@tf_export('nn.log_uniform_candidate_sampler')
+@tf_export('random.log_uniform_candidate_sampler',
+           'nn.log_uniform_candidate_sampler')
+@deprecation.deprecated_endpoints('nn.log_uniform_candidate_sampler')
 def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
                                   range_max, seed=None, name=None):
   """Samples a set of classes using a log-uniform (Zipfian) base distribution.
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index c3cf6e61f2..d607f1d9fb 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 NUMERIC_TYPES = frozenset(
@@ -91,7 +92,8 @@ def _shape_and_dtype_str(tensor):
   return 'shape=%s dtype=%s' % (tensor.shape, tensor.dtype.name)
 
 
-@tf_export('assert_proper_iterable')
+@tf_export('debugging.assert_proper_iterable', 'assert_proper_iterable')
+@deprecation.deprecated_endpoints('assert_proper_iterable')
 def assert_proper_iterable(values):
   """Static assert that values is a "proper" iterable.
 
@@ -119,7 +121,8 @@ def assert_proper_iterable(values):
         'Expected argument "values" to be iterable.  Found: %s' % type(values))
 
 
-@tf_export('assert_negative')
+@tf_export('debugging.assert_negative', 'assert_negative')
+@deprecation.deprecated_endpoints('assert_negative')
 def assert_negative(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x < 0` holds element-wise.
 
@@ -160,7 +163,8 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):
     return assert_less(x, zero, data=data, summarize=summarize)
 
 
-@tf_export('assert_positive')
+@tf_export('debugging.assert_positive', 'assert_positive')
+@deprecation.deprecated_endpoints('assert_positive')
 def assert_positive(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x > 0` holds element-wise.
 
@@ -200,7 +204,8 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):
     return assert_less(zero, x, data=data, summarize=summarize)
 
 
-@tf_export('assert_non_negative')
+@tf_export('debugging.assert_non_negative', 'assert_non_negative')
+@deprecation.deprecated_endpoints('assert_non_negative')
 def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x >= 0` holds element-wise.
 
@@ -242,7 +247,8 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
     return assert_less_equal(zero, x, data=data, summarize=summarize)
 
 
-@tf_export('assert_non_positive')
+@tf_export('debugging.assert_non_positive', 'assert_non_positive')
+@deprecation.deprecated_endpoints('assert_non_positive')
 def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x <= 0` holds element-wise.
 
@@ -284,7 +290,7 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
     return assert_less_equal(x, zero, data=data, summarize=summarize)
 
 
-@tf_export('assert_equal')
+@tf_export('debugging.assert_equal', 'assert_equal')
 def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x == y` holds element-wise.
 
@@ -384,7 +390,8 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('assert_none_equal')
+@tf_export('debugging.assert_none_equal', 'assert_none_equal')
+@deprecation.deprecated_endpoints('assert_none_equal')
 def assert_none_equal(
     x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x != y` holds for all elements.
@@ -435,7 +442,8 @@ def assert_none_equal(
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('assert_near')
+@tf_export('debugging.assert_near', 'assert_near')
+@deprecation.deprecated_endpoints('assert_near')
 def assert_near(
     x, y, rtol=None, atol=None, data=None, summarize=None, message=None,
     name=None):
@@ -513,7 +521,7 @@ def assert_near(
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('assert_less')
+@tf_export('debugging.assert_less', 'assert_less')
 def assert_less(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x < y` holds element-wise.
 
@@ -561,7 +569,8 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('assert_less_equal')
+@tf_export('debugging.assert_less_equal', 'assert_less_equal')
+@deprecation.deprecated_endpoints('assert_less_equal')
 def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x <= y` holds element-wise.
 
@@ -609,7 +618,7 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('assert_greater')
+@tf_export('debugging.assert_greater', 'assert_greater')
 def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x > y` holds element-wise.
 
@@ -657,7 +666,8 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('assert_greater_equal')
+@tf_export('debugging.assert_greater_equal', 'assert_greater_equal')
+@deprecation.deprecated_endpoints('assert_greater_equal')
 def assert_greater_equal(x, y, data=None, summarize=None, message=None,
                          name=None):
   """Assert the condition `x >= y` holds element-wise.
@@ -755,7 +765,7 @@ def _assert_rank_condition(
   return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('assert_rank')
+@tf_export('debugging.assert_rank', 'assert_rank')
 def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank equal to `rank`.
 
@@ -817,7 +827,8 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   return assert_op
 
 
-@tf_export('assert_rank_at_least')
+@tf_export('debugging.assert_rank_at_least', 'assert_rank_at_least')
+@deprecation.deprecated_endpoints('assert_rank_at_least')
 def assert_rank_at_least(
     x, rank, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank equal to `rank` or higher.
@@ -948,7 +959,8 @@ def _assert_ranks_condition(
   return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('assert_rank_in')
+@tf_export('debugging.assert_rank_in', 'assert_rank_in')
+@deprecation.deprecated_endpoints('assert_rank_in')
 def assert_rank_in(
     x, ranks, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank in `ranks`.
@@ -1010,7 +1022,8 @@ def assert_rank_in(
   return assert_op
 
 
-@tf_export('assert_integer')
+@tf_export('debugging.assert_integer', 'assert_integer')
+@deprecation.deprecated_endpoints('assert_integer')
 def assert_integer(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
 
@@ -1048,7 +1061,8 @@ def assert_integer(x, message=None, name=None):
     return control_flow_ops.no_op('statically_determined_was_integer')
 
 
-@tf_export('assert_type')
+@tf_export('debugging.assert_type', 'assert_type')
+@deprecation.deprecated_endpoints('assert_type')
 def assert_type(tensor, tf_type, message=None, name=None):
   """Statically asserts that the given `Tensor` is of the specified type.
 
@@ -1095,12 +1109,14 @@ def _get_diff_for_monotonic_comparison(x):
   return control_flow_ops.cond(is_shorter_than_two, short_result, diff)
 
 
-@tf_export('is_numeric_tensor')
+@tf_export('debugging.is_numeric_tensor', 'is_numeric_tensor')
+@deprecation.deprecated_endpoints('is_numeric_tensor')
 def is_numeric_tensor(tensor):
   return isinstance(tensor, ops.Tensor) and tensor.dtype in NUMERIC_TYPES
 
 
-@tf_export('is_non_decreasing')
+@tf_export('debugging.is_non_decreasing', 'is_non_decreasing')
+@deprecation.deprecated_endpoints('is_non_decreasing')
 def is_non_decreasing(x, name=None):
   """Returns `True` if `x` is non-decreasing.
 
@@ -1127,7 +1143,8 @@ def is_non_decreasing(x, name=None):
     return math_ops.reduce_all(math_ops.less_equal(zero, diff))
 
 
-@tf_export('is_strictly_increasing')
+@tf_export('debugging.is_strictly_increasing', 'is_strictly_increasing')
+@deprecation.deprecated_endpoints('is_strictly_increasing')
 def is_strictly_increasing(x, name=None):
   """Returns `True` if `x` is strictly increasing.
 
@@ -1202,7 +1219,8 @@ def _assert_same_base_type(items, expected_type=None):
     return expected_type
 
 
-@tf_export('assert_same_float_dtype')
+@tf_export('debugging.assert_same_float_dtype', 'assert_same_float_dtype')
+@deprecation.deprecated_endpoints('assert_same_float_dtype')
 def assert_same_float_dtype(tensors=None, dtype=None):
   """Validate and return float type based on `tensors` and `dtype`.
 
@@ -1231,7 +1249,8 @@ def assert_same_float_dtype(tensors=None, dtype=None):
   return dtype
 
 
-@tf_export('assert_scalar')
+@tf_export('debugging.assert_scalar', 'assert_scalar')
+@deprecation.deprecated_endpoints('assert_scalar')
 def assert_scalar(tensor, name=None):
   with ops.name_scope(name, 'assert_scalar', [tensor]) as name_scope:
     tensor = ops.convert_to_tensor(tensor, name=name_scope)
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index 29468431b3..45516068f4 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import numerics
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -76,8 +77,8 @@ def clip_by_value(t, clip_value_min, clip_value_max,
 
   return t_max
   # TODO(scottzhu): switch to use new implmentation in 2 weeks.
-    # return gen_math_ops.clip_by_value(
-    #     t, clip_value_min, clip_value_max, name=name)
+  # return gen_math_ops.clip_by_value(
+  #     t, clip_value_min, clip_value_max, name=name)
 
 
 # TODO(scottzhu): switch to use new implmentation in 2 weeks.
@@ -159,7 +160,8 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
   return tclip
 
 
-@tf_export("global_norm")
+@tf_export("linalg.global_norm", "global_norm")
+@deprecation.deprecated_endpoints("global_norm")
 def global_norm(t_list, name=None):
   """Computes the global norm of multiple tensors.
 
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index c09154129f..8259142456 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -89,7 +90,8 @@ def remove_squeezable_dimensions(
     return labels, predictions
 
 
-@tf_export('confusion_matrix')
+@tf_export('train.confusion_matrix', 'confusion_matrix')
+@deprecation.deprecated_endpoints('confusion_matrix')
 def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
                      name=None, weights=None):
   """Computes the confusion matrix from predictions and labels.
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 9d7d31df22..8ad71fe00c 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -106,7 +106,7 @@ def _summarize_eager(tensor, summarize=None):
 
 # Assert and Print are special symbols in python, so we must
 # use an upper-case version of them.
-@tf_export("Assert")
+@tf_export("debugging.Assert", "Assert")
 @tf_should_use.should_use_result
 def Assert(condition, data, summarize=None, name=None):
   """Asserts that the given condition is true.
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 69c0fcbbee..97b6f3bd9c 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import resource_variable_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_data_flow_ops import *
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 # pylint: enable=wildcard-import
@@ -112,7 +113,8 @@ def _shape_common(s1, s2):
 
 
 # pylint: disable=protected-access
-@tf_export("QueueBase")
+@tf_export("io.QueueBase", "QueueBase")
+@deprecation.deprecated_endpoints("QueueBase")
 class QueueBase(object):
   """Base class for queue implementations.
 
@@ -604,7 +606,8 @@ def _shared_name(shared_name):
   return shared_name
 
 
-@tf_export("RandomShuffleQueue")
+@tf_export("io.RandomShuffleQueue", "RandomShuffleQueue")
+@deprecation.deprecated_endpoints("RandomShuffleQueue")
 class RandomShuffleQueue(QueueBase):
   """A queue implementation that dequeues elements in a random order.
 
@@ -746,7 +749,8 @@ class FIFOQueue(QueueBase):
     super(FIFOQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
 
-@tf_export("PaddingFIFOQueue")
+@tf_export("io.PaddingFIFOQueue", "PaddingFIFOQueue")
+@deprecation.deprecated_endpoints("PaddingFIFOQueue")
 class PaddingFIFOQueue(QueueBase):
   """A FIFOQueue that supports batching variable-sized tensors by padding.
 
@@ -820,7 +824,8 @@ class PaddingFIFOQueue(QueueBase):
     super(PaddingFIFOQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
 
-@tf_export("PriorityQueue")
+@tf_export("io.PriorityQueue", "PriorityQueue")
+@deprecation.deprecated_endpoints("PriorityQueue")
 class PriorityQueue(QueueBase):
   """A queue implementation that dequeues elements in prioritized order.
 
@@ -1300,7 +1305,9 @@ class ConditionalAccumulator(ConditionalAccumulatorBase):
     return out
 
 
-@tf_export("SparseConditionalAccumulator")
+@tf_export("sparse.SparseConditionalAccumulator",
+           "SparseConditionalAccumulator")
+@deprecation.deprecated_endpoints("SparseConditionalAccumulator")
 class SparseConditionalAccumulator(ConditionalAccumulatorBase):
   """A conditional accumulator for aggregating sparse gradients.
 
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index fff3d9b930..65bb77b474 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import  deprecated_arg_values
 from tensorflow.python.util.tf_export import tf_export
@@ -341,6 +342,7 @@ class TruncatedNormal(Initializer):
 
 @tf_export("initializers.uniform_unit_scaling",
            "uniform_unit_scaling_initializer")
+@deprecation.deprecated_endpoints("uniform_unit_scaling_initializer")
 class UniformUnitScaling(Initializer):
   """Initializer that generates tensors without scaling variance.
 
@@ -401,6 +403,7 @@ class UniformUnitScaling(Initializer):
 
 @tf_export("keras.initializers.VarianceScaling",
            "initializers.variance_scaling", "variance_scaling_initializer")
+@deprecation.deprecated_endpoints("variance_scaling_initializer")
 class VarianceScaling(Initializer):
   """Initializer capable of adapting its scale to the shape of weights tensors.
 
@@ -494,6 +497,7 @@ class VarianceScaling(Initializer):
 
 @tf_export("keras.initializers.Orthogonal", "initializers.orthogonal",
            "orthogonal_initializer", "keras.initializers.orthogonal")
+@deprecation.deprecated_endpoints("orthogonal_initializer")
 class Orthogonal(Initializer):
   """Initializer that generates an orthogonal matrix.
 
@@ -1149,6 +1153,7 @@ class GlorotUniform(VarianceScaling):
 
 @tf_export("glorot_normal_initializer", "keras.initializers.glorot_normal",
            "initializers.glorot_normal")
+@deprecation.deprecated_endpoints("glorot_normal_initializer")
 class GlorotNormal(VarianceScaling):
   """The Glorot normal initializer, also called Xavier normal initializer.
 
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index f4a93560be..bf4354fa73 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -80,6 +80,7 @@ def _RegularizedGramianCholesky(matrix, l2_regularizer, first_kind):
 
 
 @tf_export('cholesky_solve', 'linalg.cholesky_solve')
+@deprecation.deprecated_endpoints('cholesky_solve')
 def cholesky_solve(chol, rhs, name=None):
   """Solves systems of linear eqns `A X = RHS`, given Cholesky factorizations.
 
@@ -167,7 +168,8 @@ def eye(num_rows,
                              name=name)
 
 
-@tf_export('matrix_solve_ls', 'linalg.lstsq')
+@tf_export('linalg.lstsq', 'matrix_solve_ls')
+@deprecation.deprecated_endpoints('matrix_solve_ls')
 def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
   r"""Solves one or more linear least-squares problems.
 
@@ -220,7 +222,7 @@ def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
       squares sense.
 
   Raises:
-    NotImplementedError: matrix_solve_ls is currently disabled for complex128
+    NotImplementedError: linalg.lstsq is currently disabled for complex128
     and l2_regularizer != 0 due to poor accuracy.
   """
 
@@ -303,7 +305,8 @@ def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
         matrix, rhs, l2_regularizer, fast=fast, name=name)
 
 
-@tf_export('self_adjoint_eig', 'linalg.eigh')
+@tf_export('linalg.eigh', 'self_adjoint_eig')
+@deprecation.deprecated_endpoints('self_adjoint_eig')
 def self_adjoint_eig(tensor, name=None):
   """Computes the eigen decomposition of a batch of self-adjoint matrices.
 
@@ -325,12 +328,13 @@ def self_adjoint_eig(tensor, name=None):
   return e, v
 
 
-@tf_export('self_adjoint_eigvals', 'linalg.eigvalsh')
+@tf_export('linalg.eigvalsh', 'self_adjoint_eigvals')
+@deprecation.deprecated_endpoints('self_adjoint_eigvals')
 def self_adjoint_eigvals(tensor, name=None):
   """Computes the eigenvalues of one or more self-adjoint matrices.
 
   Note: If your program backpropagates through this function, you should replace
-  it with a call to tf.self_adjoint_eig (possibly ignoring the second output) to
+  it with a call to tf.linalg.eigvalsh (possibly ignoring the second output) to
   avoid computing the eigen decomposition twice. This is because the
   eigenvectors are used to compute the gradient w.r.t. the eigenvalues. See
   _SelfAdjointEigV2Grad in linalg_grad.py.
@@ -348,6 +352,7 @@ def self_adjoint_eigvals(tensor, name=None):
 
 
 @tf_export('svd', 'linalg.svd')
+@deprecation.deprecated_endpoints('svd')
 def svd(tensor, full_matrices=False, compute_uv=True, name=None):
   r"""Computes the singular value decompositions of one or more matrices.
 
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 5443699ddd..cffaa983d4 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -59,7 +59,7 @@ def initialize_all_tables(name="init_all_tables"):
   return tables_initializer(name)
 
 
-@tf_export("tables_initializer")
+@tf_export("initializers.tables_initializer", "tables_initializer")
 def tables_initializer(name="init_all_tables"):
   """Returns an Op that initializes all tables of the default graph.
 
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
index 6633565a64..d9d0728287 100644
--- a/tensorflow/python/ops/manip_ops.py
+++ b/tensorflow/python/ops/manip_ops.py
@@ -19,11 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.ops import gen_manip_ops as _gen_manip_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access
-@tf_export('manip.roll')
+@tf_export('roll', 'manip.roll')
+@deprecation.deprecated_endpoints('manip.roll')
 def roll(input, shift, axis):  # pylint: disable=redefined-builtin
   return _gen_manip_ops.roll(input, shift, axis)
 
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index f57abf6704..83b8b5a3a4 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -70,7 +70,7 @@ def _set_doc(doc):
 
 
 # pylint: disable=redefined-builtin
-@tf_export("argmax")
+@tf_export("math.argmax", "argmax")
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -88,7 +88,7 @@ def argmax(input,
   return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
 
 
-@tf_export("argmin")
+@tf_export("math.argmin", "argmin")
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -111,7 +111,7 @@ def argmin(input,
 
 # pylint: disable=anomalous-backslash-in-string,protected-access
 # pylint: disable=g-docstring-has-escape
-@tf_export("abs")
+@tf_export("math.abs", "abs")
 def abs(x, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the absolute value of a tensor.
 
@@ -186,7 +186,7 @@ class DivideDelegateWithName(object):
     return _div_python2(self.x, y, self.name)
 
 
-@tf_export("divide")
+@tf_export("math.divide", "divide")
 def divide(x, y, name=None):
   """Computes Python style division of `x` by `y`."""
 
@@ -198,7 +198,7 @@ def divide(x, y, name=None):
     return x / y
 
 
-@tf_export("multiply")
+@tf_export("math.multiply", "multiply")
 def multiply(x, y, name=None):
   return gen_math_ops.mul(x, y, name)
 
@@ -218,7 +218,7 @@ _mul.__doc__ = (
     gen_math_ops.mul.__doc__ + ("" if _mul.__doc__ is None else _mul.__doc__))
 
 
-@tf_export("subtract")
+@tf_export("math.subtract", "subtract")
 def subtract(x, y, name=None):
   return gen_math_ops.sub(x, y, name)
 
@@ -239,7 +239,7 @@ _sub.__doc__ = (
 
 
 # pylint: disable=g-docstring-has-escape
-@tf_export("negative")
+@tf_export("math.negative", "negative")
 def negative(x, name=None):
   """Computes numerical negative value element-wise.
 
@@ -288,7 +288,7 @@ def _neg(x, name=None):
 # pylint: enable=g-docstring-has-escape
 
 
-@tf_export("sign")
+@tf_export("math.sign", "sign")
 def sign(x, name=None):
   """Returns an element-wise indication of the sign of a number.
 
@@ -319,7 +319,7 @@ def sign(x, name=None):
       return gen_math_ops.sign(x, name=name)
 
 
-@tf_export("square")
+@tf_export("math.square", "square")
 def square(x, name=None):
   r"""Computes square of x element-wise.
 
@@ -342,7 +342,7 @@ def square(x, name=None):
       return gen_math_ops.square(x, name=name)
 
 
-@tf_export("sqrt")
+@tf_export("math.sqrt", "sqrt")
 def sqrt(x, name=None):
   r"""Computes square root of x element-wise.
 
@@ -365,7 +365,8 @@ def sqrt(x, name=None):
       return gen_math_ops.sqrt(x, name=name)
 
 
-@tf_export("erf")
+@tf_export("math.erf", "erf")
+@deprecation.deprecated_endpoints("erf")
 def erf(x, name=None):
   """Computes the Gauss error function of `x` element-wise.
 
@@ -386,7 +387,7 @@ def erf(x, name=None):
       return gen_math_ops.erf(x, name=name)
 
 
-@tf_export("scalar_mul")
+@tf_export("math.scalar_mul", "scalar_mul")
 def scalar_mul(scalar, x):
   """Multiplies a scalar times a `Tensor` or `IndexedSlices` object.
 
@@ -416,7 +417,7 @@ def scalar_mul(scalar, x):
     raise ValueError("Only scalar multiply works, got shape %s" % shape)
 
 
-@tf_export("pow")
+@tf_export("math.pow", "pow")
 def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the power of one value to another.
 
@@ -444,7 +445,7 @@ def pow(x, y, name=None):  # pylint: disable=redefined-builtin
 
 
 # pylint: disable=redefined-builtin,redefined-outer-name
-@tf_export("complex")
+@tf_export("dtypes.complex", "complex")
 def complex(real, imag, name=None):
   r"""Converts two real numbers to a complex number.
 
@@ -486,7 +487,8 @@ def complex(real, imag, name=None):
     return gen_math_ops._complex(real, imag, Tout=Tout, name=name)
 
 
-@tf_export("real")
+@tf_export("math.real", "real")
+@deprecation.deprecated_endpoints("real")
 def real(input, name=None):
   r"""Returns the real part of a complex (or real) tensor.
 
@@ -517,7 +519,8 @@ def real(input, name=None):
       return input
 
 
-@tf_export("imag")
+@tf_export("math.imag", "imag")
+@deprecation.deprecated_endpoints("imag")
 def imag(input, name=None):
   r"""Returns the imaginary part of a complex (or real) tensor.
 
@@ -547,7 +550,8 @@ def imag(input, name=None):
       return array_ops.zeros_like(input)
 
 
-@tf_export("angle")
+@tf_export("math.angle", "angle")
+@deprecation.deprecated_endpoints("angle")
 def angle(input, name=None):
   r"""Returns the element-wise argument of a complex (or real) tensor.
 
@@ -586,7 +590,7 @@ def angle(input, name=None):
 # pylint: enable=redefined-outer-name,redefined-builtin
 
 
-@tf_export("round")
+@tf_export("math.round", "round")
 def round(x, name=None):  # pylint: disable=redefined-builtin
   """Rounds the values of a tensor to the nearest integer, element-wise.
 
@@ -613,7 +617,7 @@ def round(x, name=None):  # pylint: disable=redefined-builtin
     return gen_math_ops.round(x, name=name)
 
 
-@tf_export("cast")
+@tf_export("dtypes.cast", "cast")
 def cast(x, dtype, name=None):
   """Casts a tensor to a new type.
 
@@ -676,7 +680,7 @@ def cast(x, dtype, name=None):
     return x
 
 
-@tf_export("saturate_cast")
+@tf_export("dtypes.saturate_cast", "saturate_cast")
 def saturate_cast(value, dtype, name=None):
   """Performs a safe saturating cast of `value` to `dtype`.
 
@@ -995,7 +999,7 @@ def _div_python2(x, y, name=None):
       return gen_math_ops.floor_div(x, y, name=name)
 
 
-@tf_export("truediv")
+@tf_export("math.truediv", "truediv")
 def truediv(x, y, name=None):
   """Divides x / y elementwise (using Python 3 division operator semantics).
 
@@ -1006,7 +1010,7 @@ def truediv(x, y, name=None):
   arguments are cast to floating types first.   This op is generated by normal
   `x / y` division in Python 3 and in Python 2.7 with
   `from __future__ import division`.  If you want integer division that rounds
-  down, use `x // y` or `tf.floordiv`.
+  down, use `x // y` or `tf.math.floordiv`.
 
   `x` and `y` must have the same numeric type.  If the inputs are floating
   point, the output will have the same type.  If the inputs are integral, the
@@ -1078,7 +1082,8 @@ mod = gen_math_ops.floor_mod
 
 # TODO(aselle): Deprecate this once all internal functionality uses
 # tf.truncatediv
-@tf_export("floordiv")
+@tf_export("math.floordiv", "floordiv")
+@deprecation.deprecated_endpoints("floordiv")
 def floordiv(x, y, name=None):
   """Divides `x / y` elementwise, rounding toward the most negative integer.
 
@@ -1151,7 +1156,8 @@ _OverrideBinaryOperatorHelper(gen_math_ops.floor_mod, "mod")
 _OverrideBinaryOperatorHelper(pow, "pow")
 
 
-@tf_export("logical_xor")
+@tf_export("math.logical_xor", "logical_xor")
+@deprecation.deprecated_endpoints("logical_xor")
 def logical_xor(x, y, name="LogicalXor"):
   """x ^ y = (x | y) & ~(x & y)."""
   # TODO(alemi) Make this a cwise op if people end up relying on it.
@@ -1277,7 +1283,7 @@ def _may_reduce_to_scalar(keepdims, axis, reduction_indices, output):
   return output
 
 
-@tf_export("reduce_sum")
+@tf_export("math.reduce_sum", "reduce_sum")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_sum(input_tensor,
@@ -1339,7 +1345,7 @@ def reduce_sum(input_tensor,
                                    name=name))
 
 
-@tf_export("count_nonzero")
+@tf_export("math.count_nonzero", "count_nonzero")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def count_nonzero(input_tensor,
@@ -1417,7 +1423,7 @@ def count_nonzero(input_tensor,
         dtype=dtype)
 
 
-@tf_export("reduce_mean")
+@tf_export("math.reduce_mean", "reduce_mean")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_mean(input_tensor,
@@ -1489,7 +1495,7 @@ def reduce_mean(input_tensor,
                                    name=name))
 
 
-@tf_export("reduce_prod")
+@tf_export("math.reduce_prod", "reduce_prod")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_prod(input_tensor,
@@ -1539,7 +1545,7 @@ def reduce_prod(input_tensor,
                                    name=name))
 
 
-@tf_export("reduce_min")
+@tf_export("math.reduce_min", "reduce_min")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_min(input_tensor,
@@ -1588,7 +1594,7 @@ def reduce_min(input_tensor,
                                    name=name))
 
 
-@tf_export("reduce_max")
+@tf_export("math.reduce_max", "reduce_max")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_max(input_tensor,
@@ -1637,7 +1643,7 @@ def reduce_max(input_tensor,
                                    name=name))
 
 
-@tf_export("reduce_all")
+@tf_export("math.reduce_all", "reduce_all")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_all(input_tensor,
@@ -1695,7 +1701,7 @@ def reduce_all(input_tensor,
                                    name=name))
 
 
-@tf_export("reduce_any")
+@tf_export("math.reduce_any", "reduce_any")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_any(input_tensor,
@@ -1753,7 +1759,7 @@ def reduce_any(input_tensor,
                                    name=name))
 
 
-@tf_export("reduce_logsumexp")
+@tf_export("math.reduce_logsumexp", "reduce_logsumexp")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_logsumexp(input_tensor,
@@ -1827,7 +1833,8 @@ def reduce_logsumexp(input_tensor,
     return _may_reduce_to_scalar(keepdims, axis, reduction_indices, result)
 
 
-@tf_export("trace", "linalg.trace")
+@tf_export("linalg.trace", "trace")
+@deprecation.deprecated_endpoints("trace")
 def trace(x, name=None):
   """Compute the trace of a tensor `x`.
 
@@ -1841,12 +1848,12 @@ def trace(x, name=None):
 
   ```python
   x = tf.constant([[1, 2], [3, 4]])
-  tf.trace(x)  # 5
+  tf.linalg.trace(x)  # 5
 
   x = tf.constant([[1, 2, 3],
                    [4, 5, 6],
                    [7, 8, 9]])
-  tf.trace(x)  # 15
+  tf.linalg.trace(x)  # 15
 
   x = tf.constant([[[1, 2, 3],
                     [4, 5, 6],
@@ -1854,7 +1861,7 @@ def trace(x, name=None):
                    [[-1, -2, -3],
                     [-4, -5, -6],
                     [-7, -8, -9]]])
-  tf.trace(x)  # [15, -15]
+  tf.linalg.trace(x)  # [15, -15]
   ```
 
   Args:
@@ -1869,7 +1876,7 @@ def trace(x, name=None):
     return reduce_sum(array_ops.matrix_diag_part(x), [-1], name=name)
 
 
-@tf_export("matmul")
+@tf_export("linalg.matmul", "matmul")
 def matmul(a,
            b,
            transpose_a=False,
@@ -2131,7 +2138,7 @@ def _as_indexed_slices_list(inputs, optimize=True):
   return casted_outputs
 
 
-@tf_export("add_n")
+@tf_export("math.add_n", "add_n")
 def add_n(inputs, name=None):
   """Adds all input tensors element-wise.
 
@@ -2166,14 +2173,15 @@ def add_n(inputs, name=None):
   return gen_math_ops.add_n(inputs, name=name)
 
 
-@tf_export("accumulate_n")
+@tf_export("math.accumulate_n", "accumulate_n")
+@deprecation.deprecated_endpoints("accumulate_n")
 def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   """Returns the element-wise sum of a list of tensors.
 
   Optionally, pass `shape` and `tensor_dtype` for shape and type checking,
   otherwise, these are inferred.
 
-  `tf.accumulate_n` performs the same operation as `tf.add_n`, but does not
+  `tf.math.accumulate_n` performs the same operation as `tf.add_n`, but does not
   wait for all of its inputs to be ready before beginning to sum. This can
   save memory if inputs are ready at different times, since minimum temporary
   storage is proportional to the output size rather than the inputs size.
@@ -2185,10 +2193,10 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   ```python
   a = tf.constant([[1, 2], [3, 4]])
   b = tf.constant([[5, 0], [0, 6]])
-  tf.accumulate_n([a, b, a])  # [[7, 4], [6, 14]]
+  tf.math.accumulate_n([a, b, a])  # [[7, 4], [6, 14]]
 
   # Explicitly pass shape and type
-  tf.accumulate_n([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)
+  tf.math.accumulate_n([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)
                                                                  # [[7,  4],
                                                                  #  [6, 14]]
   ```
@@ -2252,7 +2260,7 @@ def _accumulate_n_grad(op, grad):
   return [grad] * len(op.inputs)
 
 
-@tf_export("nn.sigmoid", "sigmoid")
+@tf_export("math.sigmoid", "nn.sigmoid", "sigmoid")
 def sigmoid(x, name=None):
   """Computes sigmoid of `x` element-wise.
 
@@ -2275,7 +2283,8 @@ def sigmoid(x, name=None):
     return gen_math_ops.sigmoid(x, name=name)
 
 
-@tf_export("log_sigmoid")
+@tf_export("math.log_sigmoid", "log_sigmoid")
+@deprecation.deprecated_endpoints("log_sigmoid")
 def log_sigmoid(x, name=None):
   """Computes log sigmoid of `x` element-wise.
 
@@ -2294,7 +2303,7 @@ def log_sigmoid(x, name=None):
     return gen_math_ops.neg(gen_nn_ops.softplus(-x), name=name)
 
 
-@tf_export("nn.tanh", "tanh")
+@tf_export("math.tanh", "nn.tanh", "tanh")
 def tanh(x, name=None):
   """Computes hyperbolic tangent of `x` element-wise.
 
@@ -2315,7 +2324,8 @@ def tanh(x, name=None):
       return gen_math_ops.tanh(x, name=name)
 
 
-@tf_export("bincount")
+@tf_export("math.bincount", "bincount")
+@deprecation.deprecated_endpoints("bincount")
 def bincount(arr,
              weights=None,
              minlength=None,
@@ -2362,7 +2372,7 @@ def bincount(arr,
   return gen_math_ops.bincount(arr, output_size, weights)
 
 
-@tf_export("cumsum")
+@tf_export("math.cumsum", "cumsum")
 def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative sum of the tensor `x` along `axis`.
 
@@ -2414,7 +2424,8 @@ def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
         x, axis, exclusive=exclusive, reverse=reverse, name=name)
 
 
-@tf_export("cumprod")
+@tf_export("math.cumprod", "cumprod")
+@deprecation.deprecated_endpoints("cumprod")
 def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative product of the tensor `x` along `axis`.
 
@@ -2422,7 +2433,7 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
   first element of the input is identical to the first element of the output:
 
   ```python
-  tf.cumprod([a, b, c])  # [a, a * b, a * b * c]
+  tf.math.cumprod([a, b, c])  # [a, a * b, a * b * c]
   ```
 
   By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
@@ -2430,21 +2441,21 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
   instead:
 
   ```python
-  tf.cumprod([a, b, c], exclusive=True)  # [1, a, a * b]
+  tf.math.cumprod([a, b, c], exclusive=True)  # [1, a, a * b]
   ```
 
   By setting the `reverse` kwarg to `True`, the cumprod is performed in the
   opposite direction:
 
   ```python
-  tf.cumprod([a, b, c], reverse=True)  # [a * b * c, b * c, c]
+  tf.math.cumprod([a, b, c], reverse=True)  # [a * b * c, b * c, c]
   ```
 
   This is more efficient than using separate `tf.reverse` ops.
   The `reverse` and `exclusive` kwargs can also be combined:
 
   ```python
-  tf.cumprod([a, b, c], exclusive=True, reverse=True)  # [b * c, c, 1]
+  tf.math.cumprod([a, b, c], exclusive=True, reverse=True)  # [b * c, c, 1]
   ```
 
   Args:
@@ -2466,7 +2477,8 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
         x, axis, exclusive=exclusive, reverse=reverse, name=name)
 
 
-@tf_export("conj")
+@tf_export("math.conj", "conj")
+@deprecation.deprecated_endpoints("conj")
 def conj(x, name=None):
   r"""Returns the complex conjugate of a complex number.
 
@@ -2480,7 +2492,7 @@ def conj(x, name=None):
   For example:
 
       # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-      tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+      tf.math.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
 
   If `x` is real, it is returned unchanged.
 
@@ -2566,7 +2578,8 @@ def _unsorted_segment_N(data, segment_ids, num_segments):
   return gen_math_ops.maximum(N, 1)
 
 
-@tf_export("unsorted_segment_mean")
+@tf_export("math.unsorted_segment_mean", "unsorted_segment_mean")
+@deprecation.deprecated_endpoints("unsorted_segment_mean")
 def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
   r"""Computes the mean along segments of a tensor.
 
@@ -2608,7 +2621,8 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
     return summed / N
 
 
-@tf_export("unsorted_segment_sqrt_n")
+@tf_export("math.unsorted_segment_sqrt_n", "unsorted_segment_sqrt_n")
+@deprecation.deprecated_endpoints("unsorted_segment_sqrt_n")
 def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
   r"""Computes the sum along segments of a tensor divided by the sqrt(N).
 
@@ -2653,7 +2667,8 @@ def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
     return summed / gen_math_ops.sqrt(N)
 
 
-@tf_export("sparse_segment_sum")
+@tf_export("sparse.segment_sum", "sparse_segment_sum")
+@deprecation.deprecated_endpoints("sparse_segment_sum")
 def sparse_segment_sum(data, indices, segment_ids, name=None,
                        num_segments=None):
   r"""Computes the sum along sparse segments of a tensor.
@@ -2674,16 +2689,16 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
   c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
 
   # Select two rows, one segment.
-  tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+  tf.sparse.segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
   # => [[0 0 0 0]]
 
   # Select two rows, two segment.
-  tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+  tf.sparse.segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
   # => [[ 1  2  3  4]
   #     [-1 -2 -3 -4]]
 
   # With missing segment ids.
-  tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 2]),
+  tf.sparse.segment_sum(c, tf.constant([0, 1]), tf.constant([0, 2]),
                         num_segments=4)
   # => [[ 1  2  3  4]
   #     [ 0  0  0  0]
@@ -2691,7 +2706,7 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
   #     [ 0  0  0  0]]
 
   # Select all rows, two segments.
-  tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+  tf.sparse.segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
   # => [[0 0 0 0]
   #     [5 6 7 8]]
 
@@ -2726,7 +2741,8 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
         data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
-@tf_export("sparse_segment_mean")
+@tf_export("sparse.segment_mean", "sparse_segment_mean")
+@deprecation.deprecated_endpoints("sparse_segment_mean")
 def sparse_segment_mean(data,
                         indices,
                         segment_ids,
@@ -2771,7 +2787,8 @@ def sparse_segment_mean(data,
         data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
-@tf_export("sparse_segment_sqrt_n")
+@tf_export("sparse.segment_sqrt_n", "sparse_segment_sqrt_n")
+@deprecation.deprecated_endpoints("sparse_segment_sqrt_n")
 def sparse_segment_sqrt_n(data,
                           indices,
                           segment_ids,
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 2a1919e66f..453848fc00 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -328,7 +328,7 @@ def swish(features):
   return features * math_ops.sigmoid(features)
 
 
-@tf_export("nn.l2_normalize")
+@tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize")
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
   """Normalizes along dimension `axis` using an L2 norm.
@@ -360,7 +360,7 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
     return math_ops.multiply(x, x_inv_norm, name=name)
 
 
-@tf_export("nn.zero_fraction")
+@tf_export("math.zero_fraction", "nn.zero_fraction")
 def zero_fraction(value, name=None):
   """Returns the fraction of zeros in `value`.
 
@@ -689,7 +689,7 @@ def moments(
     # Compute true mean while keeping the dims for proper broadcasting.
     mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
     # sample variance, not unbiased variance
-    # Note: stop_gradient does not change the gradient that gets 
+    # Note: stop_gradient does not change the gradient that gets
     #       backpropagated to the mean from the variance calculation,
     #       because that gradient is zero
     variance = math_ops.reduce_mean(
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 9ef177e97b..fd71e7cc39 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1692,7 +1692,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   return output
 
 
-@tf_export("nn.softmax")
+@tf_export("nn.softmax", "math.softmax")
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax(logits, axis=None, name=None, dim=None):
   """Computes softmax activations.
@@ -1722,7 +1722,7 @@ def softmax(logits, axis=None, name=None, dim=None):
   return _softmax(logits, gen_nn_ops.softmax, axis, name)
 
 
-@tf_export("nn.log_softmax")
+@tf_export("nn.log_softmax", "math.log_softmax")
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def log_softmax(logits, axis=None, name=None, dim=None):
   """Computes log softmax activations.
@@ -2329,7 +2329,7 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     return ret
 
 
-@tf_export("nn.top_k")
+@tf_export("math.top_k", "nn.top_k")
 def top_k(input, k=1, sorted=True, name=None):  # pylint: disable=redefined-builtin
   """Finds values and indices of the `k` largest entries for the last dimension.
 
@@ -2644,7 +2644,7 @@ def erosion2d(value, kernel, strides, rates, padding, name=None):
             name=name))
 
 
-@tf_export("nn.in_top_k")
+@tf_export("math.in_top_k", "nn.in_top_k")
 def in_top_k(predictions, targets, k, name=None):
   r"""Says whether the targets are in the top `K` predictions.
 
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index 8fcbd7d834..002e87b411 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -24,10 +24,12 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("verify_tensor_all_finite")
+@tf_export("debugging.assert_all_finite", "verify_tensor_all_finite")
+@deprecation.deprecated_endpoints("verify_tensor_all_finite")
 def verify_tensor_all_finite(t, msg, name=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
 
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index b3e03a0135..ff50fe0d09 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.gen_parsing_ops import *
 # pylint: enable=wildcard-import,undefined-variable
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -45,7 +46,7 @@ ops.NotDifferentiable("SerializeTensor")
 ops.NotDifferentiable("StringToNumber")
 
 
-@tf_export("VarLenFeature")
+@tf_export("io.VarLenFeature", "VarLenFeature")
 class VarLenFeature(collections.namedtuple("VarLenFeature", ["dtype"])):
   """Configuration for parsing a variable-length input feature.
 
@@ -55,7 +56,7 @@ class VarLenFeature(collections.namedtuple("VarLenFeature", ["dtype"])):
   pass
 
 
-@tf_export("SparseFeature")
+@tf_export("io.SparseFeature", "SparseFeature")
 class SparseFeature(
     collections.namedtuple(
         "SparseFeature",
@@ -130,7 +131,7 @@ class SparseFeature(
         cls, index_key, value_key, dtype, size, already_sorted)
 
 
-@tf_export("FixedLenFeature")
+@tf_export("io.FixedLenFeature", "FixedLenFeature")
 class FixedLenFeature(collections.namedtuple(
     "FixedLenFeature", ["shape", "dtype", "default_value"])):
   """Configuration for parsing a fixed-length input feature.
@@ -150,7 +151,7 @@ class FixedLenFeature(collections.namedtuple(
         cls, shape, dtype, default_value)
 
 
-@tf_export("FixedLenSequenceFeature")
+@tf_export("io.FixedLenSequenceFeature", "FixedLenSequenceFeature")
 class FixedLenSequenceFeature(collections.namedtuple(
     "FixedLenSequenceFeature",
     ["shape", "dtype", "allow_missing", "default_value"])):
@@ -360,7 +361,7 @@ def _prepend_none_dimension(features):
     return features
 
 
-@tf_export("parse_example")
+@tf_export("io.parse_example", "parse_example")
 def parse_example(serialized, features, name=None, example_names=None):
   # pylint: disable=line-too-long
   """Parses `Example` protos into a `dict` of tensors.
@@ -761,7 +762,7 @@ def _process_raw_parameters(names, dense_defaults, sparse_keys, sparse_types,
           dense_shapes_as_proto, dense_shapes)
 
 
-@tf_export("parse_single_example")
+@tf_export("io.parse_single_example", "parse_single_example")
 def parse_single_example(serialized, features, name=None, example_names=None):
   """Parses a single `Example` proto.
 
@@ -1244,7 +1245,7 @@ def _parse_sequence_example_raw(serialized,
 
 # TODO(sundberg): rewrite this method to call the batch version, which is more
 # efficient especially for large inputs.
-@tf_export("parse_single_sequence_example")
+@tf_export("io.parse_single_sequence_example", "parse_single_sequence_example")
 def parse_single_sequence_example(
     serialized, context_features=None, sequence_features=None,
     example_name=None, name=None):
@@ -1564,7 +1565,8 @@ def _parse_single_sequence_example_raw(serialized,
 
 
 # Swap `name` and `na_value` for backward compatibility.
-@tf_export("decode_csv")
+@tf_export("io.decode_csv", "decode_csv")
+@deprecation.deprecated_endpoints("decode_csv")
 def decode_csv(records,
                record_defaults,
                field_delim=",",
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 4baf506385..c2eb9dfc5d 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import math_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_random_ops import *
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 # pylint: enable=wildcard-import
@@ -43,7 +44,7 @@ def _ShapeTensor(shape):
   return ops.convert_to_tensor(shape, dtype=dtype, name="shape")
 
 
-@tf_export("random_normal")
+@tf_export("random.normal", "random_normal")
 def random_normal(shape,
                   mean=0.0,
                   stddev=1.0,
@@ -136,7 +137,7 @@ def parameterized_truncated_normal(shape,
     return rnd
 
 
-@tf_export("truncated_normal")
+@tf_export("random.truncated_normal", "truncated_normal")
 def truncated_normal(shape,
                      mean=0.0,
                      stddev=1.0,
@@ -181,7 +182,7 @@ ops.NotDifferentiable("ParameterizedTruncatedNormal")
 ops.NotDifferentiable("TruncatedNormal")
 
 
-@tf_export("random_uniform")
+@tf_export("random.uniform", "random_uniform")
 def random_uniform(shape,
                    minval=0,
                    maxval=None,
@@ -246,7 +247,7 @@ def random_uniform(shape,
 ops.NotDifferentiable("RandomUniform")
 
 
-@tf_export("random_shuffle")
+@tf_export("random.shuffle", "random_shuffle")
 def random_shuffle(value, seed=None, name=None):
   """Randomly shuffles a tensor along its first dimension.
 
@@ -277,7 +278,7 @@ def random_shuffle(value, seed=None, name=None):
       value, seed=seed1, seed2=seed2, name=name)
 
 
-@tf_export("random_crop")
+@tf_export("image.random_crop", "random_crop")
 def random_crop(value, size, seed=None, name=None):
   """Randomly crops a tensor to a given size.
 
@@ -320,7 +321,7 @@ def random_crop(value, size, seed=None, name=None):
     return array_ops.slice(value, offset, size, name=name)
 
 
-@tf_export("multinomial")
+@tf_export("random.multinomial", "multinomial")
 def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
   """Draws samples from a multinomial distribution.
 
@@ -356,7 +357,8 @@ def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
 ops.NotDifferentiable("Multinomial")
 
 
-@tf_export("random_gamma")
+@tf_export("random.gamma", "random_gamma")
+@deprecation.deprecated_endpoints("random_gamma")
 def random_gamma(shape,
                  alpha,
                  beta=None,
@@ -439,7 +441,8 @@ def random_gamma(shape,
             shape, alpha_broadcast, seed=seed1, seed2=seed2) / beta)
 
 
-@tf_export("random_poisson")
+@tf_export("random.poisson", "random_poisson")
+@deprecation.deprecated_endpoints("random_poisson")
 def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
   """Draws `shape` samples from each of the given Poisson distribution(s).
 
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 400a42a3c0..7e3dbdbad4 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -185,7 +185,8 @@ def sparse_eye(num_rows,
 
 
 # pylint: disable=protected-access
-@tf_export("sparse_concat")
+@tf_export("sparse.concat", "sparse_concat")
+@deprecation.deprecated_endpoints("sparse_concat")
 @deprecation.deprecated_args(
     None, "concat_dim is deprecated, use axis instead", "concat_dim")
 def sparse_concat(axis,
@@ -317,7 +318,8 @@ def sparse_concat(axis,
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
-@tf_export("sparse_add")
+@tf_export("sparse.add", "sparse_add")
+@deprecation.deprecated_endpoints("sparse_add")
 def sparse_add(a, b, thresh=0):
   """Adds two tensors, at least one of each is a `SparseTensor`.
 
@@ -557,7 +559,8 @@ def sparse_dense_cwise_add(sp_t, dense_t):
   return sparse_tensor.SparseTensor(sp_t.indices, result, sp_t.dense_shape)
 
 
-@tf_export("sparse_reorder")
+@tf_export("sparse.reorder", "sparse_reorder")
+@deprecation.deprecated_endpoints("sparse_reorder")
 def sparse_reorder(sp_input, name=None):
   """Reorders a `SparseTensor` into the canonical, row-major ordering.
 
@@ -607,7 +610,8 @@ def sparse_reorder(sp_input, name=None):
   return sparse_tensor.SparseTensor(reordered_ind, reordered_val, dense_shape)
 
 
-@tf_export("sparse_reshape")
+@tf_export("sparse.reshape", "sparse_reshape")
+@deprecation.deprecated_endpoints("sparse_reshape")
 def sparse_reshape(sp_input, shape, name=None):
   """Reshapes a `SparseTensor` to represent values in a new dense shape.
 
@@ -700,7 +704,8 @@ class KeywordRequired(object):
     return "KeywordRequired()"
 
 
-@tf_export("sparse_split")
+@tf_export("sparse.split", "sparse_split")
+@deprecation.deprecated_endpoints("sparse_split")
 @deprecation.deprecated_args(
     None, "split_dim is deprecated, use axis instead", "split_dim")
 def sparse_split(keyword_required=KeywordRequired(),
@@ -773,7 +778,8 @@ def sparse_split(keyword_required=KeywordRequired(),
   return sparse_tensors
 
 
-@tf_export("sparse_slice")
+@tf_export("sparse.slice", "sparse_slice")
+@deprecation.deprecated_endpoints("sparse_slice")
 def sparse_slice(sp_input, start, size, name=None):
   """Slice a `SparseTensor` based on the `start` and `size.
 
@@ -785,11 +791,11 @@ def sparse_slice(sp_input, start, size, name=None):
 
   Graphically the output tensors are:
 
-      sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+      sparse.slice([0, 0], [2, 4]) = shape = [2, 4]
       [    a  ]
       [b c    ]
 
-      sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+      sparse.slice([0, 4], [2, 3]) = shape = [2, 3]
       [ d e  ]
       [      ]
 
@@ -823,6 +829,9 @@ def sparse_slice(sp_input, start, size, name=None):
 
 
 @tf_export("sparse_to_dense")
+@deprecation.deprecated(
+    None,
+    "Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.")
 def sparse_to_dense(sparse_indices,
                     output_shape,
                     sparse_values,
@@ -878,7 +887,8 @@ def sparse_to_dense(sparse_indices,
       name=name)
 
 
-@tf_export("sparse_reduce_max")
+@tf_export("sparse.reduce_max", "sparse_reduce_max")
+@deprecation.deprecated_endpoints("sparse_reduce_max")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def sparse_reduce_max(sp_input, axis=None, keepdims=None,
@@ -912,16 +922,16 @@ def sparse_reduce_max(sp_input, axis=None, keepdims=None,
   # 'x' represents [[1, ?, 2]
   #                 [?, 3, ?]]
   # where ? is implicitly-zero.
-  tf.sparse_reduce_max(x) ==> 3
-  tf.sparse_reduce_max(x, 0) ==> [1, 3, 2]
-  tf.sparse_reduce_max(x, 1) ==> [2, 3]  # Can also use -1 as the axis.
-  tf.sparse_reduce_max(x, 1, keepdims=True) ==> [[2], [3]]
-  tf.sparse_reduce_max(x, [0, 1]) ==> 3
+  tf.sparse.reduce_max(x) ==> 3
+  tf.sparse.reduce_max(x, 0) ==> [1, 3, 2]
+  tf.sparse.reduce_max(x, 1) ==> [2, 3]  # Can also use -1 as the axis.
+  tf.sparse.reduce_max(x, 1, keepdims=True) ==> [[2], [3]]
+  tf.sparse.reduce_max(x, [0, 1]) ==> 3
 
   # 'y' represents [[-7, ?]
   #                 [ 4, 3]
   #                 [ ?, ?]
-  tf.sparse_reduce_max(x, 1) ==> [-7, 4, 0]
+  tf.sparse.reduce_max(x, 1) ==> [-7, 4, 0]
   ```
 
   Args:
@@ -945,7 +955,8 @@ def sparse_reduce_max(sp_input, axis=None, keepdims=None,
       math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims)
 
 
-@tf_export("sparse_reduce_max_sparse")
+@tf_export("sparse.reduce_max_sparse", "sparse_reduce_max_sparse")
+@deprecation.deprecated_endpoints("sparse_reduce_max_sparse")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def sparse_reduce_max_sparse(sp_input,
@@ -995,7 +1006,8 @@ def sparse_reduce_max_sparse(sp_input,
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
-@tf_export("sparse_reduce_sum")
+@tf_export("sparse.reduce_sum", "sparse_reduce_sum")
+@deprecation.deprecated_endpoints("sparse_reduce_sum")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
@@ -1021,11 +1033,11 @@ def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
   # 'x' represents [[1, ?, 1]
   #                 [?, 1, ?]]
   # where ? is implicitly-zero.
-  tf.sparse_reduce_sum(x) ==> 3
-  tf.sparse_reduce_sum(x, 0) ==> [1, 1, 1]
-  tf.sparse_reduce_sum(x, 1) ==> [2, 1]  # Can also use -1 as the axis.
-  tf.sparse_reduce_sum(x, 1, keepdims=True) ==> [[2], [1]]
-  tf.sparse_reduce_sum(x, [0, 1]) ==> 3
+  tf.sparse.reduce_sum(x) ==> 3
+  tf.sparse.reduce_sum(x, 0) ==> [1, 1, 1]
+  tf.sparse.reduce_sum(x, 1) ==> [2, 1]  # Can also use -1 as the axis.
+  tf.sparse.reduce_sum(x, 1, keepdims=True) ==> [[2], [1]]
+  tf.sparse.reduce_sum(x, [0, 1]) ==> 3
   ```
 
   Args:
@@ -1049,7 +1061,8 @@ def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
       math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims)
 
 
-@tf_export("sparse_reduce_sum_sparse")
+@tf_export("sparse.reduce_sum_sparse", "sparse_reduce_sum_sparse")
+@deprecation.deprecated_endpoints("sparse_reduce_sum_sparse")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def sparse_reduce_sum_sparse(sp_input,
@@ -1099,7 +1112,8 @@ def sparse_reduce_sum_sparse(sp_input,
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
-@tf_export("sparse_tensor_to_dense")
+@tf_export("sparse.to_dense", "sparse_tensor_to_dense")
+@deprecation.deprecated_endpoints("sparse_tensor_to_dense")
 def sparse_tensor_to_dense(sp_input,
                            default_value=0,
                            validate_indices=True,
@@ -1151,7 +1165,8 @@ def sparse_tensor_to_dense(sp_input,
       name=name)
 
 
-@tf_export("sparse_to_indicator")
+@tf_export("sparse.to_indicator", "sparse_to_indicator")
+@deprecation.deprecated_endpoints("sparse_to_indicator")
 def sparse_to_indicator(sp_input, vocab_size, name=None):
   """Converts a `SparseTensor` of ids into a dense bool indicator tensor.
 
@@ -1214,7 +1229,8 @@ def sparse_to_indicator(sp_input, vocab_size, name=None):
         sp_new, default_value=False, validate_indices=False, name=name)
 
 
-@tf_export("sparse_merge")
+@tf_export("sparse.merge", "sparse_merge")
+@deprecation.deprecated_endpoints("sparse_merge")
 def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
                  already_sorted=False):
   """Combines a batch of feature ids and values into a single `SparseTensor`.
@@ -1358,7 +1374,8 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
         sorted_result.indices, sorted_result.values, new_shape)
 
 
-@tf_export("sparse_retain")
+@tf_export("sparse.retain", "sparse_retain")
+@deprecation.deprecated_endpoints("sparse_retain")
 def sparse_retain(sp_input, to_retain):
   """Retains specified non-empty values within a `SparseTensor`.
 
@@ -1402,7 +1419,8 @@ def sparse_retain(sp_input, to_retain):
                                     array_ops.identity(sp_input.dense_shape))
 
 
-@tf_export("sparse_reset_shape")
+@tf_export("sparse.reset_shape", "sparse_reset_shape")
+@deprecation.deprecated_endpoints("sparse_reset_shape")
 def sparse_reset_shape(sp_input, new_shape=None):
   """Resets the shape of a `SparseTensor` with indices and values unchanged.
 
@@ -1503,7 +1521,8 @@ def sparse_reset_shape(sp_input, new_shape=None):
   return sparse_tensor.SparseTensor(in_indices, in_values, output_shape_tensor)
 
 
-@tf_export("sparse_fill_empty_rows")
+@tf_export("sparse.fill_empty_rows", "sparse_fill_empty_rows")
+@deprecation.deprecated_endpoints("sparse_fill_empty_rows")
 def sparse_fill_empty_rows(sp_input, default_value, name=None):
   """Fills empty rows in the input 2-D `SparseTensor` with a default value.
 
@@ -1567,7 +1586,8 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
         dense_shape=sp_input.dense_shape), empty_row_indicator)
 
 
-@tf_export("serialize_sparse")
+@tf_export("io.serialize_sparse", "serialize_sparse")
+@deprecation.deprecated_endpoints("serialize_sparse")
 def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
 
@@ -1593,7 +1613,8 @@ def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
       out_type=out_type)
 
 
-@tf_export("serialize_many_sparse")
+@tf_export("io.serialize_many_sparse", "serialize_many_sparse")
+@deprecation.deprecated_endpoints("serialize_many_sparse")
 def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
 
@@ -1694,7 +1715,8 @@ def deserialize_sparse(serialized_sparse, dtype, rank=None, name=None):
   return sparse_tensor.SparseTensor(output_indices, output_values, output_shape)
 
 
-@tf_export("deserialize_many_sparse")
+@tf_export("io.deserialize_many_sparse", "deserialize_many_sparse")
+@deprecation.deprecated_endpoints("deserialize_many_sparse")
 def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
   """Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 
@@ -1712,7 +1734,7 @@ def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
 
   The input `SparseTensor` objects' indices are assumed ordered in
   standard lexicographic order.  If this is not the case, after this
-  step run `sparse_reorder` to restore index ordering.
+  step run `sparse.reorder` to restore index ordering.
 
   For example, if the serialized input is a `[2, 3]` matrix representing two
   original `SparseTensor` objects:
@@ -1764,7 +1786,8 @@ def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
   return sparse_tensor.SparseTensor(output_indices, output_values, output_shape)
 
 
-@tf_export("sparse_tensor_dense_matmul")
+@tf_export("sparse.matmul", "sparse_tensor_dense_matmul")
+@deprecation.deprecated_endpoints("sparse_tensor_dense_matmul")
 def sparse_tensor_dense_matmul(sp_a,
                                b,
                                adjoint_a=False,
@@ -1777,7 +1800,7 @@ def sparse_tensor_dense_matmul(sp_a,
   following input format is recommended for optimal behavior:
 
   * If `adjoint_a == false`: `A` should be sorted in lexicographically
-    increasing order.  Use `sparse_reorder` if you're not sure.
+    increasing order.  Use `sparse.reorder` if you're not sure.
   * If `adjoint_a == true`: `A` should be sorted in order of increasing
     dimension 1 (i.e., "column major" order instead of "row major" order).
 
@@ -1981,7 +2004,8 @@ def sparse_tensor_dense_matmul(sp_a,
         adjoint_b=adjoint_b)
 
 
-@tf_export("sparse_softmax")
+@tf_export("sparse.softmax", "sparse_softmax")
+@deprecation.deprecated_endpoints("sparse_softmax")
 def sparse_softmax(sp_input, name=None):
   """Applies softmax to a batched N-D `SparseTensor`.
 
@@ -2036,7 +2060,8 @@ def sparse_softmax(sp_input, name=None):
                                       sp_input.dense_shape)
 
 
-@tf_export("sparse_maximum")
+@tf_export("sparse.maximum", "sparse_maximum")
+@deprecation.deprecated_endpoints("sparse_maximum")
 def sparse_maximum(sp_a, sp_b, name=None):
   """Returns the element-wise max of two SparseTensors.
 
@@ -2073,7 +2098,8 @@ def sparse_maximum(sp_a, sp_b, name=None):
   return sparse_tensor.SparseTensor(out_indices, out_values, sp_a.dense_shape)
 
 
-@tf_export("sparse_minimum")
+@tf_export("sparse.minimum", "sparse_minimum")
+@deprecation.deprecated_endpoints("sparse_minimum")
 def sparse_minimum(sp_a, sp_b, name=None):
   """Returns the element-wise min of two SparseTensors.
 
@@ -2110,7 +2136,8 @@ def sparse_minimum(sp_a, sp_b, name=None):
   return sparse_tensor.SparseTensor(out_indices, out_values, sp_a.dense_shape)
 
 
-@tf_export("sparse_transpose")
+@tf_export("sparse.transpose", "sparse_transpose")
+@deprecation.deprecated_endpoints("sparse_transpose")
 def sparse_transpose(sp_input, perm=None, name=None):
   """Transposes a `SparseTensor`
 
@@ -2259,7 +2286,7 @@ def _take_many_sparse_from_tensors_map(sparse_map_op,
 
   The input `SparseTensor` objects' indices are assumed ordered in
   standard lexicographic order.  If this is not the case, after this
-  step run `sparse_reorder` to restore index ordering.
+  step run `sparse.reorder` to restore index ordering.
 
   For example, if the serialized input is a `[2, 3]` matrix representing two
   original `SparseTensor` objects:
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 9a10abfcf7..cfab943896 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -29,11 +29,13 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(b/27419586) Change docstring for required dtype of x once int allowed
-@tf_export('lbeta')
+@tf_export('math.lbeta', 'lbeta')
+@deprecation.deprecated_endpoints('lbeta')
 def lbeta(x, name=None):
   r"""Computes \\(ln(|Beta(x)|)\\), reducing along the last dimension.
 
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 046a48d192..e83c08f643 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -310,8 +310,9 @@ def _reduce_join_reduction_dims(x, axis, reduction_indices):
     return math_ops.range(array_ops.rank(x) - 1, -1, -1)
 
 
-@tf_export("reduce_join")
-def reduce_join(inputs, axis=None,
+@tf_export("strings.reduce_join", "reduce_join")
+@deprecation.deprecated_endpoints("reduce_join")
+def reduce_join(inputs, axis=None,  # pylint: disable=missing-docstring
                 keep_dims=False,
                 separator="",
                 name=None,
@@ -329,6 +330,8 @@ def reduce_join(inputs, axis=None,
 
 reduce_join.__doc__ = deprecation.rewrite_argument_docstring(
     gen_string_ops.reduce_join.__doc__, "reduction_indices", "axis")
+reduce_join.__doc__ = reduce_join.__doc__.replace("tf.reduce_join(",
+                                                  "tf.strings.reduce_join(")
 
 
 # This wrapper provides backwards compatibility for code that predates the
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 8e7f123a85..8bf057f69d 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -36,10 +36,13 @@ from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_endpoints
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("saved_model.builder.SavedModelBuilder")
+@tf_export("saved_model.Builder",
+           "saved_model.builder.SavedModelBuilder")
+@deprecated_endpoints("saved_model.builder.SavedModelBuilder")
 class SavedModelBuilder(object):
   """Builds the `SavedModel` protocol buffer and saves variables and assets.
 
@@ -61,7 +64,7 @@ class SavedModelBuilder(object):
   Typical usage for the `SavedModelBuilder`:
   ```python
   ...
-  builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
+  builder = tf.saved_model.Builder(export_dir)
 
   with tf.Session(graph=tf.Graph()) as sess:
     ...
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index e8536108e8..895644a030 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -34,6 +34,7 @@ from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -144,7 +145,10 @@ def _get_main_op_tensor(
   return main_op_tensor
 
 
-@tf_export("saved_model.loader.maybe_saved_model_directory")
+@tf_export("saved_model.maybe_saved_model_directory",
+           "saved_model.loader.maybe_saved_model_directory")
+@deprecation.deprecated_endpoints(
+    "saved_model.loader.maybe_saved_model_directory")
 def maybe_saved_model_directory(export_dir):
   """Checks whether the provided export directory could contain a SavedModel.
 
@@ -165,7 +169,7 @@ def maybe_saved_model_directory(export_dir):
   return file_io.file_exists(txt_path) or file_io.file_exists(pb_path)
 
 
-@tf_export("saved_model.loader.load")
+@tf_export("saved_model.load", "saved_model.loader.load")
 def load(sess, tags, export_dir, import_scope=None, **saver_kwargs):
   """Loads the model from a SavedModel as specified by tags.
 
diff --git a/tensorflow/python/saved_model/main_op_impl.py b/tensorflow/python/saved_model/main_op_impl.py
index 631ee63729..ad4511b28e 100644
--- a/tensorflow/python/saved_model/main_op_impl.py
+++ b/tensorflow/python/saved_model/main_op_impl.py
@@ -22,6 +22,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -42,7 +43,9 @@ def main_op():
 
 
 # TODO(sukritiramesh): Integrate with Saver for complete restore functionality.
-@tf_export('saved_model.main_op.main_op_with_restore')
+@tf_export('saved_model.main_op_with_restore',
+           'saved_model.main_op.main_op_with_restore')
+@deprecation.deprecated_endpoints('saved_model.main_op.main_op_with_restore')
 def main_op_with_restore(restore_op_name):
   """Returns a main op to init variables, tables and restore the graph.
 
diff --git a/tensorflow/python/saved_model/signature_def_utils_impl.py b/tensorflow/python/saved_model/signature_def_utils_impl.py
index 37f927f381..a1034416e9 100644
--- a/tensorflow/python/saved_model/signature_def_utils_impl.py
+++ b/tensorflow/python/saved_model/signature_def_utils_impl.py
@@ -24,10 +24,14 @@ from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import utils
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('saved_model.signature_def_utils.build_signature_def')
+@tf_export('saved_model.build_signature_def',
+           'saved_model.signature_def_utils.build_signature_def')
+@deprecation.deprecated_endpoints(
+    'saved_model.signature_def_utils.build_signature_def')
 def build_signature_def(inputs=None, outputs=None, method_name=None):
   """Utility function to build a SignatureDef protocol buffer.
 
@@ -53,7 +57,10 @@ def build_signature_def(inputs=None, outputs=None, method_name=None):
   return signature_def
 
 
-@tf_export('saved_model.signature_def_utils.regression_signature_def')
+@tf_export('saved_model.regression_signature_def',
+           'saved_model.signature_def_utils.regression_signature_def')
+@deprecation.deprecated_endpoints(
+    'saved_model.signature_def_utils.regression_signature_def')
 def regression_signature_def(examples, predictions):
   """Creates regression signature from given examples and predictions.
 
@@ -95,7 +102,10 @@ def regression_signature_def(examples, predictions):
   return signature_def
 
 
-@tf_export('saved_model.signature_def_utils.classification_signature_def')
+@tf_export('saved_model.classification_signature_def',
+           'saved_model.signature_def_utils.classification_signature_def')
+@deprecation.deprecated_endpoints(
+    'saved_model.signature_def_utils.classification_signature_def')
 def classification_signature_def(examples, classes, scores):
   """Creates classification signature from given examples and predictions.
 
@@ -148,7 +158,10 @@ def classification_signature_def(examples, classes, scores):
   return signature_def
 
 
-@tf_export('saved_model.signature_def_utils.predict_signature_def')
+@tf_export('saved_model.predict_signature_def',
+           'saved_model.signature_def_utils.predict_signature_def')
+@deprecation.deprecated_endpoints(
+    'saved_model.signature_def_utils.predict_signature_def')
 def predict_signature_def(inputs, outputs):
   """Creates prediction signature from given inputs and outputs.
 
@@ -239,7 +252,10 @@ def _supervised_signature_def(
   return signature_def
 
 
-@tf_export('saved_model.signature_def_utils.is_valid_signature')
+@tf_export('saved_model.is_valid_signature',
+           'saved_model.signature_def_utils.is_valid_signature')
+@deprecation.deprecated_endpoints(
+    'saved_model.signature_def_utils.is_valid_signature')
 def is_valid_signature(signature_def):
   """Determine whether a SignatureDef can be served by TensorFlow Serving."""
   if signature_def is None:
@@ -313,4 +329,3 @@ def _is_valid_classification_signature(signature_def):
     return False
 
   return True
-
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 06d09325c8..0bba7b6fac 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -27,13 +27,16 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.saved_model import constants
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
 # TensorInfo helpers.
 
 
-@tf_export("saved_model.utils.build_tensor_info")
+@tf_export("saved_model.build_tensor_info",
+           "saved_model.utils.build_tensor_info")
+@deprecation.deprecated_endpoints("saved_model.utils.build_tensor_info")
 def build_tensor_info(tensor):
   """Utility function to build TensorInfo proto.
 
@@ -57,7 +60,10 @@ def build_tensor_info(tensor):
   return tensor_info
 
 
-@tf_export("saved_model.utils.get_tensor_from_tensor_info")
+@tf_export("saved_model.get_tensor_from_tensor_info",
+           "saved_model.utils.get_tensor_from_tensor_info")
+@deprecation.deprecated_endpoints(
+    "saved_model.utils.get_tensor_from_tensor_info")
 def get_tensor_from_tensor_info(tensor_info, graph=None, import_scope=None):
   """Returns the Tensor or SparseTensor described by a TensorInfo proto.
 
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 92446e2f8f..5ce5410e0b 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -69,6 +69,7 @@ TENSORFLOW_API_INIT_FILES = [
     "profiler/__init__.py",
     "python_io/__init__.py",
     "quantization/__init__.py",
+    "random/__init__.py",
     "resource_loader/__init__.py",
     "strings/__init__.py",
     "saved_model/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index bc2f3516d1..587eb232f5 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -69,6 +69,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "profiler/__init__.py",
     "python_io/__init__.py",
     "quantization/__init__.py",
+    "random/__init__.py",
     "resource_loader/__init__.py",
     "strings/__init__.py",
     "saved_model/__init__.py",
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 9d9db70890..eb131ac9f7 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -56,7 +56,8 @@ _restore_sparse = sparse_ops._take_many_sparse_from_tensors_map
 # pylint: enable=protected-access
 
 
-@tf_export("train.match_filenames_once")
+@tf_export("io.match_filenames_once", "train.match_filenames_once")
+@deprecation.deprecated_endpoints("train.match_filenames_once")
 def match_filenames_once(pattern, name=None):
   """Save the list of files matching pattern, so it is only computed once.
 
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
index d9efe97821..ab6287f8cd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
@@ -1,5 +1,89 @@
 path: "tensorflow.debugging"
 tf_module {
+  member_method {
+    name: "Assert"
+    argspec: "args=[\'condition\', \'data\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_all_finite"
+    argspec: "args=[\'t\', \'msg\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "assert_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_greater"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_greater_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_integer"
+    argspec: "args=[\'x\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_less"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_less_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_near"
+    argspec: "args=[\'x\', \'y\', \'rtol\', \'atol\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_negative"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_non_negative"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_non_positive"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_none_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_positive"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_proper_iterable"
+    argspec: "args=[\'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "assert_rank"
+    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_rank_at_least"
+    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_rank_in"
+    argspec: "args=[\'x\', \'ranks\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_same_float_dtype"
+    argspec: "args=[\'tensors\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_scalar"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "assert_type"
+    argspec: "args=[\'tensor\', \'tf_type\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "check_numerics"
     argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,4 +100,16 @@ tf_module {
     name: "is_nan"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_non_decreasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_numeric_tensor"
+    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_strictly_increasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.-d-type.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.-d-type.pbtxt
new file mode 100644
index 0000000000..423eca32a2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.-d-type.pbtxt
@@ -0,0 +1,77 @@
+path: "tensorflow.dtypes.DType"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "as_datatype_enum"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "as_numpy_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "base_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_bool"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_complex"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_floating"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_integer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_numpy_compatible"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_quantized"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_unsigned"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "limits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "max"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "min"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "real_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "size"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'type_enum\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
index 98e1feed00..ea23feca84 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
@@ -1,7 +1,27 @@
 path: "tensorflow.dtypes"
 tf_module {
+  member {
+    name: "DType"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "as_dtype"
+    argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "as_string"
     argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'False\', \'False\', \'-1\', \'\', \'None\'], "
   }
+  member_method {
+    name: "cast"
+    argspec: "args=[\'x\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "complex"
+    argspec: "args=[\'real\', \'imag\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "saturate_cast"
+    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.graph_util.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.graph_util.pbtxt
index eeabf845dc..162ee76ee7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.graph_util.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.graph_util.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "extract_sub_graph"
     argspec: "args=[\'graph_def\', \'dest_nodes\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "import_graph_def"
+    argspec: "args=[\'graph_def\', \'input_map\', \'return_elements\', \'name\', \'op_dict\', \'producer_op_list\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "must_run_on_cpu"
     argspec: "args=[\'node\', \'pin_variables_on_cpu\'], varargs=None, keywords=None, defaults=[\'False\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
index 5c46dc5ee7..0a231f1b65 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
@@ -148,6 +148,10 @@ tf_module {
     name: "random_contrast"
     argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "random_crop"
+    argspec: "args=[\'value\', \'size\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "random_flip_left_right"
     argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.initializers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.pbtxt
index d499c67d89..19ca62122e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.initializers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.initializers.pbtxt
@@ -72,6 +72,10 @@ tf_module {
     name: "local_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "tables_initializer"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
+  }
   member_method {
     name: "variables"
     argspec: "args=[\'var_list\', \'name\'], varargs=None, keywords=None, defaults=[\'init\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-feature.pbtxt
new file mode 100644
index 0000000000..cd0e51c8c7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-feature.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.io.FixedLenFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "default_value"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-sequence-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-sequence-feature.pbtxt
new file mode 100644
index 0000000000..8a38f25fdf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-sequence-feature.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.io.FixedLenSequenceFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenSequenceFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenSequenceFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "allow_missing"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "default_value"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-padding-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-padding-f-i-f-o-queue.pbtxt
new file mode 100644
index 0000000000..85306fdcac
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-padding-f-i-f-o-queue.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.io.PaddingFIFOQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PaddingFIFOQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'dtypes\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'padding_fifo_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-priority-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-priority-queue.pbtxt
new file mode 100644
index 0000000000..02d8037b34
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-priority-queue.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.io.PriorityQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PriorityQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'types\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'priority_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-queue-base.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-queue-base.pbtxt
new file mode 100644
index 0000000000..a30481a0ea
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-queue-base.pbtxt
@@ -0,0 +1,65 @@
+path: "tensorflow.io.QueueBase"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtypes\', \'shapes\', \'names\', \'queue_ref\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-random-shuffle-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-random-shuffle-queue.pbtxt
new file mode 100644
index 0000000000..82cbf9884f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-random-shuffle-queue.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.io.RandomShuffleQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.RandomShuffleQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'min_after_dequeue\', \'dtypes\', \'shapes\', \'names\', \'seed\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'random_shuffle_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-sparse-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-sparse-feature.pbtxt
new file mode 100644
index 0000000000..216947b4ed
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-sparse-feature.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.io.SparseFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.SparseFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.SparseFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "already_sorted"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "index_key"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_key"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-t-f-record-compression-type.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-t-f-record-compression-type.pbtxt
new file mode 100644
index 0000000000..b598f73d7e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-t-f-record-compression-type.pbtxt
@@ -0,0 +1,20 @@
+path: "tensorflow.io.TFRecordCompressionType"
+tf_class {
+  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordCompressionType\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GZIP"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ZLIB"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-t-f-record-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-t-f-record-options.pbtxt
new file mode 100644
index 0000000000..bfbf37ccf4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-t-f-record-options.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.io.TFRecordOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordOptions\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "compression_type_map"
+    mtype: "<type \'dict\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'compression_type\', \'flush_mode\', \'input_buffer_size\', \'output_buffer_size\', \'window_bits\', \'compression_level\', \'compression_method\', \'mem_level\', \'compression_strategy\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_compression_type_string"
+    argspec: "args=[\'cls\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-t-f-record-writer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-t-f-record-writer.pbtxt
new file mode 100644
index 0000000000..6fd443f6d7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-t-f-record-writer.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.io.TFRecordWriter"
+tf_class {
+  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordWriter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'record\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-var-len-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-var-len-feature.pbtxt
new file mode 100644
index 0000000000..fd835dbfbb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-var-len-feature.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.io.VarLenFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.VarLenFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.VarLenFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
index 8938cf217b..dccf136788 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
@@ -1,5 +1,49 @@
 path: "tensorflow.io"
 tf_module {
+  member {
+    name: "FixedLenFeature"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FixedLenSequenceFeature"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PaddingFIFOQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PriorityQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "QueueBase"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomShuffleQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseFeature"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFRecordCompressionType"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFRecordOptions"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFRecordWriter"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "VarLenFeature"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "decode_base64"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -8,6 +52,10 @@ tf_module {
     name: "decode_compressed"
     argspec: "args=[\'bytes\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
+  member_method {
+    name: "decode_csv"
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\', \'None\'], "
+  }
   member_method {
     name: "decode_json_example"
     argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,18 +64,38 @@ tf_module {
     name: "decode_raw"
     argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
+  member_method {
+    name: "deserialize_many_sparse"
+    argspec: "args=[\'serialized_sparse\', \'dtype\', \'rank\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "encode_base64"
     argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "match_filenames_once"
+    argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "matching_files"
     argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "parse_example"
+    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "parse_sequence_example"
     argspec: "args=[\'serialized\', \'context_features\', \'sequence_features\', \'example_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "parse_single_example"
+    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "parse_single_sequence_example"
+    argspec: "args=[\'serialized\', \'context_features\', \'sequence_features\', \'example_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "parse_tensor"
     argspec: "args=[\'serialized\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -36,8 +104,24 @@ tf_module {
     name: "read_file"
     argspec: "args=[\'filename\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "serialize_many_sparse"
+    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "serialize_sparse"
+    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "tf_record_iterator"
+    argspec: "args=[\'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "write_file"
     argspec: "args=[\'filename\', \'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "write_graph"
+    argspec: "args=[\'graph_or_graph_def\', \'logdir\', \'name\', \'as_text\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
index d979116887..6ac95d96da 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
@@ -108,10 +108,18 @@ tf_module {
     name: "eye"
     argspec: "args=[\'num_rows\', \'num_columns\', \'batch_shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "global_norm"
+    argspec: "args=[\'t_list\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "inv"
     argspec: "args=[\'input\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "l2_normalize"
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
+  }
   member_method {
     name: "logdet"
     argspec: "args=[\'matrix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -124,6 +132,10 @@ tf_module {
     name: "lstsq"
     argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "norm"
     argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index 72856466ec..459b9e3684 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.math"
 tf_module {
+  member_method {
+    name: "abs"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "accumulate_n"
+    argspec: "args=[\'inputs\', \'shape\', \'tensor_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "acos"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -12,6 +20,22 @@ tf_module {
     name: "add"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_n"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "angle"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "argmax"
+    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+  }
+  member_method {
+    name: "argmin"
+    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+  }
   member_method {
     name: "asin"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -52,10 +76,18 @@ tf_module {
     name: "betainc"
     argspec: "args=[\'a\', \'b\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "bincount"
+    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\"], "
+  }
   member_method {
     name: "ceil"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "conj"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "cos"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -64,14 +96,34 @@ tf_module {
     name: "cosh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "count_nonzero"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cumprod"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "cumsum"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "digamma"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "divide"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "erf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "erfc"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -88,6 +140,10 @@ tf_module {
     name: "floor"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "floordiv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "greater"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +160,26 @@ tf_module {
     name: "igammac"
     argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "imag"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "in_top_k"
+    argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "invert_permutation"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "l2_normalize"
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "lbeta"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -128,6 +200,14 @@ tf_module {
     name: "log1p"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "log_sigmoid"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "log_softmax"
+    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "logical_and"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -140,6 +220,10 @@ tf_module {
     name: "logical_or"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "logical_xor"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'LogicalXor\'], "
+  }
   member_method {
     name: "maximum"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -148,6 +232,14 @@ tf_module {
     name: "minimum"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "multiply"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "negative"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "not_equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -160,18 +252,66 @@ tf_module {
     name: "polyval"
     argspec: "args=[\'coeffs\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "pow"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "real"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "reciprocal"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "reduce_all"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_any"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_logsumexp"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_max"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_mean"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_min"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_prod"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_sum"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "rint"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "round"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "rsqrt"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "scalar_mul"
+    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "segment_max"
     argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -192,6 +332,14 @@ tf_module {
     name: "segment_sum"
     argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sigmoid"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sign"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "sin"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -200,6 +348,10 @@ tf_module {
     name: "sinh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "softmax"
+    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "softplus"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -208,18 +360,46 @@ tf_module {
     name: "softsign"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sqrt"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "square"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "squared_difference"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "subtract"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "tan"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "tanh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "top_k"
+    argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "truediv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "unsorted_segment_max"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "unsorted_segment_mean"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "unsorted_segment_min"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -228,6 +408,10 @@ tf_module {
     name: "unsorted_segment_prod"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "unsorted_segment_sqrt_n"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "unsorted_segment_sum"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -240,6 +424,10 @@ tf_module {
     name: "xlogy"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "zero_fraction"
+    argspec: "args=[\'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "zeta"
     argspec: "args=[\'x\', \'q\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
index d9e5b0d0fc..9b28ce5746 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
@@ -100,6 +100,10 @@ tf_module {
     name: "ctc_loss"
     argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
   }
+  member_method {
+    name: "depth_to_space"
+    argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
+  }
   member_method {
     name: "depthwise_conv2d"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -304,6 +308,14 @@ tf_module {
     name: "softsign"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "space_to_batch"
+    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "space_to_depth"
+    argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
+  }
   member_method {
     name: "sparse_softmax_cross_entropy_with_logits"
     argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 509ceff9df..a268529c1f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -496,6 +496,10 @@ tf_module {
     name: "quint8"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "random"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "random_normal_initializer"
     mtype: "<type \'type\'>"
@@ -1744,6 +1748,10 @@ tf_module {
     name: "rint"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "roll"
+    argspec: "args=[\'input\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "round"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
index 6d865efed0..77c92aeb0d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "fake_quant_with_min_max_vars_per_channel_gradient"
     argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'8\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "quantize"
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\'], "
+  }
   member_method {
     name: "quantized_concat"
     argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
new file mode 100644
index 0000000000..a568dd4cd8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
@@ -0,0 +1,47 @@
+path: "tensorflow.random"
+tf_module {
+  member_method {
+    name: "gamma"
+    argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_seed"
+    argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "log_uniform_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "multinomial"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "normal"
+    argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "poisson"
+    argspec: "args=[\'lam\', \'shape\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_random_seed"
+    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "shuffle"
+    argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "truncated_normal"
+    argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "uniform"
+    argspec: "args=[\'shape\', \'minval\', \'maxval\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "uniform_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
new file mode 100644
index 0000000000..67457de070
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.saved_model.Builder"
+tf_class {
+  is_instance: "<class \'tensorflow.python.saved_model.builder_impl.SavedModelBuilder\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'export_dir\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_meta_graph"
+    argspec: "args=[\'self\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\', \'strip_default_attrs\', \'saver\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "add_meta_graph_and_variables"
+    argspec: "args=[\'self\', \'sess\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\', \'strip_default_attrs\', \'saver\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'as_text\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
index e1a0385092..3f4965fc69 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.saved_model"
 tf_module {
+  member {
+    name: "Builder"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "builder"
     mtype: "<type \'module\'>"
@@ -32,6 +36,46 @@ tf_module {
     name: "utils"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "build_signature_def"
+    argspec: "args=[\'inputs\', \'outputs\', \'method_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "build_tensor_info"
+    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "classification_signature_def"
+    argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_tensor_from_tensor_info"
+    argspec: "args=[\'tensor_info\', \'graph\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "is_valid_signature"
+    argspec: "args=[\'signature_def\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load"
+    argspec: "args=[\'sess\', \'tags\', \'export_dir\', \'import_scope\'], varargs=None, keywords=saver_kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "main_op_with_restore"
+    argspec: "args=[\'restore_op_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "maybe_saved_model_directory"
+    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict_signature_def"
+    argspec: "args=[\'inputs\', \'outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "regression_signature_def"
+    argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "simple_save"
     argspec: "args=[\'session\', \'export_dir\', \'inputs\', \'outputs\', \'legacy_init_op\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-conditional-accumulator.pbtxt
new file mode 100644
index 0000000000..cd97716c9d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-conditional-accumulator.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.sparse.SparseConditionalAccumulator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.SparseConditionalAccumulator\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "accumulator_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'shared_name\', \'name\', \'reduction_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'sparse_conditional_accumulator\', \'MEAN\'], "
+  }
+  member_method {
+    name: "apply_grad"
+    argspec: "args=[\'self\', \'grad_indices\', \'grad_values\', \'grad_shape\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "apply_indexed_slices_grad"
+    argspec: "args=[\'self\', \'grad\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "num_accumulated"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_global_step"
+    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "take_grad"
+    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "take_indexed_slices_grad"
+    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
new file mode 100644
index 0000000000..02e59a63e1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
@@ -0,0 +1,54 @@
+path: "tensorflow.sparse.SparseTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dense_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "indices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'indices\', \'values\', \'dense_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "consumers"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "eval"
+    argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'cls\', \'sparse_tensor_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index ba9e651b34..32bd8d5f8e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -1,5 +1,21 @@
 path: "tensorflow.sparse"
 tf_module {
+  member {
+    name: "SparseConditionalAccumulator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseTensor"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "add"
+    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "concat"
+    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "cross"
     argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,4 +32,100 @@ tf_module {
     name: "eye"
     argspec: "args=[\'num_rows\', \'num_columns\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "fill_empty_rows"
+    argspec: "args=[\'sp_input\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "mask"
+    argspec: "args=[\'a\', \'mask_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "maximum"
+    argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "merge"
+    argspec: "args=[\'sp_ids\', \'sp_values\', \'vocab_size\', \'name\', \'already_sorted\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "minimum"
+    argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "placeholder"
+    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_max"
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_max_sparse"
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_sum"
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_sum_sparse"
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reorder"
+    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset_shape"
+    argspec: "args=[\'sp_input\', \'new_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reshape"
+    argspec: "args=[\'sp_input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "retain"
+    argspec: "args=[\'sp_input\', \'to_retain\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "segment_mean"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "segment_sqrt_n"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "segment_sum"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "slice"
+    argspec: "args=[\'sp_input\', \'start\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "softmax"
+    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "split"
+    argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "to_indicator"
+    argspec: "args=[\'sp_input\', \'vocab_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "transpose"
+    argspec: "args=[\'sp_input\', \'perm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
index 312e94b41d..ebdaf57231 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "length"
     argspec: "args=[\'input\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
   }
+  member_method {
+    name: "reduce_join"
+    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\'], "
+  }
   member_method {
     name: "regex_full_match"
     argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
index 9f35395284..45c81fdd3b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
@@ -272,6 +272,10 @@ tf_module {
     name: "checkpoint_exists"
     argspec: "args=[\'checkpoint_prefix\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "confusion_matrix"
+    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'dtype\', \'name\', \'weights\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\", \'None\', \'None\'], "
+  }
   member_method {
     name: "cosine_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
index d9efe97821..ab6287f8cd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
@@ -1,5 +1,89 @@
 path: "tensorflow.debugging"
 tf_module {
+  member_method {
+    name: "Assert"
+    argspec: "args=[\'condition\', \'data\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_all_finite"
+    argspec: "args=[\'t\', \'msg\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "assert_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_greater"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_greater_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_integer"
+    argspec: "args=[\'x\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_less"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_less_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_near"
+    argspec: "args=[\'x\', \'y\', \'rtol\', \'atol\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_negative"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_non_negative"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_non_positive"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_none_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_positive"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_proper_iterable"
+    argspec: "args=[\'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "assert_rank"
+    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_rank_at_least"
+    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_rank_in"
+    argspec: "args=[\'x\', \'ranks\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_same_float_dtype"
+    argspec: "args=[\'tensors\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_scalar"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "assert_type"
+    argspec: "args=[\'tensor\', \'tf_type\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "check_numerics"
     argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,4 +100,16 @@ tf_module {
     name: "is_nan"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_non_decreasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_numeric_tensor"
+    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_strictly_increasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.-d-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.-d-type.pbtxt
new file mode 100644
index 0000000000..423eca32a2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.-d-type.pbtxt
@@ -0,0 +1,77 @@
+path: "tensorflow.dtypes.DType"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "as_datatype_enum"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "as_numpy_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "base_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_bool"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_complex"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_floating"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_integer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_numpy_compatible"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_quantized"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_unsigned"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "limits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "max"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "min"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "real_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "size"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'type_enum\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
index 98e1feed00..ea23feca84 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
@@ -1,7 +1,27 @@
 path: "tensorflow.dtypes"
 tf_module {
+  member {
+    name: "DType"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "as_dtype"
+    argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "as_string"
     argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'False\', \'False\', \'-1\', \'\', \'None\'], "
   }
+  member_method {
+    name: "cast"
+    argspec: "args=[\'x\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "complex"
+    argspec: "args=[\'real\', \'imag\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "saturate_cast"
+    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt
index eeabf845dc..162ee76ee7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "extract_sub_graph"
     argspec: "args=[\'graph_def\', \'dest_nodes\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "import_graph_def"
+    argspec: "args=[\'graph_def\', \'input_map\', \'return_elements\', \'name\', \'op_dict\', \'producer_op_list\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "must_run_on_cpu"
     argspec: "args=[\'node\', \'pin_variables_on_cpu\'], varargs=None, keywords=None, defaults=[\'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
index 5c46dc5ee7..0a231f1b65 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
@@ -148,6 +148,10 @@ tf_module {
     name: "random_contrast"
     argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "random_crop"
+    argspec: "args=[\'value\', \'size\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "random_flip_left_right"
     argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
index e3c63fe737..d49181714f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
@@ -64,4 +64,8 @@ tf_module {
     name: "lecun_uniform"
     argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "tables_initializer"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-feature.pbtxt
new file mode 100644
index 0000000000..cd0e51c8c7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-feature.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.io.FixedLenFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "default_value"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-sequence-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-sequence-feature.pbtxt
new file mode 100644
index 0000000000..8a38f25fdf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-sequence-feature.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.io.FixedLenSequenceFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenSequenceFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenSequenceFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "allow_missing"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "default_value"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-padding-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-padding-f-i-f-o-queue.pbtxt
new file mode 100644
index 0000000000..85306fdcac
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-padding-f-i-f-o-queue.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.io.PaddingFIFOQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PaddingFIFOQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'dtypes\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'padding_fifo_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-priority-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-priority-queue.pbtxt
new file mode 100644
index 0000000000..02d8037b34
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-priority-queue.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.io.PriorityQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PriorityQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'types\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'priority_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-queue-base.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-queue-base.pbtxt
new file mode 100644
index 0000000000..a30481a0ea
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-queue-base.pbtxt
@@ -0,0 +1,65 @@
+path: "tensorflow.io.QueueBase"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtypes\', \'shapes\', \'names\', \'queue_ref\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-random-shuffle-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-random-shuffle-queue.pbtxt
new file mode 100644
index 0000000000..82cbf9884f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-random-shuffle-queue.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.io.RandomShuffleQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.RandomShuffleQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'min_after_dequeue\', \'dtypes\', \'shapes\', \'names\', \'seed\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'random_shuffle_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-sparse-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-sparse-feature.pbtxt
new file mode 100644
index 0000000000..216947b4ed
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-sparse-feature.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.io.SparseFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.SparseFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.SparseFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "already_sorted"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "index_key"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_key"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-t-f-record-compression-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-t-f-record-compression-type.pbtxt
new file mode 100644
index 0000000000..b598f73d7e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-t-f-record-compression-type.pbtxt
@@ -0,0 +1,20 @@
+path: "tensorflow.io.TFRecordCompressionType"
+tf_class {
+  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordCompressionType\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GZIP"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ZLIB"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-t-f-record-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-t-f-record-options.pbtxt
new file mode 100644
index 0000000000..bfbf37ccf4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-t-f-record-options.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.io.TFRecordOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordOptions\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "compression_type_map"
+    mtype: "<type \'dict\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'compression_type\', \'flush_mode\', \'input_buffer_size\', \'output_buffer_size\', \'window_bits\', \'compression_level\', \'compression_method\', \'mem_level\', \'compression_strategy\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_compression_type_string"
+    argspec: "args=[\'cls\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-t-f-record-writer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-t-f-record-writer.pbtxt
new file mode 100644
index 0000000000..6fd443f6d7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-t-f-record-writer.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.io.TFRecordWriter"
+tf_class {
+  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordWriter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'record\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-var-len-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-var-len-feature.pbtxt
new file mode 100644
index 0000000000..fd835dbfbb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-var-len-feature.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.io.VarLenFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.VarLenFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.VarLenFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
index 8938cf217b..dccf136788 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
@@ -1,5 +1,49 @@
 path: "tensorflow.io"
 tf_module {
+  member {
+    name: "FixedLenFeature"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FixedLenSequenceFeature"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PaddingFIFOQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PriorityQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "QueueBase"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomShuffleQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseFeature"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFRecordCompressionType"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFRecordOptions"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFRecordWriter"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "VarLenFeature"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "decode_base64"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -8,6 +52,10 @@ tf_module {
     name: "decode_compressed"
     argspec: "args=[\'bytes\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
+  member_method {
+    name: "decode_csv"
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\', \'None\'], "
+  }
   member_method {
     name: "decode_json_example"
     argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,18 +64,38 @@ tf_module {
     name: "decode_raw"
     argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
+  member_method {
+    name: "deserialize_many_sparse"
+    argspec: "args=[\'serialized_sparse\', \'dtype\', \'rank\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "encode_base64"
     argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "match_filenames_once"
+    argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "matching_files"
     argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "parse_example"
+    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "parse_sequence_example"
     argspec: "args=[\'serialized\', \'context_features\', \'sequence_features\', \'example_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "parse_single_example"
+    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "parse_single_sequence_example"
+    argspec: "args=[\'serialized\', \'context_features\', \'sequence_features\', \'example_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "parse_tensor"
     argspec: "args=[\'serialized\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -36,8 +104,24 @@ tf_module {
     name: "read_file"
     argspec: "args=[\'filename\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "serialize_many_sparse"
+    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "serialize_sparse"
+    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "tf_record_iterator"
+    argspec: "args=[\'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "write_file"
     argspec: "args=[\'filename\', \'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "write_graph"
+    argspec: "args=[\'graph_or_graph_def\', \'logdir\', \'name\', \'as_text\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index d979116887..6ac95d96da 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -108,10 +108,18 @@ tf_module {
     name: "eye"
     argspec: "args=[\'num_rows\', \'num_columns\', \'batch_shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "global_norm"
+    argspec: "args=[\'t_list\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "inv"
     argspec: "args=[\'input\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "l2_normalize"
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
+  }
   member_method {
     name: "logdet"
     argspec: "args=[\'matrix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -124,6 +132,10 @@ tf_module {
     name: "lstsq"
     argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "norm"
     argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index 72856466ec..459b9e3684 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.math"
 tf_module {
+  member_method {
+    name: "abs"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "accumulate_n"
+    argspec: "args=[\'inputs\', \'shape\', \'tensor_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "acos"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -12,6 +20,22 @@ tf_module {
     name: "add"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_n"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "angle"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "argmax"
+    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+  }
+  member_method {
+    name: "argmin"
+    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+  }
   member_method {
     name: "asin"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -52,10 +76,18 @@ tf_module {
     name: "betainc"
     argspec: "args=[\'a\', \'b\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "bincount"
+    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\"], "
+  }
   member_method {
     name: "ceil"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "conj"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "cos"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -64,14 +96,34 @@ tf_module {
     name: "cosh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "count_nonzero"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cumprod"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "cumsum"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "digamma"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "divide"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "erf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "erfc"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -88,6 +140,10 @@ tf_module {
     name: "floor"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "floordiv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "greater"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +160,26 @@ tf_module {
     name: "igammac"
     argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "imag"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "in_top_k"
+    argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "invert_permutation"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "l2_normalize"
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "lbeta"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -128,6 +200,14 @@ tf_module {
     name: "log1p"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "log_sigmoid"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "log_softmax"
+    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "logical_and"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -140,6 +220,10 @@ tf_module {
     name: "logical_or"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "logical_xor"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'LogicalXor\'], "
+  }
   member_method {
     name: "maximum"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -148,6 +232,14 @@ tf_module {
     name: "minimum"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "multiply"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "negative"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "not_equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -160,18 +252,66 @@ tf_module {
     name: "polyval"
     argspec: "args=[\'coeffs\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "pow"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "real"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "reciprocal"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "reduce_all"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_any"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_logsumexp"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_max"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_mean"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_min"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_prod"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_sum"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "rint"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "round"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "rsqrt"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "scalar_mul"
+    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "segment_max"
     argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -192,6 +332,14 @@ tf_module {
     name: "segment_sum"
     argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sigmoid"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sign"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "sin"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -200,6 +348,10 @@ tf_module {
     name: "sinh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "softmax"
+    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "softplus"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -208,18 +360,46 @@ tf_module {
     name: "softsign"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sqrt"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "square"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "squared_difference"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "subtract"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "tan"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "tanh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "top_k"
+    argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "truediv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "unsorted_segment_max"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "unsorted_segment_mean"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "unsorted_segment_min"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -228,6 +408,10 @@ tf_module {
     name: "unsorted_segment_prod"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "unsorted_segment_sqrt_n"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "unsorted_segment_sum"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -240,6 +424,10 @@ tf_module {
     name: "xlogy"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "zero_fraction"
+    argspec: "args=[\'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "zeta"
     argspec: "args=[\'x\', \'q\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index d9e5b0d0fc..9b28ce5746 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -100,6 +100,10 @@ tf_module {
     name: "ctc_loss"
     argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
   }
+  member_method {
+    name: "depth_to_space"
+    argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
+  }
   member_method {
     name: "depthwise_conv2d"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -304,6 +308,14 @@ tf_module {
     name: "softsign"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "space_to_batch"
+    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "space_to_depth"
+    argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
+  }
   member_method {
     name: "sparse_softmax_cross_entropy_with_logits"
     argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index d2dc8bc85f..5b3ea75bce 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -456,6 +456,10 @@ tf_module {
     name: "quint8"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "random"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "random_normal_initializer"
     mtype: "<type \'type\'>"
@@ -1608,6 +1612,10 @@ tf_module {
     name: "rint"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "roll"
+    argspec: "args=[\'input\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "round"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
index 6d865efed0..77c92aeb0d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "fake_quant_with_min_max_vars_per_channel_gradient"
     argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'8\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "quantize"
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\'], "
+  }
   member_method {
     name: "quantized_concat"
     argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
new file mode 100644
index 0000000000..a568dd4cd8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
@@ -0,0 +1,47 @@
+path: "tensorflow.random"
+tf_module {
+  member_method {
+    name: "gamma"
+    argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_seed"
+    argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "log_uniform_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "multinomial"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "normal"
+    argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "poisson"
+    argspec: "args=[\'lam\', \'shape\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_random_seed"
+    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "shuffle"
+    argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "truncated_normal"
+    argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "uniform"
+    argspec: "args=[\'shape\', \'minval\', \'maxval\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "uniform_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-builder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-builder.pbtxt
new file mode 100644
index 0000000000..67457de070
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-builder.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.saved_model.Builder"
+tf_class {
+  is_instance: "<class \'tensorflow.python.saved_model.builder_impl.SavedModelBuilder\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'export_dir\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_meta_graph"
+    argspec: "args=[\'self\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\', \'strip_default_attrs\', \'saver\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "add_meta_graph_and_variables"
+    argspec: "args=[\'self\', \'sess\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\', \'strip_default_attrs\', \'saver\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'as_text\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
index e1a0385092..3f4965fc69 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.saved_model"
 tf_module {
+  member {
+    name: "Builder"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "builder"
     mtype: "<type \'module\'>"
@@ -32,6 +36,46 @@ tf_module {
     name: "utils"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "build_signature_def"
+    argspec: "args=[\'inputs\', \'outputs\', \'method_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "build_tensor_info"
+    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "classification_signature_def"
+    argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_tensor_from_tensor_info"
+    argspec: "args=[\'tensor_info\', \'graph\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "is_valid_signature"
+    argspec: "args=[\'signature_def\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load"
+    argspec: "args=[\'sess\', \'tags\', \'export_dir\', \'import_scope\'], varargs=None, keywords=saver_kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "main_op_with_restore"
+    argspec: "args=[\'restore_op_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "maybe_saved_model_directory"
+    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict_signature_def"
+    argspec: "args=[\'inputs\', \'outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "regression_signature_def"
+    argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "simple_save"
     argspec: "args=[\'session\', \'export_dir\', \'inputs\', \'outputs\', \'legacy_init_op\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-conditional-accumulator.pbtxt
new file mode 100644
index 0000000000..cd97716c9d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-conditional-accumulator.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.sparse.SparseConditionalAccumulator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.SparseConditionalAccumulator\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "accumulator_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'shared_name\', \'name\', \'reduction_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'sparse_conditional_accumulator\', \'MEAN\'], "
+  }
+  member_method {
+    name: "apply_grad"
+    argspec: "args=[\'self\', \'grad_indices\', \'grad_values\', \'grad_shape\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "apply_indexed_slices_grad"
+    argspec: "args=[\'self\', \'grad\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "num_accumulated"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_global_step"
+    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "take_grad"
+    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "take_indexed_slices_grad"
+    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
new file mode 100644
index 0000000000..02e59a63e1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
@@ -0,0 +1,54 @@
+path: "tensorflow.sparse.SparseTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dense_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "indices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'indices\', \'values\', \'dense_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "consumers"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "eval"
+    argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'cls\', \'sparse_tensor_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index ba9e651b34..32bd8d5f8e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -1,5 +1,21 @@
 path: "tensorflow.sparse"
 tf_module {
+  member {
+    name: "SparseConditionalAccumulator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseTensor"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "add"
+    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "concat"
+    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "cross"
     argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,4 +32,100 @@ tf_module {
     name: "eye"
     argspec: "args=[\'num_rows\', \'num_columns\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "fill_empty_rows"
+    argspec: "args=[\'sp_input\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "mask"
+    argspec: "args=[\'a\', \'mask_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "maximum"
+    argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "merge"
+    argspec: "args=[\'sp_ids\', \'sp_values\', \'vocab_size\', \'name\', \'already_sorted\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "minimum"
+    argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "placeholder"
+    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_max"
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_max_sparse"
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_sum"
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_sum_sparse"
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reorder"
+    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset_shape"
+    argspec: "args=[\'sp_input\', \'new_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reshape"
+    argspec: "args=[\'sp_input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "retain"
+    argspec: "args=[\'sp_input\', \'to_retain\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "segment_mean"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "segment_sqrt_n"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "segment_sum"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "slice"
+    argspec: "args=[\'sp_input\', \'start\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "softmax"
+    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "split"
+    argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "to_indicator"
+    argspec: "args=[\'sp_input\', \'vocab_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "transpose"
+    argspec: "args=[\'sp_input\', \'perm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index 312e94b41d..ebdaf57231 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "length"
     argspec: "args=[\'input\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
   }
+  member_method {
+    name: "reduce_join"
+    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\'], "
+  }
   member_method {
     name: "regex_full_match"
     argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index cb6da5088b..7e980fe44d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -252,6 +252,10 @@ tf_module {
     name: "checkpoint_exists"
     argspec: "args=[\'checkpoint_prefix\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "confusion_matrix"
+    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'dtype\', \'name\', \'weights\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\", \'None\', \'None\'], "
+  }
   member_method {
     name: "cosine_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
-- 
GitLab


From 694367b574dcaf5ac90f3e42b8dee8fa51ca9f38 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 11:58:17 -0700
Subject: [PATCH 238/570] Automated rollback of commit
 cb98ceba9cff8c10ee3c7e89dc8925c88b28118e

PiperOrigin-RevId: 215254762
---
 tensorflow/core/grappler/optimizers/meta_optimizer.cc | 4 ++--
 tensorflow/core/protobuf/rewriter_config.proto        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index a5f851fb1a..c3d70a1fdf 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -139,7 +139,7 @@ Status MetaOptimizer::InitializeOptimizers(
   if (cfg_.remapping() != RewriterConfig::OFF) {
     optimizers->push_back(MakeUnique<Remapper>(cfg_.remapping()));
   }
-  if (cfg_.pin_to_host_optimization() != RewriterConfig::OFF) {
+  if (cfg_.pin_to_host_optimization() == RewriterConfig::ON) {
     optimizers->push_back(MakeUnique<PinToHostOptimizer>());
   }
   if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
@@ -527,7 +527,7 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
          cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
          cfg.debug_stripper() == RewriterConfig::ON ||
          cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
-         cfg.pin_to_host_optimization() != RewriterConfig::OFF ||
+         cfg.pin_to_host_optimization() == RewriterConfig::ON ||
          !cfg.optimizers().empty() || !cfg.custom_optimizers().empty();
 }
 
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 8e0448d536..8c31468ff5 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -75,7 +75,7 @@ message RewriterConfig {
   // Try to allocate some independent Op outputs contiguously in order to
   // merge or eliminate downstream Ops (off by default).
   Toggle scoped_allocator_optimization = 15;
-  // Force small ops onto the CPU (default is ON).
+  // Force small ops onto the CPU (default is OFF).
   Toggle pin_to_host_optimization = 18;
   // Disable the entire meta optimizer (off by default).
   bool disable_meta_optimizer = 19;
-- 
GitLab


From c4b3ce081b8abfae5560814ec445f0169cb4c368 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Mon, 1 Oct 2018 12:03:53 -0700
Subject: [PATCH 239/570] Add new attributes for the defun forward/backward
 functions.

PiperOrigin-RevId: 215255826
---
 tensorflow/python/eager/function.py      | 39 ++++++++++++++++++------
 tensorflow/python/eager/function_test.py | 15 +++++++++
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index dd3e1a3723..60a4f018cd 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import collections
 import functools
+import re
 import sys
 import threading
 import weakref
@@ -61,9 +62,15 @@ cond_v2_impl._function = sys.modules[__name__]  # pylint: disable=protected-acce
 # This is to avoid a circular dependency with gradients_impl
 gradients_impl._function = sys.modules[__name__]  # pylint: disable=protected-access
 
+FORWARD_FUNCTION_ATTRIBUTE_NAME = "forward_function_name"
+BACKWARD_FUNCTION_ATTRIBUTE_NAME = "backward_function_name"
 
 # TODO(scottzhu): Update this to allow arbitrary attribute names in future.
-WHITELIST_FUNCTION_ATTRIBUTE_PREFIX = "experimental_"
+WHITELIST_FUNCTION_ATTRIBUTE_REGEX = [
+    "experimental_.*",
+    FORWARD_FUNCTION_ATTRIBUTE_NAME,
+    BACKWARD_FUNCTION_ATTRIBUTE_NAME
+]
 
 
 def _create_substitute_placeholder(value, name=None, dtype=None):
@@ -140,10 +147,11 @@ def _parse_func_attrs(attributes):
   """
   attrs = {}
   for key, value in attributes.items():
-    if not key.startswith(WHITELIST_FUNCTION_ATTRIBUTE_PREFIX):
+    if not any([re.match(reg, key)
+                for reg in WHITELIST_FUNCTION_ATTRIBUTE_REGEX]):
       raise ValueError("Attribute name is not whitelisted. "
                        "Whitelisted: prefix %s, got: %s" %
-                       (WHITELIST_FUNCTION_ATTRIBUTE_PREFIX, key))
+                       (WHITELIST_FUNCTION_ATTRIBUTE_REGEX, key))
 
     if isinstance(value, attr_value_pb2.AttrValue):
       attrs[key] = value
@@ -154,7 +162,7 @@ def _parse_func_attrs(attributes):
       attrs[key] = attr_value_pb2.AttrValue(i=value)
     elif isinstance(value, float):
       attrs[key] = attr_value_pb2.AttrValue(f=value)
-    elif isinstance(value, str):
+    elif isinstance(value, (str, bytes)):
       attrs[key] = attr_value_pb2.AttrValue(s=compat.as_bytes(value))
     else:
       raise ValueError("Unsupported attribute type for %s with type %s" %
@@ -705,6 +713,7 @@ class Function(object):
   def _construct_backprop_function(self):
     """Constructs the backprop function object for this function."""
     backwards_graph = FuncGraph(_backward_name(self._func_graph.name))
+    forward_function_name = _forward_name(self._func_graph.name)
     with backwards_graph.as_default():
       gradients_wrt_outputs = [
           graph_placeholder(x.dtype, x.shape) for x in self._func_graph.outputs
@@ -715,11 +724,11 @@ class Function(object):
           grad_ys=gradients_wrt_outputs,
           src_graph=self._func_graph)
 
-    self._forward_function = _EagerDefinedFunction(
-        _forward_name(
-            self._func_graph.name), self._func_graph, self._func_graph.inputs,
-        self._func_graph.outputs + list(backwards_graph.captures.keys()),
-        self._attrs)
+    backwards_graph_captures = list(backwards_graph.captures.keys())
+
+    backward_function_attr = _parse_func_attrs(
+        {FORWARD_FUNCTION_ATTRIBUTE_NAME: forward_function_name})
+    backward_function_attr.update(self._attrs)
 
     # The ordering of `backwards_graph.inputs` is important: inputs of
     # `self._backward_graph_function` correspond to outputs of
@@ -732,7 +741,17 @@ class Function(object):
         grad for grad in _flatten(gradients_wrt_inputs) if grad is not None)
     backwards_graph.structured_outputs = gradients_wrt_inputs
     self._backward_graph_function = Function(
-        backwards_graph, attrs=self._attrs)
+        backwards_graph, attrs=backward_function_attr)
+
+    forward_function_attr = _parse_func_attrs({
+        BACKWARD_FUNCTION_ATTRIBUTE_NAME:
+            self._backward_graph_function._inference_function.name})  # pylint: disable=protected-access
+    forward_function_attr.update(self._attrs)
+
+    self._forward_function = _EagerDefinedFunction(
+        forward_function_name, self._func_graph, self._func_graph.inputs,
+        self._func_graph.outputs + backwards_graph_captures,
+        forward_function_attr)
 
   def _backprop_call(self, args):
     """Calls the forward function and records the result on a tape.
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 34a2648e26..afe3ba9893 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -1687,6 +1687,21 @@ class FunctionTest(test.TestCase):
           self.assertRegexpMatches(captured_function_names[i],
                                    expected_func_name_regex[i])
 
+        # Check the forward and backward function has the correct attributes.
+        self.assertEquals(
+            functions[1].definition.attr['backward_function_name'].s,
+            functions[2].name)
+        self.assertEquals(
+            functions[2].definition.attr['forward_function_name'].s,
+            functions[1].name)
+
+        self.assertEquals(
+            functions[4].definition.attr['backward_function_name'].s,
+            functions[5].name)
+        self.assertEquals(
+            functions[5].definition.attr['forward_function_name'].s,
+            functions[4].name)
+
         sq = defun_matmul(t, t)
         double = add(t, t)
         self.assertAllEqual(sq.eval().reshape(-1), [7, 10, 15, 22])
-- 
GitLab


From f0c219d095f38f7ce6febfb68d4f84d64aa1829a Mon Sep 17 00:00:00 2001
From: Youlong Cheng <ylc@google.com>
Date: Mon, 1 Oct 2018 12:28:32 -0700
Subject: [PATCH 240/570]  Expose tpu_host_placement_function().

PiperOrigin-RevId: 215259803
---
 tensorflow/contrib/tpu/python/tpu/tpu_context.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index 7cfb6c38fa..da6bdf67d6 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -154,6 +154,20 @@ class TPUContext(object):
     # as far as model is replicated to all cores in the system.
     return self._internal_ctx.device_for_replica(replica_id)
 
+  @property
+  def tpu_host_placement_function(self):
+    """Returns the TPU host place function.
+
+    The place function takes host_id as the input and returns the TF device
+    for the correspoding host.
+    """
+
+    def _placement_function(host_id):
+      """Return the host device given host_id."""
+      return self._internal_ctx.tpu_host_placement_function(host_id=host_id)
+
+    return _placement_function
+
 
 class _InternalTPUContext(object):
   """A context holds immutable states of TPU computation.
-- 
GitLab


From 5c8c48df7fd4ccbe4a9dec035fdec6b02a5d6016 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 12:54:56 -0700
Subject: [PATCH 241/570] Internal build specification change

PiperOrigin-RevId: 215263951
---
 tensorflow/core/BUILD | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 57819cec70..0aae29d10c 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -271,6 +271,12 @@ proto_library(
     visibility = ["//visibility:public"],
 )
 
+java_proto_library(
+    name = "example_java_proto",
+    visibility = ["//visibility:public"],
+    deps = [":example_protos"],
+)
+
 closure_proto_library(
     name = "example_protos_closure",
     visibility = ["//visibility:public"],
-- 
GitLab


From 3648cb0198690d551ea5c8eefcf706c8fa67f4f0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 13:07:12 -0700
Subject: [PATCH 242/570] Add option to initialize the TPU system.

PiperOrigin-RevId: 215266241
---
 tensorflow/python/tools/saved_model_cli.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 3dbccd1409..2fcb0fa029 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -267,7 +267,8 @@ def scan_meta_graph_def(meta_graph_def):
 
 def run_saved_model_with_feed_dict(saved_model_dir, tag_set, signature_def_key,
                                    input_tensor_key_feed_dict, outdir,
-                                   overwrite_flag, worker=None, tf_debug=False):
+                                   overwrite_flag, worker=None, init_tpu=False,
+                                   tf_debug=False):
   """Runs SavedModel and fetch all outputs.
 
   Runs the input dictionary through the MetaGraphDef within a SavedModel
@@ -287,6 +288,8 @@ def run_saved_model_with_feed_dict(saved_model_dir, tag_set, signature_def_key,
         the same name exists.
     worker: If provided, the session will be run on the worker.  Valid worker
         specification is a bns or gRPC path.
+    init_tpu: If true, the TPU system will be initialized after the session
+        is created.
     tf_debug: A boolean flag to use TensorFlow Debugger (TFDBG) to observe the
         intermediate Tensor values and runtime GraphDefs while running the
         SavedModel.
@@ -328,6 +331,12 @@ def run_saved_model_with_feed_dict(saved_model_dir, tag_set, signature_def_key,
   ]
 
   with session.Session(worker, graph=ops_lib.Graph()) as sess:
+    if init_tpu:
+      print('Initializing TPU System ...')
+      # This is needed for freshly started worker, or if the job
+      # restarts after a preemption.
+      sess.run(tf.contrib.tpu.initialize_system())
+
     loader.load(sess, tag_set.split(','), saved_model_dir)
 
     if tf_debug:
@@ -632,7 +641,7 @@ def run(args):
   run_saved_model_with_feed_dict(args.dir, args.tag_set, args.signature_def,
                                  tensor_key_feed_dict, args.outdir,
                                  args.overwrite, worker=args.worker,
-                                 tf_debug=args.tf_debug)
+                                 init_tpu=args.init_tpu, tf_debug=args.tf_debug)
 
 
 def scan(args):
@@ -775,6 +784,12 @@ def create_parser():
       default=None,
       help='if specified, a Session will be run on the worker. '
            'Valid worker specification is a bns or gRPC path.')
+  parser_run.add_argument(
+      '--init_tpu',
+      action='store_true',
+      default=None,
+      help='if specified, tpu.initialize_system will be called on the Session. '
+           'This option should be only used if the worker is a TPU job.')
   parser_run.set_defaults(func=run)
 
   # scan command
-- 
GitLab


From 3c6e6885f32e7638ece306dad3a5081b06137bdc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 13:08:10 -0700
Subject: [PATCH 243/570] Check in and refactor the OVIC detector benchmarker.

PiperOrigin-RevId: 215266415
---
 tensorflow/contrib/lite/java/ovic/BUILD       |  61 +++++-
 .../contrib/lite/java/ovic/demo/app/BUILD     |   5 +-
 .../demo/app/OvicBenchmarkerActivity.java     |  77 +++++---
 .../demo/app/res/layout/activity_main.xml     |  27 ++-
 .../java/ovic/demo/app/res/values/strings.xml |   3 +-
 .../java/org/tensorflow/ovic/BoundingBox.java |  68 +++++++
 .../org/tensorflow/ovic/OvicBenchmarker.java  | 152 ++++++---------
 ...ult.java => OvicClassificationResult.java} |  12 +-
 .../org/tensorflow/ovic/OvicClassifier.java   |  10 +-
 .../ovic/OvicClassifierBenchmarker.java       | 142 ++++++++++++++
 .../tensorflow/ovic/OvicDetectionResult.java  |  91 +++++++++
 .../org/tensorflow/ovic/OvicDetector.java     | 184 ++++++++++++++++++
 .../ovic/OvicDetectorBenchmarker.java         | 160 +++++++++++++++
 .../org/tensorflow/ovic/OvicValidator.java    |   2 +-
 .../tensorflow/ovic/OvicClassifierTest.java   |   6 +-
 .../org/tensorflow/ovic/OvicDetectorTest.java | 149 ++++++++++++++
 .../contrib/lite/java/ovic/src/testdata/BUILD |   5 +-
 .../java/ovic/src/testdata/coco_labels.txt    |  91 +++++++++
 18 files changed, 1101 insertions(+), 144 deletions(-)
 create mode 100644 tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/BoundingBox.java
 rename tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/{OvicSingleImageResult.java => OvicClassificationResult.java} (83%)
 create mode 100644 tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java
 create mode 100644 tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectionResult.java
 create mode 100644 tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetector.java
 create mode 100644 tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java
 create mode 100644 tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicDetectorTest.java
 create mode 100644 tensorflow/contrib/lite/java/ovic/src/testdata/coco_labels.txt

diff --git a/tensorflow/contrib/lite/java/ovic/BUILD b/tensorflow/contrib/lite/java/ovic/BUILD
index bb0be04ca2..ea9b9ed4b6 100644
--- a/tensorflow/contrib/lite/java/ovic/BUILD
+++ b/tensorflow/contrib/lite/java/ovic/BUILD
@@ -9,6 +9,7 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
 
+# Build targets for OVIC classification.
 java_test(
     name = "OvicClassifierTest",
     size = "medium",
@@ -45,8 +46,9 @@ android_library(
     name = "ovicbenchmarkerlib",
     srcs = [
         "src/main/java/org/tensorflow/ovic/OvicBenchmarker.java",
+        "src/main/java/org/tensorflow/ovic/OvicClassificationResult.java",
         "src/main/java/org/tensorflow/ovic/OvicClassifier.java",
-        "src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java",
+        "src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java",
     ],
     manifest = "//tensorflow/contrib/lite/java:AndroidManifest.xml",
     tags = ["no_oss"],
@@ -60,8 +62,8 @@ android_library(
 java_library(
     name = "ovicbenchmarkerlib_java",
     srcs = [
+        "src/main/java/org/tensorflow/ovic/OvicClassificationResult.java",
         "src/main/java/org/tensorflow/ovic/OvicClassifier.java",
-        "src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java",
     ],
     javacopts = JAVACOPTS,
     tags = ["no_oss"],
@@ -73,3 +75,58 @@ java_library(
         "@org_checkerframework_qual",
     ],
 )
+
+# Build targets for OVIC detection.
+java_test(
+    name = "OvicDetectorTest",
+    size = "medium",
+    srcs = ["src/test/java/org/tensorflow/ovic/OvicDetectorTest.java"],
+    data = [
+        "//tensorflow/contrib/lite/java/ovic/src/testdata:coco_labels.txt",
+        "//tensorflow/contrib/lite/java/ovic/src/testdata:ovic_testdata",
+        "@tflite_mobilenet_ssd_quant//:detect.tflite",
+    ],
+    javacopts = JAVACOPTS,
+    tags = ["no_oss"],
+    test_class = "org.tensorflow.ovic.OvicDetectorTest",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/lite/java/ovic:ovicdetectionbenchmarkerlib_java",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+android_library(
+    name = "ovicdetectionbenchmarkerlib",
+    srcs = [
+        "src/main/java/org/tensorflow/ovic/BoundingBox.java",
+        "src/main/java/org/tensorflow/ovic/OvicBenchmarker.java",
+        "src/main/java/org/tensorflow/ovic/OvicDetectionResult.java",
+        "src/main/java/org/tensorflow/ovic/OvicDetector.java",
+        "src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java",
+    ],
+    manifest = "//tensorflow/contrib/lite/java:AndroidManifest.xml",
+    deps = [
+        "//tensorflow/contrib/lite/java:tensorflowlite",
+        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@org_checkerframework_qual",
+    ],
+)
+
+java_library(
+    name = "ovicdetectionbenchmarkerlib_java",
+    srcs = [
+        "src/main/java/org/tensorflow/ovic/BoundingBox.java",
+        "src/main/java/org/tensorflow/ovic/OvicDetectionResult.java",
+        "src/main/java/org/tensorflow/ovic/OvicDetector.java",
+    ],
+    javacopts = JAVACOPTS,
+    deps = [
+        "//tensorflow/contrib/lite/java:libtensorflowlite_jni.so",
+        "//tensorflow/contrib/lite/java:tensorflowlite_java",
+        "//tensorflow/contrib/lite/java/src/main/native",
+        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@org_checkerframework_qual",
+    ],
+)
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/BUILD b/tensorflow/contrib/lite/java/ovic/demo/app/BUILD
index 058240aada..f567358ea3 100644
--- a/tensorflow/contrib/lite/java/ovic/demo/app/BUILD
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/BUILD
@@ -10,8 +10,10 @@ android_binary(
     ],
     aapt_version = "aapt",
     assets = [
-        "//tensorflow/contrib/lite/java/ovic/src/testdata:ovic_testdata",
+        "//tensorflow/contrib/lite/java/ovic/src/testdata:coco_labels.txt",
         "//tensorflow/contrib/lite/java/ovic/src/testdata:labels.txt",
+        "//tensorflow/contrib/lite/java/ovic/src/testdata:ovic_testdata",
+        "@tflite_mobilenet_ssd_quant//:detect.tflite",
     ],
     assets_dir = "",
     custom_package = "ovic.demo.app",
@@ -25,6 +27,7 @@ android_binary(
     deps = [
         "//tensorflow/contrib/lite/java:tensorflowlite",
         "//tensorflow/contrib/lite/java/ovic:ovicbenchmarkerlib",
+        "//tensorflow/contrib/lite/java/ovic:ovicdetectionbenchmarkerlib",
         "@androidsdk//com.android.support:support-v13-25.2.0",
         "@androidsdk//com.android.support:support-v4-25.2.0",
     ],
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java b/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
index 4adf94aeb6..48c29ecebe 100644
--- a/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
@@ -35,19 +35,18 @@ import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.text.DecimalFormat;
 import org.tensorflow.ovic.OvicBenchmarker;
-import org.tensorflow.ovic.OvicSingleImageResult;
-
+import org.tensorflow.ovic.OvicClassifierBenchmarker;
+import org.tensorflow.ovic.OvicDetectorBenchmarker;
 
 /** Class that benchmark image classifier models. */
 public class OvicBenchmarkerActivity extends Activity {
   /** Tag for the {@link Log}. */
   private static final String TAG = "OvicBenchmarkerActivity";
 
-  /** Name of the label file stored in Assets. */
-  private static final String LABEL_PATH = "labels.txt";
-
-  private static final String TEST_IMAGE_PATH = "test_image_224.jpg";
-  private static final String MODEL_PATH = "float_model.lite";
+  /** Name of the task-dependent data files stored in Assets. */
+  private static String labelPath = null;
+  private static String testImagePath = null;
+  private static String modelPath = null;
   /**
    * Each bottom press will launch a benchmarking experiment. The experiment stops when either the
    * total native latency reaches WALL_TIME or the number of iterations reaches MAX_ITERATIONS,
@@ -66,8 +65,6 @@ public class OvicBenchmarkerActivity extends Activity {
   private MappedByteBuffer model = null;
   private InputStream labelInputStream = null;
   private OvicBenchmarker benchmarker;
-  /** Inference result of each iteration. */
-  OvicSingleImageResult iterResult = null;
 
   private TextView textView = null;
   // private Button startButton = null;
@@ -83,21 +80,31 @@ public class OvicBenchmarkerActivity extends Activity {
   }
 
   private Bitmap loadTestBitmap() throws IOException {
-    InputStream imageStream = getAssets().open(TEST_IMAGE_PATH);
+    InputStream imageStream = getAssets().open(testImagePath);
     return BitmapFactory.decodeStream(imageStream);
   }
 
-  public void initializeTest() throws IOException {
+  public void initializeTest(boolean benchmarkClassification) throws IOException {
     Log.i(TAG, "Initializing benchmarker.");
-    benchmarker = new OvicBenchmarker(WALL_TIME);
+    if (benchmarkClassification) {
+      benchmarker = new OvicClassifierBenchmarker(WALL_TIME);
+      labelPath = "labels.txt";
+      testImagePath = "test_image_224.jpg";
+      modelPath = "quantized_model.lite";
+    } else {  // Benchmarking detection.
+      benchmarker = new OvicDetectorBenchmarker(WALL_TIME);
+      labelPath = "coco_labels.txt";
+      testImagePath = "test_image_224.jpg";
+      modelPath = "detect.tflite";
+    }
     AssetManager am = getAssets();
-    AssetFileDescriptor fileDescriptor = am.openFd(MODEL_PATH);
+    AssetFileDescriptor fileDescriptor = am.openFd(modelPath);
     FileInputStream modelInputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
     FileChannel fileChannel = modelInputStream.getChannel();
     long startOffset = fileDescriptor.getStartOffset();
     long declaredLength = fileDescriptor.getDeclaredLength();
     model = fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
-    labelInputStream = am.open(LABEL_PATH);
+    labelInputStream = am.open(labelPath);
   }
 
   public Boolean doTestIteration() throws IOException, InterruptedException {
@@ -117,24 +124,44 @@ public class OvicBenchmarkerActivity extends Activity {
     Log.i(TAG, "Going to do test iter.");
     // Start testing.
     Bitmap testImageBitmap = loadTestBitmap();
-    iterResult = benchmarker.doTestIteration(testImageBitmap);
-    testImageBitmap.recycle();
-    if (iterResult == null) {
+    try {
+      if (!benchmarker.processBitmap(testImageBitmap)) {
+        throw new RuntimeException("Failed to run test.");
+      }
+    } catch (Exception e) {
+      e.printStackTrace();
+      throw e;
+    } finally {
+      testImageBitmap.recycle();
+    }
+    String iterResultString = benchmarker.getLastResultString();
+    if (iterResultString == null) {
       throw new RuntimeException("Inference failed to produce a result.");
     }
-    Log.i(TAG, iterResult.toString());
+    Log.i(TAG, iterResultString);
     return true;
   }
 
-  public void startPressed(View view) throws IOException {
-    Log.i(TAG, "Start pressed");
+  public void detectPressed(View view) throws IOException {
+    benchmarkSession(false);
+  }
+  public void classifyPressed(View view) throws IOException {
+    benchmarkSession(true);
+  }
+
+  private void benchmarkSession(boolean benchmarkClassification) throws IOException {
     try {
-      initializeTest();
+      initializeTest(benchmarkClassification);
     } catch (IOException e) {
       Log.e(TAG, "Can't initialize benchmarker.", e);
       throw e;
     }
     String displayText = "";
+    if (benchmarkClassification) {
+      displayText = "Classification benchmark: ";
+    } else {
+      displayText = "Detection benchmark: ";
+    }
     try {
       setProcessorAffinity(BIG_CORE_MASK);
     } catch (IOException e) {
@@ -144,7 +171,6 @@ public class OvicBenchmarkerActivity extends Activity {
     Log.i(TAG, "Successfully initialized benchmarker.");
     int testIter = 0;
     Boolean iterSuccess = false;
-    double totalLatency = 0.0f;
     while (testIter < MAX_ITERATIONS) {
       try {
         iterSuccess = doTestIteration();
@@ -153,23 +179,22 @@ public class OvicBenchmarkerActivity extends Activity {
         throw e;
       } catch (InterruptedException e) {
         Log.e(TAG, "Interrupted at iteration " + testIter);
+        displayText += e.getMessage() + "\n";
       }
       if (!iterSuccess) {
         break;
       }
       testIter++;
-      totalLatency += (double) iterResult.latency;
     }
-    ;
     Log.i(TAG, "Benchmarking finished");
 
     if (textView != null) {
       if (testIter > 0) {
         textView.setText(
             displayText
-                + MODEL_PATH
+                + modelPath
                 + ": Average latency="
-                + df2.format(totalLatency / testIter)
+                + df2.format(benchmarker.getTotalRunTime() / testIter)
                 + "ms after "
                 + testIter
                 + " runs.");
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/layout/activity_main.xml b/tensorflow/contrib/lite/java/ovic/demo/app/res/layout/activity_main.xml
index e9d83bae54..1bce60ff7d 100644
--- a/tensorflow/contrib/lite/java/ovic/demo/app/res/layout/activity_main.xml
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/res/layout/activity_main.xml
@@ -30,14 +30,14 @@
     android:layout_height="wrap_content"
     android:text="@string/initial_status_msg"
     android:id="@+id/textView"
-    android:layout_above="@+id/button_start"
+    android:layout_above="@+id/button_clf_start"
     android:layout_alignParentTop="true"/>
 
   <Button
     android:layout_width="wrap_content"
     android:layout_height="wrap_content"
-    android:text="@string/start_label"
-    android:id="@id/button_start"
+    android:text="@string/start_clf_label"
+    android:id="@id/button_clf_start"
     android:layout_alignParentBottom="true"
     android:layout_alignParentLeft="true"
     android:background="@drawable/start_button_color"
@@ -49,6 +49,25 @@
     android:textColor="#ffffff"
     android:enabled="true"
     style="?android:attr/buttonBarButtonStyle"
-    android:onClick="startPressed"/>
+    android:onClick="classifyPressed"/>
+
+  <Button
+    android:layout_width="wrap_content"
+    android:layout_height="wrap_content"
+    android:text="@string/start_det_label"
+    android:id="@+id/button_det_start"
+    android:layout_alignParentBottom="true"
+    android:layout_alignParentRight="true"
+    android:layout_toRightOf="@id/button_clf_start"
+    android:background="@drawable/start_button_color"
+    android:padding="10dp"
+    android:layout_marginRight="100dp"
+    android:layout_marginLeft="30dp"
+    android:layout_marginTop="10dp"
+    android:foreground="#000000"
+    android:textColor="#ffffff"
+    android:enabled="true"
+    style="?android:attr/buttonBarButtonStyle"
+    android:onClick="detectPressed"/>
 
 </RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/values/strings.xml b/tensorflow/contrib/lite/java/ovic/demo/app/res/values/strings.xml
index d26beb1d27..53525908d3 100644
--- a/tensorflow/contrib/lite/java/ovic/demo/app/res/values/strings.xml
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/res/values/strings.xml
@@ -17,6 +17,7 @@
 <resources>
     <string name="app_name" translatable="false">Benchmarker</string>
 
-    <string name="start_label" translatable="false">Start</string>
+    <string name="start_clf_label" translatable="false">Clf</string>
+    <string name="start_det_label" translatable="false">Det</string>
     <string name="initial_status_msg" translatable="false"> Press start to run the benchmarks.</string>
 </resources>
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/BoundingBox.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/BoundingBox.java
new file mode 100644
index 0000000000..9bf7d005d2
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/BoundingBox.java
@@ -0,0 +1,68 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+/** Class for holding a detection bounding box with category and confidence. */
+public class BoundingBox {
+  // Upper left point.
+  public float x1;
+  public float y1;
+
+  // Lower right point.
+  public float x2;
+  public float y2;
+
+  // The area of the box
+  public float area;
+
+  // The object category
+  public int category;
+
+  // The confidence of the detection
+  public float score;
+
+  public BoundingBox(float x1, float y1, float x2, float y2, int category, float score) {
+    this.x1 = x1;
+    this.y1 = y1;
+    this.x2 = x2;
+    this.y2 = y2;
+    this.category = category;
+    this.score = score;
+    // -1 stands for area not initialized
+    this.area = -1;
+  }
+
+  // The intersection area of two bounding boxes
+  public float intersect(BoundingBox bbx) {
+    return Math.max(0, Math.min(x2, bbx.x2) - Math.max(x1, bbx.x1))
+        * Math.max(0, Math.min(y2, bbx.y2) - Math.max(y1, bbx.y1));
+  }
+
+  // The union area of two bounding boxes
+  public float union(BoundingBox bbx) {
+    return bbx.getArea() + this.getArea() - this.intersect(bbx);
+  }
+
+  public float getArea() {
+    if (area < 0) {
+      area = (x2 - x1) * (y2 - y1);
+    }
+    return area;
+  }
+
+  public float computeIoU(BoundingBox bbx) {
+    return (float) (this.intersect(bbx) * 1.0 / this.union(bbx));
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
index 4cda258bee..15d9511f50 100644
--- a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
@@ -20,11 +20,10 @@ import android.util.Log;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
 
 /**
- * Class that benchmarks image classifier models.
+ * Base class that benchmarks image models.
  *
  * <p>===================== General workflow =======================
  *
@@ -33,37 +32,40 @@ import java.nio.MappedByteBuffer;
  * benchmarker.getReadyToTest(labelInputStream, model);
  * while (!benchmarker.shouldStop()) {
  *   Bitmap bitmap = ...
- *   benchmarker.doTestIteration(bitmap);
+ *   imgId = ...
+ *   benchmarker.processBitmap(bitmap, imgId);
  * }
  * }</pre>
  */
-public class OvicBenchmarker {
+public abstract class OvicBenchmarker {
   /** Tag for the {@link Log}. */
   private static final String TAG = "OvicBenchmarker";
 
-  /** Evaluation transformation parameters. */
-  private static final float CENTRAL_FRACTION = 0.875f;
-
   /** Dimensions of inputs. */
-  private static final int DIM_BATCH_SIZE = 1;
-  private static final int DIM_PIXEL_SIZE = 3;
-  private int imgHeight = 224;
-  private int imgWidth = 224;
+  protected static final int DIM_BATCH_SIZE = 1;
+  protected static final int DIM_PIXEL_SIZE = 3;
+  protected int imgHeight = 224;
+  protected int imgWidth = 224;
+
+  /** Preprocess parameters (only used when input is float). */
+  protected static final float IMAGE_MEAN = 127.5f;
+  protected static final float IMAGE_STD = 127.5f;
+
+  /** Whether input is float or quantized. */
+  protected Boolean quantizedInput = null;
 
   /* Preallocated buffers for storing image data in. */
-  private int[] intValues = null;
+  protected int[] intValues = null;
 
   /** A ByteBuffer to hold image data, to be feed into classifier as inputs. */
-  private ByteBuffer imgData = null;
-
-  private OvicClassifier classifier;
+  protected ByteBuffer imgData = null;
 
   /** Total runtime in ms. */
-  private double totalRuntime = 0.0;
+  protected double totalRuntime = 0.0;
   /** Total allowed runtime in ms. */
-  private double wallTime = 20000 * 30.0;
-
-  private Boolean benchmarkStarted = null;
+  protected double wallTime = 20000 * 30.0;
+  /** Record whether benchmark has started (used to skip the first image). */
+  protected boolean benchmarkStarted = false;
 
   /**
    * Initializes an {@link OvicBenchmarker}
@@ -76,6 +78,11 @@ public class OvicBenchmarker {
     this.wallTime = wallTime;
   }
 
+  /** Return the cumulative latency of all runs so far. */
+  public double getTotalRunTime() {
+    return totalRuntime;
+  }
+
   /** Check whether the benchmarker should stop. */
   public Boolean shouldStop() {
     if (totalRuntime >= wallTime) {
@@ -90,105 +97,62 @@ public class OvicBenchmarker {
     return false;
   }
 
-  /** Check whether the benchmarker is ready to start classifying images. */
-  public Boolean readyToTest() {
-    return (classifier != null);
-  }
+  /** Abstract class for checking whether the benchmarker is ready to start processing images */
+  public abstract boolean readyToTest();
 
   /**
-   * Getting the benchmarker ready for classifying images.
+   * Abstract class for getting the benchmarker ready.
    *
    * @param labelInputStream: an {@link InputStream} specifying where the list of labels should be
    *     read from.
    * @param model: a {@link MappedByteBuffer} model to benchmark.
    */
-  public void getReadyToTest(InputStream labelInputStream, MappedByteBuffer model) {
-    try {
-      Log.i(TAG, "Creating classifier.");
-      classifier = new OvicClassifier(labelInputStream, model);
-      int [] inputDims = classifier.getInputDims();
-      imgHeight = inputDims[1];
-      imgWidth = inputDims[2];
-      // Only accept QUANTIZED_UINT8 input.
-      imgData = ByteBuffer.allocateDirect(DIM_BATCH_SIZE * imgHeight * imgWidth * DIM_PIXEL_SIZE);
-      imgData.order(ByteOrder.nativeOrder());
-      intValues = new int[imgHeight * imgWidth];
-    } catch (Exception e) {
-        Log.e(TAG, e.getMessage());
-        Log.e(TAG, "Failed to initialize ImageNet classifier for the benchmarker.");
-    }
-  }
-
-  /** Return how many classes are predicted per image. */
-  public int getNumPredictions() {
-    return classifier.getNumPredictions();
-  }
+  public abstract void getReadyToTest(InputStream labelInputStream, MappedByteBuffer model);
 
   /**
    * Perform test on a single bitmap image.
    *
-   * @param bitmap: a {@link Bitmap} image to classify.
+   * @param bitmap: a {@link Bitmap} image to process.
+   * @param imageId: an ID uniquely representing the image.
    */
-  public OvicSingleImageResult doTestIteration(Bitmap bitmap)
-      throws IOException, InterruptedException {
-    if (shouldStop() || !readyToTest()) {
-      return null;
-    }
-    OvicSingleImageResult iterResult = null;
-    try {
-      Log.i(TAG, "Converting bitmap.");
-      convertBitmapToInput(bitmap);
-      Log.i(TAG, "Classifying image.");
-      iterResult = classifier.classifyByteBuffer(imgData);
-    } catch (RuntimeException e) {
-      Log.e(TAG, e.getMessage());
-      Log.e(TAG, "Failed to classify image.");
-    }
-    if (iterResult == null || iterResult.latency == null) {
-      throw new RuntimeException("Classification result or timing is invalid.");
-    }
-    Log.d(TAG, "Native inference latency: " + iterResult.latency);
-    Log.i(TAG, iterResult.toString());
+  public abstract boolean processBitmap(Bitmap bitmap, int imageId)
+      throws IOException, InterruptedException;
 
-    if (!benchmarkStarted) {  // Skip the first image to discount warming-up time.
-      benchmarkStarted = true;
-    } else {
-      totalRuntime += (double) iterResult.latency;
-    }
-    return iterResult;
+  /** Perform test on a single bitmap image without an image ID. */
+  public boolean processBitmap(Bitmap bitmap) throws IOException, InterruptedException {
+    return processBitmap(bitmap, /* imageId = */ 0);
   }
 
+  /** Returns the last inference results as string. */
+  public abstract String getLastResultString();
+
   /**
-   * Writes Image data into a {@link ByteBuffer}.
-   *
-   * @param bitmap: a {@link Bitmap} source image.
-   */
-  private void convertBitmapToInput(Bitmap bitmap) throws RuntimeException {
-    if (imgData == null) {
+   * Loads input buffer from intValues into ByteBuffer for the interpreter.
+   * Input buffer must be loaded in intValues and output will be placed in imgData.
+  */
+  protected void loadsInputToByteBuffer() {
+    if (imgData == null || intValues == null || quantizedInput == null) {
       throw new RuntimeException("Benchmarker is not yet ready to test.");
     }
-    imgData.rewind();
-    // Perform transformations corresponding to evaluation mode.
-    float width = (float) bitmap.getWidth();
-    float height = (float) bitmap.getHeight();
-    int stWidth = Math.round((width - width * CENTRAL_FRACTION) / 2);
-    int stHeight = Math.round((height - height * CENTRAL_FRACTION) / 2);
-    int newWidth = Math.round(width - stWidth * 2);
-    int newHeight = Math.round(height - stHeight * 2);
-    bitmap = Bitmap.createBitmap(bitmap, stWidth, stHeight, newWidth, newHeight);
-    bitmap = Bitmap.createScaledBitmap(bitmap, imgWidth, imgHeight, true);
-    bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
-
     // Convert the image to ByteBuffer.
+    imgData.rewind();
     int pixel = 0;
     long startTime = SystemClock.uptimeMillis();
 
     for (int i = 0; i < imgHeight; ++i) {
       for (int j = 0; j < imgWidth; ++j) {
-        final int val = intValues[pixel++];
-        imgData.put((byte) ((val >> 16) & 0xFF));
-        imgData.put((byte) ((val >> 8) & 0xFF));
-        imgData.put((byte) (val & 0xFF));
+        final int pixelValue = intValues[pixel++];
+        if (quantizedInput) {
+          // Quantized model
+          imgData.put((byte) ((pixelValue >> 16) & 0xFF));
+          imgData.put((byte) ((pixelValue >> 8) & 0xFF));
+          imgData.put((byte) (pixelValue & 0xFF));
+        } else {
+          // Float model
+          imgData.putFloat((((pixelValue >> 16) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
+          imgData.putFloat((((pixelValue >> 8) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
+          imgData.putFloat(((pixelValue & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
+        }
       }
     }
     long endTime = SystemClock.uptimeMillis();
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassificationResult.java
similarity index 83%
rename from tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java
rename to tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassificationResult.java
index 4af9a65c2f..5ab804e6ee 100644
--- a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassificationResult.java
@@ -1,4 +1,4 @@
-/*Copyright 2018 Google LLC
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,17 +17,17 @@ package org.tensorflow.ovic;
 import java.util.ArrayList;
 
 /** Result class for inference run on a single image. */
-public class OvicSingleImageResult {
+public class OvicClassificationResult {
 
   /** Top K classes and probabilities. */
-  public ArrayList<String> topKClasses;
-  public ArrayList<Float> topKProbs;
-  public ArrayList<Integer> topKIndices;
+  public final ArrayList<String> topKClasses;
+  public final ArrayList<Float> topKProbs;
+  public final ArrayList<Integer> topKIndices;
 
   /** Latency (ms). */
   public Long latency;
 
-  OvicSingleImageResult() {
+  OvicClassificationResult() {
     topKClasses = new ArrayList<>();
     topKProbs = new ArrayList<>();
     topKIndices = new ArrayList<>();
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
index fd610b054f..d8a54c1f3b 100644
--- a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
@@ -31,7 +31,7 @@ import java.util.PriorityQueue;
 import org.tensorflow.lite.Interpreter;
 import org.tensorflow.lite.TestHelper;
 
-/** Benchmark ImageNet Classifier with Tensorflow Lite. */
+/** Class for running ImageNet classification with a TfLite model. */
 public class OvicClassifier {
 
   /** Tag for the {@link Log}. */
@@ -106,7 +106,7 @@ public class OvicClassifier {
 
   /** Classifies a {@link ByteBuffer} image. */
   // @throws RuntimeException if model is uninitialized.
-  public OvicSingleImageResult classifyByteBuffer(ByteBuffer imgData) {
+  public OvicClassificationResult classifyByteBuffer(ByteBuffer imgData) {
     if (tflite == null) {
       throw new RuntimeException(TAG + ": ImageNet classifier has not been initialized; Failed.");
     }
@@ -122,7 +122,7 @@ public class OvicClassifier {
         labelProbArray[0][i] = (inferenceOutputArray[0][i] & 0xff) / 255.0f;
       }
     }
-    OvicSingleImageResult iterResult = computeTopKLabels();
+    OvicClassificationResult iterResult = computeTopKLabels();
     iterResult.latency = getLastNativeInferenceLatencyMilliseconds();
     return iterResult;
   }
@@ -174,7 +174,7 @@ public class OvicClassifier {
   }
 
   /** Computes top-K labels. */
-  private OvicSingleImageResult computeTopKLabels() {
+  private OvicClassificationResult computeTopKLabels() {
     if (labelList == null) {
       throw new RuntimeException("Label file has not been loaded.");
     }
@@ -184,7 +184,7 @@ public class OvicClassifier {
         sortedLabels.poll();
       }
     }
-    OvicSingleImageResult singleImageResult = new OvicSingleImageResult();
+    OvicClassificationResult singleImageResult = new OvicClassificationResult();
     if (sortedLabels.size() != RESULTS_TO_SHOW) {
       throw new RuntimeException(
           "Number of returned labels does not match requirement: "
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java
new file mode 100644
index 0000000000..0cdd0f7bec
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java
@@ -0,0 +1,142 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import android.graphics.Bitmap;
+import android.util.Log;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+
+/** Class that benchmarks image classifier models. */
+public final class OvicClassifierBenchmarker extends OvicBenchmarker {
+  /** Tag for the {@link Log}. */
+  private static final String TAG = "OvicClassifierBenchmarker";
+
+  /** ImageNet preprocessing parameters. */
+  private static final float CENTRAL_FRACTION = 0.875f;
+  private OvicClassifier classifier;
+  private OvicClassificationResult iterResult = null;
+
+  public OvicClassifierBenchmarker(double wallTime) {
+    super(wallTime);
+  }
+
+  /** Test if the classifier is ready for benchmarking. */
+  @Override
+  public boolean readyToTest() {
+    return (classifier != null);
+  }
+
+  /**
+   * Getting the benchmarker ready for classifying images.
+   *
+   * @param labelInputStream: an {@link InputStream} specifying where the list of labels should be
+   *     read from.
+   * @param model: a {@link MappedByteBuffer} model to benchmark.
+   */
+  @Override
+   public void getReadyToTest(InputStream labelInputStream, MappedByteBuffer model) {
+    try {
+      Log.i(TAG, "Creating classifier.");
+      classifier = new OvicClassifier(labelInputStream, model);
+      int [] inputDims = classifier.getInputDims();
+      imgHeight = inputDims[1];
+      imgWidth = inputDims[2];
+      quantizedInput = true;
+      // Only accept QUANTIZED_UINT8 input.
+      imgData = ByteBuffer.allocateDirect(DIM_BATCH_SIZE * imgHeight * imgWidth * DIM_PIXEL_SIZE);
+      imgData.order(ByteOrder.nativeOrder());
+      intValues = new int[imgHeight * imgWidth];
+    } catch (Exception e) {
+        Log.e(TAG, e.getMessage());
+        Log.e(TAG, "Failed to initialize ImageNet classifier for the benchmarker.");
+    }
+  }
+
+  /**
+   * Perform classification on a single bitmap image.
+   *
+   * @param bitmap: a {@link Bitmap} image to process.
+   * @param imageId: an ID uniquely representing the image.
+   */
+  @Override
+  public boolean processBitmap(Bitmap bitmap, int imageId)
+      throws IOException, InterruptedException {
+    if (shouldStop() || !readyToTest()) {
+      return false;
+    }
+    try {
+      Log.i(TAG, "Converting bitmap.");
+      convertBitmapToInput(bitmap);
+      Log.i(TAG, "Classifying image: " + imageId);
+      iterResult = classifier.classifyByteBuffer(imgData);
+    } catch (RuntimeException e) {
+      Log.e(TAG, e.getMessage());
+      Log.e(TAG, "Failed to classify image.");
+    }
+    if (iterResult == null || iterResult.latency == null) {
+      throw new RuntimeException("Classification result or timing is invalid.");
+    }
+    Log.d(TAG, "Native inference latency: " + iterResult.latency);
+    Log.i(TAG, iterResult.toString());
+
+    if (!benchmarkStarted) {  // Skip the first image to discount warming-up time.
+      benchmarkStarted = true;
+    } else {
+      totalRuntime += ((double) iterResult.latency);
+    }
+    return true;
+  }
+
+  /** Return how many classes are predicted per image. */
+  public int getNumPredictions() {
+    return classifier.getNumPredictions();
+  }
+
+  public OvicClassificationResult getLastClassificationResult() {
+    return iterResult;
+  }
+
+  @Override
+  public String getLastResultString() {
+    if (iterResult == null) {
+      return null;
+    } else {
+      return iterResult.toString();
+    }
+  }
+
+  /**
+   * Preprocess bitmap according to ImageNet protocol then writes result into a {@link ByteBuffer}.
+   *
+   * @param bitmap: a {@link Bitmap} source image.
+   */
+  private void convertBitmapToInput(Bitmap bitmap) {
+    // Perform transformations corresponding to evaluation mode.
+    float width = (float) bitmap.getWidth();
+    float height = (float) bitmap.getHeight();
+    int stWidth = Math.round((width - width * CENTRAL_FRACTION) / 2);
+    int stHeight = Math.round((height - height * CENTRAL_FRACTION) / 2);
+    int newWidth = Math.round(width - stWidth * 2);
+    int newHeight = Math.round(height - stHeight * 2);
+    bitmap = Bitmap.createBitmap(bitmap, stWidth, stHeight, newWidth, newHeight);
+    bitmap = Bitmap.createScaledBitmap(bitmap, imgWidth, imgHeight, true);
+    bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+    loadsInputToByteBuffer();
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectionResult.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectionResult.java
new file mode 100644
index 0000000000..cf2902a5cb
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectionResult.java
@@ -0,0 +1,91 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import java.util.ArrayList;
+
+/** Result class for inference run on a single image. */
+public class OvicDetectionResult {
+
+  // Top K classes and probabilities.
+  public final ArrayList<BoundingBox> detections;
+  // Latency (ms).
+  public Long latency = -1L;
+  // id of the image.
+  public int id = -1;
+  // Number of valid detections (separately maintained, maybe different from detections.size()).
+  public int count = 0;
+
+  // Create OvicDetectionResult object with pre-filled capacity. Note that detections.size() will
+  // be equal to capacity after this call.
+  OvicDetectionResult(int capacity) {
+    detections = new ArrayList<BoundingBox>(capacity);
+    for (int i = 0; i < capacity; i++) {
+      detections.add(new BoundingBox(-1.0f, -1.0f, -1.0f, -1.0f, -1, -1.0f));
+    }
+  }
+
+  public void resetTo(Long latency, int id) {
+    count = 0;
+    this.latency = latency;
+    this.id = id;
+  }
+
+  public void addBox(float x1, float y1, float x2, float y2, int category, float score) {
+    detections.get(count).x1 = x1;
+    detections.get(count).y1 = y1;
+    detections.get(count).x2 = x2;
+    detections.get(count).y2 = y2;
+    detections.get(count).category = category;
+    detections.get(count).score = score;
+    count += 1;
+  }
+
+  public void scaleUp(double scaleFactorWidth, double scaleFactorHeight) {
+    for (BoundingBox box : detections) {
+      box.x1 = (float) (box.x1 * scaleFactorWidth);
+      box.y1 = (float) (box.y1 * scaleFactorHeight);
+      box.x2 = (float) (box.x2 * scaleFactorWidth);
+      box.y2 = (float) (box.y2 * scaleFactorHeight);
+    }
+  }
+
+  @Override
+  public String toString() {
+    String textToShow = latency + "ms";
+    int k = 0;
+    for (BoundingBox box : detections) {
+      textToShow +=
+          "\nPrediction ["
+              + k
+              + "] = Class "
+              + box.category
+              + " ("
+              + box.x1
+              + ", "
+              + box.y1
+              + ", "
+              + box.x2
+              + ", "
+              + box.y2
+              + ") : "
+              + box.score;
+      k++;
+    }
+
+
+    return textToShow;
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetector.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetector.java
new file mode 100644
index 0000000000..56836a79e5
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetector.java
@@ -0,0 +1,184 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.ByteBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.TestHelper;
+
+/** Class for running COCO detection with a TfLite model. */
+public class OvicDetector implements AutoCloseable {
+
+  /** Tag for the {@link Log}. */
+  private static final String TAG = "OvicDetector";
+
+  /** An instance of the driver class to run model inference with Tensorflow Lite. */
+  private Interpreter tflite;
+
+  /** Labels corresponding to the output of the vision model. */
+  private final List<String> labelList;
+
+  /** Define the output format. */
+  private final Boolean inputIsFloat;
+
+  /** Number of detections per image. 10 for demo, 100 for the actual competition. */
+  private static final int NUM_RESULTS = 10;
+
+  /** The output arrays for the mobilenet SSD. */
+  private float[][][] outputLocations;
+  private float[][] outputClasses;
+  private float[][] outputScores;
+  private float[] numDetections;
+  private Map<Integer, Object> outputMap;
+
+  /** Input resolution. */
+  private final int[] inputDims;
+
+  /** Final result. */
+  public OvicDetectionResult result = null;
+
+  OvicDetector(InputStream labelInputStream, MappedByteBuffer model) throws IOException {
+    // Load the label list.
+    labelList = loadLabelList(labelInputStream);
+
+    // Create the TfLite interpreter.
+    tflite = new Interpreter(model, new Interpreter.Options().setNumThreads(1));
+    inputDims = TestHelper.getInputDims(tflite, 0);
+    inputIsFloat = TestHelper.getInputDataType(tflite, 0).equals("float");
+    if (inputDims.length != 4) {
+      throw new RuntimeException("The model's input dimensions must be 4 (BWHC).");
+    }
+    if (inputDims[0] != 1) {
+      throw new RuntimeException(
+          "The model must have a batch size of 1, got " + inputDims[0] + " instead.");
+    }
+    if (inputDims[3] != 3) {
+      throw new RuntimeException(
+          "The model must have three color channels, got " + inputDims[3] + " instead.");
+    }
+    // Check the resolution.
+    int minSide = Math.min(inputDims[1], inputDims[2]);
+    int maxSide = Math.max(inputDims[1], inputDims[2]);
+    if (minSide <= 0 || maxSide > 1000) {
+      throw new RuntimeException("The model's resolution must be between (0, 1000].");
+    }
+
+    // Initialize the input array and result arrays. The input images are stored in a list of
+    // Object. Since this function anaylzed one image per time, there is only 1 item.
+    // The output is fomulated as a map of int -> Object. The output arrays are added to the map.
+    outputLocations = new float[1][NUM_RESULTS][4];
+    outputClasses = new float[1][NUM_RESULTS];
+    outputScores = new float[1][NUM_RESULTS];
+    numDetections = new float[1];
+    outputMap = new HashMap<>();
+    outputMap.put(0, outputLocations);
+    outputMap.put(1, outputClasses);
+    outputMap.put(2, outputScores);
+    outputMap.put(3, numDetections);
+    // Preallocate the result. This will be where inference result is stored after each
+    // detectByteBuffer call.
+    result = new OvicDetectionResult(NUM_RESULTS);
+  }
+
+  public Boolean quantizedInput() {
+    return !inputIsFloat;
+  }
+
+  /** Reads label list from Assets. */
+  private static List<String> loadLabelList(InputStream labelInputStream) throws IOException {
+    List<String> labelList = new ArrayList<>();
+    try (BufferedReader reader =
+        new BufferedReader(new InputStreamReader(labelInputStream, StandardCharsets.UTF_8))) {
+      String line;
+      while ((line = reader.readLine()) != null) {
+        labelList.add(line);
+      }
+    }
+    return labelList;
+  }
+
+  /**
+   * The interface to run the detection. This method currently only support float mobilenet_ssd
+   * model. The quantized models will be added in the future.
+   *
+   * @param imgData The image buffer in ByteBuffer format.
+   * @return boolean indicator of whether detection was a success. If success, the detection results
+   *  is available in the result member variable.
+   *     See OvicDetectionResult.java for details.
+   */
+  boolean detectByteBuffer(ByteBuffer imgData, int imageId) {
+    if (tflite == null) {
+      throw new RuntimeException(TAG + ": Detector has not been initialized; Failed.");
+    }
+    if (inputIsFloat == null) {
+      throw new RuntimeException(TAG + ": Detector input type has not been resolved.");
+    }
+
+    Object[] inputArray = {imgData};
+    tflite.runForMultipleInputsOutputs(inputArray, outputMap);
+
+    Long latency = getLastNativeInferenceLatencyMilliseconds();
+
+    // Update the results.
+    result.resetTo(latency, imageId);
+    for (int i = 0; i < NUM_RESULTS; i++) {
+      result.addBox(outputLocations[0][i][1] * inputDims[1],
+              outputLocations[0][i][0] * inputDims[1],
+              outputLocations[0][i][3] * inputDims[2],
+              outputLocations[0][i][2] * inputDims[2],
+              Math.round(outputClasses[0][i] + 1 /* Label offset */),
+              outputScores[0][i]);
+    }
+    return true;  // Marks that the result is available.
+  }
+
+  /*
+   * Get native inference latency of last image detection run.
+   *  @throws RuntimeException if model is uninitialized.
+   *  @return The inference latency in millisecond.
+   */
+  public Long getLastNativeInferenceLatencyMilliseconds() {
+    if (tflite == null) {
+      throw new RuntimeException(TAG + ": ImageNet classifier has not been initialized; Failed.");
+    }
+    Long latency = tflite.getLastNativeInferenceDurationNanoseconds();
+    return (latency == null) ? null : (Long) (latency / 1000000);
+  }
+
+  public int[] getInputDims() {
+    return inputDims;
+  }
+
+  public List<String> getLabels() {
+    return labelList;
+  }
+
+  /** Closes tflite to release resources. */
+  @Override
+  public void close() {
+    tflite.close();
+    tflite = null;
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java
new file mode 100644
index 0000000000..1a4e193ff2
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java
@@ -0,0 +1,160 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import android.graphics.Bitmap;
+import android.util.Log;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+
+/**
+ * Class that benchmarks object detection models.
+ */
+public final class OvicDetectorBenchmarker extends OvicBenchmarker {
+  /** Tag for the {@link Log}. */
+  private static final String TAG = "OvicDetectorBenchmarker";
+
+  public double scaleFactorWidth = 1.0f;
+  public double scaleFactorHeight = 1.0f;
+  private Bitmap scaledBitmap = null;  // Preallocate bitmap for scaling.
+
+  private OvicDetector detector;
+
+  /**
+   * Initializes an {@link OvicDetectionBenchmarker}
+   *
+   * @param wallTime: a double number specifying the total amount of time to benchmark.
+   */
+  public OvicDetectorBenchmarker(double wallTime) {
+    super(wallTime);
+  }
+
+  /** Check to see if the detector is ready to test. */
+  @Override
+  public boolean readyToTest() {
+    return (detector != null);
+  }
+
+  /**
+   * Getting the benchmarker ready for detecting images.
+   *
+   * @param labelInputStream: an {@link InputStream} specifying where the list of labels should be
+   *     read from.
+   * @param model: a {@link MappedByteBuffer} model to benchmark.
+   */
+  @Override
+  public void getReadyToTest(InputStream labelInputStream, MappedByteBuffer model) {
+    try {
+      Log.i(TAG, "Creating detector.");
+      detector = new OvicDetector(labelInputStream, model);
+      quantizedInput = detector.quantizedInput();
+      int[] inputDims = detector.getInputDims();
+      imgHeight = inputDims[1];
+      imgWidth = inputDims[2];
+      if (quantizedInput) {
+        imgData = ByteBuffer.allocateDirect(DIM_BATCH_SIZE * imgHeight * imgWidth * DIM_PIXEL_SIZE);
+      } else {
+        imgData =
+            ByteBuffer.allocateDirect(DIM_BATCH_SIZE * imgHeight * imgWidth * DIM_PIXEL_SIZE * 4);
+      }
+      imgData.order(ByteOrder.nativeOrder());
+      intValues = new int[imgHeight * imgWidth];
+      benchmarkStarted = false;
+    } catch (Exception e) {
+      Log.e(TAG, e.getMessage());
+      Log.e(TAG, "Failed to initialize COCO detector for the benchmarker.", e);
+    }
+  }
+
+  /**
+   * Perform detection on a single ByteBuffer {@link ByteBuffer} image. The image must have the
+   * same dimension that the model expects.
+   *
+   * @param image: a {@link ByteBuffer} image to process.
+   * @param imageId: an ID uniquely representing the image.
+   */
+  public boolean processBuffer(ByteBuffer image, int imageId) {
+    if (!readyToTest()) {
+      return false;
+    }
+    try {
+      if (!detector.detectByteBuffer(image, imageId)) {
+        return false;
+      }
+    } catch (RuntimeException e) {
+      Log.e(TAG, e.getMessage());
+      return false;
+    }
+
+    if (!benchmarkStarted) { // Skip the first image to discount warming-up time.
+      benchmarkStarted = true;
+    } else {
+      totalRuntime += ((double) detector.result.latency);
+    }
+    return true;  // Indicating that result is ready.
+  }
+
+  /**
+   * Perform detection on a single bitmap image.
+   *
+   * @param bitmap: a {@link Bitmap} image to process.
+   * @param imageId: an ID uniquely representing the image.
+   */
+  @Override
+  public boolean processBitmap(Bitmap bitmap, int imageId)
+      throws IOException, InterruptedException {
+    if (shouldStop() || !readyToTest()) {
+      return false;
+    }
+    convertBitmapToInput(bitmap);  // Scale bitmap if needed, store result in imgData.
+    if (!processBuffer(imgData, imageId)) {
+      return false;
+    }
+    // Scale results back to original image coordinates.
+    detector.result.scaleUp(scaleFactorWidth, scaleFactorHeight);
+    return true;  // Indicating that result is ready.
+  }
+
+  public OvicDetectionResult getLastDetectionResult() {
+    return detector.result;
+  }
+
+  @Override
+  public String getLastResultString() {
+    if (detector.result == null) {
+      return null;
+    }
+    return detector.result.toString();
+  }
+
+  /**
+   * Preprocess bitmap image into {@link ByteBuffer} format for the detector.
+   *
+   * @param bitmap: a {@link Bitmap} source image.
+   */
+  private void convertBitmapToInput(Bitmap bitmap) {
+    int originalWidth = bitmap.getWidth();
+    int originalHeight = bitmap.getHeight();
+    scaledBitmap = Bitmap.createScaledBitmap(bitmap, imgWidth, imgHeight, true);
+    scaleFactorWidth = originalWidth * 1.0 / imgWidth;
+    scaleFactorHeight = originalHeight * 1.0 / imgHeight;
+    scaledBitmap.getPixels(intValues, 0, imgWidth, 0, 0, imgWidth, imgHeight);
+    scaledBitmap.recycle();
+    loadsInputToByteBuffer();
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java
index a504ec74a9..baa14baf92 100644
--- a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java
@@ -51,7 +51,7 @@ public class OvicValidator {
       MappedByteBuffer model = loadModelFile(modelFile);
       OvicClassifier classifier = new OvicClassifier(labelsInputStream, model);
       ByteBuffer imgData = createByteBufferForClassifier(classifier);
-      OvicSingleImageResult testResult = classifier.classifyByteBuffer(imgData);
+      OvicClassificationResult testResult = classifier.classifyByteBuffer(imgData);
       if (testResult.topKClasses.isEmpty()) {
         throw new RuntimeException("Failed to return top K predictions.");
       }
diff --git a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
index 1587c3c56f..99e874ca78 100644
--- a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
+++ b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
@@ -1,4 +1,4 @@
-/*Copyright 2018 Google LLC
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ public final class OvicClassifierTest {
   private MappedByteBuffer lowResModel = null;
   private ByteBuffer testImage = null;
   private ByteBuffer lowResTestImage = null;
-  private OvicSingleImageResult testResult = null;
+  private OvicClassificationResult testResult = null;
   private static final String LABELS_PATH =
       "tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt";
   private static final String QUANTIZED_MODEL_PATH =
@@ -147,7 +147,7 @@ public final class OvicClassifierTest {
     return imgData;
   }
 
-  private static void assertCorrectTopK(OvicSingleImageResult testResult) {
+  private static void assertCorrectTopK(OvicClassificationResult testResult) {
     assertThat(testResult.topKClasses.size() > 0).isTrue();
     Boolean topKAccurate = false;
     // Assert that the correct class is in the top K.
diff --git a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicDetectorTest.java b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicDetectorTest.java
new file mode 100644
index 0000000000..4681e26052
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicDetectorTest.java
@@ -0,0 +1,149 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import java.awt.Graphics2D;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import javax.imageio.ImageIO;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit test for {@link org.tensorflow.ovic.OvicDetector}. */
+@RunWith(JUnit4.class)
+public final class OvicDetectorTest {
+  private OvicDetector detector = null;
+  private InputStream labelsInputStream = null;
+  private MappedByteBuffer model = null;
+  private ByteBuffer testImage = null;
+
+  private static final float IMAGE_MEAN = 128f;
+  private static final float IMAGE_STD = 128f;
+
+  private Boolean quantizedInput = null;
+  private static final String LABELS_PATH =
+      "tensorflow/contrib/lite/java/ovic/src/testdata/coco_labels.txt";
+  private static final String MODEL_PATH =
+      "external/tflite_mobilenet_ssd_quant/detect.tflite";
+  private static final String TEST_IMAGE_PATH =
+      "external/tflite_ovic_testdata/test_image_224.jpg";
+  private static final int GROUNDTRUTH = 1 /* Person */;
+
+  @Before
+  public void setUp() {
+    try {
+      // load models.
+      model = loadModelFile(MODEL_PATH);
+
+      // Load label files;
+      File labelsfile = new File(LABELS_PATH);
+      labelsInputStream = new FileInputStream(labelsfile);
+
+      // Create detector.
+      detector = new OvicDetector(labelsInputStream, model);
+      quantizedInput = detector.quantizedInput();
+
+      // Load test image and convert into byte buffer.
+      File imageFile = new File(TEST_IMAGE_PATH);
+      BufferedImage rawimg = ImageIO.read(imageFile);
+      int[] inputDims = detector.getInputDims();
+      BufferedImage img = new BufferedImage(inputDims[1], inputDims[2], rawimg.getType());
+      Graphics2D g = img.createGraphics();
+      g.drawImage(rawimg, 0, 0, inputDims[1], inputDims[2], null);
+      g.dispose();
+      testImage = toByteBuffer(img);
+    } catch (IOException e) {
+      System.out.println(e.getMessage());
+    }
+
+    System.out.println("Successfully setup");
+  }
+
+  private static MappedByteBuffer loadModelFile(String modelFilePath) throws IOException {
+    File modelfile = new File(modelFilePath);
+    FileInputStream inputStream = new FileInputStream(modelfile);
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = 0L;
+    long declaredLength = fileChannel.size();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+
+  private ByteBuffer toByteBuffer(BufferedImage image) {
+    ByteBuffer imgData;
+    if (quantizedInput) {
+      imgData = ByteBuffer.allocateDirect(image.getHeight() * image.getWidth() * 3);
+    } else {
+      imgData = ByteBuffer.allocateDirect(image.getHeight() * image.getWidth() * 12);
+    }
+    imgData.order(ByteOrder.nativeOrder());
+    for (int y = 0; y < image.getHeight(); y++) {
+      for (int x = 0; x < image.getWidth(); x++) {
+        int pixelValue = image.getRGB(x, y);
+        if (quantizedInput) {
+          // Quantized model
+          imgData.put((byte) ((pixelValue >> 16) & 0xFF));
+          imgData.put((byte) ((pixelValue >> 8) & 0xFF));
+          imgData.put((byte) (pixelValue & 0xFF));
+        } else {
+          // Float model
+          imgData.putFloat((((pixelValue >> 16) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
+          imgData.putFloat((((pixelValue >> 8) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
+          imgData.putFloat(((pixelValue & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
+        }
+      }
+    }
+    return imgData;
+  }
+
+  @Test
+  public void ovicDetector_detectSuccess() throws Exception {
+    assertThat(detector.detectByteBuffer(testImage, 1)).isTrue();
+    assertThat(detector.result != null).isTrue();
+  }
+
+  @Test
+  public void ovicDetector_simpleBatchTest() throws Exception {
+    final int numRepeats = 5;
+    for (int i = 0; i < numRepeats; i++) {
+      assertThat(detector.detectByteBuffer(testImage, 1)).isTrue();
+      OvicDetectionResult result = detector.result;
+      Boolean detectWithinTop5 = false;
+      for (int j = 0; j < Math.min(5, result.count); j++) {
+        if (result.detections.get(j).category == GROUNDTRUTH) {
+          detectWithinTop5 = true;
+          break;
+        }
+      }
+      if (!detectWithinTop5) {
+        System.out.println("---------------- Image " + i + " ---------------------");
+        System.out.println("Expect category " + GROUNDTRUTH);
+        System.out.println("Detection results: ");
+        System.out.println(result.toString());
+      }
+      assertThat(detectWithinTop5).isTrue();
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/testdata/BUILD b/tensorflow/contrib/lite/java/ovic/src/testdata/BUILD
index 1021ea30dd..051aa2204e 100644
--- a/tensorflow/contrib/lite/java/ovic/src/testdata/BUILD
+++ b/tensorflow/contrib/lite/java/ovic/src/testdata/BUILD
@@ -14,6 +14,9 @@ filegroup(
 )
 
 exports_files(
-    ["labels.txt"],
+    [
+        "labels.txt",
+        "coco_labels.txt",
+    ],
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/contrib/lite/java/ovic/src/testdata/coco_labels.txt b/tensorflow/contrib/lite/java/ovic/src/testdata/coco_labels.txt
new file mode 100644
index 0000000000..d91f535b1a
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/testdata/coco_labels.txt
@@ -0,0 +1,91 @@
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+empty
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+empty
+backpack
+umbrella
+empty
+empty
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+empty
+wine glasses
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+empty
+dining table
+empty
+empty
+toilet
+empty
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+empty
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
+empty
-- 
GitLab


From 1630584951975479dee852cf6f7603fe6819fde1 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Mon, 1 Oct 2018 13:28:17 -0700
Subject: [PATCH 244/570] Fixes possible out-of-bounds access by strided slice.

PiperOrigin-RevId: 215269882
---
 tensorflow/core/kernels/strided_slice_op.cc      | 2 +-
 tensorflow/python/kernel_tests/array_ops_test.py | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index f0575de4d9..3e8a4c5b72 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -149,7 +149,7 @@ class StridedSliceOp : public OpKernel {
       // NDIM and T
       if (is_simple_slice && std::is_same<Device, CPUDevice>::value &&
           input_dims == 2 && processing_shape.dims() == 2 &&
-          final_shape.dims() == 2) {
+          final_shape.dims() == 2 && new_axis_mask == 0) {
         MemCpyFunctor<T> functor;
         if (functor.Copy(input, begin, end, result)) {
           return;
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index c5547b19be..dcc594789e 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -615,6 +615,14 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       _ = checker[:, 0]
       _ = checker[:, :, 0]
 
+  def testBothNewAxisAndShrink(self):
+    with self.test_session(use_gpu=True):
+      ones = array_ops.placeholder(shape=[2, 2], dtype=dtypes.int16)
+      self.assertAllEqual(
+          ones[array_ops.newaxis, :, 0].eval(
+              feed_dict={ones: [[1, 1], [1, 1]]}),
+          [[1, 1]])
+
   def testTensorIndexing(self):
     with self.test_session(use_gpu=True):
       raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
-- 
GitLab


From c86f5941359526b91d85daf844e94ff5d39b2d6c Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 1 Oct 2018 13:40:30 -0700
Subject: [PATCH 245/570] Make cond_v2 If op lowering work in a defun + eager.

Prior to this change, the lowering pass assumed that the If op
functions would be available in the If op's graph. If the If op is
defined in a defun and then called via eager execution, the functions
will be in the eager context, but not in the defun's graph. This
change makes the lowering pass correctly use the function library
passed in by the caller via GraphOptimizationPassOptions.

PiperOrigin-RevId: 215271990
---
 tensorflow/core/common_runtime/lower_if_op.cc | 43 ++++++++++++-------
 tensorflow/core/common_runtime/lower_if_op.h  |  5 ++-
 .../core/common_runtime/lower_if_op_test.cc   |  4 +-
 .../kernel_tests/control_flow_ops_py_test.py  | 22 ++++++++++
 4 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index dfce7c23e7..a02084f223 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -38,11 +38,12 @@ class CondBuilder {
  public:
   enum Branch { kElseBranch = 0, kThenBranch = 1 };
 
-  // Create a CondBuilder to create the lowering of If op.  that has then and
+  // Create a CondBuilder to create the lowered form of `if_op` with then and
   // else functions named `then_fn_name` and `else_fn_name` respectively in the
-  // given graph.
+  // `graph`. The functions should be available in `flib`.
   CondBuilder(Node* if_op, const string& then_fn_name,
-              const string& else_fn_name, Graph* graph);
+              const string& else_fn_name, const FunctionLibraryDefinition& flib,
+              Graph* graph);
 
   // Constructs the basic conditional control flow using switch and merge nodes.
   Status CreatePivotNodes();
@@ -89,6 +90,7 @@ class CondBuilder {
   Node* then_call_node_;
   Node* else_call_node_;
   Graph* graph_;
+  const FunctionLibraryDefinition& flib_;
   string name_;
 
   NodeBuilder then_call_builder_;
@@ -96,9 +98,11 @@ class CondBuilder {
 };
 
 CondBuilder::CondBuilder(Node* if_op, const string& then_fn_name,
-                         const string& else_fn_name, Graph* graph)
+                         const string& else_fn_name,
+                         const FunctionLibraryDefinition& flib, Graph* graph)
     : if_op_(if_op),
       graph_(graph),
+      flib_(flib),
       name_(if_op->name()),
       then_call_builder_(NewName("then"), then_fn_name, graph->op_registry()),
       else_call_builder_(NewName("else"), else_fn_name, graph->op_registry()) {
@@ -193,15 +197,15 @@ Status CondBuilder::AddOutputs() {
   return Status::OK();
 }
 
-Status InlineCallInGraph(Node* n, Graph* g) {
-  const auto& lib = g->flib_def();
-  const FunctionDef* fdef = lib.Find(n->type_string());
+Status InlineCallInGraph(Node* n, const FunctionLibraryDefinition& flib,
+                         Graph* g) {
+  const FunctionDef* fdef = flib.Find(n->type_string());
   CHECK(fdef != nullptr);
   FunctionBody* fbody;
   TF_RETURN_IF_ERROR(
-      FunctionDefToBodyHelper(*fdef, n->attrs(), &lib,
-                              [&lib](const string& op, const OpDef** sig) {
-                                return lib.LookUpOpDef(op, sig);
+      FunctionDefToBodyHelper(*fdef, n->attrs(), &flib,
+                              [&flib](const string& op, const OpDef** sig) {
+                                return flib.LookUpOpDef(op, sig);
                               },
                               &fbody));
   // TODO(jpienaar): Improve this interface to make the need to delete it
@@ -219,8 +223,8 @@ Status CondBuilder::BuildLoweredIfOutput() {
 }
 
 Status CondBuilder::InlineCallNodes() {
-  TF_RETURN_IF_ERROR(InlineCallInGraph(then_call_node_, graph_));
-  TF_RETURN_IF_ERROR(InlineCallInGraph(else_call_node_, graph_));
+  TF_RETURN_IF_ERROR(InlineCallInGraph(then_call_node_, flib_, graph_));
+  TF_RETURN_IF_ERROR(InlineCallInGraph(else_call_node_, flib_, graph_));
   return Status::OK();
 }
 
@@ -240,6 +244,12 @@ Status LowerIfOpPass::Run(const GraphOptimizationPassOptions& options) {
     return errors::Internal("Lowering If op requires a graph to be available.");
   }
 
+  FunctionLibraryDefinition* flib = options.flib_def;
+  if (flib == nullptr) {
+    return errors::Internal(
+        "Lowering If op requires a FunctionLibraryDefinition to be available.");
+  }
+
   // Match all the nodes that need to be rewritten.
   gtl::InlinedVector<Node*, 2> matches;
   for (Node* n : g->op_nodes()) {
@@ -251,12 +261,14 @@ Status LowerIfOpPass::Run(const GraphOptimizationPassOptions& options) {
     }
   }
   for (Node* n : matches) {
-    TF_RETURN_IF_ERROR(RewriteNode(n, g));
+    TF_RETURN_IF_ERROR(RewriteNode(n, *flib, g));
   }
   return Status::OK();
 }
 
-Status LowerIfOpPass::RewriteNode(Node* n, Graph* g) {
+Status LowerIfOpPass::RewriteNode(Node* n,
+                                  const FunctionLibraryDefinition& flib,
+                                  Graph* g) {
   const AttrValue* then_attr = n->attrs().Find("then_branch");
   if (then_attr == nullptr) {
     return errors::InvalidArgument("Then branch function missing");
@@ -266,7 +278,8 @@ Status LowerIfOpPass::RewriteNode(Node* n, Graph* g) {
     return errors::InvalidArgument("Else branch function missing");
   }
 
-  CondBuilder cb(n, then_attr->func().name(), else_attr->func().name(), g);
+  CondBuilder cb(n, then_attr->func().name(), else_attr->func().name(), flib,
+                 g);
   TF_RETURN_IF_ERROR(cb.CreatePivotNodes());
   TF_RETURN_IF_ERROR(cb.AddInputs());
   TF_RETURN_IF_ERROR(cb.AddOutputs());
diff --git a/tensorflow/core/common_runtime/lower_if_op.h b/tensorflow/core/common_runtime/lower_if_op.h
index a9ef39ae5c..5ab1123e3f 100644
--- a/tensorflow/core/common_runtime/lower_if_op.h
+++ b/tensorflow/core/common_runtime/lower_if_op.h
@@ -29,8 +29,9 @@ class LowerIfOpPass : public GraphOptimizationPass {
   Status Run(const GraphOptimizationPassOptions& options) override;
 
  private:
-  // Rewrite the given If node `n` in graph `g` to use the switch-merge form.
-  Status RewriteNode(Node* n, Graph* g);
+  // Rewrite the given If node `n` in graph `g` to use the switch-merge
+  // form. `flib` should contain the branch functions referenced by `n`.
+  Status RewriteNode(Node* n, const FunctionLibraryDefinition& flib, Graph* g);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/lower_if_op_test.cc b/tensorflow/core/common_runtime/lower_if_op_test.cc
index 319a617b32..044a355d06 100644
--- a/tensorflow/core/common_runtime/lower_if_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_if_op_test.cc
@@ -36,9 +36,7 @@ namespace tensorflow {
 namespace {
 
 Status Rewrite(std::unique_ptr<Graph>* graph) {
-  FunctionDefLibrary flib;
-  FunctionLibraryDefinition flib_def((*graph)->op_registry(), flib);
-
+  FunctionLibraryDefinition flib_def((*graph)->flib_def());
   GraphOptimizationPassOptions opt_options;
   opt_options.graph = graph;
   opt_options.flib_def = &flib_def;
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index d91a848e01..ae61be614e 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -31,6 +31,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -3414,6 +3415,27 @@ class EagerTest(test.TestCase):
       self.assertAllEqual(r.numpy(), 10)
       self.assertFalse(isinstance(r, list))
 
+  def testCondInDefun(self):
+    if "GPU" in [d.device_type for d in device_lib.list_local_devices()]:
+      return unittest.skip("b/113346829 (gpu failure)")
+
+    with context.eager_mode():
+
+      @eager_function.defun
+      def foo(pred):
+        # TODO(b/111124878): this only needs to output one element.
+        fn1 = lambda: (constant_op.constant(10), constant_op.constant(100))
+        fn2 = lambda: (constant_op.constant(20), constant_op.constant(200))
+        return control_flow_ops.cond(constant_op.constant(pred), fn1, fn2)
+
+      r = foo(True)
+      self.assertAllEqual(r[0].numpy(), 10)
+      self.assertNotIsInstance(r, list)
+
+      r = foo(False)
+      self.assertAllEqual(r[0].numpy(), 20)
+      self.assertFalse(isinstance(r, list))
+
   def testWhileLoop(self):
     with context.eager_mode():
       tensor = constant_op.constant([1, 2, 3, 4, 5])
-- 
GitLab


From 44acd839c57494860666c799afd24360f1df3bed Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 13:42:40 -0700
Subject: [PATCH 246/570] Fix reported cuDNN default version during
 configuration.

PiperOrigin-RevId: 215272308
---
 configure.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.py b/configure.py
index 2de2365ff3..57d9574d1f 100644
--- a/configure.py
+++ b/configure.py
@@ -884,7 +884,7 @@ def set_tf_cudnn_version(environ_cp):
   """Set CUDNN_INSTALL_PATH and TF_CUDNN_VERSION."""
   ask_cudnn_version = (
       'Please specify the cuDNN version you want to use. '
-      '[Leave empty to default to cuDNN %s.0]: ') % _DEFAULT_CUDNN_VERSION
+      '[Leave empty to default to cuDNN %s]: ') % _DEFAULT_CUDNN_VERSION
 
   for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
     tf_cudnn_version = get_from_env_or_user_or_default(
-- 
GitLab


From 3039a4694e22674b502257ae34b0a5b614a631f3 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Mon, 1 Oct 2018 13:43:49 -0700
Subject: [PATCH 247/570] [XLA] Migrate from gtl::FlatMap to
 absl::flat_hash_map

PiperOrigin-RevId: 215272497
---
 tensorflow/compiler/jit/BUILD                 |  5 +++
 tensorflow/compiler/jit/deadness_analysis.cc  | 22 ++++++------
 .../compiler/jit/deadness_analysis_internal.h |  4 +--
 tensorflow/compiler/jit/kernels/BUILD         |  1 +
 tensorflow/compiler/jit/kernels/xla_ops.cc    |  3 +-
 .../jit/mark_for_compilation_pass_test.cc     | 11 +++---
 .../jit/resource_operation_safety_analysis.cc |  1 -
 .../compiler/jit/xla_compilation_cache.h      |  6 ++--
 tensorflow/compiler/tf2xla/BUILD              |  2 ++
 .../tf2xla/resource_operation_table.cc        | 14 ++++----
 .../tf2xla/resource_operation_table_test.cc   |  3 +-
 tensorflow/compiler/xla/client/BUILD          |  1 +
 tensorflow/compiler/xla/client/xla_builder.h  |  4 +--
 tensorflow/compiler/xla/service/BUILD         | 33 +++++++++++++++++
 .../compiler/xla/service/allocation_tracker.h |  5 +--
 .../xla/service/batchnorm_expander.cc         |  1 -
 .../xla/service/bfloat16_propagation.h        |  6 ++--
 .../compiler/xla/service/buffer_assignment.cc | 30 ++++++++--------
 .../compiler/xla/service/buffer_assignment.h  | 23 ++++++------
 .../compiler/xla/service/buffer_liveness.h    |  1 -
 tensorflow/compiler/xla/service/call_graph.h  |  6 ++--
 .../compiler/xla/service/copy_insertion.cc    | 11 +++---
 tensorflow/compiler/xla/service/cpu/BUILD     |  4 +++
 .../xla/service/cpu/cpu_layout_assignment.cc  |  3 +-
 .../compiler/xla/service/cpu/cpu_runtime.cc   |  4 +--
 .../compiler/xla/service/cpu/ir_emitter.cc    |  4 +--
 .../compiler/xla/service/cpu/ir_emitter.h     | 10 +++---
 .../service/cpu/target_machine_features.cc    |  1 +
 .../xla/service/cpu/target_machine_features.h |  5 ++-
 tensorflow/compiler/xla/service/defuser.cc    |  3 +-
 .../compiler/xla/service/dfs_hlo_visitor.h    |  1 -
 tensorflow/compiler/xla/service/gpu/BUILD     |  2 ++
 .../xla/service/gpu/gpu_executable.cc         |  3 +-
 .../compiler/xla/service/gpu/gpu_executable.h |  4 +--
 .../xla/service/gpu/stream_assignment.h       |  4 +--
 .../compiler/xla/service/heap_simulator.cc    | 17 +++++----
 .../compiler/xla/service/heap_simulator.h     | 21 ++++++-----
 .../xla/service/heap_simulator_test.cc        |  4 +--
 .../xla/service/hlo_alias_analysis.cc         |  7 ++--
 .../compiler/xla/service/hlo_alias_analysis.h |  3 +-
 .../compiler/xla/service/hlo_clone_context.h  | 12 +++----
 .../compiler/xla/service/hlo_computation.cc   | 11 +++---
 .../compiler/xla/service/hlo_computation.h    | 10 +++---
 .../compiler/xla/service/hlo_domain_map.cc    |  5 +--
 .../compiler/xla/service/hlo_domain_map.h     |  9 +++--
 .../compiler/xla/service/hlo_instruction.cc   | 13 +++----
 .../compiler/xla/service/hlo_instruction.h    |  8 ++---
 .../compiler/xla/service/hlo_instructions.cc  |  6 ++--
 .../xla/service/hlo_memory_scheduler.cc       | 35 +++++++++----------
 .../xla/service/hlo_memory_scheduler.h        | 11 +++---
 .../xla/service/hlo_memory_scheduler_test.cc  |  5 +--
 tensorflow/compiler/xla/service/hlo_module.cc |  5 +--
 .../xla/service/hlo_module_group_metadata.h   | 14 ++++----
 .../xla/service/hlo_module_group_util.h       |  4 +--
 tensorflow/compiler/xla/service/hlo_opcode.cc |  4 +--
 .../compiler/xla/service/hlo_ordering.h       |  8 ++---
 .../compiler/xla/service/hlo_pass_pipeline.cc |  3 +-
 .../compiler/xla/service/hlo_reachability.h   |  4 +--
 .../xla/service/hlo_rematerialization.cc      | 11 +++---
 .../xla/service/hlo_rematerialization.h       |  4 +--
 .../compiler/xla/service/hlo_schedule.cc      | 19 +++++-----
 .../compiler/xla/service/hlo_schedule.h       |  6 ++--
 .../compiler/xla/service/hlo_verifier.cc      |  6 ++--
 .../xla/service/indexed_array_analysis.cc     |  3 +-
 .../xla/service/indexed_array_analysis.h      |  4 +--
 .../xla/service/instruction_fusion.cc         |  8 ++---
 .../compiler/xla/service/instruction_fusion.h |  5 +--
 .../compiler/xla/service/layout_assignment.h  |  6 ++--
 tensorflow/compiler/xla/service/llvm_ir/BUILD |  1 +
 .../xla/service/llvm_ir/alias_analysis.h      | 10 +++---
 .../xla/service/multi_output_fusion.cc        |  2 +-
 .../xla/service/multi_output_fusion.h         |  3 +-
 .../compiler/xla/service/name_uniquer.h       |  4 +--
 .../xla/service/reduce_precision_insertion.h  |  1 -
 .../xla/service/tuple_points_to_analysis.h    |  1 -
 .../service/while_loop_constant_sinking.cc    |  1 -
 .../while_loop_invariant_code_motion.cc       |  8 ++---
 .../xla/service/while_loop_simplifier.cc      |  6 ++--
 tensorflow/compiler/xla/tests/BUILD           |  1 +
 .../xla/tests/xla_hlo_profile_test.cc         |  8 ++---
 80 files changed, 319 insertions(+), 259 deletions(-)

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 5bf4af1014..29b60d1dbe 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -258,6 +258,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:variable_ops",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -323,6 +324,7 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -400,6 +402,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:bounds_check",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
@@ -471,6 +474,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -509,6 +513,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler/optimizers/data:graph_utils",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index 25e2e9a7af..e63d4b7792 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/deadness_analysis.h"
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/jit/deadness_analysis_internal.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -420,15 +421,15 @@ class PredicateFactory {
     }
   };
 
-  gtl::FlatMap<SignatureForAndOr, std::unique_ptr<Predicate>,
-               HashSignatureForAndOr>
+  absl::flat_hash_map<SignatureForAndOr, std::unique_ptr<Predicate>,
+                      HashSignatureForAndOr>
       interned_and_or_instances_;
-  gtl::FlatMap<SignatureForNot, std::unique_ptr<Predicate>>
+  absl::flat_hash_map<SignatureForNot, std::unique_ptr<Predicate>>
       interned_not_instances_;
-  gtl::FlatMap<SignatureForAndRec, std::unique_ptr<Predicate>>
+  absl::flat_hash_map<SignatureForAndRec, std::unique_ptr<Predicate>>
       interned_and_rec_instances_;
-  gtl::FlatMap<SignatureForSymbol, std::unique_ptr<Predicate>,
-               HashSignatureForSymbol>
+  absl::flat_hash_map<SignatureForSymbol, std::unique_ptr<Predicate>,
+                      HashSignatureForSymbol>
       interned_symbol_instances_;
 };
 
@@ -572,7 +573,8 @@ class DeadnessAnalysisImpl : public DeadnessAnalysis {
   Status PopulateWithReversePostOrder(absl::Span<Node* const> rpo);
   bool HasInputsWithMismatchingDeadness(const Node& node) override;
   void Print() const override;
-  gtl::FlatMap<TensorId, string, TensorId::Hasher> PredicateMapAsString() const;
+  absl::flat_hash_map<TensorId, string, TensorId::Hasher> PredicateMapAsString()
+      const;
 
  private:
   enum class EdgeKind { kDataAndControl, kDataOnly, kControlOnly };
@@ -614,7 +616,7 @@ class DeadnessAnalysisImpl : public DeadnessAnalysis {
   Status HandleNode(Node* n, std::vector<bool>* should_revisit);
 
   const Graph& graph_;
-  gtl::FlatMap<TensorId, Predicate*, TensorId::Hasher> predicate_map_;
+  absl::flat_hash_map<TensorId, Predicate*, TensorId::Hasher> predicate_map_;
   PredicateFactory predicate_factory_;
   bool vlog_;
 };
@@ -977,9 +979,9 @@ DeadnessAnalysis::~DeadnessAnalysis() {}
   return Status::OK();
 }
 
-gtl::FlatMap<TensorId, string, TensorId::Hasher>
+absl::flat_hash_map<TensorId, string, TensorId::Hasher>
 DeadnessAnalysisImpl::PredicateMapAsString() const {
-  gtl::FlatMap<TensorId, string, TensorId::Hasher> result;
+  absl::flat_hash_map<TensorId, string, TensorId::Hasher> result;
   std::vector<TensorId> tensor_ids;
   for (const auto& kv_pair : predicate_map_) {
     CHECK(result.insert({kv_pair.first, kv_pair.second->ToString()}).second);
diff --git a/tensorflow/compiler/jit/deadness_analysis_internal.h b/tensorflow/compiler/jit/deadness_analysis_internal.h
index 3df2679c62..354782374a 100644
--- a/tensorflow/compiler/jit/deadness_analysis_internal.h
+++ b/tensorflow/compiler/jit/deadness_analysis_internal.h
@@ -16,15 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_INTERNAL_H_
 #define TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_INTERNAL_H_
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/graph/tensor_id.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace tensorflow {
 namespace deadness_analysis_internal {
 
 // Returns a map describing the predicate each Tensor was mapped to.  For
 // testing purposes only.
-using PredicateMapTy = gtl::FlatMap<TensorId, string, TensorId::Hasher>;
+using PredicateMapTy = absl::flat_hash_map<TensorId, string, TensorId::Hasher>;
 Status ComputePredicates(const Graph& graph, PredicateMapTy* out_predicate_map);
 
 // Returns a map describing the predicate each Tensor was mapped to.  For
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 0839f1cb3d..26cb3af9d6 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -26,6 +26,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/kernels:variable_ops",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index a85006eb03..cfd27a6510 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/kernels/xla_ops.h"
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
@@ -163,7 +164,7 @@ class XlaExecutableClosureStore {
  private:
   mutex mutex_;
   int64 key_counter_ GUARDED_BY(mutex_);
-  gtl::FlatMap<KeyT, XlaExecutableClosure> closures_ GUARDED_BY(mutex_);
+  absl::flat_hash_map<KeyT, XlaExecutableClosure> closures_ GUARDED_BY(mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaExecutableClosureStore);
 };
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 4f9145b479..2a80c745e3 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h"
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "tensorflow/cc/framework/ops.h"
@@ -61,10 +62,10 @@ std::unordered_map<string, string> GetClusters(const Graph& graph) {
   return ids;
 }
 
-gtl::FlatMap<string, std::vector<string>> GetClusterSets(
+absl::flat_hash_map<string, std::vector<string>> GetClusterSets(
     const Graph& g, std::vector<string>* cluster_names = nullptr) {
   CHECK(cluster_names == nullptr || cluster_names->empty());
-  gtl::FlatMap<string, std::vector<string>> cluster_sets;
+  absl::flat_hash_map<string, std::vector<string>> cluster_sets;
   for (const auto& p : GetClusters(g)) {
     cluster_sets[p.second].push_back(p.first);
   }
@@ -566,7 +567,7 @@ TEST(XlaCompilationTest, ResourcesClusteringAllowed) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_EXPECT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
-  gtl::FlatMap<string, std::vector<string>> cluster_sets =
+  absl::flat_hash_map<string, std::vector<string>> cluster_sets =
       GetClusterSets(*graph);
   ASSERT_EQ(cluster_sets.size(), 1);
   std::vector<string> expected_clustered_nodes = {"AssignmentW", "ReadR",
@@ -586,7 +587,7 @@ TEST(XlaCompilationTest, ResourcesClusteringDisallowed) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_EXPECT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
-  gtl::FlatMap<string, std::vector<string>> cluster_sets =
+  absl::flat_hash_map<string, std::vector<string>> cluster_sets =
       GetClusterSets(*graph);
   ASSERT_EQ(cluster_sets.size(), 1);
   std::vector<string> expected_clustered_nodes = {"AssignmentW",
@@ -616,7 +617,7 @@ TEST(XlaCompilationTest, ChainOfOps) {
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
   std::vector<string> cluster_names;
-  gtl::FlatMap<string, std::vector<string>> cluster_sets =
+  absl::flat_hash_map<string, std::vector<string>> cluster_sets =
       GetClusterSets(*graph, &cluster_names);
 
   ASSERT_EQ(cluster_sets.size(), 2);
diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis.cc b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
index 56e35c0059..657bb409db 100644
--- a/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
+++ b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
@@ -89,7 +89,6 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/tensor_id.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/util/ptr_util.h"
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index 10ad87e38c..17c0321c1e 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_COMPILATION_CACHE_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_COMPILATION_CACHE_H_
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -24,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
@@ -152,7 +152,7 @@ class XlaCompilationCache : public ResourceBase {
   };
 
   mutex compile_cache_mu_;
-  gtl::FlatMap<Signature, std::unique_ptr<Entry>, Signature::Hash> cache_
+  absl::flat_hash_map<Signature, std::unique_ptr<Entry>, Signature::Hash> cache_
       GUARDED_BY(compile_cache_mu_);
 
   struct CompileStats {
@@ -165,7 +165,7 @@ class XlaCompilationCache : public ResourceBase {
   mutex compile_stats_mu_;
 
   // Maps cluster names to compilation statistics for said cluster.
-  gtl::FlatMap<string, CompileStats> compile_stats_
+  absl::flat_hash_map<string, CompileStats> compile_stats_
       GUARDED_BY(compile_stats_mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaCompilationCache);
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index ba1e3b2b4f..3f631f91ec 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -635,6 +635,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:ops",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -649,6 +650,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table.cc b/tensorflow/compiler/tf2xla/resource_operation_table.cc
index 20f2ce2919..72b240996f 100644
--- a/tensorflow/compiler/tf2xla/resource_operation_table.cc
+++ b/tensorflow/compiler/tf2xla/resource_operation_table.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "absl/algorithm/container.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "absl/container/flat_hash_map.h"
 
 namespace tensorflow {
 /*static*/ absl::string_view XlaResourceOpInfo::XlaResourceOpKindToString(
@@ -30,9 +30,9 @@ namespace tensorflow {
   }
 }
 
-static gtl::FlatMap<absl::string_view, XlaResourceOpInfo>*
+static absl::flat_hash_map<absl::string_view, XlaResourceOpInfo>*
 CreateResourceOpInfoMap() {
-  auto* result = new gtl::FlatMap<absl::string_view, XlaResourceOpInfo>;
+  auto* result = new absl::flat_hash_map<absl::string_view, XlaResourceOpInfo>;
 
   auto add = [&](absl::string_view op, XlaResourceOpKind op_kind,
                  XlaResourceKind resource_kind) {
@@ -103,15 +103,15 @@ CreateResourceOpInfoMap() {
   return result;
 }
 
-static const gtl::FlatMap<absl::string_view, XlaResourceOpInfo>&
+static const absl::flat_hash_map<absl::string_view, XlaResourceOpInfo>&
 GetStaticResourceOpInfoMap() {
-  static gtl::FlatMap<absl::string_view, XlaResourceOpInfo>* op_info_map =
-      CreateResourceOpInfoMap();
+  static absl::flat_hash_map<absl::string_view, XlaResourceOpInfo>*
+      op_info_map = CreateResourceOpInfoMap();
   return *op_info_map;
 }
 
 const XlaResourceOpInfo* GetResourceOpInfoForOp(absl::string_view op) {
-  const gtl::FlatMap<absl::string_view, XlaResourceOpInfo>& op_infos =
+  const absl::flat_hash_map<absl::string_view, XlaResourceOpInfo>& op_infos =
       GetStaticResourceOpInfoMap();
   auto it = op_infos.find(op);
   return it == op_infos.end() ? nullptr : &it->second;
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table_test.cc b/tensorflow/compiler/tf2xla/resource_operation_table_test.cc
index a85ef040a7..956f597301 100644
--- a/tensorflow/compiler/tf2xla/resource_operation_table_test.cc
+++ b/tensorflow/compiler/tf2xla/resource_operation_table_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -33,7 +34,7 @@ bool HasResourceInputOrOutput(const OpDef& op_def) {
 }
 
 TEST(ResourceOperationTableTest, HaveAllResourceOps) {
-  gtl::FlatMap<string, bool> known_resource_ops;
+  absl::flat_hash_map<string, bool> known_resource_ops;
   for (absl::string_view known_resource_op :
        resource_op_table_internal::GetKnownResourceOps()) {
     ASSERT_TRUE(
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index f825f67b44..1191cff109 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -220,6 +220,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 1da6ddd318..b7295e8a53 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <type_traits>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/padding.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stacktrace.h"
@@ -1027,7 +1027,7 @@ class XlaBuilder {
 
   // A map from XlaOp::Handle to the index in the instructions_ vector where the
   // instruction is held.
-  tensorflow::gtl::FlatMap<int64, int64> handle_to_index_;
+  absl::flat_hash_map<int64, int64> handle_to_index_;
 
   // The embedded computations used by this computation. Each computation was
   // the entry computation of some XlaComputation, the key is the unique id of
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index e800cf470c..8da6364786 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -146,6 +146,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -250,6 +251,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -333,6 +335,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -395,6 +398,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -485,6 +489,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -903,6 +908,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
@@ -952,6 +958,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:types",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -987,6 +994,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -1034,6 +1042,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1087,6 +1096,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -1125,6 +1135,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -1146,6 +1157,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -1196,6 +1208,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:optional",
     ],
@@ -1216,6 +1229,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1260,6 +1274,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -1280,6 +1295,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -1304,6 +1320,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -1330,6 +1347,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1385,6 +1403,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
@@ -1640,6 +1659,7 @@ cc_library(
         ":while_loop_analysis",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
@@ -1671,6 +1691,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -2203,6 +2224,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -2263,6 +2285,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -2319,6 +2342,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -2345,6 +2369,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2416,6 +2441,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
@@ -2460,6 +2486,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -2588,6 +2615,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -2701,6 +2729,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -3147,6 +3176,7 @@ cc_library(
         ":hlo_pass_pipeline",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -3269,6 +3299,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
     ],
 )
@@ -3298,6 +3329,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
     ],
 )
@@ -3354,6 +3386,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index a7d8927cf7..af227fe4da 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -110,7 +111,7 @@ class AllocationTracker {
 
   // A map from device memory opaque value to allocation. One such map is
   // maintained per device ordinal.
-  using AllocationMap = tensorflow::gtl::FlatMap<const void*, Allocation>;
+  using AllocationMap = absl::flat_hash_map<const void*, Allocation>;
 
   tensorflow::mutex mutex_;
 
@@ -146,7 +147,7 @@ class AllocationTracker {
   // non-owning "view" into a tuple's sub-buffers.  The sub-buffers are then
   // free'd when both the view *and* the original tuple are Unregistered.  This
   // refcounting is managed in opaque_to_allocation_map_.
-  tensorflow::gtl::FlatMap<int64, std::vector<std::unique_ptr<ShapedBuffer>>>
+  absl::flat_hash_map<int64, std::vector<std::unique_ptr<ShapedBuffer>>>
       handle_to_shaped_buffers_ GUARDED_BY(mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(AllocationTracker);
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index 30d33e0d35..f70f6ddfec 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.h b/tensorflow/compiler/xla/service/bfloat16_propagation.h
index 6a62439f88..c74326f631 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.h
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -186,7 +187,7 @@ class BFloat16Propagation : public HloModulePass {
 
   // Mapping from each HloComputation to the number of callers to it in the
   // module. Populated at the beginning of this pass.
-  tensorflow::gtl::FlatMap<const HloComputation*, int64> caller_counts_;
+  absl::flat_hash_map<const HloComputation*, int64> caller_counts_;
 
   // We first store the potential F32-to-BF16 changes to changes_to_bf16_, which
   // are subject to further adjustment, then finally applied to the HLOs. This
@@ -195,8 +196,7 @@ class BFloat16Propagation : public HloModulePass {
   //
   // For each HloInstruction, changes_to_bf16_ stores the affected buffers in
   // the output as a map from in-place pointers to subshapes to shape indices.
-  tensorflow::gtl::FlatMap<HloInstruction*,
-                           tensorflow::gtl::FlatMap<Shape*, ShapeIndex>>
+  absl::flat_hash_map<HloInstruction*, absl::flat_hash_map<Shape*, ShapeIndex>>
       changes_to_bf16_;
 
   // Whether the last processed HLO module has been changed by this pass.
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 34a7be0e9c..3efa0b1dad 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <ostream>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -41,9 +42,9 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using absl::flat_hash_map;
 using absl::StrAppend;
 using absl::StrAppendFormat;
-using ::tensorflow::gtl::FlatMap;
 using ::tensorflow::gtl::FlatSet;
 using ::tensorflow::strings::HumanReadableNumBytes;
 
@@ -519,7 +520,8 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation,
 // BufferAllocation.
 void BufferAssignment::CombineTempAllocations() {
   VLOG(1) << "CombineTempAllocations()";
-  FlatMap<LogicalBuffer::Color, BufferAllocation, LogicalBuffer::Color::Hasher>
+  flat_hash_map<LogicalBuffer::Color, BufferAllocation,
+                LogicalBuffer::Color::Hasher>
       combined_allocation_map;
 
   // Move all temp allocations into a single run at the end of the allocations
@@ -582,7 +584,8 @@ void BufferAssignment::CombineTempAllocations() {
   }
 
   // Update allocation indices to their new positions.
-  allocation_index_for_buffer_.clear_no_resize();
+  allocation_index_for_buffer_.erase(allocation_index_for_buffer_.begin(),
+                                     allocation_index_for_buffer_.end());
   for (size_t index = 0; index < allocations_.size(); ++index) {
     BufferAllocation* allocation = &allocations_[index];
     allocation->set_index(index);
@@ -814,7 +817,7 @@ Status BufferAssigner::AssignBuffersForComputation(
     const HloComputation* computation, bool is_thread_local,
     const FlatSet<const LogicalBuffer*>& colocated_buffers,
     const FlatSet<BufferAllocation::Index>& colocated_allocations,
-    FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>*
+    flat_hash_map<const HloComputation*, FlatSet<const LogicalBuffer*>>*
         buffers_to_assign_sequentially,
     BufferAssignment* assignment) {
   // Buffers are sorted and assigned to BufferAllocations in decreasing order of
@@ -833,7 +836,7 @@ Status BufferAssigner::AssignBuffersForComputation(
 
   // Generate a post order sort of instructions for sorting of the
   // LogicalBuffers.
-  FlatMap<const HloInstruction*, int> post_order_position;
+  flat_hash_map<const HloInstruction*, int> post_order_position;
   int position = 0;
   for (auto* instruction : computation->MakeInstructionPostOrder()) {
     post_order_position.emplace(instruction, position);
@@ -1043,12 +1046,12 @@ Status BufferAssigner::AssignBuffersForComputation(
   return Status::OK();
 }
 
-FlatMap<LogicalBuffer::Color, FlatSet<const LogicalBuffer*>,
-        LogicalBuffer::Color::Hasher>
+flat_hash_map<LogicalBuffer::Color, FlatSet<const LogicalBuffer*>,
+              LogicalBuffer::Color::Hasher>
 BufferAssigner::SplitBuffersByColor(
     const FlatSet<const LogicalBuffer*>& buffers) {
-  FlatMap<LogicalBuffer::Color, FlatSet<const LogicalBuffer*>,
-          LogicalBuffer::Color::Hasher>
+  flat_hash_map<LogicalBuffer::Color, FlatSet<const LogicalBuffer*>,
+                LogicalBuffer::Color::Hasher>
       color_map;
   for (auto buffer : buffers) {
     color_map[buffer->color()].insert(buffer);
@@ -1057,7 +1060,7 @@ BufferAssigner::SplitBuffersByColor(
 }
 
 Status BufferAssigner::AssignBuffersWithSequentialOrdering(
-    const FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>&
+    const flat_hash_map<const HloComputation*, FlatSet<const LogicalBuffer*>>&
         buffers_to_assign_sequentially,
     bool run_whole_module_heap_simulation, BufferAssignment* assignment) {
   // Run the sequence of instructions through the heap simulator.  The heuristic
@@ -1155,9 +1158,8 @@ std::vector<const LogicalBuffer*> ComputePeakMemoryLogicalBuffers(
     const BufferAllocation& allocation, const HeapSimulatorTrace& heap_trace) {
   // Create a map from LogicalBuffer::Id to LogicalBuffer* for the logical
   // buffers in this allocation.
-  tensorflow::gtl::FlatMap<LogicalBuffer::Id, const LogicalBuffer*>
-      id_to_buffer;
-  tensorflow::gtl::FlatMap<const LogicalBuffer*, int64> buffer_sizes;
+  absl::flat_hash_map<LogicalBuffer::Id, const LogicalBuffer*> id_to_buffer;
+  absl::flat_hash_map<const LogicalBuffer*, int64> buffer_sizes;
   for (const auto& pair : allocation.assigned_buffers()) {
     const LogicalBuffer* buffer = pair.first;
     const BufferAllocation::OffsetSize& offset_size = pair.second;
@@ -1679,7 +1681,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
 
   // First assign buffers for global computatations. Temporary buffers for
   // sequential computations are collected in 'buffers_to_assign_sequentially'.
-  FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>
+  flat_hash_map<const HloComputation*, FlatSet<const LogicalBuffer*>>
       buffers_to_assign_sequentially;
   for (auto* computation : global_computations) {
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 24ba7c16f5..9ba40617a3 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -148,7 +148,7 @@ class BufferAllocation {
 
   // Access to the logical buffers assigned to this allocation, and their
   // associated logical offsets and sizes.
-  const tensorflow::gtl::FlatMap<const LogicalBuffer*, OffsetSize>&
+  const absl::flat_hash_map<const LogicalBuffer*, OffsetSize>&
   assigned_buffers() const {
     return assigned_buffers_;
   }
@@ -323,7 +323,7 @@ class BufferAllocation {
 
   // Mapping from the set of buffers assigned to this allocation to their
   // logical offsets and sizes.
-  tensorflow::gtl::FlatMap<const LogicalBuffer*, OffsetSize> assigned_buffers_;
+  absl::flat_hash_map<const LogicalBuffer*, OffsetSize> assigned_buffers_;
 
   int64 fragmentation_bytes_ = 0;
   std::vector<HeapSimulatorTrace> heap_traces_;
@@ -500,7 +500,7 @@ class BufferAssignment {
   int64 temp_allocation_total_size_ = 0;
 
   // Maps Buffers to the index of the BufferAllocation which holds the buffer.
-  tensorflow::gtl::FlatMap<const LogicalBuffer*, BufferAllocation::Index>
+  absl::flat_hash_map<const LogicalBuffer*, BufferAllocation::Index>
       allocation_index_for_buffer_;
 
   const HloModule* module_;
@@ -557,8 +557,8 @@ class BufferAssigner {
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>& colocated_buffers,
       const tensorflow::gtl::FlatSet<BufferAllocation::Index>&
           colocated_allocations,
-      tensorflow::gtl::FlatMap<const HloComputation*,
-                               tensorflow::gtl::FlatSet<const LogicalBuffer*>>*
+      absl::flat_hash_map<const HloComputation*,
+                          tensorflow::gtl::FlatSet<const LogicalBuffer*>>*
           buffers_to_assign_sequentially,
       BufferAssignment* assignment);
 
@@ -568,9 +568,8 @@ class BufferAssigner {
   // 'run_whole_module_heap_simulation' is true, the heap simulation will be run
   // assuming all global computations are sequentially ordered.
   Status AssignBuffersWithSequentialOrdering(
-      const tensorflow::gtl::FlatMap<
-          const HloComputation*,
-          tensorflow::gtl::FlatSet<const LogicalBuffer*>>&
+      const absl::flat_hash_map<const HloComputation*,
+                                tensorflow::gtl::FlatSet<const LogicalBuffer*>>&
           buffers_to_assign_sequentially,
       bool run_whole_module_heap_simulation, BufferAssignment* assignment);
 
@@ -624,9 +623,9 @@ class BufferAssigner {
 
   // Split a set of buffers into several sets, each of which contains buffers
   // colored with the same color.
-  tensorflow::gtl::FlatMap<LogicalBuffer::Color,
-                           tensorflow::gtl::FlatSet<const LogicalBuffer*>,
-                           LogicalBuffer::Color::Hasher>
+  absl::flat_hash_map<LogicalBuffer::Color,
+                      tensorflow::gtl::FlatSet<const LogicalBuffer*>,
+                      LogicalBuffer::Color::Hasher>
   SplitBuffersByColor(
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>& buffers);
 
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.h b/tensorflow/compiler/xla/service/buffer_liveness.h
index cdd3cf4032..2911bbcfbf 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.h
+++ b/tensorflow/compiler/xla/service/buffer_liveness.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index 3af2ab5edf..0c2e9b99db 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include <ostream>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
@@ -157,7 +157,7 @@ class CallGraphNode {
 
   // The map from instruction to index in callsites_ for looking up the callsite
   // (if any) associated with a particular instruction in this computation.
-  tensorflow::gtl::FlatMap<const HloInstruction*, int64> callsite_instructions_;
+  absl::flat_hash_map<const HloInstruction*, int64> callsite_instructions_;
 
   // The call sites in other computations which call this computation.
   std::vector<CallSite> caller_callsites_;
@@ -267,7 +267,7 @@ class CallGraph {
 
   // Map from HLO computation to the index of the corresponding call graph node
   // in nodes_.
-  tensorflow::gtl::FlatMap<const HloComputation*, int64> node_indices_;
+  absl::flat_hash_map<const HloComputation*, int64> node_indices_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index b65dfef9c9..7f78412924 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -432,7 +432,7 @@ class CopyRemover {
       // Construct a list for each HLO buffer in the alias analysis. Maintain a
       // map from HloValue to the respective list element representing that
       // value. The map is used to construct the copy info map below.
-      tensorflow::gtl::FlatMap<const HloValue*, ValueNode*> value_to_node;
+      absl::flat_hash_map<const HloValue*, ValueNode*> value_to_node;
       for (const HloBuffer& buffer : alias_analysis.buffers()) {
         // Verify values contained in the buffer are strictly ordered. This
         // should always be the case after adding copies to eliminate
@@ -480,7 +480,7 @@ class CopyRemover {
     // respective ValueNode representing that value.
     void AddValueList(
         absl::Span<const HloValue* const> values,
-        tensorflow::gtl::FlatMap<const HloValue*, ValueNode*>* value_to_node) {
+        absl::flat_hash_map<const HloValue*, ValueNode*>* value_to_node) {
       ValueNode* tail = nullptr;
       ValueNode* head = nullptr;
       for (const HloValue* value : values) {
@@ -516,8 +516,7 @@ class CopyRemover {
     // respective ValueNode.
     void CreateCopyMap(
         const HloModule& module,
-        const tensorflow::gtl::FlatMap<const HloValue*, ValueNode*>&
-            value_to_node) {
+        const absl::flat_hash_map<const HloValue*, ValueNode*>& value_to_node) {
       for (HloComputation* computation : module.computations()) {
         for (HloInstruction* instruction : computation->instructions()) {
           // Add copies with unambiguous source values to the map. Copies with
@@ -916,7 +915,7 @@ class CopyRemover {
       ValueNode* src = nullptr;
       ValueNode* dest = nullptr;
     };
-    tensorflow::gtl::FlatMap<const HloInstruction*, CopyNodes> copy_map_;
+    absl::flat_hash_map<const HloInstruction*, CopyNodes> copy_map_;
   };
 
   HloModule* module_;
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index b7103118ac..6a83909a3b 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -290,6 +290,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter",
         "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -309,6 +310,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@llvm//:analysis",
         "@llvm//:target",
     ],
@@ -471,6 +473,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
     ],
@@ -762,6 +765,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:layout_assignment",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
index bfecbd6e01..c291bf2d1b 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <numeric>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
@@ -38,7 +39,7 @@ using absl::nullopt;
 using absl::optional;
 
 using ShouldMakeOperandColMajorCache =
-    tensorflow::gtl::FlatMap<const HloInstruction*, bool>;
+    absl::flat_hash_map<const HloInstruction*, bool>;
 }  // namespace
 
 static bool ShouldMakeAllUsersColMajor(const HloInstruction* instruction) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 20cf855735..a9febe891b 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <functional>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
@@ -30,8 +31,7 @@ namespace cpu {
 namespace runtime {
 
 XfeedManager* GetXfeedManager(int device_ordinal) {
-  static tensorflow::gtl::FlatMap<int, XfeedManager*>* managers =
-      new tensorflow::gtl::FlatMap<int, XfeedManager*>();
+  static auto* managers = new absl::flat_hash_map<int, XfeedManager*>();
   static absl::Mutex* mutex = new absl::Mutex();
 
   absl::MutexLock lock(mutex);
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index c3e8020783..953a75c35f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/logging.h"
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
@@ -67,7 +68,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
@@ -1398,7 +1398,7 @@ static bool ReductionPreservesLayout(const HloInstruction& reduce) {
   //
   // So if we reduce f32[A,B,C,D] on dimensions 1 and 2, this map contains
   // [0->0, 3->1].
-  gtl::FlatMap<int64, int64> unreduced_dim_map;
+  absl::flat_hash_map<int64, int64> unreduced_dim_map;
 
   gtl::FlatSet<int64> reduced_dims(reduce.dimensions().begin(),
                                    reduce.dimensions().end());
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index daafef4eb3..586f27b104 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/Triple.h"
@@ -47,7 +48,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -427,7 +427,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // Maps the buffer allocation slices for the parameters to the computation
   // being compiled to their parameter numbers.  Only relevant for thread local
   // computations.
-  tensorflow::gtl::FlatMap<BufferAllocation::Index, int64>
+  absl::flat_hash_map<BufferAllocation::Index, int64>
       computation_parameter_allocations_;
 
   // Maps HLO instructions to their index into the profile counter array.
@@ -567,11 +567,11 @@ class IrEmitter : public DfsHloVisitorWithDefault,
     }
   };
 
-  tensorflow::gtl::FlatMap<const Literal*, llvm::Constant*,
-                           LiteralPtrHashFunctor, LiteralPtrEqualityFunctor>
+  absl::flat_hash_map<const Literal*, llvm::Constant*, LiteralPtrHashFunctor,
+                      LiteralPtrEqualityFunctor>
       emitted_literals_;
 
-  tensorflow::gtl::FlatMap<BufferAllocation::Index, llvm::Constant*>
+  absl::flat_hash_map<BufferAllocation::Index, llvm::Constant*>
       constant_buffer_to_global_;
 
   std::vector<const HloComputation*> thread_local_computations_;
diff --git a/tensorflow/compiler/xla/service/cpu/target_machine_features.cc b/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
index a0cd8ee2d2..5cdac203af 100644
--- a/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
+++ b/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 namespace cpu {
diff --git a/tensorflow/compiler/xla/service/cpu/target_machine_features.h b/tensorflow/compiler/xla/service/cpu/target_machine_features.h
index 8b00ae9e47..a383b4a4a0 100644
--- a/tensorflow/compiler/xla/service/cpu/target_machine_features.h
+++ b/tensorflow/compiler/xla/service/cpu/target_machine_features.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TARGET_MACHINE_FEATURES_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TARGET_MACHINE_FEATURES_H_
 
+#include "absl/container/flat_hash_map.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 namespace cpu {
@@ -97,8 +97,7 @@ class LLVMTargetMachineFeatures : public TargetMachineFeatures {
   // This is mutated from within `GetTargetTransformInfoFor` which is
   // semantically a getter (and thus `const`); and is therefore declared
   // mutable.  Making this mutable is okay because it has cache semantics.
-  mutable tensorflow::gtl::FlatMap<const llvm::Function*,
-                                   llvm::TargetTransformInfo>
+  mutable absl::flat_hash_map<const llvm::Function*, llvm::TargetTransformInfo>
       target_transform_info_cache_;
   llvm::TargetMachine* target_machine_;
 };
diff --git a/tensorflow/compiler/xla/service/defuser.cc b/tensorflow/compiler/xla/service/defuser.cc
index d124f74d19..661539cccb 100644
--- a/tensorflow/compiler/xla/service/defuser.cc
+++ b/tensorflow/compiler/xla/service/defuser.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -48,7 +49,7 @@ Status Defuse(HloInstruction* fusion_instruction) {
       fusion_instruction->fused_instructions_computation();
 
   // A map from fused instruction to its defused clone.
-  tensorflow::gtl::FlatMap<const HloInstruction*, HloInstruction*>
+  absl::flat_hash_map<const HloInstruction*, HloInstruction*>
       defused_instructions;
   // Initialize map to contain the fusion instruction parameters mapping
   // to the operands of the fusion instruction.
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 5761573791..68d01d75a2 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 51968d13d4..e65d3fa332 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -91,6 +91,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_reachability",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -357,6 +358,7 @@ cc_library(
         "//tensorflow/core/platform/default/build_config:cufft_plugin",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",  # build_cleaner: keep
         "//tensorflow/stream_executor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 31a9f9b1be..5742632782 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
@@ -197,7 +198,7 @@ GpuExecutable::ResolveConstantGlobals(se::StreamExecutor* executor) {
   }
   module_spec.AddCudaPtxInMemory(ptx().c_str());
 
-  tensorflow::gtl::FlatMap<int64, se::DeviceMemoryBase> globals;
+  absl::flat_hash_map<int64, se::DeviceMemoryBase> globals;
   se::ModuleHandle module_handle;
   executor->LoadModule(module_spec, &module_handle);
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 38b0f8f15b..0e276282e4 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -101,7 +101,7 @@ class GpuExecutable : public Executable {
   const PointsToSet& GetRootPointsToSet() const;
 
   using BufferAllocToDeviceMemoryMap =
-      tensorflow::gtl::FlatMap<BufferAllocation::Index, se::DeviceMemoryBase>;
+      absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>;
 
   // Loads the PTX or CUBIN for this executable into `executor` and resolves the
   // globals corresponding to constant buffers.  Returns a map mapping buffer
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.h b/tensorflow/compiler/xla/service/gpu/stream_assignment.h
index c2df83aaa4..52d38b6f20 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_ASSIGNMENT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_ASSIGNMENT_H_
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 namespace gpu {
@@ -34,7 +34,7 @@ class StreamAssignment {
 
  private:
   int stream_count_ = 1;  // At least the main stream.
-  tensorflow::gtl::FlatMap<const HloInstruction*, int> hlo_to_stream_number_;
+  absl::flat_hash_map<const HloInstruction*, int> hlo_to_stream_number_;
 };
 
 // Assigns GPU streams to instructions in `module`.
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 2bd04259c0..147776c8c4 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -18,13 +18,14 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 
-using tensorflow::gtl::FlatMap;
+using absl::flat_hash_map;
 using tensorflow::gtl::FlatSet;
 
 /*static*/
@@ -56,7 +57,7 @@ StatusOr<int64> HeapSimulator::MinimumMemoryForComputation(
     const HloComputation& computation, const HloInstructionSequence& sequence,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
-    const tensorflow::gtl::FlatMap<const HloComputation*, int64>*
+    const absl::flat_hash_map<const HloComputation*, int64>*
         memory_by_computation) {
   TF_ASSIGN_OR_RETURN(
       HeapSimulator::Result result,
@@ -88,7 +89,7 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     const HloInstructionSequence& instruction_sequence,
     const TuplePointsToAnalysis& points_to_analysis,
     const BufferValue::SizeFunction& size_fn, const Options& options,
-    const tensorflow::gtl::FlatMap<const HloComputation*, int64>*
+    const absl::flat_hash_map<const HloComputation*, int64>*
         memory_by_computation) {
   HeapSimulator heap(std::move(algorithm), size_fn, options,
                      /*schedule=*/nullptr, memory_by_computation);
@@ -115,8 +116,10 @@ Status HeapSimulator::RunComputation(
   // 'used_buffers' is the reverse map - it tracks which buffers were used by an
   // instruction, so that we can remove the instructions from a buffer's live
   // set after they are visited.
-  FlatMap<const BufferValue*, FlatSet<const HloInstruction*>> live_buffers;
-  FlatMap<const HloInstruction*, FlatSet<const BufferValue*>> used_buffers;
+  flat_hash_map<const BufferValue*, FlatSet<const HloInstruction*>>
+      live_buffers;
+  flat_hash_map<const HloInstruction*, FlatSet<const BufferValue*>>
+      used_buffers;
   auto add_user_to_buffer = [this, &live_buffers, &used_buffers](
                                 const HloInstruction* user,
                                 const BufferValue* buffer) {
@@ -345,7 +348,7 @@ HeapSimulator::HeapSimulator(
     std::unique_ptr<HeapAlgorithm> algorithm,
     const BufferValue::SizeFunction& size_fn, const Options& options,
     const HloSchedule* schedule,
-    const tensorflow::gtl::FlatMap<const HloComputation*, int64>*
+    const absl::flat_hash_map<const HloComputation*, int64>*
         memory_by_computation)
     : no_fragmentation_stats_(absl::make_unique<NoFragmentationStatsHeap>()),
       algorithm_(std::move(algorithm)),
@@ -536,7 +539,7 @@ void NoFragmentationStatsHeap::Alloc(const BufferValue* buffer, int64 size,
 
 void NoFragmentationStatsHeap::AccountForSubcomputationMemory(
     const HloInstruction* instruction,
-    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+    const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
   // We only count the memory usage of the largest subcomputation, instead of
   // adding them all, because subcomputations won't execute in parallel.
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 7d6dcc0dc9..a5bb3f81f7 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/buffer_value_containers.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
@@ -58,7 +58,7 @@ class HeapSimulator {
   // Result represents the result of the heap simulation.
   struct Result {
     // The assignment of buffers to chunks.
-    tensorflow::gtl::FlatMap<const BufferValue*, Chunk> chunk_map;
+    absl::flat_hash_map<const BufferValue*, Chunk> chunk_map;
 
     // The total size in bytes of the heap, containing all assigned chunks.
     int64 heap_size = 0;
@@ -100,7 +100,7 @@ class HeapSimulator {
       const HloComputation& computation, const HloInstructionSequence& sequence,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_function,
-      const tensorflow::gtl::FlatMap<const HloComputation*, int64>*
+      const absl::flat_hash_map<const HloComputation*, int64>*
           memory_by_computation = nullptr);
 
   // Run the heap simulation with the given algorithm, assuming the given
@@ -130,7 +130,7 @@ class HeapSimulator {
       const TuplePointsToAnalysis& points_to_analysis,
       const BufferValue::SizeFunction& size_fn,
       const Options& options = Options(),
-      const tensorflow::gtl::FlatMap<const HloComputation*, int64>*
+      const absl::flat_hash_map<const HloComputation*, int64>*
           memory_by_computation = nullptr);
 
  private:
@@ -140,7 +140,7 @@ class HeapSimulator {
   HeapSimulator(std::unique_ptr<HeapAlgorithm> algorithm,
                 const BufferValue::SizeFunction& size_fn,
                 const Options& options, const HloSchedule* schedule = nullptr,
-                const tensorflow::gtl::FlatMap<const HloComputation*, int64>*
+                const absl::flat_hash_map<const HloComputation*, int64>*
                     memory_by_computation = nullptr);
   ~HeapSimulator();
 
@@ -172,7 +172,7 @@ class HeapSimulator {
   // handle subcomputations. It would be good to unify the handling of
   // subcomputations, but it's not clear how.
   const HloSchedule* schedule_;
-  const tensorflow::gtl::FlatMap<const HloComputation*, int64>*
+  const absl::flat_hash_map<const HloComputation*, int64>*
       memory_by_computation_;
 
   // In addition to Alloc and Free, the heap simulator exposes a concept of
@@ -193,7 +193,7 @@ class HeapSimulator {
     const BufferValue* canonical = nullptr;
     int64 refcount = 0;
   };
-  tensorflow::gtl::FlatMap<const BufferValue*, std::shared_ptr<SharedGroup>>
+  absl::flat_hash_map<const BufferValue*, std::shared_ptr<SharedGroup>>
       shared_buffers_;
 
   // Hold some sets for error-checking the sequence of Alloc and Free calls.
@@ -235,7 +235,7 @@ class HeapAlgorithm {
   // analysis, it's not worth making major changes to HeapSimulator now.
   virtual void AccountForSubcomputationMemory(
       const HloInstruction* instruction,
-      const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+      const absl::flat_hash_map<const HloComputation*, int64>&
           memory_by_computation) {}
 
   // Free de-allocates a previously allocated buffer.
@@ -262,7 +262,7 @@ class NoFragmentationStatsHeap : public HeapAlgorithm {
 
   void AccountForSubcomputationMemory(
       const HloInstruction* instruction,
-      const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+      const absl::flat_hash_map<const HloComputation*, int64>&
           memory_by_computation) override;
 
   void Free(const BufferValue* buffer, int64 size) override;
@@ -382,8 +382,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
     // Free time of the buffer.
     int64 end;
   };
-  tensorflow::gtl::FlatMap<const BufferValue*, BufferInterval>
-      buffer_intervals_;
+  absl::flat_hash_map<const BufferValue*, BufferInterval> buffer_intervals_;
 };
 
 // A heap algorithm that chooses the best results from other algorithms added to
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 191fbf8194..ea0bced923 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 namespace {
@@ -174,7 +174,7 @@ class HeapSimulatorTracker {
 
     // Construct the module sequence grouped by computation.
     HloSchedule schedule(module_.get());
-    tensorflow::gtl::FlatMap<const HloInstruction*, int> reverse_position;
+    absl::flat_hash_map<const HloInstruction*, int> reverse_position;
     for (int i = 0; i < full_module_sequence.size(); ++i) {
       const HloInstruction* instruction = full_module_sequence[i];
       schedule.GetOrCreateSequence(instruction->parent())
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index 0986da65cb..b6e1f52cf5 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -290,13 +291,11 @@ class BufferValueMap {
   const HloDataflowAnalysis& dataflow_;
 
   // A map containing the set of values contained in each buffer.
-  tensorflow::gtl::FlatMap<BufferNumber,
-                           tensorflow::gtl::FlatSet<const HloValue*>>
+  absl::flat_hash_map<BufferNumber, tensorflow::gtl::FlatSet<const HloValue*>>
       buffers_;
 
   // A map indicating which buffer each value is contained in.
-  tensorflow::gtl::FlatMap<const HloValue*, BufferNumber>
-      value_to_buffer_number_;
+  absl::flat_hash_map<const HloValue*, BufferNumber> value_to_buffer_number_;
 
   // The buffer number of the next buffer to be created.
   BufferNumber next_buffer_number_ = 0;
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.h b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
index e345804537..372f99ff01 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
@@ -110,7 +111,7 @@ class HloAliasAnalysis {
   std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
 
   // A map indicating which buffer a value is contained in.
-  tensorflow::gtl::FlatMap<const HloValue*, HloBuffer*> value_to_buffer_;
+  absl::flat_hash_map<const HloValue*, HloBuffer*> value_to_buffer_;
 
   // A lazily constructed vector containing all HloBuffers sorted by
   // HloBuffer::Id.
diff --git a/tensorflow/compiler/xla/service/hlo_clone_context.h b/tensorflow/compiler/xla/service/hlo_clone_context.h
index 658643b427..24910ca07b 100644
--- a/tensorflow/compiler/xla/service/hlo_clone_context.h
+++ b/tensorflow/compiler/xla/service/hlo_clone_context.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 
@@ -73,12 +73,12 @@ class HloCloneContext {
     return FindOrDie(computations_, old_computation);
   }
 
-  const tensorflow::gtl::FlatMap<const HloInstruction*, HloInstruction*>&
+  const absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
   cloned_instructions() const {
     return instructions_;
   }
 
-  const tensorflow::gtl::FlatMap<const HloComputation*, HloComputation*>&
+  const absl::flat_hash_map<const HloComputation*, HloComputation*>&
   cloned_computations() const {
     return computations_;
   }
@@ -86,10 +86,8 @@ class HloCloneContext {
  private:
   HloModule* module_;
   string suffix_;
-  tensorflow::gtl::FlatMap<const HloInstruction*, HloInstruction*>
-      instructions_;
-  tensorflow::gtl::FlatMap<const HloComputation*, HloComputation*>
-      computations_;
+  absl::flat_hash_map<const HloInstruction*, HloInstruction*> instructions_;
+  absl::flat_hash_map<const HloComputation*, HloComputation*> computations_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 4613d6762e..257dd5876f 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <sstream>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
@@ -297,7 +298,7 @@ void ComputeComputationPostOrder(
 void HloComputation::ComputeInstructionPostOrder(
     const HloComputation::ChannelDependencyMap& channel_dependency_map,
     std::vector<HloInstruction*>* post_order, HloInstruction* root,
-    tensorflow::gtl::FlatMap<HloInstruction*, VisitState>* visited) const {
+    absl::flat_hash_map<HloInstruction*, VisitState>* visited) const {
   std::vector<HloInstruction*> dfs_stack;
   dfs_stack.push_back(root);
   while (!dfs_stack.empty()) {
@@ -394,7 +395,7 @@ std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
   std::vector<HloInstruction*> post_order;
   post_order.reserve(instruction_count());
   std::vector<HloInstruction*> trace_instructions;
-  tensorflow::gtl::FlatMap<HloInstruction*, VisitState> visited;
+  absl::flat_hash_map<HloInstruction*, VisitState> visited;
   for (auto& instruction : instructions_) {
     if (instruction->opcode() == HloOpcode::kTrace) {
       // Trace instructions aren't handled by the DFS visitor. Add trace
@@ -505,9 +506,9 @@ HloComputationProto HloComputation::ToProto() const {
 /* static */ StatusOr<std::unique_ptr<HloComputation>>
 HloComputation::CreateFromProto(
     const HloComputationProto& proto,
-    const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map) {
-  tensorflow::gtl::FlatMap<int64, HloInstruction*> instruction_map;
-  tensorflow::gtl::FlatMap<HloInstruction*, int64> to_proto_id;
+    const absl::flat_hash_map<int64, HloComputation*>& computation_map) {
+  absl::flat_hash_map<int64, HloInstruction*> instruction_map;
+  absl::flat_hash_map<HloInstruction*, int64> to_proto_id;
   std::vector<std::unique_ptr<HloInstruction>> instructions;
   int64 parameter_count = 0;
   for (const HloInstructionProto& instruction_proto : proto.instructions()) {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 936a53bd7e..af929ac009 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/iterator_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -188,7 +188,7 @@ class HloComputation {
   //     calls.
   static StatusOr<std::unique_ptr<HloComputation>> CreateFromProto(
       const HloComputationProto& proto,
-      const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map);
+      const absl::flat_hash_map<int64, HloComputation*>& computation_map);
 
   // Gets the instructions in this computation.
   //
@@ -414,14 +414,14 @@ class HloComputation {
   // cross-replica-sum the union of the dependencies for all participating
   // instructions.
   using ChannelDependencyMap =
-      tensorflow::gtl::FlatMap<int64, absl::InlinedVector<HloInstruction*, 1>>;
+      absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>;
   ChannelDependencyMap ComputeChannelDependencies() const;
 
   enum VisitState { kVisiting, kVisited };
   void ComputeInstructionPostOrder(
       const HloComputation::ChannelDependencyMap& channel_dependency_map,
       std::vector<HloInstruction*>* post_order, HloInstruction* root,
-      tensorflow::gtl::FlatMap<HloInstruction*, VisitState>* visited) const;
+      absl::flat_hash_map<HloInstruction*, VisitState>* visited) const;
 
   string name_;
   int64 unique_id_;
@@ -439,7 +439,7 @@ class HloComputation {
   // instruction pointer to location in the list for fast lookup.
   using InstructionList = std::list<std::unique_ptr<HloInstruction>>;
   InstructionList instructions_;
-  tensorflow::gtl::FlatMap<const HloInstruction*, InstructionList::iterator>
+  absl::flat_hash_map<const HloInstruction*, InstructionList::iterator>
       instruction_iterators_;
 
   std::vector<HloInstruction*> param_instructions_;
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.cc b/tensorflow/compiler/xla/service/hlo_domain_map.cc
index 113fd18eae..159c39d557 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -106,8 +107,8 @@ Status HloDomainMap::PopulateDomainMetadataMap() {
   auto equal = [](const DomainMetadata* a, const DomainMetadata* b) {
     return a->Matches(*b);
   };
-  tensorflow::gtl::FlatMap<const DomainMetadata*, int64, decltype(hash),
-                           decltype(equal)>
+  absl::flat_hash_map<const DomainMetadata*, int64, decltype(hash),
+                      decltype(equal)>
       domain_metadata(1024, hash, equal);
 
   for (auto& domain : instruction_domains_) {
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.h b/tensorflow/compiler/xla/service/hlo_domain_map.h
index 56b557d7ce..8584bc021d 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.h
@@ -19,13 +19,13 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
@@ -77,8 +77,7 @@ class HloDomainMap {
  private:
   // Map used for representing instruction ordering, i.e.
   // order_map[a] < order_map[b] means a must be ordered before b.
-  using InstructionOrderMap =
-      tensorflow::gtl::FlatMap<const HloInstruction*, int64>;
+  using InstructionOrderMap = absl::flat_hash_map<const HloInstruction*, int64>;
 
   HloDomainMap(string domain_kind) : domain_kind_(std::move(domain_kind)) {}
 
@@ -120,8 +119,8 @@ class HloDomainMap {
 
   string domain_kind_;
   std::vector<std::unique_ptr<DomainMetadata::Domain>> instruction_domains_;
-  tensorflow::gtl::FlatMap<HloInstruction*, int64> instruction_to_domain_;
-  tensorflow::gtl::FlatMap<HloInstruction*, int64> domain_metadata_id_;
+  absl::flat_hash_map<HloInstruction*, int64> instruction_to_domain_;
+  absl::flat_hash_map<HloInstruction*, int64> domain_metadata_id_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 23787dbc8a..5d5c9c7e58 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/ascii.h"
@@ -43,7 +44,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/human_readable_json.h"
@@ -59,8 +59,8 @@ using absl::StrJoin;
 /* static */
 StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     const HloInstructionProto& proto,
-    const tensorflow::gtl::FlatMap<int64, HloInstruction*>& instruction_map,
-    const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map) {
+    const absl::flat_hash_map<int64, HloInstruction*>& instruction_map,
+    const absl::flat_hash_map<int64, HloComputation*>& computation_map) {
   TF_RET_CHECK(!proto.opcode().empty());
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(proto.opcode()));
   TF_RET_CHECK(proto.has_shape());
@@ -266,7 +266,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "Expect 1 called computation for fusion instruction but sees "
           << proto.called_computation_ids_size();
       const int64 fusion_id = proto.called_computation_ids(0);
-      auto* fused_computation = FindPtrOrNull(computation_map, fusion_id);
+      auto* fused_computation =
+          tensorflow::gtl::FindPtrOrNull(computation_map, fusion_id);
       TF_RET_CHECK(fused_computation != nullptr)
           << "No fusion computation with id " << fusion_id;
       instruction = CreateFusion(proto.shape(), fusion_kind, all_operands(),
@@ -2661,14 +2662,14 @@ class HloInstruction::FusionReusesParamElements {
   // the value of this parameter, which would save stack space but not allow us
   // to finish early if we find a reuse.
   static UseKind Compute(int64 i, const HloInstruction& hlo) {
-    tensorflow::gtl::FlatMap<const HloInstruction*, UseKind> memoization_cache;
+    absl::flat_hash_map<const HloInstruction*, UseKind> memoization_cache;
     return ComputeInternal(i, hlo, &memoization_cache);
   }
 
  private:
   static UseKind ComputeInternal(
       int64 i, const HloInstruction& hlo,
-      tensorflow::gtl::FlatMap<const HloInstruction*, UseKind>* cache) {
+      absl::flat_hash_map<const HloInstruction*, UseKind>* cache) {
     if (auto hlo_param = DynCast<HloParameterInstruction>(&hlo)) {
       if (hlo_param->parameter_number() == i) {
         return UseKind::kUse;
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 009bd3bab3..1bfdc88abc 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
@@ -50,7 +51,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/iterator_range.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -247,7 +247,7 @@ class CanonicalNameMap {
 
  private:
   int64 index;
-  tensorflow::gtl::FlatMap<string, string> canonical_name_map;
+  absl::flat_hash_map<string, string> canonical_name_map;
 };
 
 // HLO instructions are the atomic unit of the high-level compiler's IR.
@@ -350,8 +350,8 @@ class HloInstruction {
   //     calls.
   static StatusOr<std::unique_ptr<HloInstruction>> CreateFromProto(
       const HloInstructionProto& proto,
-      const tensorflow::gtl::FlatMap<int64, HloInstruction*>& instruction_map,
-      const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map);
+      const absl::flat_hash_map<int64, HloInstruction*>& instruction_map,
+      const absl::flat_hash_map<int64, HloComputation*>& computation_map);
 
   // Creates a parameter-retrieving instruction.
   static std::unique_ptr<HloInstruction> CreateParameter(int64 parameter_number,
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index ad45a82941..1bc168c8b7 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <deque>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/window_util.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 namespace {
@@ -1099,7 +1099,7 @@ void HloFusionInstruction::MergeFusionInstructionIntoMultiOutput(
   // Note that we add the unfused instructions to this->parent_ computation.
   // This is necessary because the unique_id needs for an instruction and
   // it's only added when inserting to the computation.
-  tensorflow::gtl::FlatMap<HloInstruction*, HloInstruction*> old_to_new;
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> old_to_new;
   std::vector<HloInstruction*> unfused_instructions;
   auto computation_to_merge =
       instruction_to_merge->fused_instructions_computation();
@@ -1392,7 +1392,7 @@ std::unique_ptr<HloInstruction> HloFusionInstruction::CloneWithNewOperandsImpl(
 }
 
 Status HloFusionInstruction::DeduplicateFusionOperands() {
-  tensorflow::gtl::FlatMap<const HloInstruction*, int> operand_indices;
+  absl::flat_hash_map<const HloInstruction*, int> operand_indices;
   std::vector<int> operands_to_remove;
   for (int i = 0; i < operand_count(); ++i) {
     auto emplace_result = operand_indices.emplace(operand(i), i);
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
index 6a4e766788..1c2b2868fd 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
@@ -74,7 +75,7 @@ class ListScheduler {
       const HloComputation& computation,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_function,
-      const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+      const absl::flat_hash_map<const HloComputation*, int64>&
           memory_by_computation) {
     ListScheduler scheduler(computation, points_to_analysis, size_function,
                             memory_by_computation);
@@ -99,7 +100,7 @@ class ListScheduler {
   ListScheduler(const HloComputation& computation,
                 const TuplePointsToAnalysis& points_to_analysis,
                 const LogicalBuffer::SizeFunction& size_function,
-                const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+                const absl::flat_hash_map<const HloComputation*, int64>&
                     memory_by_computation)
       : computation_(computation),
         points_to_analysis_(points_to_analysis),
@@ -234,8 +235,7 @@ class ListScheduler {
 
     // Populate the ready list with instructions which have no operands or
     // control predecessors.
-    tensorflow::gtl::FlatMap<const HloInstruction*, int64>
-        unscheduled_pred_count;
+    absl::flat_hash_map<const HloInstruction*, int64> unscheduled_pred_count;
     for (auto* instruction : computation_.instructions()) {
       // TODO(b/34466113): Replace this and above with successors() or
       // predecessors() when these methods are added to HloInstruction.
@@ -251,8 +251,8 @@ class ListScheduler {
     std::multimap<Priority, ReadyListEntry> ready_queue;
 
     // Map of ready instructions to their iterators in ready_queue.
-    tensorflow::gtl::FlatMap<const HloInstruction*,
-                             std::multimap<Priority, ReadyListEntry>::iterator>
+    absl::flat_hash_map<const HloInstruction*,
+                        std::multimap<Priority, ReadyListEntry>::iterator>
         ready_instructions;
 
     auto add_to_ready_queue = [&](HloInstruction* inst) {
@@ -347,12 +347,11 @@ class ListScheduler {
   // Computations are analyzed in post-order. When scheduling an instruction
   // that includes subcomputations, such as a while loop, we use this map to
   // look up the memory needed by subcomputations.
-  const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+  const absl::flat_hash_map<const HloComputation*, int64>&
       memory_by_computation_;
 
   // A map containing the LogicalBuffers that each instruction uses.
-  tensorflow::gtl::FlatMap<const HloInstruction*,
-                           std::vector<const LogicalBuffer*>>
+  absl::flat_hash_map<const HloInstruction*, std::vector<const LogicalBuffer*>>
       buffer_uses_;
 
   // A map containing the count of unscheduled HLOs which using a particular
@@ -379,7 +378,7 @@ StatusOr<HloInstructionSequence> ScheduleComputationHelper(
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const MemorySchedulerAlgorithm& algorithm,
-    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+    const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
   VLOG(2) << "Computation: " << computation.name();
   if (algorithm) {
@@ -396,13 +395,13 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
-    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+    const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
   // These variables are a hack to prevent overflows.
   int64 cumulative_total_size = 0;
   int64 total_hlos = computation.parent()->instruction_count();
-  tensorflow::gtl::FlatMap<const HloInstruction*, int64> extra_users;
-  tensorflow::gtl::FlatMap<const HloInstruction*, int64> total_sizes;
+  absl::flat_hash_map<const HloInstruction*, int64> extra_users;
+  absl::flat_hash_map<const HloInstruction*, int64> total_sizes;
   for (const HloInstruction* hlo : computation.MakeInstructionPostOrder()) {
     if (ListScheduler::IgnoreInstruction(*hlo)) {
       extra_users[hlo] = 0;
@@ -467,7 +466,7 @@ StatusOr<HloInstructionSequence> ListMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
-    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+    const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
   return ListScheduler::Run(computation, points_to_analysis, size_function,
                             memory_by_computation);
@@ -477,7 +476,7 @@ StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
-    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+    const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
   return HloInstructionSequence(computation.MakeInstructionPostOrder());
 }
@@ -486,7 +485,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
-    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+    const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
   // We try a few schedulers and choose whichever returns a lower min-memory,
   // not accounting for fragmentation.
@@ -549,7 +548,7 @@ StatusOr<HloSchedule> ScheduleModule(
   HloSchedule schedule(&module);
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(&module));
-  tensorflow::gtl::FlatMap<const HloComputation*, int64> memory_by_computation;
+  absl::flat_hash_map<const HloComputation*, int64> memory_by_computation;
   for (const auto* computation : module.MakeComputationPostOrder()) {
     if (!computation->IsFusionComputation()) {
       TF_ASSIGN_OR_RETURN(HloInstructionSequence computation_sequence,
@@ -577,7 +576,7 @@ StatusOr<HloInstructionSequence> ScheduleComputation(
   CHECK(!computation.IsFusionComputation());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(computation.parent()));
-  tensorflow::gtl::FlatMap<const HloComputation*, int64> empty_map;
+  absl::flat_hash_map<const HloComputation*, int64> empty_map;
   return ScheduleComputationHelper(computation, *points_to_analysis,
                                    size_function, nullptr, empty_map);
 }
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
index 9964c6fdd7..a4c1d3db81 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
@@ -37,7 +38,7 @@ namespace xla {
 typedef std::function<StatusOr<HloInstructionSequence>(
     const HloComputation&, const TuplePointsToAnalysis&,
     const LogicalBuffer::SizeFunction&,
-    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&)>
+    const absl::flat_hash_map<const HloComputation*, int64>&)>
     MemorySchedulerAlgorithm;
 
 // List scheduler
@@ -45,7 +46,7 @@ StatusOr<HloInstructionSequence> ListMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
-    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+    const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation);
 
 // DFS-order scheduler
@@ -53,7 +54,7 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
-    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+    const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation);
 
 // Naive Post Order scheduler
@@ -61,7 +62,7 @@ StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
-    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+    const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation);
 
 // The default scheduling algorithm. Runs both the list scheduler
@@ -71,7 +72,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
-    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+    const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation);
 
 // Returns an HloSchedule which seeks to minimize the memory required for
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
index 1b9e9bfc77..5a9fccc7dd 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
@@ -247,7 +248,7 @@ TEST_F(HloSchedulingTest, ListAccountsForSubcomputations) {
   EXPECT_TRUE(ordering.ExecutesBefore(bcast, add));
   EXPECT_TRUE(ordering.ExecutesBefore(transpose, add));
 
-  tensorflow::gtl::FlatMap<const HloComputation*, int64> memory_by_computation;
+  absl::flat_hash_map<const HloComputation*, int64> memory_by_computation;
   memory_by_computation[cond_computation] = 17;
   memory_by_computation[body_computation] = 16;
   std::unique_ptr<TuplePointsToAnalysis> points_to_analysis =
@@ -409,7 +410,7 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
   EXPECT_EQ(module->entry_computation()->instruction_count(),
             schedule.sequence(module->entry_computation()).size());
 
-  tensorflow::gtl::FlatMap<const HloComputation*, int64> memory_by_computation;
+  absl::flat_hash_map<const HloComputation*, int64> memory_by_computation;
   memory_by_computation[cond_computation] = 17;
   memory_by_computation[body_computation] = 16;
   std::unique_ptr<TuplePointsToAnalysis> points_to_analysis =
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index b3949f3a6d..9359e9a8be 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -285,8 +286,8 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
       << ShapeUtil::HumanStringWithLayout(expected_program_shape.result())
       << ", actual: " << ShapeUtil::HumanStringWithLayout(result_shape);
 
-  tensorflow::gtl::FlatMap<int64, HloComputation*> computation_map;
-  tensorflow::gtl::FlatMap<HloComputation*, int64> to_proto_id;
+  absl::flat_hash_map<int64, HloComputation*> computation_map;
+  absl::flat_hash_map<HloComputation*, int64> to_proto_id;
   std::vector<std::unique_ptr<HloComputation>> computations;
   HloComputation* entry = nullptr;
   for (const HloComputationProto& computation_proto : proto.computations()) {
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index 278d94cdd3..0311b73207 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -250,25 +250,25 @@ class HloModuleGroupMetadata {
   std::vector<std::unique_ptr<std::vector<HloInstruction*>>> companion_sets_;
 
   // Map from each companion while instruction to the index into companion_set_.
-  tensorflow::gtl::FlatMap<const HloInstruction*, int64> companion_set_index_;
+  absl::flat_hash_map<const HloInstruction*, int64> companion_set_index_;
 
   // Map from computation to the instruction using it (a kWhile, kConditional).
-  tensorflow::gtl::FlatMap<const HloComputation*, TrackedInstruction>
+  absl::flat_hash_map<const HloComputation*, TrackedInstruction>
       tracked_instructions_;
 
   // Maps tracked instructions (kWhile, kConditional, kCall, ...) to the set of
   // communicating instructions within the proper called computation(s).
-  tensorflow::gtl::FlatMap<HloInstruction*, std::vector<HloInstruction*>>
+  absl::flat_hash_map<HloInstruction*, std::vector<HloInstruction*>>
       tracked_instructions_comms_;
 
   // All channels in the module.
   std::vector<Channel> channels_;
 
   // Map from channel ids to the index in channels_.
-  tensorflow::gtl::FlatMap<int64, int64> channel_id_map_;
+  absl::flat_hash_map<int64, int64> channel_id_map_;
 
   // Map from all-reduce ids to the all reduce instructions.
-  tensorflow::gtl::FlatMap<int64, std::vector<HloInstruction*>> all_reduce_map_;
+  absl::flat_hash_map<int64, std::vector<HloInstruction*>> all_reduce_map_;
 
   // The maximum channel id used in the module group.
   int64 max_channel_id_ = -1;
@@ -276,7 +276,7 @@ class HloModuleGroupMetadata {
   // The modules that this metadata was built from.
   const std::vector<HloModule*>& modules_;
 
-  tensorflow::gtl::FlatMap<HloModule*, std::unique_ptr<TuplePointsToAnalysis>>
+  absl::flat_hash_map<HloModule*, std::unique_ptr<TuplePointsToAnalysis>>
       points_to_analyses_;
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.h b/tensorflow/compiler/xla/service/hlo_module_group_util.h
index 309c23045d..f21b44bcd9 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 
@@ -87,7 +87,7 @@ class HloModuleGroupUtil {
   // * visit_state: map from each instruction to its visit state.
   // * visit_function: function called when each instruction group.
   // * root: the root instruction of the traversal.
-  using VisitStates = tensorflow::gtl::FlatMap<HloInstruction*, VisitState>;
+  using VisitStates = absl::flat_hash_map<HloInstruction*, VisitState>;
   Status VisitTopologicalOrder(VisitStates* visit_state,
                                const VisitFunction& visit_function,
                                HloInstruction* root);
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.cc b/tensorflow/compiler/xla/service/hlo_opcode.cc
index 2d4e38589f..4551a1c2e2 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 
@@ -31,7 +31,7 @@ string HloOpcodeString(HloOpcode opcode) {
 }
 
 StatusOr<HloOpcode> StringToHloOpcode(const string& opcode_name) {
-  static auto* opcode_map = new tensorflow::gtl::FlatMap<string, HloOpcode>({
+  static auto* opcode_map = new absl::flat_hash_map<string, HloOpcode>({
 #define STRING_TO_OPCODE_ENTRY(enum_name, opcode_name, ...) \
   {opcode_name, HloOpcode::enum_name},
       HLO_OPCODE_LIST(STRING_TO_OPCODE_ENTRY)
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.h b/tensorflow/compiler/xla/service/hlo_ordering.h
index b0361c3f02..66313492eb 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.h
+++ b/tensorflow/compiler/xla/service/hlo_ordering.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 
@@ -120,8 +120,8 @@ class PredecessorHloOrdering : public HloOrdering {
   // predecessors. An instruction is an element of its own predecessor set.
   //
   // Subclasses should fill this in to define the desired ordering.
-  tensorflow::gtl::FlatMap<const HloComputation*,
-                           std::unique_ptr<HloReachabilityMap>>
+  absl::flat_hash_map<const HloComputation*,
+                      std::unique_ptr<HloReachabilityMap>>
       predecessors_;
 };
 
@@ -204,7 +204,7 @@ class SequentialHloOrdering : public HloOrdering {
   // this map so more than one instruction may have the same position
   // value. This is not a problem because ExecutesBefore also verifies
   // instructions are in the same computation.
-  tensorflow::gtl::FlatMap<const HloInstruction*, int> order_position_;
+  absl::flat_hash_map<const HloInstruction*, int> order_position_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 8c2f928ca1..59fd01cb58 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <functional>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
@@ -98,7 +99,7 @@ void HloPassPipeline::MaybeDumpHlo(const HloModule& module,
   if (!proto_dump_path.empty()) {
     static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
     static auto* const module_id_to_pass_number =
-        new tensorflow::gtl::FlatMap<int64, int64>();
+        new absl::flat_hash_map<int64, int64>();
 
     tensorflow::mutex_lock lock(mu);
     const int64 pass_number = (*module_id_to_pass_number)[module.unique_id()]++;
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.h b/tensorflow/compiler/xla/service/hlo_reachability.h
index b66a2aa4bd..5a5f01f8fd 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.h
+++ b/tensorflow/compiler/xla/service/hlo_reachability.h
@@ -19,11 +19,11 @@ limitations under the License.
 #include <list>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -154,7 +154,7 @@ class HloReachabilityMap {
 
   // Dense assignment from HloInstruction* to number. These numbers index
   // into the bit_vectors_ vector and into the bits within a BitVector.
-  tensorflow::gtl::FlatMap<const HloInstruction*, int> indices_;
+  absl::flat_hash_map<const HloInstruction*, int> indices_;
 
   // Bitvectors holding the reachability to each instruction. The bit vector for
   // instruction X includes ones for each instruction which X is reachable from.
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index a438671936..abdd9a9212 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <set>
 #include <string>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -75,7 +76,7 @@ bool IsRematerializable(const HloInstruction* instruction) {
 // cache before, and eventually calling the IsRematerializable() API.
 bool CanBeRematerialized(
     const HloInstruction* instruction,
-    tensorflow::gtl::FlatMap<const HloInstruction*, bool>* remat_able) {
+    absl::flat_hash_map<const HloInstruction*, bool>* remat_able) {
   auto it = remat_able->find(instruction);
   if (it != remat_able->end()) {
     return it->second;
@@ -268,7 +269,7 @@ class InstructionList {
   Item* first_;
 
   // Item for each instruction.
-  tensorflow::gtl::FlatMap<const HloInstruction*, Item*> item_map_;
+  absl::flat_hash_map<const HloInstruction*, Item*> item_map_;
 };
 
 // Return the items which use the given LogicalBuffer. Sets
@@ -503,7 +504,7 @@ MemoryUsageTracker::MemoryUsageTracker(
   PointsToSet::BufferSet live_out_set =
       points_to_analysis.GetPointsToSet(computation_->root_instruction())
           .CreateFlattenedSet();
-  tensorflow::gtl::FlatMap<const LogicalBuffer*, BufferId>
+  absl::flat_hash_map<const LogicalBuffer*, BufferId>
       logical_buffer_to_buffer_id;
 
   for (auto* item = instruction_list_.first(); item != nullptr;
@@ -854,7 +855,7 @@ int64 RematerializationCost(const HloInstruction* instruction,
 Item* PickRematerializationCandidate(
     const MemoryUsageTracker& memory_tracker,
     const InstructionList& instruction_list, int64 memory_limit_bytes,
-    tensorflow::gtl::FlatMap<const HloInstruction*, bool>* remat_able) {
+    absl::flat_hash_map<const HloInstruction*, bool>* remat_able) {
   Item* best_item = nullptr;
   int64 best_cost = 0;
 
@@ -983,7 +984,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   tensorflow::gtl::FlatSet<const HloInstruction*> remat_move_instructions;
 
   // The map from instructions to their rematerializable status.
-  tensorflow::gtl::FlatMap<const HloInstruction*, bool> remat_able;
+  absl::flat_hash_map<const HloInstruction*, bool> remat_able;
 
   // The peak memory of the computation at any point in the instruction
   // sequence.
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 7330d73c09..5a02e3a8bb 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -15,6 +15,7 @@
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REMATERIALIZATION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REMATERIALIZATION_H_
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -115,8 +116,7 @@ class HloRematerialization : public HloModulePass {
   // computations called from sequential context
   // (CallContext::kSequential). These values are updated as rematerialization
   // occurs.
-  tensorflow::gtl::FlatMap<const HloComputation*, int64>
-      computation_peak_memory_;
+  absl::flat_hash_map<const HloComputation*, int64> computation_peak_memory_;
 
   std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.cc b/tensorflow/compiler/xla/service/hlo_schedule.cc
index 3fc5dbeb02..7c5c98f04e 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <queue>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -30,7 +31,7 @@ namespace xla {
 
 /* static */ StatusOr<HloSchedule> HloSchedule::CreateFromProto(
     const HloModule* module, const HloScheduleProto& proto) {
-  tensorflow::gtl::FlatMap<int64, const HloComputation*> id_to_computation;
+  absl::flat_hash_map<int64, const HloComputation*> id_to_computation;
   for (const HloComputation* computation : module->computations()) {
     id_to_computation[computation->unique_id()] = computation;
   }
@@ -44,7 +45,7 @@ namespace xla {
         << "No computation exists in HLO module with id " << computation_id;
     const HloComputation* computation = comp_it->second;
 
-    tensorflow::gtl::FlatMap<int64, const HloInstruction*> id_to_instruction;
+    absl::flat_hash_map<int64, const HloInstruction*> id_to_instruction;
     for (const HloInstruction* instruction : computation->instructions()) {
       id_to_instruction[instruction->unique_id()] = instruction;
     }
@@ -112,7 +113,7 @@ Status HloSchedule::UpdateComputationSchedule(
     const HloComputation* computation) {
   // Map from unique ID to HloInstruction pointer for instructions in the
   // computation.
-  tensorflow::gtl::FlatMap<int, const HloInstruction*> id_to_instruction;
+  absl::flat_hash_map<int, const HloInstruction*> id_to_instruction;
   for (const HloInstruction* instruction : computation->instructions()) {
     InsertOrDie(&id_to_instruction, instruction->unique_id(), instruction);
   }
@@ -126,15 +127,13 @@ Status HloSchedule::UpdateComputationSchedule(
   // Map from HloInstruction X to newly added instructions (instruction is in
   // computation, but not in schedule) which use X. If an instruction is not in
   // the map, then it has no users which are newly added instructions.
-  tensorflow::gtl::FlatMap<const HloInstruction*,
-                           std::vector<const HloInstruction*>>
+  absl::flat_hash_map<const HloInstruction*, std::vector<const HloInstruction*>>
       new_instruction_uses;
 
   // For each newly added instruction, this is the count of the instruction's
   // operands that have not yet been scheduled. When this value reaches zero,
   // then the instruction may be placed in the schedule.
-  tensorflow::gtl::FlatMap<const HloInstruction*, int>
-      unscheduled_operand_count;
+  absl::flat_hash_map<const HloInstruction*, int> unscheduled_operand_count;
 
   // Create a worklist of newly added instructions which are ready to be added
   // to the schedule. Initialize worklist with those that have zero operands.
@@ -217,9 +216,9 @@ Status HloSchedule::Update() {
     }
     for (auto it = sequences_.begin(); it != sequences_.end();) {
       if (nonfusion_computations_ids.count(it->first) == 0) {
-        it = sequences_.erase(it);
+        sequences_.erase(it++);
       } else {
-        it++;
+        ++it;
       }
     }
   }
@@ -254,7 +253,7 @@ Status HloSchedule::Verify() const {
   // For each computation verify the set of instructions is the same and that
   // each dependency and control edge is honored.
   for (const HloComputation* computation : nonfusion_computations) {
-    tensorflow::gtl::FlatMap<const HloInstruction*, int> instruction_position;
+    absl::flat_hash_map<const HloInstruction*, int> instruction_position;
     int pos = 0;
     for (const HloInstruction* instruction :
          sequence(computation).instructions()) {
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.h b/tensorflow/compiler/xla/service/hlo_schedule.h
index 270fe6039f..0a714101ee 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/hlo_schedule.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -103,8 +104,7 @@ class HloSchedule {
 
   // Returns a map from HloComputation unique ID to instruction sequence. The
   // map contains all sequences in the schedule.
-  const tensorflow::gtl::FlatMap<int64, HloInstructionSequence>& sequences()
-      const {
+  const absl::flat_hash_map<int64, HloInstructionSequence>& sequences() const {
     return sequences_;
   }
 
@@ -148,7 +148,7 @@ class HloSchedule {
   // A map from computation unique ID to instruction sequence. Unique IDs are
   // used rather than HloComputation pointers because HLO pointers are not
   // unique across HLO transformations because pointers may be recycled.
-  tensorflow::gtl::FlatMap<int64, HloInstructionSequence> sequences_;
+  absl::flat_hash_map<int64, HloInstructionSequence> sequences_;
 };
 
 std::ostream& operator<<(std::ostream& out, const HloSchedule& schedule);
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 6eb6658904..a7727824fe 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <set>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
@@ -23,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 
@@ -993,7 +993,7 @@ Status CheckSameIsHostTransfer(const HloInstruction* instr1,
 
 // Checks various invariants of send and recv instructions.
 Status VerifySendsAndRecvs(const HloModule& module) {
-  tensorflow::gtl::FlatMap<int64, const HloInstruction*> host_channels;
+  absl::flat_hash_map<int64, const HloInstruction*> host_channels;
   // Host send/recv instructions must have their own unique channel.
   auto check_unique_host_channel = [&](const HloInstruction* instruction) {
     const HloSendRecvInstruction* sendrecv =
@@ -1061,7 +1061,7 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
   TF_RETURN_IF_ERROR(VerifyHloStructure(module));
   TF_RETURN_IF_ERROR(VerifySendsAndRecvs(*module));
 
-  tensorflow::gtl::FlatMap<string, const HloInstruction*> instructions;
+  absl::flat_hash_map<string, const HloInstruction*> instructions;
 
   for (auto* computation : module->computations()) {
     for (const auto& instruction : computation->instructions()) {
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.cc b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
index 06f0e1ed25..7ee789276d 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -95,7 +96,7 @@ Status IndexedArrayAnalysis::TraverseAndPopulateCache(
   absl::InlinedVector<const HloInstruction*, 4> stack;
 
   enum DfsState { kDiscovered, kVisited };
-  gtl::FlatMap<const HloInstruction*, DfsState> dfs_state_map;
+  absl::flat_hash_map<const HloInstruction*, DfsState> dfs_state_map;
 
   stack.push_back(root);
   InsertOrDie(&dfs_state_map, root, kDiscovered);
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.h b/tensorflow/compiler/xla/service/indexed_array_analysis.h
index 3e238f97a0..e5aa67fd85 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis.h
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.h
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include <type_traits>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace xla {
@@ -360,7 +360,7 @@ class IndexedArrayAnalysis {
 
   std::vector<std::unique_ptr<Array>> owned_tensors_;
   std::vector<Literal> owned_literals_;
-  tensorflow::gtl::FlatMap<const HloInstruction*, Array*> cache_;
+  absl::flat_hash_map<const HloInstruction*, Array*> cache_;
 };
 
 // A pass that prints all non-trivial results returned by IndexedArrayAnalysis.
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index e884122fcb..5a99c40df4 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -22,11 +22,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -189,7 +189,7 @@ bool InstructionFusion::EffectivelyAtMostUnary(HloInstruction* hlo) {
 bool InstructionFusion::CanFuseOnAllPaths(
     HloInstruction* producer, HloInstruction* consumer,
     const HloInstructionSet& do_not_fuse,
-    tensorflow::gtl::FlatMap<std::pair<HloInstruction*, HloInstruction*>, bool>*
+    absl::flat_hash_map<std::pair<HloInstruction*, HloInstruction*>, bool>*
         result_cache) {
   if (consumer == producer) {
     return true;
@@ -241,7 +241,7 @@ InstructionFusion::ComputeGloballyUnfusible(
   // fusing operations that require duplication later depending on
   // is_expensive_().
   HloInstructionSet do_not_duplicate;
-  tensorflow::gtl::FlatMap<std::pair<HloInstruction*, HloInstruction*>, bool>
+  absl::flat_hash_map<std::pair<HloInstruction*, HloInstruction*>, bool>
       can_fuse_on_all_paths_result_cache;
   for (HloInstruction* consumer : post_order) {
     for (HloInstruction* producer : consumer->operands()) {
@@ -430,7 +430,7 @@ class ReversePostOrderFusionQueue : public FusionQueue {
 
  private:
   std::vector<HloInstruction*> post_order_;
-  tensorflow::gtl::FlatMap<HloInstruction*, int> post_order_index_;
+  absl::flat_hash_map<HloInstruction*, int> post_order_index_;
 };
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index c1ec3b18a1..da2032f6c7 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -1,3 +1,4 @@
+#include "absl/container/flat_hash_map.h"
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -158,8 +159,8 @@ class InstructionFusion : public HloModulePass {
   bool CanFuseOnAllPaths(
       HloInstruction* producer, HloInstruction* consumer,
       const HloInstructionSet& do_not_fuse,
-      tensorflow::gtl::FlatMap<std::pair<HloInstruction*, HloInstruction*>,
-                               bool>* result_cache);
+      absl::flat_hash_map<std::pair<HloInstruction*, HloInstruction*>, bool>*
+          result_cache);
 
   // Computes the set of nodes that we do not want to fuse into any of their
   // consumers based on a global analysis of the HLO graph.
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index e29c199c42..1591256fad 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -228,8 +228,8 @@ class LayoutConstraints {
   // Array-shaped buffers which have not yet been constrained.
   std::set<LogicalBuffer::Id> unconstrained_buffer_ids_;
 
-  mutable tensorflow::gtl::FlatMap<const HloInstruction*,
-                                   std::unique_ptr<PointsToSet::BufferSet>>
+  mutable absl::flat_hash_map<const HloInstruction*,
+                              std::unique_ptr<PointsToSet::BufferSet>>
       buffer_sets_cache_;
 
   HloComputation* computation_;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 540bbb7c7a..3934d2e493 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -38,6 +38,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:logical_buffer",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@llvm//:core",
     ],
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
index 8d9fa99d82..88cde2d3d9 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
@@ -16,13 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_ALIAS_ANALYSIS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_ALIAS_ANALYSIS_H_
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
@@ -77,14 +77,14 @@ class AliasAnalysis {
   // A map from a buffer slice to metadata corresponding to its alias.scope
   // metadata.  The index kParameterAliasSet is used to hold aliasing
   // information for parameters.
-  tensorflow::gtl::FlatMap<BufferAllocation::Slice, llvm::MDNode*,
-                           BufferAllocation::Slice::Hasher>
+  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*,
+                      BufferAllocation::Slice::Hasher>
       alias_scope_metadata_;
 
   // A map from a buffer slice to metadata corresponding to its noalias
   // metadata.
-  tensorflow::gtl::FlatMap<BufferAllocation::Slice, llvm::MDNode*,
-                           BufferAllocation::Slice::Hasher>
+  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*,
+                      BufferAllocation::Slice::Hasher>
       noalias_metadata_;
 };
 
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index b9ec31c497..95b1c20663 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.h b/tensorflow/compiler/xla/service/multi_output_fusion.h
index 0344626b26..9508ab2ed1 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.h
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <queue>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -126,7 +127,7 @@ class MultiOutputFusion : public HloModulePass {
   std::vector<FusionCandidate> candidates_;
 
   // A map that maps an instruction to the index_.
-  tensorflow::gtl::FlatMap<HloInstruction*, int> candidates_index_;
+  absl::flat_hash_map<HloInstruction*, int> candidates_index_;
 
   // The reachability map of current computation.
   std::unique_ptr<HloReachabilityMap> reachability_;
diff --git a/tensorflow/compiler/xla/service/name_uniquer.h b/tensorflow/compiler/xla/service/name_uniquer.h
index 6dd89c240f..1ac60f1cf4 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.h
+++ b/tensorflow/compiler/xla/service/name_uniquer.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/macros.h"
 
@@ -78,7 +78,7 @@ class NameUniquer {
 
   // Map from name prefix to the generator data structure which tracks used
   // identifiers and generates new ones.
-  tensorflow::gtl::FlatMap<string, SequentialIdGenerator> generated_names_;
+  absl::flat_hash_map<string, SequentialIdGenerator> generated_names_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(NameUniquer);
 };
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.h b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
index 4bb22428f3..0b4e82e8d6 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.h
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index a9e8a51e09..78392d3bb2 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -36,7 +36,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/compactptrset.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
index 56145822be..067cfcc17d 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index e8fe33e626..2590473c77 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -15,17 +15,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h"
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/service/tuple_util.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
 
+using absl::flat_hash_map;
 using absl::InlinedVector;
-using tensorflow::gtl::FlatMap;
 using tensorflow::gtl::FlatSet;
 
 // Copies `to_hoist` to the computation containing `while_instr`, hoisting its
@@ -34,7 +34,7 @@ using tensorflow::gtl::FlatSet;
 // function hoists the operands in `unhoisted_invariant_instructions` and moves
 // them into `hoisted_instructions`.
 static void CreateLoopInvariantCopy(
-    FlatMap<HloInstruction*, HloInstruction*>* hoisted_instructions,
+    flat_hash_map<HloInstruction*, HloInstruction*>* hoisted_instructions,
     FlatSet<HloInstruction*>* unhoisted_invariant_instructions,
     HloInstruction* while_instr, HloInstruction* to_hoist) {
   HloComputation* parent_of_while = while_instr->parent();
@@ -147,7 +147,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
 
   // Maps instructions in the while body to instructions hoisted outside the
   // while that compute the same value.
-  FlatMap<HloInstruction*, HloInstruction*> hoisted_instructions;
+  flat_hash_map<HloInstruction*, HloInstruction*> hoisted_instructions;
 
   // Contains instructions that can be legally hoisted, but were deemed to be
   // unprofitable to be hoisted alone by NotWorthHoistingIndividually.  When we
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 9a74f22395..07de8492ba 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 
@@ -181,7 +181,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
                                           used_tuple_indices.end());
   std::sort(new_to_old_tuple_idx.begin(), new_to_old_tuple_idx.end());
 
-  tensorflow::gtl::FlatMap<int64, int64> old_to_new_tuple_idx;
+  absl::flat_hash_map<int64, int64> old_to_new_tuple_idx;
   for (int64 new_idx = 0; new_idx < new_to_old_tuple_idx.size(); ++new_idx) {
     int64 old_idx = new_to_old_tuple_idx[new_idx];
     old_to_new_tuple_idx[old_idx] = new_idx;
@@ -405,7 +405,7 @@ static StatusOr<bool> TryPropagateConstant(HloInstruction* while_op) {
   // build a map from the tuple element index to the constant value. Limit this
   // to scalar constant values because propagating array constants can regress
   // performance by forcing us to copy constants.
-  tensorflow::gtl::FlatMap<int, const HloInstruction*> index_to_constant;
+  absl::flat_hash_map<int, const HloInstruction*> index_to_constant;
   for (int i = 0; i < root_operands.size(); i++) {
     HloInstruction* instr = root_operands[i];
     if (instr->opcode() == HloOpcode::kGetTupleElement &&
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index f474ecb18c..06b6330321 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -422,6 +422,7 @@ xla_test(
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core:test",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index db5a824de0..a6e70eb6ca 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -83,7 +83,7 @@ struct ParsedProfileOutputLine {
 
 Status ParseOneProfileOutputLine(
     const string& line, bool expect_hlo,
-    gtl::FlatMap<string, ParsedProfileOutputLine>* parsed_results,
+    absl::flat_hash_map<string, ParsedProfileOutputLine>* parsed_results,
     absl::Span<const absl::string_view> opcodes_to_ignore = {}) {
   string separator = "[^:]*:: +";
   string match_percentage = R"(\d+\.\d*% +\d+Σ)";
@@ -208,7 +208,7 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) {
   std::vector<string> profile_output_lines =
       absl::StrSplit(profile_output, '\n');
 
-  gtl::FlatMap<string, ParsedProfileOutputLine> parsed_profile_lines;
+  absl::flat_hash_map<string, ParsedProfileOutputLine> parsed_profile_lines;
 
   TF_ASSERT_OK(ParseOneProfileOutputLine(
       profile_output_lines[1], /*expect_hlo=*/false, &parsed_profile_lines));
@@ -314,7 +314,7 @@ XLA_TEST_F(HloProfileTest, ProfileWhileComputation) {
 
   ASSERT_NE(while_body_profile_end, profile_output_lines.end());
 
-  gtl::FlatMap<string, ParsedProfileOutputLine> parsed_profile_lines;
+  absl::flat_hash_map<string, ParsedProfileOutputLine> parsed_profile_lines;
 
   for (auto while_body_profile_i = while_body_profile_start + 1;
        while_body_profile_i != while_body_profile_end; while_body_profile_i++) {
-- 
GitLab


From ec900f15e352e4b203b1f0678f7d2ff042df57d5 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 1 Oct 2018 13:46:31 -0700
Subject: [PATCH 248/570] Minor speed improvements to defun.

- EncodeArg in C instead of python.
- Also caches parsed device specs, and device spec hashes
- Adds a common way to register python types in C.
- Fastpath canonicalize function inputs when no kwargs are passed
- Set the func name attr directly instead of creating an op to wrap it.
- Rewrite IsAttrsHelper without caching

Before:
entry {
  name: "MicroBenchmarks.benchmark_defun_matmul_2_by_2_CPU"
  iters: 30000
  wall_time: 101.803263028
  extras {
    key: "examples_per_sec"
    value {
      double_value: 9822.86785562
    }
  }
}

After:
entry {
  name: "MicroBenchmarks.benchmark_defun_matmul_2_by_2_CPU"
  iters: 30000
  wall_time: 47.2899993261
  extras {
    key: "examples_per_sec"
    value {
      double_value: 21146.1199884
    }
  }
}
PiperOrigin-RevId: 215272962
---
 tensorflow/c/eager/c_api.cc                  |   8 +
 tensorflow/c/eager/c_api.h                   |   3 +
 tensorflow/python/eager/BUILD                |   1 +
 tensorflow/python/eager/function.py          | 100 +++------
 tensorflow/python/eager/function_test.py     |  26 ++-
 tensorflow/python/eager/pywrap_tfe.h         |   4 +
 tensorflow/python/eager/pywrap_tfe_src.cc    | 223 ++++++++++++++++++-
 tensorflow/python/framework/device.py        |  12 +-
 tensorflow/python/framework/sparse_tensor.py |   2 +-
 tensorflow/python/pywrap_tfe.i               |   1 +
 tensorflow/python/util/nest.py               |   4 +-
 tensorflow/python/util/util.cc               | 223 +++++++++++--------
 tensorflow/python/util/util.h                |  34 ++-
 tensorflow/python/util/util.i                |  10 +-
 14 files changed, 462 insertions(+), 189 deletions(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 0bf3d9542b..3554ec0bf3 100755
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -578,6 +578,14 @@ void TFE_OpSetAttrFunction(TFE_Op* op, const char* attr_name,
   op->operation.MutableAttrs()->Set(attr_name, attr_value);
 }
 
+void TFE_OpSetAttrFunctionName(TFE_Op* op, const char* attr_name,
+                               const char* data, size_t length) {
+  tensorflow::AttrValue attr_value;
+  tensorflow::NameAttrList* func = attr_value.mutable_func();
+  func->set_name(data, length);
+  op->operation.MutableAttrs()->Set(attr_name, attr_value);
+}
+
 void TFE_OpSetAttrTensor(TFE_Op* op, const char* attr_name, TF_Tensor* tensor,
                          TF_Status* status) {
   tensorflow::Tensor t;
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 6323f8a053..b2454d8722 100755
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -313,6 +313,9 @@ TF_CAPI_EXPORT extern void TFE_OpSetAttrFunction(TFE_Op* op,
                                                  const char* attr_name,
                                                  const TFE_Op* value);
 
+TF_CAPI_EXPORT void TFE_OpSetAttrFunctionName(TFE_Op* op, const char* attr_name,
+                                              const char* data, size_t length);
+
 TF_CAPI_EXPORT extern void TFE_OpSetAttrTensor(TFE_Op* op,
                                                const char* attr_name,
                                                TF_Tensor* tensor,
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index d3d997e6df..d0c1a93118 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -37,6 +37,7 @@ cc_library(
         "//tensorflow/python:safe_ptr",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
     ],
 )
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 60a4f018cd..3b6f288fb9 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -1005,52 +1005,8 @@ def func_graph_from_py_func(name,
   return func_graph
 
 
-_TensorType = collections.namedtuple("_TensorType", ["dtype", "shape"])
-
-
-def _encode_arg(arg):
-  """A canonical representation for this argument, for use in a cache key."""
-
-  # `defun` uses dtypes and shapes instead of `Tensors` as cache keys. Dtypes
-  # are used because TensorFlow graphs are not parametric w.r.t. dtypes. Shapes
-  # are used for both performance reasons, as much TensorFlow code specializes
-  # on known shapes to produce slimmer graphs, and correctness, as some
-  # high-level APIs require shapes to be fully-known.
-  #
-  # TODO(akshayka): Add support for sparse tensors.
-  #
-  # pylint: disable=protected-access
-  if isinstance(arg, ops.Tensor):
-    return _TensorType(arg.dtype, arg._shape_tuple())
-  elif isinstance(arg, ops.IndexedSlices):
-    if arg.dense_shape is not None:
-      return tuple([
-          _TensorType(arg.values.dtype, arg.values._shape_tuple()),
-          _TensorType(arg.indices.dtype, arg.indices._shape_tuple()),
-          _TensorType(arg.dense_shape.dtype, arg.dense_shape._shape_tuple()),
-      ])
-    else:
-      return tuple([
-          _TensorType(arg.values.dtype, arg.values._shape_tuple()),
-          _TensorType(arg.indices.dtype, arg.indices._shape_tuple()),
-      ])
-  # pylint: enable=protected-access
-  elif isinstance(arg, (list, tuple)):
-    return tuple([_encode_arg(elem) for elem in arg])
-  elif isinstance(arg, dict):
-    return tuple(
-        (_encode_arg(key), _encode_arg(arg[key])) for key in sorted(arg))
-  else:
-    try:
-      # If possible, keep only a weak reference to Python objects. Weak
-      # references hash to the same value as the original object.
-      # TODO(allenl): Clean up dead functions and their cache keys if the cache
-      # gets large. Right now creating objects with a defunned method, calling
-      # the method, and losing a reference to the object in a loop will leak
-      # memory here.
-      return weakref.ref(arg)
-    except TypeError:
-      return arg
+pywrap_tensorflow.RegisterType("Tensor", ops.Tensor)
+pywrap_tensorflow.RegisterType("IndexedSlices", ops.IndexedSlices)
 
 
 def _deterministic_dict_values(dictionary):
@@ -1120,6 +1076,8 @@ class PolymorphicFunction(object):
         offset + index: default
         for index, default in enumerate(fullargspec.defaults or [])
     }
+    self._default_values = fullargspec.defaults
+    self._default_values_start_index = offset
     if input_signature is None:
       self._input_signature = None
     else:
@@ -1180,7 +1138,7 @@ class PolymorphicFunction(object):
     """Computes the cache key given inputs and execution context."""
     if self._input_signature is None:
       inputs = (args, kwargs) if kwargs else args
-      cache_key = tuple(_encode_arg(arg) for arg in inputs)
+      cache_key = pywrap_tensorflow.TFE_Py_EncodeArg(inputs)
     else:
       del args, kwargs
       cache_key = self._flat_input_signature
@@ -1203,7 +1161,7 @@ class PolymorphicFunction(object):
     colocation_stack = (() if executing_eagerly else
                         tuple(default_graph._colocation_stack.peek_objs()))  # pylint: disable=protected-access
 
-    return cache_key + (execution_context, device_functions, colocation_stack)
+    return (cache_key, execution_context, device_functions, colocation_stack)
 
   def _canonicalize_function_inputs(self, *args, **kwargs):
     """Canonicalizes `args` and `kwargs`.
@@ -1231,26 +1189,32 @@ class PolymorphicFunction(object):
     # Maps from index of arg to its corresponding value, according to `args`
     # and `kwargs`; seeded with the default values for the named args that
     # aren't in `args`.
-    arg_indices_to_values = {
-        index: default
-        for index, default in six.iteritems(self._arg_indices_to_default_values)
-        if index >= len(args)
-    }
-    consumed_args = []
-    for arg, value in six.iteritems(kwargs):
-      index = self._args_to_indices.get(arg, None)
-      if index is not None:
-        arg_indices_to_values[index] = value
-        consumed_args.append(arg)
-      elif self._input_signature is not None:
-        raise ValueError("Cannot define a TensorFlow function from a Python "
-                         "function with keyword arguments when "
-                         "input_signature is provided.")
-    for arg in consumed_args:
-      # After this loop, `kwargs` will only contain true keyword arguments, as
-      # opposed to named arguments called in a keyword-like fashion.
-      kwargs.pop(arg)
-    inputs = args + _deterministic_dict_values(arg_indices_to_values)
+    if not kwargs:
+      if self._default_values:
+        inputs = args + self._default_values[len(args) -
+                                             self._default_values_start_index:]
+      else:
+        inputs = args
+    else:
+      arg_indices_to_values = {
+          index: default for index, default in six.iteritems(
+              self._arg_indices_to_default_values) if index >= len(args)
+      }
+      consumed_args = []
+      for arg, value in six.iteritems(kwargs):
+        index = self._args_to_indices.get(arg, None)
+        if index is not None:
+          arg_indices_to_values[index] = value
+          consumed_args.append(arg)
+        elif self._input_signature is not None:
+          raise ValueError("Cannot define a TensorFlow function from a Python "
+                           "function with keyword arguments when "
+                           "input_signature is provided.")
+      for arg in consumed_args:
+        # After this loop, `kwargs` will only contain true keyword arguments, as
+        # opposed to named arguments called in a keyword-like fashion.
+        kwargs.pop(arg)
+      inputs = args + _deterministic_dict_values(arg_indices_to_values)
     flat_inputs = nest.flatten(inputs)
 
     # Check for NumPy arrays in arguments and convert them to Tensors.
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index afe3ba9893..9ce367a837 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -1237,6 +1237,24 @@ class FunctionTest(test.TestCase):
     x = constant_op.constant([1.0, 2.0])
     self.assertAllEqual([2., 4.], self.evaluate(defined(x)))
 
+  def testCacheObjectHashCollisions(self):
+
+    class Foo(object):
+
+      def __hash__(self):
+        return 42
+
+    def func(foo):
+      del foo
+      return
+
+    defined = function.defun(func)
+    defined(Foo())
+    self.assertEqual(len(defined._function_cache), 1)
+
+    defined(Foo())
+    self.assertEqual(len(defined._function_cache), 2)
+
   def testPythonFunctionWithDefaultArgs(self):
 
     def func(foo, bar=1, baz=2):
@@ -1250,20 +1268,20 @@ class FunctionTest(test.TestCase):
 
     def cache_keys():
       """Sanitizes cache keys of non-input metadata."""
-      return tuple(key[:3] for key in defined._function_cache)
+      return tuple(key[0] for key in defined._function_cache)
 
     # `True` corresponds to the fact that we're executing eagerly
-    self.assertIn((0, 1, 20), cache_keys())
+    self.assertIn(('tRRR', (0, 1, 20)), cache_keys())
 
     defined(1)  # bar=1, baz=2
-    self.assertIn((1, 1, 2), cache_keys())
+    self.assertIn(('tRRR', (1, 1, 2)), cache_keys())
 
     # This matches the previous call.
     defined(foo=1)
     self.assertEqual(len(defined._function_cache), 2)
 
     defined(1, 2, 3)
-    self.assertIn((1, 2, 3), cache_keys())
+    self.assertIn(('tRRR', (1, 2, 3)), cache_keys())
 
     # This matches the previous call.
     defined(1, bar=2, baz=3)
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index f1b4042ec9..decd635b58 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -224,4 +224,8 @@ PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim);
 // The shape is represented as a Python tuple of integers.
 PyObject* TFE_Py_TensorShapeOnDevice(PyObject* tensor);
 
+// Encodes the object as a tuple that is meant to be used as part of the key
+// for the defun function cache.
+PyObject* TFE_Py_EncodeArg(PyObject*);
+
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 196e20e4d7..4b9f7f4100 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/python/eager/pywrap_tfe.h"
 
+#include "absl/strings/str_cat.h"
 #include "absl/types/variant.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
@@ -567,11 +568,8 @@ bool SetOpAttrScalar(
         return false;
       }
     }
-    TFE_Op* func = TFE_NewOp(
-        ctx, string(func_name.data(), func_name.size()).c_str(), status);
-    if (TF_GetCode(status) != TF_OK) return false;
-    TFE_OpSetAttrFunction(op, key, func);
-    TFE_DeleteOp(func);
+    TF_SetStatus(status, TF_OK, "");
+    TFE_OpSetAttrFunctionName(op, key, func_name.data(), func_name.size());
   } else {
     TF_SetStatus(
         status, TF_UNIMPLEMENTED,
@@ -2748,3 +2746,218 @@ PyObject* TFE_Py_RecordGradient(PyObject* op_name, PyObject* inputs,
 
   return RecordGradient(op_name, inputs, attrs, results, name);
 }
+
+namespace {
+
+tensorflow::int64 GetPyNoneHash() {
+  tensorflow::int64 py_none_hash = PyObject_Hash(Py_None);
+  return py_none_hash;
+}
+
+struct EncodeResult {
+  string str;
+  std::vector<PyObject*> objects;
+
+  PyObject* ToPyTuple() {
+    PyObject* result = PyTuple_New(2);
+
+    PyTuple_SET_ITEM(result, 0, GetPythonObjectFromString(str.c_str()));
+
+    if (objects.empty()) {
+      Py_INCREF(Py_None);
+      PyTuple_SET_ITEM(result, 1, Py_None);
+    } else {
+      PyObject* objects_tuple = PyTuple_New(objects.size());
+
+      for (int i = 0; i < objects.size(); i++) {
+        PyTuple_SET_ITEM(objects_tuple, i, objects[i]);
+      }
+
+      PyTuple_SET_ITEM(result, 1, objects_tuple);
+    }
+
+    return result;
+  }
+};
+
+tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
+  if (EagerTensor_CheckExact(arg)) {
+    TFE_TensorHandle* t = EagerTensor_Handle(arg);
+    tensorflow::TensorShape tensor_shape;
+    TF_RETURN_IF_ERROR(t->handle->Shape(&tensor_shape));
+    absl::StrAppend(&result->str, t->handle->dtype);
+
+    for (tensorflow::int64 dim_size : tensor_shape.dim_sizes()) {
+      absl::StrAppend(&result->str, dim_size);
+    }
+
+    return tensorflow::Status::OK();
+  }
+
+  tensorflow::Safe_PyObjectPtr dtype_object(
+      PyObject_GetAttrString(arg, "dtype"));
+
+  if (dtype_object == nullptr) {
+    return tensorflow::errors::InvalidArgument(
+        "ops.Tensor object doesn't have dtype() attr.");
+  }
+
+  tensorflow::Safe_PyObjectPtr dtype_enum(
+      PyObject_GetAttrString(dtype_object.get(), "_type_enum"));
+
+  if (dtype_enum == nullptr) {
+    return tensorflow::errors::InvalidArgument(
+        "ops.Tensor's dtype object doesn't have _type_enum() attr.");
+  }
+
+  tensorflow::DataType dtype =
+      static_cast<tensorflow::DataType>(MakeInt(dtype_enum.get()));
+
+  absl::StrAppend(&result->str, dtype);
+  static char _shape_tuple[] = "_shape_tuple";
+  tensorflow::Safe_PyObjectPtr shape_tuple(
+      PyObject_CallMethod(arg, _shape_tuple, nullptr));
+
+  if (shape_tuple == nullptr) {
+    return tensorflow::errors::InvalidArgument(
+        "ops.Tensor object doesn't have _shape_tuple() method.");
+  }
+
+  if (shape_tuple.get() == Py_None) {
+    // Unknown shape, encode that directly.
+    absl::StrAppend(&result->str, GetPyNoneHash());
+    return tensorflow::Status::OK();
+  }
+
+  tensorflow::Safe_PyObjectPtr shape_seq(PySequence_Fast(
+      shape_tuple.get(), "shape_tuple didn't return a sequence"));
+
+  int len = PySequence_Fast_GET_SIZE(shape_seq.get());
+  for (int i = 0; i < len; ++i) {
+    PyObject* item = PySequence_Fast_GET_ITEM(shape_seq.get(), i);
+    if (item == Py_None) {
+      absl::StrAppend(&result->str, GetPyNoneHash());
+    } else {
+      absl::StrAppend(&result->str, MakeInt(item));
+    }
+  }
+
+  return tensorflow::Status::OK();
+}
+
+const char kTensor[] = "T";
+const char kIndexedSlices[] = "I";
+const char kList[] = "L";
+const char kTuple[] = "t";
+const char kDict[] = "D";
+const char kRaw[] = "R";
+
+tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result);
+
+// This function doesn't set the type of sequence before
+tensorflow::Status TFE_Py_EncodeSequence(PyObject* arg, const char* type,
+                                         EncodeResult* result) {
+  tensorflow::Safe_PyObjectPtr arg_seq(
+      PySequence_Fast(arg, "unable to create seq from list/tuple"));
+
+  absl::StrAppend(&result->str, type);
+  int len = PySequence_Fast_GET_SIZE(arg_seq.get());
+  for (int i = 0; i < len; ++i) {
+    PyObject* item = PySequence_Fast_GET_ITEM(arg_seq.get(), i);
+    if (item == Py_None) {
+      absl::StrAppend(&result->str, GetPyNoneHash());
+    } else {
+      TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(item, result));
+    }
+  }
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
+  if (tensorflow::swig::IsTensor(arg)) {
+    absl::StrAppend(&result->str, kTensor);
+    TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(arg, result));
+  } else if (tensorflow::swig::IsIndexedSlices(arg)) {
+    absl::StrAppend(&result->str, kIndexedSlices);
+    tensorflow::Safe_PyObjectPtr values(PyObject_GetAttrString(arg, "values"));
+    if (values == nullptr) {
+      PyErr_Clear();
+      return tensorflow::errors::InvalidArgument(
+          "IndexedSlices does not have a values attr");
+    }
+    TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(values.get(), result));
+
+    tensorflow::Safe_PyObjectPtr indices(
+        PyObject_GetAttrString(arg, "indices"));
+    if (indices == nullptr) {
+      PyErr_Clear();
+      return tensorflow::errors::InvalidArgument(
+          "IndexedSlices does not have a indices attr");
+    }
+    TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(indices.get(), result));
+
+    tensorflow::Safe_PyObjectPtr dense_shape(
+        PyObject_GetAttrString(arg, "dense_shape"));
+    if (dense_shape == nullptr) {
+      PyErr_Clear();
+      return tensorflow::errors::InvalidArgument(
+          "IndexedSlices does not have a dense_shape attr");
+    }
+    if (dense_shape.get() != Py_None) {
+      TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(dense_shape.get(), result));
+    }
+  } else if (PyList_Check(arg)) {
+    TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(arg, kList, result));
+  } else if (PyTuple_Check(arg)) {
+    TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(arg, kTuple, result));
+  } else if (PyDict_Check(arg)) {
+    tensorflow::Safe_PyObjectPtr keys(PyDict_Keys(arg));
+    if (PyList_Sort(keys.get()) == -1) {
+      return tensorflow::errors::Internal("Unable to sort keys");
+    }
+
+    absl::StrAppend(&result->str, kDict);
+    int len = PyList_Size(keys.get());
+
+    for (int i = 0; i < len; i++) {
+      PyObject* key = PyList_GetItem(keys.get(), i);
+      TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(key, result));
+      PyObject* value = PyDict_GetItem(arg, key);
+      TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(value, result));
+    }
+  } else {
+    PyObject* object = PyWeakref_NewRef(arg, nullptr);
+
+    if (object == nullptr) {
+      PyErr_Clear();
+
+      object = arg;
+      Py_INCREF(object);
+    }
+
+    absl::StrAppend(&result->str, kRaw);
+    result->objects.push_back(object);
+  }
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace
+
+// `defun` uses dtypes and shapes instead of `Tensors` as cache keys. Dtypes
+// are used because TensorFlow graphs are not parametric w.r.t. dtypes. Shapes
+// are used for both performance reasons, as much TensorFlow code specializes
+// on known shapes to produce slimmer graphs, and correctness, as some
+// high-level APIs require shapes to be fully-known.
+//
+// TODO(nareshmodi): Add support for sparse tensors.
+PyObject* TFE_Py_EncodeArg(PyObject* arg) {
+  EncodeResult result;
+  const auto status = TFE_Py_EncodeArgHelper(arg, &result);
+  if (MaybeRaiseExceptionFromStatus(status, nullptr)) {
+    return nullptr;
+  }
+
+  return result.ToPyTuple();
+}
diff --git a/tensorflow/python/framework/device.py b/tensorflow/python/framework/device.py
index 06c653097a..7f6e0a75a5 100644
--- a/tensorflow/python/framework/device.py
+++ b/tensorflow/python/framework/device.py
@@ -87,6 +87,7 @@ class DeviceSpec(object):
     else:
       self.device_type = device_type
     self.device_index = device_index
+    self._hash = hash(self.to_string())
 
   def _clear(self):
     self._job = None
@@ -234,7 +235,7 @@ class DeviceSpec(object):
     return self.to_string() == other.to_string()
 
   def __hash__(self):
-    return hash(self.to_string())
+    return self._hash
 
 
 def check_valid(spec):
@@ -266,6 +267,7 @@ def canonical_name(device):
 # possible to compare the device function stacks belonging to different
 # graphs in a meaningful way.
 _cached_device_functions = {}
+_cached_device_specs = {}
 _cache_lock = threading.Lock()
 
 
@@ -297,7 +299,13 @@ def merge_device(spec):
   """
   with _cache_lock:
     if not isinstance(spec, DeviceSpec):
-      spec = DeviceSpec.from_string(spec or "")
+      cached_device_spec = _cached_device_specs.get(spec, None)
+      if cached_device_spec is None:
+        device_spec = DeviceSpec.from_string(spec or "")
+        _cached_device_specs[spec] = device_spec
+        spec = device_spec
+      else:
+        spec = cached_device_spec
     cached_function = _cached_device_functions.get(spec, None)
     if cached_function is not None:
       return cached_function
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 41ef2e11d1..440e3a0968 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -245,7 +245,7 @@ class SparseTensor(_TensorLike):
 SparseTensorValue = collections.namedtuple(
     "SparseTensorValue", ["indices", "values", "dense_shape"])
 tf_export("SparseTensorValue")(SparseTensorValue)
-pywrap_tensorflow.RegisterSparseTensorValueClass(SparseTensorValue)
+pywrap_tensorflow.RegisterType("SparseTensorValue", SparseTensorValue)
 
 
 @tf_export("convert_to_tensor_or_sparse_tensor")
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index c411a58b70..61e0abbfcb 100755
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -67,6 +67,7 @@ limitations under the License.
 %rename("%s") TFE_ContextStartStep;
 %rename("%s") TFE_ContextEndStep;
 %rename("%s") TFE_Py_RegisterVSpace;
+%rename("%s") TFE_Py_EncodeArg;
 
 %{
 #include "tensorflow/python/eager/pywrap_tfe.h"
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 758cba7487..d67dbde304 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -819,5 +819,5 @@ def flatten_with_joined_string_paths(structure, separator="/"):
   return list(zip(flat_string_paths, flatten(structure)))
 
 
-_pywrap_tensorflow.RegisterSequenceClass(_collections.Sequence)
-_pywrap_tensorflow.RegisterMappingClass(_collections.Mapping)
+_pywrap_tensorflow.RegisterType("Mapping", _collections.Mapping)
+_pywrap_tensorflow.RegisterType("Sequence", _collections.Sequence)
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 38b8491c66..7b3e618e84 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -29,14 +29,51 @@ limitations under the License.
 namespace tensorflow {
 namespace swig {
 
-namespace {
+std::unordered_map<string, PyObject*>* PythonTypesMap() {
+  static auto* m = new std::unordered_map<string, PyObject*>();
+  return m;
+}
+
+PyObject* GetRegisteredType(const string& key) {
+  auto* m = PythonTypesMap();
+  auto it = m->find(key);
+  if (it == m->end()) return nullptr;
+  return it->second;
+}
+
+PyObject* RegisterType(PyObject* type_name, PyObject* type) {
+  if (!PyType_Check(type)) {
+    PyErr_SetString(PyExc_TypeError,
+                    tensorflow::strings::StrCat("Expecting a type, got ",
+                                                Py_TYPE(type)->tp_name)
+                        .c_str());
+    return nullptr;
+  }
 
-// Type object for collections.Sequence. This is set by RegisterSequenceClass.
-PyObject* CollectionsSequenceType = nullptr;
-// Type object for collections.Mapping, set by RegisterMappingClass.
-PyObject* CollectionsMappingType = nullptr;
-PyTypeObject* SparseTensorValueType = nullptr;
+  string key;
+  if (PyBytes_Check(type_name)) {
+    key = PyBytes_AsString(type_name);
+  }
+#if PY_MAJOR_VERSION >= 3
+  if (PyUnicode_Check(type_name)) {
+    key = PyUnicode_AsUTF8(type_name);
+  }
+#endif
 
+  if (PythonTypesMap()->find(key) != PythonTypesMap()->end()) {
+    PyErr_SetString(PyExc_TypeError, tensorflow::strings::StrCat(
+                                         "Type already registered for ", key)
+                                         .c_str());
+    return nullptr;
+  }
+
+  Py_INCREF(type);
+  PythonTypesMap()->emplace(key, type);
+
+  Py_RETURN_NONE;
+}
+
+namespace {
 const int kMaxItemsInCache = 1024;
 
 bool WarnedThatSetIsNotSequence = false;
@@ -177,46 +214,82 @@ class CachedTypeCheck {
 // Returns -1 if an error occurred.
 int IsMappingHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return PyObject_IsInstance(to_check, CollectionsMappingType);
+    PyObject* collections_mapping_type = GetRegisteredType("Mapping");
+    if (TF_PREDICT_FALSE(collections_mapping_type == nullptr)) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      tensorflow::strings::StrCat(
+                          "collections.Mapping type has not been set. "
+                          "Please register the type with the identifier "
+                          "\"Mapping\" using RegisterType.")
+                          .c_str());
+      return -1;
+    }
+    return PyObject_IsInstance(to_check, collections_mapping_type);
   });
   if (PyDict_Check(o)) return true;
-  if (TF_PREDICT_FALSE(CollectionsMappingType == nullptr)) {
-    PyErr_SetString(
-        PyExc_RuntimeError,
-        tensorflow::strings::StrCat(
-            "collections.Mapping type has not been set. "
-            "Please call RegisterMappingClass before using this module")
-            .c_str());
-    return -1;
-  }
   return check_cache->CachedLookup(o);
 }
 
 // Returns 1 if `o` is an instance of attrs-decorated class.
 // Returns 0 otherwise.
 int IsAttrsHelper(PyObject* o) {
-  Safe_PyObjectPtr cls(PyObject_GetAttrString(o, "__class__"));
-  if (cls) {
-    return PyObject_HasAttrString(cls.get(), "__attrs_attrs__");
-  } else {
+  static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
+    Safe_PyObjectPtr cls(PyObject_GetAttrString(to_check, "__class__"));
+    if (cls) {
+      return PyObject_HasAttrString(cls.get(), "__attrs_attrs__");
+    }
+
     // PyObject_GetAttrString returns null on error
     PyErr_Clear();
     return 0;
-  }
+  });
+  return check_cache->CachedLookup(o);
 }
 
-// Returns 1 if `o` is considered a sequence for the purposes of Flatten().
+// Returns 1 if `o` is an object of type IndexedSlices.
 // Returns 0 otherwise.
 // Returns -1 if an error occurred.
-int IsSequenceHelper(PyObject* o) {
+int IsIndexedSlicesHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    int is_instance = PyObject_IsInstance(to_check, CollectionsSequenceType);
-
-    // Don't cache a failed is_instance check.
-    if (is_instance == -1) return -1;
+    PyObject* indexed_slices_type = GetRegisteredType("IndexedSlices");
+    if (TF_PREDICT_FALSE(indexed_slices_type == nullptr)) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      tensorflow::strings::StrCat(
+                          "IndexedSlices type has not been set. "
+                          "Please register the type with the identifier "
+                          "\"IndexedSlices\" using RegisterType.")
+                          .c_str());
+      return -1;
+    }
+    return PyObject_IsInstance(to_check, indexed_slices_type);
+  });
+  return check_cache->CachedLookup(o);
+}
 
-    return static_cast<int>(is_instance != 0 && !IsString(to_check));
+// Returns 1 if `o` is a Tensor.
+// Returns 0 otherwise.
+// Returns -1 if an error occurred.
+int IsTensorHelper(PyObject* o) {
+  static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
+    PyObject* tensor_type = GetRegisteredType("Tensor");
+    if (TF_PREDICT_FALSE(tensor_type == nullptr)) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      tensorflow::strings::StrCat(
+                          "Tensor type has not been set. "
+                          "Please register the type with the identifier "
+                          "\"Tensor\" using RegisterType.")
+                          .c_str());
+      return -1;
+    }
+    return PyObject_IsInstance(to_check, tensor_type);
   });
+  return check_cache->CachedLookup(o);
+}
+
+// Returns 1 if `o` is considered a sequence for the purposes of Flatten().
+// Returns 0 otherwise.
+// Returns -1 if an error occurred.
+int IsSequenceHelper(PyObject* o) {
   // We treat dicts and other mappings as special cases of sequences.
   if (IsMappingHelper(o)) return true;
   if (IsAttrsHelper(o)) return true;
@@ -226,15 +299,24 @@ int IsSequenceHelper(PyObject* o) {
                     "so consider avoiding using them.";
     WarnedThatSetIsNotSequence = true;
   }
-  if (TF_PREDICT_FALSE(CollectionsSequenceType == nullptr)) {
-    PyErr_SetString(
-        PyExc_RuntimeError,
-        tensorflow::strings::StrCat(
-            "collections.Sequence type has not been set. "
-            "Please call RegisterSequenceClass before using this module")
-            .c_str());
-    return -1;
-  }
+  static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
+    PyObject* collections_sequence_type = GetRegisteredType("Sequence");
+    if (TF_PREDICT_FALSE(collections_sequence_type == nullptr)) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      tensorflow::strings::StrCat(
+                          "collections.Sequence type has not been set. "
+                          "Please register the type with the identifier "
+                          "\"Sequence\" using RegisterType.")
+                          .c_str());
+      return -1;
+    }
+    int is_instance = PyObject_IsInstance(to_check, collections_sequence_type);
+
+    // Don't cache a failed is_instance check.
+    if (is_instance == -1) return -1;
+
+    return static_cast<int>(is_instance != 0 && !IsString(to_check));
+  });
   return check_cache->CachedLookup(o);
 }
 
@@ -401,11 +483,13 @@ class AttrsValueIterator : public ValueIterator {
 };
 
 bool IsSparseTensorValueType(PyObject* o) {
-  if (TF_PREDICT_FALSE(SparseTensorValueType == nullptr)) {
+  PyObject* sparse_tensor_value_type = GetRegisteredType("SparseTensorValue");
+  if (TF_PREDICT_FALSE(sparse_tensor_value_type == nullptr)) {
     return false;
   }
 
-  return PyObject_TypeCheck(o, SparseTensorValueType) == 1;
+  return PyObject_TypeCheck(
+             o, reinterpret_cast<PyTypeObject*>(sparse_tensor_value_type)) == 1;
 }
 
 int IsSequenceForDataHelper(PyObject* o) {
@@ -647,49 +731,11 @@ bool AssertSameStructureHelper(
 
 }  // namespace
 
-void RegisterSequenceClass(PyObject* sequence_class) {
-  if (!PyType_Check(sequence_class)) {
-    PyErr_SetString(
-        PyExc_TypeError,
-        tensorflow::strings::StrCat(
-            "Expecting a class definition for `collections.Sequence`. Got ",
-            Py_TYPE(sequence_class)->tp_name)
-            .c_str());
-    return;
-  }
-  CollectionsSequenceType = sequence_class;
-}
-
-void RegisterMappingClass(PyObject* mapping_class) {
-  if (!PyType_Check(mapping_class)) {
-    PyErr_SetString(
-        PyExc_TypeError,
-        tensorflow::strings::StrCat(
-            "Expecting a class definition for `collections.Mapping`. Got ",
-            Py_TYPE(mapping_class)->tp_name)
-            .c_str());
-    return;
-  }
-  CollectionsMappingType = mapping_class;
-}
-
-void RegisterSparseTensorValueClass(PyObject* sparse_tensor_value_class) {
-  if (!PyType_Check(sparse_tensor_value_class)) {
-    PyErr_SetString(
-        PyExc_TypeError,
-        tensorflow::strings::StrCat(
-            "Expecting a class definition for `SparseTensorValue`. Got ",
-            Py_TYPE(sparse_tensor_value_class)->tp_name)
-            .c_str());
-    return;
-  }
-  SparseTensorValueType =
-      reinterpret_cast<PyTypeObject*>(sparse_tensor_value_class);
-}
-
 bool IsSequence(PyObject* o) { return IsSequenceHelper(o) == 1; }
 bool IsMapping(PyObject* o) { return IsMappingHelper(o) == 1; }
 bool IsAttrs(PyObject* o) { return IsAttrsHelper(o) == 1; }
+bool IsTensor(PyObject* o) { return IsTensorHelper(o) == 1; }
+bool IsIndexedSlices(PyObject* o) { return IsIndexedSlicesHelper(o) == 1; }
 
 PyObject* Flatten(PyObject* nested) {
   PyObject* list = PyList_New(0);
@@ -737,13 +783,15 @@ PyObject* IsNamedtuple(PyObject* o, bool strict) {
     }
   }
 
-  if (TF_PREDICT_FALSE(CollectionsSequenceType == nullptr)) {
-    PyErr_SetString(
-        PyExc_RuntimeError,
-        tensorflow::strings::StrCat(
-            "collections.Sequence type has not been set. "
-            "Please call RegisterSequenceClass before using this module")
-            .c_str());
+  PyObject* collections_sequence_type = GetRegisteredType("Sequence");
+
+  if (TF_PREDICT_FALSE(collections_sequence_type == nullptr)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    tensorflow::strings::StrCat(
+                        "collections.Sequence type has not been set. "
+                        "Please register the type with the identifier "
+                        "\"Sequence\" using RegisterType.")
+                        .c_str());
     return nullptr;
   }
 
@@ -755,7 +803,8 @@ PyObject* IsNamedtuple(PyObject* o, bool strict) {
   }
 
   Safe_PyObjectPtr fields = make_safe(PyObject_GetAttrString(o, "_fields"));
-  int is_instance = PyObject_IsInstance(fields.get(), CollectionsSequenceType);
+  int is_instance =
+      PyObject_IsInstance(fields.get(), collections_sequence_type);
   if (is_instance == 0) {
     Py_RETURN_FALSE;
   } else if (is_instance == -1) {
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 01f85ea1dc..f37cd527d8 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -65,6 +65,24 @@ bool IsMapping(PyObject* o);
 //   True if the object is an instance of an attr.s decorated class.
 bool IsAttrs(PyObject* o);
 
+// Returns a true if its input is an ops.Tensor.
+//
+// Args:
+//   seq: the input to be checked.
+//
+// Returns:
+//   True if the object is a tensor.
+bool IsTensor(PyObject* o);
+
+// Returns a true if its input is an ops.IndexesSlices.
+//
+// Args:
+//   seq: the input to be checked.
+//
+// Returns:
+//   True if the object is an ops.IndexedSlices.
+bool IsIndexedSlices(PyObject* o);
+
 // Implements the same interface as tensorflow.util.nest._same_namedtuples
 // Returns Py_True iff the two namedtuples have the same name and fields.
 // Raises RuntimeError if `o1` or `o2` don't look like namedtuples (don't have
@@ -130,18 +148,6 @@ PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
 //   TypeError: The nest is or contains a dict with non-sortable keys.
 PyObject* Flatten(PyObject* nested);
 
-// RegisterSequenceClass is used to pass PyTypeObject for collections.Sequence
-// (which is defined in python) into the C++ world.
-// Alternative approach could be to import the collections modules and retrieve
-// the type from the module. This approach also requires some trigger from
-// Python so that we know that Python interpreter had been initialzied.
-void RegisterSequenceClass(PyObject* sequence_class);
-// Like RegisterSequenceClass, but for collections.Mapping.
-void RegisterMappingClass(PyObject* mapping_class);
-// Similar to the above functions, except for the
-// sparse_tensor.SparseTensorValue class.
-void RegisterSparseTensorValueClass(PyObject* sparse_tensor_value_class);
-
 // The tensorflow.python.data package has its own nest utility that follows very
 // slightly different semantics for its functions than the tensorflow.python
 // nest utility. Returns a true if its input is a collections.Sequence (except
@@ -167,6 +173,10 @@ PyObject* FlattenForData(PyObject* nested);
 PyObject* AssertSameStructureForData(PyObject* o1, PyObject* o2,
                                      bool check_types);
 
+// RegisterType is used to pass PyTypeObject (which is defined in python) for an
+// arbitrary identifier `type_name` into C++.
+PyObject* RegisterType(PyObject* type_name, PyObject* type);
+
 }  // namespace swig
 }  // namespace tensorflow
 
diff --git a/tensorflow/python/util/util.i b/tensorflow/python/util/util.i
index 32a6e684fa..3c0ec87fa4 100644
--- a/tensorflow/python/util/util.i
+++ b/tensorflow/python/util/util.i
@@ -28,14 +28,8 @@ limitations under the License.
 // for functions in this module because they use python methods that need GIL.
 // TODO(iga): Find a way not to leak such definitions across files.
 
-%unignore tensorflow::swig::RegisterSequenceClass;
-%noexception tensorflow::swig::RegisterSequenceClass;
-
-%unignore tensorflow::swig::RegisterMappingClass;
-%noexception tensorflow::swig::RegisterMappingClass;
-
-%unignore tensorflow::swig::RegisterSparseTensorValueClass;
-%noexception tensorflow::swig::RegisterSparseTensorValueClass;
+%unignore tensorflow::swig::RegisterType;
+%noexception tensorflow::swig::RegisterType;
 
 %feature("docstring") tensorflow::swig::IsSequence
 """Returns a true if its input is a collections.Sequence (except strings).
-- 
GitLab


From 3aa8b781b342c36302bd500737ab4ce9b2b87a45 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 1 Oct 2018 14:07:17 -0700
Subject: [PATCH 249/570] Disable async remote tests

PiperOrigin-RevId: 215276816
---
 tensorflow/contrib/eager/python/remote_test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/eager/python/remote_test.py b/tensorflow/contrib/eager/python/remote_test.py
index ba6fe9701d..7aa4b598b8 100644
--- a/tensorflow/contrib/eager/python/remote_test.py
+++ b/tensorflow/contrib/eager/python/remote_test.py
@@ -47,8 +47,9 @@ def run_sync_and_async(f):
 
   @functools.wraps(f)
   def decorator(self, *args, **kwargs):
-    with context.execution_mode(context.ASYNC):
-      f(self, *args, **kwargs)
+    # TODO(b/117110239): Re-enable.
+    # with context.execution_mode(context.ASYNC):
+    #   f(self, *args, **kwargs)
 
     with context.execution_mode(context.SYNC):
       f(self, *args, **kwargs)
-- 
GitLab


From d7edbeb8dcc81a9cabc922ae46f549fe6b498eb9 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Mon, 1 Oct 2018 14:09:23 -0700
Subject: [PATCH 250/570] Update keras_applications to 1.0.6 and
 keras_preprocessing to 1.0.5. This removes the transitive keras and scipy
 dependencies in TensorFlow.

PiperOrigin-RevId: 215277190
---
 tensorflow/tools/ci_build/Dockerfile.cmake                | 4 ++--
 tensorflow/tools/ci_build/install/install_pip_packages.sh | 8 ++++----
 .../ci_build/install/install_python3.5_pip_packages.sh    | 4 ++--
 .../ci_build/install/install_python3.6_pip_packages.sh    | 4 ++--
 tensorflow/tools/docker/Dockerfile                        | 4 ++--
 tensorflow/tools/docker/Dockerfile.devel                  | 4 ++--
 tensorflow/tools/docker/Dockerfile.devel-gpu              | 4 ++--
 tensorflow/tools/docker/Dockerfile.devel-mkl              | 4 ++--
 tensorflow/tools/docker/Dockerfile.devel-mkl-horovod      | 4 ++--
 tensorflow/tools/docker/Dockerfile.gpu                    | 4 ++--
 tensorflow/tools/docker/Dockerfile.mkl                    | 4 ++--
 tensorflow/tools/docker/Dockerfile.mkl-horovod            | 4 ++--
 tensorflow/tools/pip_package/setup.py                     | 4 ++--
 13 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.cmake b/tensorflow/tools/ci_build/Dockerfile.cmake
index b7450c83de..ef0024fdb4 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cmake
+++ b/tensorflow/tools/ci_build/Dockerfile.cmake
@@ -28,8 +28,8 @@ RUN pip install --upgrade astor
 RUN pip install --upgrade gast
 RUN pip install --upgrade numpy
 RUN pip install --upgrade termcolor
-RUN pip install keras_applications==1.0.5
-RUN pip install keras_preprocessing==1.0.3
+RUN pip install --upgrade keras_applications
+RUN pip install --upgrade keras_preprocessing
 
 # Install golang
 RUN apt-get install -t xenial-backports -y golang-1.9
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 4ced96f90b..b90f3f3b97 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -115,10 +115,10 @@ pip2 install --upgrade setuptools==39.1.0
 pip3 install --upgrade setuptools==39.1.0
 
 # Keras
-pip2 install keras_applications==1.0.5 --no-deps
-pip3 install keras_applications==1.0.5 --no-deps
-pip2 install keras_preprocessing==1.0.3 --no-deps
-pip3 install keras_preprocessing==1.0.3 --no-deps
+pip2 install keras_applications==1.0.6 --no-deps
+pip3 install keras_applications==1.0.6 --no-deps
+pip2 install keras_preprocessing==1.0.5 --no-deps
+pip3 install keras_preprocessing==1.0.5 --no-deps
 pip2 install --upgrade h5py==2.8.0
 pip3 install --upgrade h5py==2.8.0
 
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 37e6b51f66..61d4fe3fe8 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -85,8 +85,8 @@ pip3.5 install --upgrade termcolor
 pip3.5 install --upgrade setuptools==39.1.0
 
 # Keras
-pip3.5 install keras_applications==1.0.5
-pip3.5 install keras_preprocessing==1.0.3
+pip3.5 install keras_applications==1.0.6
+pip3.5 install keras_preprocessing==1.0.5
 pip3.5 install --upgrade h5py==2.8.0
 
 # Install last working version of setuptools.
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 7520ff74cb..8949af8a88 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -102,7 +102,7 @@ pip3 install --upgrade setuptools==39.1.0
 pip3 install --upgrade h5py==2.8.0
 
 # Keras
-pip3 install keras_applications==1.0.5
-pip3 install keras_preprocessing==1.0.3
+pip3 install keras_applications==1.0.6
+pip3 install keras_preprocessing==1.0.5
 
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index b5a6c05193..205128ad58 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -29,8 +29,8 @@ RUN pip --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications==1.0.5 \
-        keras_preprocessing==1.0.3 \
+        keras_applications \
+        keras_preprocessing \
         matplotlib \
         numpy \
         pandas \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index c741e8ad0c..6f8e91fccf 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -33,8 +33,8 @@ RUN pip --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications==1.0.5 \
-        keras_preprocessing==1.0.3 \
+        keras_applications \
+        keras_preprocessing \
         matplotlib \
         mock \
         numpy \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index f544725af4..69a117fda6 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -55,8 +55,8 @@ RUN pip --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications==1.0.5 \
-        keras_preprocessing==1.0.3 \
+        keras_applications \
+        keras_preprocessing \
         matplotlib \
         mock \
         numpy \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl b/tensorflow/tools/docker/Dockerfile.devel-mkl
index db7c701289..e433e9ebb2 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl
@@ -52,8 +52,8 @@ RUN ${PIP} --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications==1.0.5 \
-        keras_preprocessing==1.0.3 \
+        keras_applications \
+        keras_preprocessing \
         matplotlib \
         mock \
         numpy \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
index 987b582d10..48f2400569 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
@@ -45,8 +45,8 @@ RUN ${PIP} --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications==1.0.5 \
-        keras_preprocessing==1.0.3 \
+        keras_applications \
+        keras_preprocessing \
         matplotlib \
         mock \
         numpy \
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 781bf9e851..7dc92a888b 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -42,8 +42,8 @@ RUN pip --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications==1.0.5 \
-        keras_preprocessing==1.0.3 \
+        keras_applications \
+        keras_preprocessing \
         matplotlib \
         numpy \
         pandas \
diff --git a/tensorflow/tools/docker/Dockerfile.mkl b/tensorflow/tools/docker/Dockerfile.mkl
index 641c9e3b16..ac41cffe4b 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl
+++ b/tensorflow/tools/docker/Dockerfile.mkl
@@ -38,8 +38,8 @@ RUN ${PIP} --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications==1.0.5 \
-        keras_preprocessing==1.0.3 \
+        keras_applications \
+        keras_preprocessing \
         matplotlib \
         numpy \
         pandas \
diff --git a/tensorflow/tools/docker/Dockerfile.mkl-horovod b/tensorflow/tools/docker/Dockerfile.mkl-horovod
index 2b11679f54..4daf4fefff 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.mkl-horovod
@@ -38,8 +38,8 @@ RUN ${PIP} --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications==1.0.5 \
-        keras_preprocessing==1.0.3 \
+        keras_applications \
+        keras_preprocessing \
         matplotlib \
         numpy \
         pandas \
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 88c9c20d36..d864a7a039 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -51,8 +51,8 @@ REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
-    'keras_applications >= 1.0.5',
-    'keras_preprocessing >= 1.0.3',
+    'keras_applications >= 1.0.6',
+    'keras_preprocessing >= 1.0.5',
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.6.1',
-- 
GitLab


From 094e1953b7df0bbb9bd4d0e3329b3b4611edf984 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Mon, 1 Oct 2018 14:14:32 -0700
Subject: [PATCH 251/570] Fix benchmark regression.

PiperOrigin-RevId: 215278033
---
 tensorflow/python/ops/conv2d_benchmark.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/conv2d_benchmark.py b/tensorflow/python/ops/conv2d_benchmark.py
index 28111c2730..f40488afbe 100644
--- a/tensorflow/python/ops/conv2d_benchmark.py
+++ b/tensorflow/python/ops/conv2d_benchmark.py
@@ -63,9 +63,9 @@ def build_graph(device, dtype, data_format, input_shape, filter_shape, strides,
     An array of tensors to run()
   """
   with ops.device("/%s:0" % device):
-    inp = variables.Variable(
+    inp = variables.VariableV1(
         random_ops.truncated_normal(input_shape, dtype=dtype))
-    filt = variables.Variable(
+    filt = variables.VariableV1(
         random_ops.truncated_normal(filter_shape, dtype=dtype))
 
     outputs = []
-- 
GitLab


From 5e3c2255b7f90146a895cd20267de699fbb15c27 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 1 Oct 2018 14:38:57 -0700
Subject: [PATCH 252/570] internal change

PiperOrigin-RevId: 215282721
---
 tensorflow/docs_src/BUILD                     |   14 -
 tensorflow/docs_src/__init__.py               |    0
 .../performance/xla/operation_semantics.md    | 2426 +++++++++++++++++
 tensorflow/tools/docs/BUILD                   |    1 -
 tensorflow/tools/docs/build_docs_test.py      |    6 +-
 5 files changed, 2430 insertions(+), 17 deletions(-)
 delete mode 100644 tensorflow/docs_src/BUILD
 delete mode 100644 tensorflow/docs_src/__init__.py
 create mode 100644 tensorflow/docs_src/performance/xla/operation_semantics.md

diff --git a/tensorflow/docs_src/BUILD b/tensorflow/docs_src/BUILD
deleted file mode 100644
index 34bf7b6a11..0000000000
--- a/tensorflow/docs_src/BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-# Files used to generate TensorFlow docs.
-
-licenses(["notice"])  # Apache 2.0
-
-package(
-    default_visibility = ["//tensorflow:internal"],
-)
-
-exports_files(["LICENSE"])
-
-filegroup(
-    name = "docs_src",
-    data = glob(["**/*.md"]),
-)
diff --git a/tensorflow/docs_src/__init__.py b/tensorflow/docs_src/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
new file mode 100644
index 0000000000..96d269bec4
--- /dev/null
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -0,0 +1,2426 @@
+# Operation Semantics
+
+The following describes the semantics of operations defined in the
+[`XlaBuilder`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
+interface. Typically, these operations map one-to-one to operations defined in
+the RPC interface in
+[`xla_data.proto`](https://www.tensorflow.org/code/tensorflow/compiler/xla/xla_data.proto).
+
+A note on nomenclature: the generalized data type XLA deals with is an
+N-dimensional array holding elements of some uniform type (such as 32-bit
+float). Throughout the documentation, *array* is used to denote an
+arbitrary-dimensional array. For convenience, special cases have more specific
+and familiar names; for example a *vector* is a 1-dimensional array and a
+*matrix* is a 2-dimensional array.
+
+## AllToAll
+
+See also
+[`XlaBuilder::AllToAll`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Alltoall is a collective operation that sends data from all cores to all cores.
+It has two phases:
+
+1.  the scatter phase. On each core, the operand is split into `split_count`
+    number of blocks along the `split_dimensions`, and the blocks are scattered
+    to all cores, e.g., the ith block is send to the ith core.
+2.  the gather phase. Each core concatenates the received blocks along the
+    `concat_dimension`.
+
+The participating cores can be configured by:
+
+-   `replica_groups`: each ReplicaGroup contains a list of replica id. If empty,
+    all replicas belong to one group in the order of 0 - (n-1). Alltoall will be
+    applied within subgroups in the specified order. For example, replica
+    groups = {{1,2,3},{4,5,0}} means, an Alltoall will be applied within replica
+    1, 2, 3, and in the gather phase, the received blocks will be concatenated
+    in the order of 1, 2, 3; another Alltoall will be applied within replica 4,
+    5, 0, and the concatenation order is 4, 5, 0.
+
+Prerequisites:
+
+-   The dimension size of the operand on the split_dimension is divisible by
+    split_count.
+-   The operand's shape is not tuple.
+
+<b> `AllToAll(operand, split_dimension, concat_dimension, split_count,
+replica_groups)` </b>
+
+
+| Arguments          | Type                  | Semantics                       |
+| ------------------ | --------------------- | ------------------------------- |
+| `operand`          | `XlaOp`               | n dimensional input array       |
+| `split_dimension`  | `int64`               | A value in the interval `[0,    |
+:                    :                       : n)` that names the dimension    :
+:                    :                       : along which the operand is      :
+:                    :                       : split                           :
+| `concat_dimension` | `int64`               | a value in the interval `[0,    |
+:                    :                       : n)` that names the dimension    :
+:                    :                       : along which the split blocks    :
+:                    :                       : are concatenated                :
+| `split_count`      | `int64`               | the number of cores that        |
+:                    :                       : participate this operation. If  :
+:                    :                       : `replica_groups` is empty, this :
+:                    :                       : should be the number of         :
+:                    :                       : replicas; otherwise, this       :
+:                    :                       : should be equal to the number   :
+:                    :                       : of replicas in each group.      :
+| `replica_groups`   | `ReplicaGroup` vector | each group contains a list of   |
+:                    :                       : replica id.                     :
+
+Below shows an example of Alltoall.
+
+```
+XlaBuilder b("alltoall");
+auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
+AllToAll(x, /*split_dimension=*/1, /*concat_dimension=*/0, /*split_count=*/4);
+```
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="../../images/xla/ops_alltoall.png">
+</div>
+
+In this example, there are 4 cores participating the Alltoall. On each core, the
+operand is split into 4 parts along dimension 0, so each part has shape
+f32[4,4]. The 4 parts are scattered to all cores. Then each core concatenates
+the received parts along dimension 1, in the order or core 0-4. So the output on
+each core has shape f32[16,4].
+
+## BatchNormGrad
+
+See also
+[`XlaBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
+and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
+for a detailed description of the algorithm.
+
+Calculates gradients of batch norm.
+
+<b> `BatchNormGrad(operand, scale, mean, variance, grad_output, epsilon, feature_index)` </b>
+
+| Arguments       | Type                    | Semantics                        |
+| --------------- | ----------------------- | -------------------------------- |
+| `operand`       | `XlaOp`                 | n dimensional array to be        |
+:                 :                         : normalized (x)                   :
+| `scale`         | `XlaOp`                 | 1 dimensional array              |
+:                 :                         : (\\(\gamma\\))                   :
+| `mean`          | `XlaOp`                 | 1 dimensional array (\\(\mu\\))  |
+| `variance`      | `XlaOp`                 | 1 dimensional array              |
+:                 :                         : (\\(\sigma^2\\))                 :
+| `grad_output`   | `XlaOp`                 | Gradients passed to              |
+:                 :                         : `BatchNormTraining`              :
+:                 :                         : (\\( \nabla y\\))                :
+| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
+| `feature_index` | `int64`                 | Index to feature dimension in    |
+:                 :                         : `operand`                        :
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the gradients with
+respect to `operand`, `offset` and `scale` across all the other dimensions. The
+`feature_index` must be a valid index for the feature dimension in `operand`.
+
+The three gradients are defined by the following formulas (assuming a
+4-dimensional tensor as `operand` and with feature dimension index \\(l\\),
+batch size `m` and spatial sizes `w` and `h`):
+
+\\[ \begin{split} c_l&=
+\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h
+\left( \nabla y_{ijkl} \frac{x_{ijkl} - \mu_l}{\sigma^2_l+\epsilon} \right)
+\\\\
+\nabla x_{ijkl} &= \frac{\gamma_{l}}{\sqrt{\sigma^2_{l}+\epsilon}}
+\left( \nabla y_{ijkl} - \mathrm{mean}(\nabla y) - c_l (x_{ijkl} - \mu_{l})
+\right)
+\\\\
+\nabla \gamma_l &= \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \left( \nabla y_{ijkl}
+\frac{x_{ijkl} - \mu_l}{\sqrt{\sigma^2_{l}+\epsilon}} \right)
+\\\\\
+\nabla \beta_l &= \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \nabla y_{ijkl}
+\end{split} \\]
+
+The inputs `mean` and `variance` represent moments value
+across batch and spatial dimensions.
+
+The output type is a tuple of three handles:
+
+| Outputs        | Type                    | Semantics                         |
+| -------------  | ----------------------- | --------------------------------- |
+| `grad_operand` | `XlaOp`                 | gradient with respect to input    |
+:                :                         : `operand` (\\( \nabla x\\))       :
+| `grad_scale`   | `XlaOp`                 | gradient with respect to input    |
+:                :                         : `scale` (\\( \nabla \gamma\\))    :
+| `grad_offset`  | `XlaOp`                 | gradient with respect to input    |
+:                :                         : `offset`(\\( \nabla \beta\\))     :
+
+## BatchNormInference
+
+See also
+[`XlaBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
+and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
+for a detailed description of the algorithm.
+
+Normalizes an array across batch and spatial dimensions.
+
+<b> `BatchNormInference(operand, scale, offset, mean, variance, epsilon, feature_index)` </b>
+
+Arguments       | Type    | Semantics
+--------------- | ------- | ---------------------------------------
+`operand`       | `XlaOp` | n dimensional array to be normalized
+`scale`         | `XlaOp` | 1 dimensional array
+`offset`        | `XlaOp` | 1 dimensional array
+`mean`          | `XlaOp` | 1 dimensional array
+`variance`      | `XlaOp` | 1 dimensional array
+`epsilon`       | `float` | Epsilon value
+`feature_index` | `int64` | Index to feature dimension in `operand`
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the mean and variance
+across all the other dimensions and uses the mean and variance to normalize each
+element in `operand`. The `feature_index` must be a valid index for the feature
+dimension in `operand`.
+
+`BatchNormInference`  is equivalent to calling `BatchNormTraining` without
+computing `mean` and `variance` for each batch. It uses the input `mean` and
+`variance` instead as estimated values. The purpose of this op is to reduce
+latency in inference, hence the name `BatchNormInference`.
+
+The output is an n-dimensional, normalized array with the same shape as input
+`operand`.
+
+## BatchNormTraining
+
+See also
+[`XlaBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
+and [`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
+for a detailed description of the algorithm.
+
+Normalizes an array across batch and spatial dimensions.
+
+<b> `BatchNormTraining(operand, scale, offset, epsilon, feature_index)` </b>
+
+Arguments       | Type    | Semantics
+--------------- | ------- | ----------------------------------------
+`operand`       | `XlaOp` | n dimensional array to be normalized (x)
+`scale`         | `XlaOp` | 1 dimensional array (\\(\gamma\\))
+`offset`        | `XlaOp` | 1 dimensional array (\\(\beta\\))
+`epsilon`       | `float` | Epsilon value (\\(\epsilon\\))
+`feature_index` | `int64` | Index to feature dimension in `operand`
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the mean and variance
+across all the other dimensions and uses the mean and variance to normalize each
+element in `operand`. The `feature_index` must be a valid index for the feature
+dimension in `operand`.
+
+The algorithm goes as follows for each batch in `operand` \\(x\\) that
+contains `m` elements with `w` and `h` as the size of spatial dimensions
+(assuming `operand` is an 4 dimensional array):
+
+- Calculates batch mean \\(\mu_l\\) for each feature `l` in feature dimension:
+\\(\mu_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h x_{ijkl}\\)
+
+- Calculates batch variance \\(\sigma^2_l\\):
+\\(\sigma^2_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h (x_{ijkl} - \mu_l)^2\\)
+
+- Normalizes, scales and shifts:
+\\(y_{ijkl}=\frac{\gamma_l(x_{ijkl}-\mu_l)}{\sqrt[2]{\sigma^2_l+\epsilon}}+\beta_l\\)
+
+The epsilon value, usually a small number, is added to avoid divide-by-zero errors.
+
+The output type is a tuple of three `XlaOp`s:
+
+| Outputs      | Type                    | Semantics                            |
+| ------------ | ----------------------- | -------------------------------------|
+| `output`     | `XlaOp`                 | n dimensional array with the same    |
+:              :                         : shape as input `operand` (y)         :
+| `batch_mean` | `XlaOp`                 | 1 dimensional array (\\(\mu\\))      |
+| `batch_var`  | `XlaOp`                 | 1 dimensional array (\\(\sigma^2\\)) |
+
+The `batch_mean` and `batch_var` are moments calculated across the batch and
+spatial dimensions using the formulas above.
+
+## BitcastConvertType
+
+See also
+[`XlaBuilder::BitcastConvertType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Similar to a `tf.bitcast` in TensorFlow, performs an element-wise bitcast
+operation from a data shape to a target shape. The dimensions must match, and
+the conversion is an element-wise one; e.g. `s32` elements become `f32` elements
+via bitcast routine. Bitcast is implemented as a low-level cast, so machines
+with different floating-point representations will give different results.
+
+<b> `BitcastConvertType(operand, new_element_type)` </b>
+
+Arguments          | Type            | Semantics
+------------------ | --------------- | ---------------------------
+`operand`          | `XlaOp`         | array of type T with dims D
+`new_element_type` | `PrimitiveType` | type U
+
+The dimensions of the operand and the target shape must match. The bit-width of
+the source and destination element types must be equal. The source
+and destination element types must not be tuples.
+
+## Broadcast
+
+See also
+[`XlaBuilder::Broadcast`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Adds dimensions to an array by duplicating the data in the array.
+
+<b> `Broadcast(operand, broadcast_sizes)` </b>
+
+Arguments         | Type                | Semantics
+----------------- | ------------------- | -------------------------------
+`operand`         | `XlaOp`             | The array to duplicate
+`broadcast_sizes` | `ArraySlice<int64>` | The sizes of the new dimensions
+
+The new dimensions are inserted on the left, i.e. if `broadcast_sizes` has
+values `{a0, ..., aN}` and the operand shape has dimensions `{b0, ..., bM}` then
+the shape of the output has dimensions `{a0, ..., aN, b0, ..., bM}`.
+
+The new dimensions index into copies of the operand, i.e.
+
+```
+output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
+```
+
+For example, if `operand` is a scalar `f32` with value `2.0f`, and
+`broadcast_sizes` is `{2, 3}`, then the result will be an array with shape
+`f32[2, 3]` and all the values in the result will be `2.0f`.
+
+## Call
+
+See also
+[`XlaBuilder::Call`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Invokes a computation with the given arguments.
+
+<b> `Call(computation, args...)` </b>
+
+| Arguments     | Type                   | Semantics                           |
+| ------------- | ---------------------- | ----------------------------------- |
+| `computation` | `XlaComputation`       | computation of type `T_0, T_1, ..., |
+:               :                        : T_N -> S` with N parameters of      :
+:               :                        : arbitrary type                      :
+| `args`        | sequence of N `XlaOp`s | N arguments of arbitrary type       |
+
+The arity and types of the `args` must match the parameters of the
+`computation`. It is allowed to have no `args`.
+
+## Clamp
+
+See also
+[`XlaBuilder::Clamp`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Clamps an operand to within the range between a minimum and maximum value.
+
+<b> `Clamp(min, operand, max)` </b>
+
+Arguments | Type    | Semantics
+--------- | ------- | ---------------
+`min`     | `XlaOp` | array of type T
+`operand` | `XlaOp` | array of type T
+`max`     | `XlaOp` | array of type T
+
+Given an operand and minimum and maximum values, returns the operand if it is in
+the range between the minimum and maximum, else returns the minimum value if the
+operand is below this range or the maximum value if the operand is above this
+range.  That is, `clamp(a, x, b) =  min(max(a, x), b)`.
+
+All three arrays must be the same shape. Alternatively, as a restricted form of
+[broadcasting](broadcasting.md), `min` and/or `max` can be a scalar of type `T`.
+
+Example with scalar `min` and `max`:
+
+```
+let operand: s32[3] = {-1, 5, 9};
+let min: s32 = 0;
+let max: s32 = 6;
+==>
+Clamp(min, operand, max) = s32[3]{0, 5, 6};
+```
+
+## Collapse
+
+See also
+[`XlaBuilder::Collapse`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
+and the `tf.reshape` operation.
+
+Collapses dimensions of an array into one dimension.
+
+<b> `Collapse(operand, dimensions)` </b>
+
+Arguments    | Type           | Semantics
+------------ | -------------- | -----------------------------------------------
+`operand`    | `XlaOp`        | array of type T
+`dimensions` | `int64` vector | in-order, consecutive subset of T's dimensions.
+
+Collapse replaces the given subset of the operand's dimensions by a single
+dimension. The input arguments are an arbitrary array of type T and a
+compile-time-constant vector of dimension indices. The dimension indices must be
+an in-order (low to high dimension numbers), consecutive subset of T's
+dimensions. Thus, {0, 1, 2}, {0, 1}, or {1, 2} are all valid dimension sets, but
+{1, 0} or {0, 2} are not. They are replaced by a single new dimension, in the
+same position in the dimension sequence as those they replace, with the new
+dimension size equal to the product of original dimension sizes. The lowest
+dimension number in `dimensions` is the slowest varying dimension (most major)
+in the loop nest which collapses these dimension, and the highest dimension
+number is fastest varying (most minor). See the `tf.reshape` operator
+if more general collapse ordering is needed.
+
+For example, let v be an array of 24 elements:
+
+```
+let v = f32[4x2x3] {{{10, 11, 12},  {15, 16, 17}},
+                    {{20, 21, 22},  {25, 26, 27}},
+                    {{30, 31, 32},  {35, 36, 37}},
+                    {{40, 41, 42},  {45, 46, 47}}};
+
+// Collapse to a single dimension, leaving one dimension.
+let v012 = Collapse(v, {0,1,2});
+then v012 == f32[24] {10, 11, 12, 15, 16, 17,
+                      20, 21, 22, 25, 26, 27,
+                      30, 31, 32, 35, 36, 37,
+                      40, 41, 42, 45, 46, 47};
+
+// Collapse the two lower dimensions, leaving two dimensions.
+let v01 = Collapse(v, {0,1});
+then v01 == f32[4x6] {{10, 11, 12, 15, 16, 17},
+                      {20, 21, 22, 25, 26, 27},
+                      {30, 31, 32, 35, 36, 37},
+                      {40, 41, 42, 45, 46, 47}};
+
+// Collapse the two higher dimensions, leaving two dimensions.
+let v12 = Collapse(v, {1,2});
+then v12 == f32[8x3] {{10, 11, 12},
+                      {15, 16, 17},
+                      {20, 21, 22},
+                      {25, 26, 27},
+                      {30, 31, 32},
+                      {35, 36, 37},
+                      {40, 41, 42},
+                      {45, 46, 47}};
+
+```
+
+## Concatenate
+
+See also
+[`XlaBuilder::ConcatInDim`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Concatenate composes an array from multiple array operands. The array is of the
+same rank as each of the input array operands (which must be of the same rank as
+each other) and contains the arguments in the order that they were specified.
+
+<b> `Concatenate(operands..., dimension)` </b>
+
+| Arguments   | Type                  | Semantics                              |
+| ----------- | --------------------- | -------------------------------------- |
+| `operands`  | sequence of N `XlaOp` | N arrays of type T with dimensions     |
+:             :                       : [L0, L1, ...]. Requires N >= 1.        :
+| `dimension` | `int64`               | A value in the interval `[0, N)` that  |
+:             :                       : names the dimension to be concatenated :
+:             :                       : between the `operands`.                :
+
+With the exception of `dimension` all dimensions must be the same. This is
+because XLA does not support "ragged" arrays. Also note that rank-0 values
+cannot be concatenated (as it's impossible to name the dimension along which the
+concatenation occurs).
+
+1-dimensional example:
+
+```
+Concat({{2, 3}, {4, 5}, {6, 7}}, 0)
+>>> {2, 3, 4, 5, 6, 7}
+```
+
+2-dimensional example:
+
+```
+let a = {
+  {1, 2},
+  {3, 4},
+  {5, 6},
+};
+let b = {
+  {7, 8},
+};
+Concat({a, b}, 0)
+>>> {
+  {1, 2},
+  {3, 4},
+  {5, 6},
+  {7, 8},
+}
+```
+
+Diagram:
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="https://www.tensorflow.org/images/ops_concatenate.png">
+</div>
+
+## Conditional
+
+See also
+[`XlaBuilder::Conditional`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+<b> `Conditional(pred, true_operand, true_computation, false_operand,
+false_computation)` </b>
+
+Arguments           | Type             | Semantics
+------------------- | ---------------- | ---------------------------------
+`pred`              | `XlaOp`          | Scalar of type `PRED`
+`true_operand`      | `XlaOp`          | Argument of type `T_0`
+`true_computation`  | `XlaComputation` | XlaComputation of type `T_0 -> S`
+`false_operand`     | `XlaOp`          | Argument of type `T_1`
+`false_computation` | `XlaComputation` | XlaComputation of type `T_1 -> S`
+
+Executes `true_computation` if `pred` is `true`, `false_computation` if `pred`
+is `false`, and returns the result.
+
+The `true_computation` must take in a single argument of type `T_0` and will be
+invoked with `true_operand` which must be of the same type. The
+`false_computation` must take in a single argument of type `T_1` and will be
+invoked with `false_operand` which must be of the same type. The type of the
+returned value of `true_computation` and `false_computation` must be the same.
+
+Note that only one of `true_computation` and `false_computation` will be
+executed depending on the value of `pred`.
+
+## Conv (convolution)
+
+See also
+[`XlaBuilder::Conv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+As ConvWithGeneralPadding, but the padding is specified in a short-hand way as
+either SAME or VALID. SAME padding pads the input (`lhs`) with zeroes so that
+the output has the same shape as the input when not taking striding into
+account. VALID padding simply means no padding.
+
+## ConvWithGeneralPadding (convolution)
+
+See also
+[`XlaBuilder::ConvWithGeneralPadding`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Computes a convolution of the kind used in neural networks. Here, a convolution
+can be thought of as a n-dimensional window moving across a n-dimensional base
+area and a computation is performed for each possible position of the window.
+
+| Arguments             | Type                 | Semantics                     |
+| --------------------- | -------------------- | ----------------------------- |
+| `lhs`                 | `XlaOp`              | rank n+2 array of inputs      |
+| `rhs`                 | `XlaOp`              | rank n+2 array of kernel      |
+:                       :                      : weights                       :
+| `window_strides`      | `ArraySlice<int64>`  | n-d array of kernel strides   |
+| `padding`             | `ArraySlice<         | n-d array of (low, high)      |
+:                       : pair<int64, int64>>` : padding                       :
+| `lhs_dilation`        | `ArraySlice<int64>`  | n-d lhs dilation factor array |
+| `rhs_dilation`        | `ArraySlice<int64>`  | n-d rhs dilation factor array |
+| `feature_group_count` | int64                | the number of feature groups  |
+
+Let n be the number of spatial dimensions. The `lhs` argument is a rank n+2
+array describing the base area. This is called the input, even though of course
+the rhs is also an input. In a neural network, these are the input activations.
+The n+2 dimensions are, in this order:
+
+*   `batch`: Each coordinate in this dimension represents an independent input
+    for which convolution is carried out.
+*   `z/depth/features`: Each (y,x) position in the base area has a vector
+    associated to it, which goes into this dimension.
+*   `spatial_dims`: Describes the `n` spatial dimensions that define the base
+    area that the window moves across.
+
+The `rhs` argument is a rank n+2 array describing the convolutional
+filter/kernel/window. The dimensions are, in this order:
+
+*   `output-z`: The `z` dimension of the output.
+*   `input-z`: The size of this dimension times `feature_group_count` should
+    equal the size of the `z` dimension in lhs.
+*   `spatial_dims`: Describes the `n` spatial dimensions that define the n-d
+    window that moves across the base area.
+
+The `window_strides` argument specifies the stride of the convolutional window
+in the spatial dimensions. For example, if the stride in the first spatial
+dimension is 3, then the window can only be placed at coordinates where the
+first spatial index is divisible by 3.
+
+The `padding` argument specifies the amount of zero padding to be applied to the
+base area. The amount of padding can be negative -- the absolute value of
+negative padding indicates the number of elements to remove from the specified
+dimension before doing the convolution. `padding[0]` specifies the padding for
+dimension `y` and `padding[1]` specifies the padding for dimension `x`. Each
+pair has the low padding as the first element and the high padding as the second
+element. The low padding is applied in the direction of lower indices while the
+high padding is applied in the direction of higher indices. For example, if
+`padding[1]` is `(2,3)` then there will be a padding by 2 zeroes on the left and
+by 3 zeroes on the right in the second spatial dimension. Using padding is
+equivalent to inserting those same zero values into the input (`lhs`) before
+doing the convolution.
+
+The `lhs_dilation` and `rhs_dilation` arguments specify the dilation factor to
+be applied to the lhs and rhs, respectively, in each spatial dimension. If the
+dilation factor in a spatial dimension is d, then d-1 holes are implicitly
+placed between each of the entries in that dimension, increasing the size of the
+array. The holes are filled with a no-op value, which for convolution means
+zeroes.
+
+Dilation of the rhs is also called atrous convolution. For more details, see
+`tf.nn.atrous_conv2d`. Dilation of the lhs is also called transposed
+convolution. For more details, see `tf.nn.conv2d_transpose`.
+
+The `feature_group_count` argument (default value 1) can be used for grouped
+convolutions. `feature_group_count` needs to be a divisor of both the input and
+the output feature dimension. If `feature_group_count` is greater than 1, it
+means that conceptually the input and output feature dimension and the `rhs`
+output feature dimension are split evenly into `feature_group_count` many
+groups, each group consisting of a consecutive subsequence of features. The
+input feature dimension of `rhs` needs to be equal to the `lhs` input feature
+dimension divided by `feature_group_count` (so it already has the size of a
+group of input features). The i-th groups are used together to compute
+`feature_group_count` many separate convolutions. The results of these
+convolutions are concatenated together in the output feature dimension.
+
+For depthwise convolution the `feature_group_count` argument would be set to the
+input feature dimension, and the filter would be reshaped from
+`[filter_height, filter_width, in_channels, channel_multiplier]` to
+`[filter_height, filter_width, 1, in_channels * channel_multiplier]`. For more
+details, see `tf.nn.depthwise_conv2d`.
+
+The output shape has these dimensions, in this order:
+
+*   `batch`: Same size as `batch` on the input (`lhs`).
+*   `z`: Same size as `output-z` on the kernel (`rhs`).
+*   `spatial_dims`: One value for each valid placement of the convolutional
+    window.
+
+The valid placements of the convolutional window are determined by the strides
+and the size of the base area after padding.
+
+To describe what a convolution does, consider a 2d convolution, and pick some
+fixed `batch`, `z`, `y`, `x` coordinates in the output. Then `(y,x)` is a
+position of a corner of the window within the base area (e.g. the upper left
+corner, depending on how you interpret the spatial dimensions). We now have a 2d
+window, taken from the base area, where each 2d point is associated to a 1d
+vector, so we get a 3d box. From the convolutional kernel, since we fixed the
+output coordinate `z`, we also have a 3d box. The two boxes have the same
+dimensions, so we can take the sum of the element-wise products between the two
+boxes (similar to a dot product). That is the output value.
+
+Note that if `output-z` is e.g., 5, then each position of the window produces 5
+values in the output into the `z` dimension of the output. These values differ
+in what part of the convolutional kernel is used - there is a separate 3d box of
+values used for each `output-z` coordinate. So you could think of it as 5
+separate convolutions with a different filter for each of them.
+
+Here is pseudo-code for a 2d convolution with padding and striding:
+
+```
+for (b, oz, oy, ox) {  // output coordinates
+  value = 0;
+  for (iz, ky, kx) {  // kernel coordinates and input z
+    iy = oy*stride_y + ky - pad_low_y;
+    ix = ox*stride_x + kx - pad_low_x;
+    if ((iy, ix) inside the base area considered without padding) {
+      value += input(b, iz, iy, ix) * kernel(oz, iz, ky, kx);
+    }
+  }
+  output(b, oz, oy, ox) = value;
+}
+```
+
+## ConvertElementType
+
+See also
+[`XlaBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Similar to an element-wise `static_cast` in C++, performs an element-wise
+conversion operation from a data shape to a target shape. The dimensions must
+match, and the conversion is an element-wise one; e.g. `s32` elements become
+`f32` elements via an `s32`-to-`f32` conversion routine.
+
+<b> `ConvertElementType(operand, new_element_type)` </b>
+
+Arguments          | Type            | Semantics
+------------------ | --------------- | ---------------------------
+`operand`          | `XlaOp`         | array of type T with dims D
+`new_element_type` | `PrimitiveType` | type U
+
+The dimensions of the operand and the target shape must match. The source and
+destination element types must not be tuples.
+
+A conversion such as `T=s32` to `U=f32` will perform a normalizing int-to-float
+conversion routine such as round-to-nearest-even.
+
+> Note: The precise float-to-int and visa-versa conversions are currently
+> unspecified, but may become additional arguments to the convert operation in
+> the future.  Not all possible conversions have been implemented for all
+>targets.
+
+```
+let a: s32[3] = {0, 1, 2};
+let b: f32[3] = convert(a, f32);
+then b == f32[3]{0.0, 1.0, 2.0}
+```
+
+## CrossReplicaSum
+
+See also
+[`XlaBuilder::CrossReplicaSum`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Computes a sum across replicas.
+
+<b> `CrossReplicaSum(operand)` </b>
+
+Arguments | Type    | Semantics
+--------- | ------- | -----------------------------
+`operand` | `XlaOp` | Array to sum across replicas.
+| `replica_group_ids`    | `int64` vector | Group ID for each replica.      |
+
+The output shape is the same as the input shape. For example, if there are two
+replicas and the operand has the value `(1.0, 2.5)` and `(3.0, 5.25)`
+respectively on the two replicas, then the output value from this op will be
+`(4.0, 7.75)` on both replicas.
+
+`replica_group_ids` identifies the group ID of each replica. The group ID must
+either be empty (all replicas belong to a single group), or contain the same
+number of elements as the number of replicas. For example, if
+`replica_group_ids` = {0, 1, 2, 3, 0, 1, 2, 3} has eight replicas, there are
+four subgroups of replica IDs: {0, 4}, {1, 5}, {2, 6}, and {3, 7}. The size of
+each subgroup *must* be identical, so, for example, using:
+`replica_group_ids` = {0, 1, 2, 0} for four replicas is invalid.
+
+Computing the result of CrossReplicaSum requires having one input from each
+replica, so if one replica executes a CrossReplicaSum node more times than
+another, then the former replica will wait forever. Since the replicas are all
+running the same program, there are not a lot of ways for that to happen, but it
+is possible when a while loop's condition depends on data from infeed and the
+data that is infed causes the while loop to iterate more times on one replica
+than another.
+
+## CustomCall
+
+See also
+[`XlaBuilder::CustomCall`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Call a user-provided function within a computation.
+
+<b> `CustomCall(target_name, args..., shape)` </b>
+
+| Arguments     | Type                   | Semantics                         |
+| ------------- | ---------------------- | --------------------------------- |
+| `target_name` | `string`               | Name of the function. A call      |
+:               :                        : instruction will be emitted which :
+:               :                        : targets this symbol name.         :
+| `args`        | sequence of N `XlaOp`s | N arguments of arbitrary type,    |
+:               :                        : which will be passed to the       :
+:               :                        : function.                         :
+| `shape`       | `Shape`                | Output shape of the function      |
+
+The function signature is the same, regardless of the arity or type of args:
+
+```
+extern "C" void target_name(void* out, void** in);
+```
+
+For example, if CustomCall is used as follows:
+
+```
+let x = f32[2] {1,2};
+let y = f32[2x3] {{10, 20, 30}, {40, 50, 60}};
+
+CustomCall("myfunc", {x, y}, f32[3x3])
+```
+
+Here is an example of an implementation of `myfunc`:
+
+```
+extern "C" void myfunc(void* out, void** in) {
+  float (&x)[2] = *static_cast<float(*)[2]>(in[0]);
+  float (&y)[2][3] = *static_cast<float(*)[2][3]>(in[1]);
+  EXPECT_EQ(1, x[0]);
+  EXPECT_EQ(2, x[1]);
+  EXPECT_EQ(10, y[0][0]);
+  EXPECT_EQ(20, y[0][1]);
+  EXPECT_EQ(30, y[0][2]);
+  EXPECT_EQ(40, y[1][0]);
+  EXPECT_EQ(50, y[1][1]);
+  EXPECT_EQ(60, y[1][2]);
+  float (&z)[3][3] = *static_cast<float(*)[3][3]>(out);
+  z[0][0] = x[1] + y[1][0];
+  // ...
+}
+```
+
+The user-provided function must not have side-effects and its execution must be
+idempotent.
+
+> Note: The opaque nature of the user-provided function restricts optimization
+> opportunities for the compiler. Try to express your computation in terms of
+> native XLA ops whenever possible; only use CustomCall as a last resort.
+
+## Dot
+
+See also
+[`XlaBuilder::Dot`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+<b> `Dot(lhs, rhs)` </b>
+
+Arguments | Type    | Semantics
+--------- | ------- | ---------------
+`lhs`     | `XlaOp` | array of type T
+`rhs`     | `XlaOp` | array of type T
+
+The exact semantics of this operation depend on the ranks of the operands:
+
+| Input                   | Output                | Semantics               |
+| ----------------------- | --------------------- | ----------------------- |
+| vector [n] `dot` vector | scalar                | vector dot product      |
+: [n]                     :                       :                         :
+| matrix [m x k] `dot`    | vector [m]            | matrix-vector           |
+: vector [k]              :                       : multiplication          :
+| matrix [m x k] `dot`    | matrix [m x n]        | matrix-matrix           |
+: matrix [k x n]          :                       : multiplication          :
+
+The operation performs sum of products over the last dimension of `lhs` and the
+one-before-last dimension of `rhs`. These are the "contracted" dimensions. The
+contracted dimensions of `lhs` and `rhs` must be of the same size. In practice,
+it can be used to perform dot products between vectors, vector/matrix
+multiplications or matrix/matrix multiplications.
+
+## DotGeneral
+
+See also
+[`XlaBuilder::DotGeneral`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+<b> `DotGeneral(lhs, rhs, dimension_numbers)` </b>
+
+Arguments           | Type                  | Semantics
+------------------- | --------------------- | ---------------
+`lhs`               | `XlaOp`               | array of type T
+`rhs`               | `XlaOp`               | array of type T
+`dimension_numbers` | `DotDimensionNumbers` | array of type T
+
+As Dot, but allows contracting and batch dimension numbers to be specified for
+both the 'lhs' and 'rhs'.
+
+| DotDimensionNumbers Fields | Type                    | Semantics
+| --------- | ----------------------- | ---------------
+| 'lhs_contracting_dimensions' | repeated int64 | 'lhs' contracting dimension numbers |
+| 'rhs_contracting_dimensions' | repeated int64 | 'rhs' contracting dimension numbers |
+| 'lhs_batch_dimensions' | repeated int64 | 'lhs' batch dimension numbers |
+| 'rhs_batch_dimensions' | repeated int64 | 'rhs' batch dimension numbers |
+
+DotGeneral performs the sum of products over contracting dimensions specified
+in 'dimension_numbers'.
+
+Associated contracting dimension numbers from the 'lhs' and 'rhs' do not need
+to be the same, but must be listed in the same order in both
+'lhs/rhs_contracting_dimensions' arrays and have the same dimension sizes.
+There must be exactly one contracting dimension on both 'lhs' and 'rhs'.
+
+Example with contracting dimension numbers:
+
+```
+lhs = { {1.0, 2.0, 3.0},
+        {4.0, 5.0, 6.0} }
+
+rhs = { {1.0, 1.0, 1.0},
+        {2.0, 2.0, 2.0} }
+
+DotDimensionNumbers dnums;
+dnums.add_lhs_contracting_dimensions(1);
+dnums.add_rhs_contracting_dimensions(1);
+
+DotGeneral(lhs, rhs, dnums) -> { {6.0, 12.0},
+                                 {15.0, 30.0} }
+```
+
+Associated batch dimension numbers from the 'lhs' and 'rhs' must have the same
+dimension number, must be listed in the same order in both arrays, must
+have the same dimension sizes, and must be ordered before contracting and
+non-contracting/non-batch dimension numbers.
+
+Example with batch dimension numbers (batch size 2, 2x2 matrices):
+
+```
+lhs = { { {1.0, 2.0},
+          {3.0, 4.0} },
+        { {5.0, 6.0},
+          {7.0, 8.0} } }
+
+rhs = { { {1.0, 0.0},
+          {0.0, 1.0} },
+        { {1.0, 0.0},
+          {0.0, 1.0} } }
+
+DotDimensionNumbers dnums;
+dnums.add_lhs_contracting_dimensions(2);
+dnums.add_rhs_contracting_dimensions(1);
+dnums.add_lhs_batch_dimensions(0);
+dnums.add_rhs_batch_dimensions(0);
+
+DotGeneral(lhs, rhs, dnums) -> { { {1.0, 2.0},
+                                   {3.0, 4.0} },
+                                 { {5.0, 6.0},
+                                   {7.0, 8.0} } }
+```
+
+| Input                               | Output            | Semantics        |
+| ----------------------------------- | ----------------- | ---------------- |
+| [b0, m, k] `dot` [b0, k, n]         | [b0, m, n]        |  batch matmul    |
+| [b0, b1, m, k] `dot` [b0, b1, k, n] | [b0, b1, m, n]    |  batch matmul    |
+
+It follows that the resulting dimension number starts with the batch dimension,
+then the 'lhs' non-contracting/non-batch dimension, and finally the 'rhs'
+non-contracting/non-batch dimension.
+
+## DynamicSlice
+
+See also
+[`XlaBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+DynamicSlice extracts a sub-array from the input array at dynamic
+`start_indices`. The size of the slice in each dimension is passed in
+`size_indices`, which specify the end point of exclusive slice intervals in each
+dimension: [start, start + size). The shape of `start_indices` must be rank ==
+1, with dimension size equal to the rank of `operand`.
+
+<b> `DynamicSlice(operand, start_indices, size_indices)` </b>
+
+| Arguments       | Type                | Semantics                           |
+| --------------- | ------------------- | ----------------------------------- |
+| `operand`       | `XlaOp`             | N dimensional array of type T       |
+| `start_indices` | `XlaOp`             | Rank 1 array of N integers          |
+:                 :                     : containing the starting indices of  :
+:                 :                     : the slice for each dimension. Value :
+:                 :                     : must be greater than or equal to    :
+:                 :                     : zero.                               :
+| `size_indices`  | `ArraySlice<int64>` | List of N integers containing the   |
+:                 :                     : slice size for each dimension. Each :
+:                 :                     : value must be strictly greater than :
+:                 :                     : zero, and start + size must be less :
+:                 :                     : than or equal to the size of the    :
+:                 :                     : dimension to avoid wrapping modulo  :
+:                 :                     : dimension size.                     :
+
+The effective slice indices are computed by applying the following
+transformation for each index `i` in `[1, N)` before performing the slice:
+
+```
+start_indices[i] = clamp(start_indices[i], 0, operand.dimension_size[i] - size_indices[i])
+```
+
+This ensures that the extracted slice is always in-bounds with respect to the
+operand array. If the slice is in-bounds before the transformation is applied,
+the transformation has no effect.
+
+1-dimensional example:
+
+```
+let a = {0.0, 1.0, 2.0, 3.0, 4.0}
+let s = {2}
+
+DynamicSlice(a, s, {2}) produces:
+  {2.0, 3.0}
+```
+
+2-dimensional example:
+
+```
+let b =
+ { {0.0,  1.0,  2.0},
+   {3.0,  4.0,  5.0},
+   {6.0,  7.0,  8.0},
+   {9.0, 10.0, 11.0} }
+let s = {2, 1}
+
+DynamicSlice(b, s, {2, 2}) produces:
+  { { 7.0,  8.0},
+    {10.0, 11.0} }
+```
+## DynamicUpdateSlice
+
+See also
+[`XlaBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+DynamicUpdateSlice generates a result which is the value of the input array
+`operand`, with a slice `update` overwritten at `start_indices`.
+The shape of `update` determines the shape of the sub-array of the result which
+is updated.
+The shape of `start_indices` must be rank == 1, with dimension size equal to
+the rank of `operand`.
+
+<b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
+
+| Arguments       | Type    | Semantics                                        |
+| --------------- | ------- | ------------------------------------------------ |
+| `operand`       | `XlaOp` | N dimensional array of type T                    |
+| `update`        | `XlaOp` | N dimensional array of type T containing the     |
+:                 :         : slice update. Each dimension of update shape     :
+:                 :         : must be strictly greater than zero, and start +  :
+:                 :         : update must be less than or equal to the operand :
+:                 :         : size for each dimension to avoid generating      :
+:                 :         : out-of-bounds update indices.                    :
+| `start_indices` | `XlaOp` | Rank 1 array of N integers containing the        |
+:                 :         : starting indices of the slice for each           :
+:                 :         : dimension. Value must be greater than or equal   :
+:                 :         : to zero.                                         :
+
+The effective slice indices are computed by applying the following
+transformation for each index `i` in `[1, N)` before performing the slice:
+
+```
+start_indices[i] = clamp(start_indices[i], 0, operand.dimension_size[i] - update.dimension_size[i])
+```
+
+This ensures that the updated slice is always in-bounds with respect to the
+operand array. If the slice is in-bounds before the transformation is applied,
+the transformation has no effect.
+
+1-dimensional example:
+
+```
+let a = {0.0, 1.0, 2.0, 3.0, 4.0}
+let u = {5.0, 6.0}
+let s = {2}
+
+DynamicUpdateSlice(a, u, s) produces:
+  {0.0, 1.0, 5.0, 6.0, 4.0}
+```
+
+2-dimensional example:
+
+```
+let b =
+ { {0.0,  1.0,  2.0},
+   {3.0,  4.0,  5.0},
+   {6.0,  7.0,  8.0},
+   {9.0, 10.0, 11.0} }
+let u =
+ { {12.0,  13.0},
+   {14.0,  15.0},
+   {16.0,  17.0} }
+
+let s = {1, 1}
+
+DynamicUpdateSlice(b, u, s) produces:
+ { {0.0,  1.0,  2.0},
+   {3.0, 12.0, 13.0},
+   {6.0, 14.0, 15.0},
+   {9.0, 16.0, 17.0} }
+```
+
+## Element-wise binary arithmetic operations
+
+See also
+[`XlaBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+A set of element-wise binary arithmetic operations is supported.
+
+<b> `Op(lhs, rhs)` </b>
+
+Where `Op` is one of `Add` (addition), `Sub` (subtraction), `Mul`
+(multiplication), `Div` (division), `Rem` (remainder), `Max` (maximum), `Min`
+(minimum), `LogicalAnd` (logical AND), or `LogicalOr` (logical OR).
+
+Arguments | Type    | Semantics
+--------- | ------- | ----------------------------------------
+`lhs`     | `XlaOp` | left-hand-side operand: array of type T
+`rhs`     | `XlaOp` | right-hand-side operand: array of type T
+
+The arguments' shapes have to be either similar or compatible. See the
+[broadcasting](../../performance/xla/broadcasting.md) documentation about what it means for shapes to
+be compatible. The result of an operation has a shape which is the result of
+broadcasting the two input arrays. In this variant, operations between arrays of
+different ranks are *not* supported, unless one of the operands is a scalar.
+
+When `Op` is `Rem`, the sign of the result is taken from the dividend, and the
+absolute value of the result is always less than the divisor's absolute value.
+
+Integer division overflow (signed/unsigned division/remainder by zero or signed
+divison/remainder of `INT_SMIN` with `-1`) produces an implementation defined
+value.
+
+An alternative variant with different-rank broadcasting support exists for these
+operations:
+
+<b> `Op(lhs, rhs, broadcast_dimensions)` </b>
+
+Where `Op` is the same as above. This variant of the operation should be used
+for arithmetic operations between arrays of different ranks (such as adding a
+matrix to a vector).
+
+The additional `broadcast_dimensions` operand is a slice of integers used to
+expand the rank of the lower-rank operand up to the rank of the higher-rank
+operand. `broadcast_dimensions` maps the dimensions of the lower-rank shape to
+the dimensions of the higher-rank shape. The unmapped dimensions of the expanded
+shape are filled with dimensions of size one. Degenerate-dimension broadcasting
+then broadcasts the shapes along these degenerate dimensions to equalize the
+shapes of both operands. The semantics are described in detail on the
+[broadcasting page](../../performance/xla/broadcasting.md).
+
+## Element-wise comparison operations
+
+See also
+[`XlaBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+A set of standard element-wise binary comparison operations is supported. Note
+that standard IEEE 754 floating-point comparison semantics apply when comparing
+floating-point types.
+
+<b> `Op(lhs, rhs)` </b>
+
+Where `Op` is one of `Eq` (equal-to), `Ne` (not equal-to), `Ge`
+(greater-or-equal-than), `Gt` (greater-than), `Le` (less-or-equal-than), `Lt`
+(less-than).
+
+Arguments | Type    | Semantics
+--------- | ------- | ----------------------------------------
+`lhs`     | `XlaOp` | left-hand-side operand: array of type T
+`rhs`     | `XlaOp` | right-hand-side operand: array of type T
+
+The arguments' shapes have to be either similar or compatible. See the
+[broadcasting](../../performance/xla/broadcasting.md) documentation about what it means for shapes to
+be compatible. The result of an operation has a shape which is the result of
+broadcasting the two input arrays with the element type `PRED`. In this variant,
+operations between arrays of different ranks are *not* supported, unless one of
+the operands is a scalar.
+
+An alternative variant with different-rank broadcasting support exists for these
+operations:
+
+<b> `Op(lhs, rhs, broadcast_dimensions)` </b>
+
+Where `Op` is the same as above. This variant of the operation should be used
+for comparison operations between arrays of different ranks (such as adding a
+matrix to a vector).
+
+The additional `broadcast_dimensions` operand is a slice of integers specifying
+the dimensions to use for broadcasting the operands. The semantics are described
+in detail on the [broadcasting page](../../performance/xla/broadcasting.md).
+
+## Element-wise unary functions
+
+XlaBuilder supports these element-wise unary functions:
+
+<b>`Abs(operand)`</b> Element-wise abs `x -> |x|`.
+
+<b>`Ceil(operand)`</b> Element-wise ceil `x -> ⌈x⌉`.
+
+<b>`Cos(operand)`</b> Element-wise cosine `x -> cos(x)`.
+
+<b>`Exp(operand)`</b> Element-wise natural exponential `x -> e^x`.
+
+<b>`Floor(operand)`</b> Element-wise floor `x -> ⌊x⌋`.
+
+<b>`IsFinite(operand)`</b> Tests whether each element of `operand` is finite,
+i.e., is not positive or negative infinity, and is not `NaN`. Returns an array
+of `PRED` values with the same shape as the input, where each element is `true`
+if and only if the corresponding input element is finite.
+
+<b>`Log(operand)`</b> Element-wise natural logarithm `x -> ln(x)`.
+
+<b>`LogicalNot(operand)`</b> Element-wise logical not `x -> !(x)`.
+
+<b>`Neg(operand)`</b> Element-wise negation `x -> -x`.
+
+<b>`Sign(operand)`</b> Element-wise sign operation `x -> sgn(x)` where
+
+$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ 0 & x = 0\\ 1 & x > 0 \end{cases}$$
+
+using the comparison operator of the element type of `operand`.
+
+<b>`Tanh(operand)`</b> Element-wise hyperbolic tangent `x -> tanh(x)`.
+
+
+Arguments | Type    | Semantics
+--------- | ------- | ---------------------------
+`operand` | `XlaOp` | The operand to the function
+
+The function is applied to each element in the `operand` array, resulting in an
+array with the same shape. It is allowed for `operand` to be a scalar (rank 0).
+
+## Gather
+
+The XLA gather operation stitches together several slices (each slice at a
+potentially different runtime offset) of an input array.
+
+### General Semantics
+
+See also
+[`XlaBuilder::Gather`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+For a more intuitive description, see the "Informal Description" section below.
+
+<b> `gather(operand, start_indices, offset_dims, collapsed_slice_dims, slice_sizes, start_index_map)` </b>
+
+|Arguments         | Type                    | Semantics                       |
+|----------------- | ----------------------- | --------------------------------|
+|`operand`         | `XlaOp`                 | The array we’re gathering       |
+:                  :                         : from.                           :
+|`start_indices`   | `XlaOp`                 | Array containing the starting  |
+:                  :                         : indices of the slices we gather.:
+|`index_vector_dim` | `int64`                | The dimension in                |
+:                  :                         : `start_indices` that "contains" :
+:                  :                         : the starting indices.  See      :
+:                  :                         : below for a detailed            :
+:                  :                         : description.                    :
+|`offset_dims`     | `ArraySlice<int64>`     | The set of dimensions in  the   :
+:                  :                         : output shape that offset into a :
+:                  :                         : array sliced from operand.     :
+|`slice_sizes`     | `ArraySlice<int64>`      | `slice_sizes[i]` is the bounds |
+:                  :                          : for the slice on dimension `i`.:
+|`collapsed_slice_dims` | `ArraySlice<int64>` | The set of dimensions in each  :
+|                  :                          | slice that are collapsed away. :
+|                  :                          | These dimensions must have size:
+|                  :                          | 1.                             |
+|`start_index_map` | `ArraySlice<int64>`      | A map that describes how to map|
+:                  :                          : indices in `start_indices` to  :
+:                  :                          : to legal indices into operand. :
+
+For convenience, we label dimensions in the output array not in `offset_dims`
+as `batch_dims`.
+
+The output is an array of rank `batch_dims.size` + `operand.rank` -
+`collapsed_slice_dims`.size.
+
+If `index_vector_dim` is equal to `start_indices.rank` we implicitly consider
+`start_indices` to have a trailing `1` dimension (i.e. if `start_indices` was of
+shape `[6,7]` and `index_vector_dim` is `2` then we implicitly consider the
+shape of `start_indices` to be `[6,7,1]`).
+
+The bounds for the output array along dimension `i` is computed as follows:
+
+  1. If `i` is present in `batch_dims` (i.e. is equal to `batch_dims[k]` for
+     some `k`) then we pick the corresponding dimension bounds out of
+     `start_indices.shape`, skipping `index_vector_dim` (i.e. pick
+     `start_indices.shape.dims`[`k`] if `k` < `index_vector_dim` and
+     `start_indices.shape.dims`[`k`+`1`] otherwise).
+
+  2. If `i` is present in `offset_dims` (i.e. equal to `offset_dims`[`k`] for
+     some `k`) then we pick the corresponding bound out of `slice_sizes` after
+     accounting for `collapsed_slice_dims` (i.e. we pick
+     `adjusted_slice_sizes`[`k`] where `adjusted_slice_sizes` is `slice_sizes`
+     with the bounds at indices `collapsed_slice_dims` removed).
+
+Formally, the operand index `In` corresponding to an output index `Out` is
+computed as follows:
+
+  1. Let `G` = { `Out`[`k`] for `k` in `batch_dims` }.  Use `G` to slice out
+     vector `S` such that `S`[`i`] = `start_indices`[Combine(`G`, `i`)] where
+     Combine(A, b) inserts b at position `index_vector_dim` into A.  Note that
+     this is well defined even if `G` is empty -- if `G` is empty then `S` =
+     `start_indices`.
+
+  2. Create a starting index, `S`<sub>`in`</sub>, into `operand` using `S` by
+     scattering `S` using `start_index_map`.  More precisely:
+       1. `S`<sub>`in`</sub>[`start_index_map`[`k`]] = `S`[`k`] if `k` <
+          `start_index_map.size`.
+       2. `S`<sub>`in`</sub>[`_`] = `0` otherwise.
+
+  3. Create an index `O`<sub>`in`</sub> into `operand` by scattering the indices
+     at the offset dimensions in `Out` according to the `collapsed_slice_dims`
+     set.  More precisely:
+       1. `O`<sub>`in`</sub>[`expand_offset_dims`(`k`)] =
+          `Out`[`offset_dims`[`k`]] if `k` < `offset_dims.size`
+          (`expand_offset_dims` is defined below).
+       2. `O`<sub>`in`</sub>[`_`] = `0` otherwise.
+  4. `In` is `O`<sub>`in`</sub> + `S`<sub>`in`</sub> where + is element-wise
+     addition.
+
+`expand_offset_dims` is the monotonic function with domain [`0`, `offset.size`)
+and range [`0`, `operand.rank`) \ `collapsed_slice_dims`.  So if, e.g.,
+`offset.size` is `4`, `operand.rank` is `6` and `collapsed_slice_dims` is {`0`,
+`2`} then `expand_offset_dims` is {`0`→`1`, `1`→`3`, `2`→`4`, `3`→`5`}.
+
+### Informal Description and Examples
+
+Informally, every index `Out` in the output array corresponds to an element `E`
+in the operand array, computed as follows:
+
+  - We use the batch dimensions in `Out` to look up a starting index from
+    `start_indices`.
+
+  - We use `start_index_map` to map the starting index (which may have size less
+    than operand.rank) to a "full" starting index into operand.
+
+  - We dynamic-slice out a slice with size `slice_sizes` using the full starting
+    index.
+
+  - We reshape the slice by collapsing the `collapsed_slice_dims` dimensions.
+    Since all collapsed slice dimensions have to have bound 1 this reshape is
+    always legal.
+
+  - We use the offset dimensions in `Out` to index into this slice to get the
+    input element, `E`, corresponding to output index `Out`.
+
+`index_vector_dim` is set to `start_indices.rank` - `1` in all of the
+examples that follow.  More interesting values for `index_vector_dim` does not
+change the operation fundamentally, but makes the visual representation more
+cumbersome.
+
+To get an intuition on how all of the above fits together, let's look at an
+example that gathers 5 slices of shape `[8,6]` from a `[16,11]` array.  The
+position of a slice into the `[16,11]` array can be represented as an index
+vector of shape `S64[2]`, so the set of 5 positions can be represented as a
+`S64[5,2]` array.
+
+The behavior of the gather operation can then be depicted as an index
+transformation that takes [`G`,`O`<sub>`0`</sub>,`O`<sub>`1`</sub>], an index in
+the output shape, and maps it to an element in the input array in the following
+way:
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="../../images/ops_xla_gather_0.svg">
+</div>
+
+We first select an (`X`,`Y`) vector from the gather indices array using `G`.
+The element in the output array at index
+[`G`,`O`<sub>`0`</sub>,`O`<sub>`1`</sub>] is then the element in the input
+array at index [`X`+`O`<sub>`0`</sub>,`Y`+`O`<sub>`1`</sub>].
+
+`slice_sizes` is `[8,6]`, which decides the range of W<sub>`0`</sub> and
+W<sub>`1`</sub>, and this in turn decides the bounds of the slice.
+
+This gather operation acts as a batch dynamic slice with `G` as the batch
+dimension.
+
+The gather indices may be multidimensional.  For instance, a more general
+version of the example above using a "gather indices" array of shape `[4,5,2]`
+would translate indices like this:
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="../../images/ops_xla_gather_1.svg">
+</div>
+
+Again, this acts as a batch dynamic slice `G`<sub>`0`</sub> and
+`G`<sub>`1`</sub> as the batch dimensions.  The slice size is still `[8,6]`.
+
+The gather operation in XLA generalizes the informal semantics outlined above in
+the following ways:
+
+ 1. We can configure which dimensions in the output shape are the offset
+    dimensions (dimensions containing `O`<sub>`0`</sub>, `O`<sub>`1`</sub> in
+    the last example).  The output batch dimensions (dimensions containing
+    `G`<sub>`0`</sub>, `G`<sub>`1`</sub> in the last example) are defined to be
+    the output dimensions that are not offset dimensions.
+
+ 2. The number of output offset dimensions explicitly present in the output
+    shape may be smaller than the input rank.  These "missing" dimensions, which
+    are listed explicitly as `collapsed_slice_dims`, must have a slice size of
+    `1`.  Since they have a slice size of `1` the only valid index for them is
+    `0` and eliding them does not introduce ambiguity.
+
+ 3. The slice extracted from the "Gather Indices" array ((`X`, `Y`) in the last
+    example) may have fewer elements than the input array rank, and an explicit
+    mapping dictates how the index should be expanded to have the same rank as
+    the input.
+
+As a final example, we use (2) and (3) to implement `tf.gather_nd`:
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="../../images/ops_xla_gather_2.svg">
+</div>
+
+`G`<sub>`0`</sub> and `G`<sub>`1`</sub> are used to slice out a starting index
+from the gather indices array as usual, except the starting index has only one
+element, `X`.  Similarly, there is only one output offset index with the value
+`O`<sub>`0`</sub>.  However, before being used as indices into the input array,
+these are expanded in accordance to "Gather Index Mapping" (`start_index_map` in
+the formal description) and "Offset Mapping" (`expand_offset_dims` in the formal
+description) into [`0`,`O`<sub>`0`</sub>] and [`X`,`0`] respectively, adding up
+to [`X`,`O`<sub>`0`</sub>].  In other words, the output index
+[`G`<sub>`0`</sub>,`G`<sub>`1`</sub>,`O`<sub>`0`</sub>] maps to the input index
+[`GatherIndices`[`G`<sub>`0`</sub>,`G`<sub>`1`</sub>,`0`],`X`] which gives us
+the semantics for `tf.gather_nd`.
+
+`slice_sizes` for this case is `[1,11]`.  Intuitively this means that every
+index `X` in the gather indices array picks an entire row and the result is the
+concatenation of all these rows.
+
+## GetTupleElement
+
+See also
+[`XlaBuilder::GetTupleElement`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Indexes into a tuple with a compile-time-constant value.
+
+The value must be a compile-time-constant so that shape inference can determine
+the type of the resulting value.
+
+This is analogous to `std::get<int N>(t)` in C++. Conceptually:
+
+```
+let v: f32[10] = f32[10]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+let s: s32 = 5;
+let t: (f32[10], s32) = tuple(v, s);
+let element_1: s32 = gettupleelement(t, 1);  // Inferred shape matches s32.
+```
+
+See also `tf.tuple`.
+
+## Infeed
+
+See also
+[`XlaBuilder::Infeed`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+<b> `Infeed(shape)` </b>
+
+| Argument | Type    | Semantics                                             |
+| -------- | ------- | ----------------------------------------------------- |
+| `shape`  | `Shape` | Shape of the data read from the Infeed interface. The |
+:          :         : layout field of the shape must be set to match the    :
+:          :         : layout of the data sent to the device; otherwise its  :
+:          :         : behavior is undefined.                                :
+
+Reads a single data item from the implicit Infeed streaming interface of the
+device, interpreting the data as the given shape and its layout, and returns a
+`XlaOp` of the data. Multiple Infeed operations are allowed in a
+computation, but there must be a total order among the Infeed operations. For
+example, two Infeeds in the code below have a total order since there is a
+dependency between the while loops.
+
+```
+result1 = while (condition, init = init_value) {
+  Infeed(shape)
+}
+
+result2 = while (condition, init = result1) {
+  Infeed(shape)
+}
+```
+
+Nested tuple shapes are not supported. For an empty tuple shape, the Infeed
+operation is effectively a no-op and proceeds without reading any data from the
+Infeed of the device.
+
+> Note: We plan to allow multiple Infeed operations without a total order, in
+> which case the compiler will provide information about how the Infeed
+> operations are serialized in the compiled program.
+
+## Iota
+
+<b> `Iota()` </b>
+
+Builds a constant literal on device rather than a potentially large host
+transfer.  Creates a rank 1 tensor of values starting at zero and incrementing
+by one.
+
+Arguments          | Type            | Semantics
+------------------ | --------------- | ---------------------------
+`type`             | `PrimitiveType` | type U
+`size`             | `int64`         | The number of elements in the tensor.
+
+## Map
+
+See also
+[`XlaBuilder::Map`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+<b> `Map(operands..., computation)` </b>
+
+| Arguments         | Type                   | Semantics                      |
+| ----------------- | ---------------------- | ------------------------------ |
+| `operands`        | sequence of N `XlaOp`s | N arrays of types T_0..T_{N-1} |
+| `computation`     | `XlaComputation`       | computation of type `T_0, T_1, |
+:                   :                        : ..., T_{N + M -1} -> S` with N :
+:                   :                        : parameters of type T and M of  :
+:                   :                        : arbitrary type                 :
+| `dimensions`      | `int64` array          | array of map dimensions        |
+
+Applies a scalar function over the given `operands` arrays, producing an array
+of the same dimensions where each element is the result of the mapped function
+applied to the corresponding elements in the input arrays.
+
+The mapped function is an arbitrary computation with the restriction that it has
+N inputs of scalar type `T` and a single output with type `S`. The output has
+the same dimensions as the operands except that the element type T is replaced
+with S.
+
+For example: `Map(op1, op2, op3, computation, par1)` maps `elem_out <-
+computation(elem1, elem2, elem3, par1)` at each (multi-dimensional) index in the
+input arrays to produce the output array.
+
+## Pad
+
+See also
+[`XlaBuilder::Pad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+<b> `Pad(operand, padding_value, padding_config)` </b>
+
+| Arguments        | Type            | Semantics                               |
+| ---------------- | --------------- | --------------------------------------- |
+| `operand`        | `XlaOp`         | array of type `T`                       |
+| `padding_value`  | `XlaOp`         | scalar of type `T` to fill in the added |
+:                  :                 : padding                                 :
+| `padding_config` | `PaddingConfig` | padding amount on both edges (low,      |
+:                  :                 : high) and between the elements of each  :
+:                  :                 : dimension                               :
+
+Expands the given `operand` array by padding around the array as well as between
+the elements of the array with the given `padding_value`. `padding_config`
+specifies the amount of edge padding and the interior padding for each
+dimension.
+
+`PaddingConfig` is a repeated field of `PaddingConfigDimension`, which contains
+three fields for each dimension: `edge_padding_low`, `edge_padding_high`, and
+`interior_padding`. `edge_padding_low` and `edge_padding_high` specify the
+amount of padding added at the low-end (next to index 0) and the high-end (next
+to the highest index) of each dimension respectively. The amount of edge padding
+can be negative -- the absolute value of negative padding indicates the number
+of elements to remove from the specified dimension. `interior_padding` specifies
+the amount of padding added between any two elements in each dimension. Interior
+padding occurs logically before edge padding, so in the case of negative edge
+padding elements are removed from the interior-padded operand. This operation is
+a no-op if the edge padding pairs are all (0, 0) and the interior padding values
+are all 0. The figure below shows examples of different `edge_padding` and
+`interior_padding` values for a two-dimensional array.
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="https://www.tensorflow.org/images/ops_pad.png">
+</div>
+
+## Recv
+
+See also
+[`XlaBuilder::Recv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+<b> `Recv(shape, channel_handle)` </b>
+
+| Arguments        | Type            | Semantics                            |
+| ---------------- | --------------- | ------------------------------------ |
+| `shape`          | `Shape`         | shape of the data to receive         |
+| `channel_handle` | `ChannelHandle` | unique identifier for each send/recv pair |
+
+Receives data of the given shape from a `Send` instruction in another
+computation that shares the same channel handle. Returns a
+XlaOp for the received data.
+
+The client API of `Recv` operation represents synchronous communication.
+However, the instruction is internally decomposed into 2 HLO instructions
+(`Recv` and `RecvDone`) to enable asynchronous data transfers. See also
+[`HloInstruction::CreateRecv` and `HloInstruction::CreateRecvDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h).
+
+<b>`Recv(const Shape& shape, int64 channel_id)`</b>
+
+Allocates resources required to receive data from a `Send` instruction with the
+same channel_id. Returns a context for the allocated resources, which is used
+by a following `RecvDone` instruction to wait for the completion of the data
+transfer. The context is a tuple of {receive buffer (shape), request identifier
+(U32)} and it can only be used by a `RecvDone` instruction.
+
+<b> `RecvDone(HloInstruction context)` </b>
+
+Given a context created by a `Recv` instruction, waits for the data transfer to
+complete and returns the received data.
+
+## Reduce
+
+See also
+[`XlaBuilder::Reduce`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Applies a reduction function to one or more arrays in parallel.
+
+<b> `Reduce(operands..., init_values..., computation, dimensions)` </b>
+
+Arguments     | Type                  | Semantics
+------------- | --------------------- | ---------------------------------------
+`operands`    | Sequence of N `XlaOp` | N arrays of types `T_0, ..., T_N`.
+`init_values` | Sequence of N `XlaOp` | N scalars of types `T_0, ..., T_N`.
+`computation` | `XlaComputation`      | computation of type
+              :                       : `T_0, ..., T_N, T_0, ..., T_N -> Collate(T_0, ..., T_N)`
+`dimensions`  | `int64` array         | unordered array of dimensions to reduce
+
+Where:
+* N is required to be greater or equal to 1.
+* All input arrays must have the same dimensions.
+* If `N = 1`, `Collate(T)` is `T`.
+* If `N > 1`, `Collate(T_0, ..., T_N)` is a tuple of `N` elements of type `T`.
+
+The output of the op is `Collate(Q_0, ..., Q_N)` where `Q_i` is an array of type
+`T_i`, the dimensions of which are described below.
+
+This operation reduces one or more dimensions of each input array into scalars.
+The rank of each returned array is `rank(operand) - len(dimensions)`.
+`init_value` is the initial value used for every reduction and may be inserted
+anywhere during computation by the back-end. In most cases, `init_value` is an
+identity of the reduction function (for example, 0 for addition). The applied
+`computation` is always passed the `init_value` on the left-hand side.
+
+The evaluation order of the reduction function is arbitrary and may be
+non-deterministic. Therefore, the reduction function should not be overly
+sensitive to reassociation.
+
+Some reduction functions like addition are not strictly associative for floats.
+However, if the range of the data is limited, floating-point addition is close
+enough to being associative for most practical uses. It is possible to conceive
+of some completely non-associative reductions, however, and these will produce
+incorrect or unpredictable results in XLA reductions.
+
+As an example, when reducing across one dimension in a single 1D array with
+values [10, 11, 12, 13], with reduction function `f` (this is `computation`)
+then that could be computed as
+
+`f(10, f(11, f(12, f(init_value, 13)))`
+
+but there are also many other possibilities, e.g.
+
+`f(init_value, f(f(10, f(init_value, 11)), f(f(init_value, 12), f(init_value, 13))))`
+
+The following is a rough pseudo-code example of how reduction could be
+implemented, using summation as the reduction computation with an initial value
+of 0.
+
+```python
+result_shape <- remove all dims in dimensions from operand_shape
+
+# Iterate over all elements in result_shape. The number of r's here is equal
+# to the rank of the result
+for r0 in range(result_shape[0]), r1 in range(result_shape[1]), ...:
+  # Initialize this result element
+  result[r0, r1...] <- 0
+
+  # Iterate over all the reduction dimensions
+  for d0 in range(dimensions[0]), d1 in range(dimensions[1]), ...:
+    # Increment the result element with the value of the operand's element.
+    # The index of the operand's element is constructed from all ri's and di's
+    # in the right order (by construction ri's and di's together index over the
+    # whole operand shape).
+    result[r0, r1...] += operand[ri... di]
+```
+
+Here's an example of reducing a 2D array (matrix). The shape has rank 2,
+dimension 0 of size 2 and dimension 1 of size 3:
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_2d_matrix.png">
+</div>
+
+Results of reducing dimensions 0 or 1 with an "add" function:
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_from_2d_matrix.png">
+</div>
+
+Note that both reduction results are 1D arrays. The diagram shows one as column
+and another as row just for visual convenience.
+
+For a more complex example, here is a 3D array. Its rank is 3, dimension 0 of
+size 4, dimension 1 of size 2 and dimension 2 of size 3. For simplicity, the
+values 1 to 6 are replicated across dimension 0.
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_from_3d_matrix.png">
+</div>
+
+Similarly to the 2D example, we can reduce just one dimension. If we reduce
+dimension 0, for example, we get a rank-2 array where all values across
+dimension 0 were folded into a scalar:
+
+```text
+|  4   8  12 |
+| 16  20  24 |
+```
+
+If we reduce dimension 2, we also get a rank-2 array where all values across
+dimension 2 were folded into a scalar:
+
+```text
+| 6  15 |
+| 6  15 |
+| 6  15 |
+| 6  15 |
+```
+
+Note that the relative order between the remaining dimensions in the input is
+preserved in the output, but some dimensions may get assigned new numbers (since
+the rank changes).
+
+We can also reduce multiple dimensions. Add-reducing dimensions 0 and 1 produces
+the 1D array `| 20 28 36 |`.
+
+Reducing the 3D array over all its dimensions produces the scalar `84`.
+
+When `N > 1`, reduce function application is slightly more complex, as it is
+applied simultaneously to all inputs. For example, consider the following
+reduction function, which can be used to compute the max and the argmax of a
+a 1-D tensor in parallel:
+
+```
+f: (Float, Int, Float, Int) -> Float, Int
+f(max, argmax, value, index):
+  if value >= argmax:
+    return (value, index)
+  else:
+    return (max, argmax)
+```
+
+For 1-D Input arrays `V = Float[N], K = Int[N]`, and init values
+`I_V = Float, I_K =  Int`, the result `f_(N-1)` of reducing across the only
+input dimension is equivalent to the following recursive application:
+```
+f_0 = f(I_V, I_K, V_0, K_0)
+f_1 = f(f_0.first, f_0.second, V_1, K_1)
+...
+f_(N-1) = f(f_(N-2).first, f_(N-2).second, V_(N-1), K_(N-1))
+```
+
+Applying this reduction to an array of values, and an array of sequential
+indices (i.e. iota), will co-iterate over the arrays, and return a tuple
+containing the maximal value and the matching index.
+
+## ReducePrecision
+
+See also
+[`XlaBuilder::ReducePrecision`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Models the effect of converting floating-point values to a lower-precision
+format (such as IEEE-FP16) and back to the original format.  The number of
+exponent and mantissa bits in the lower-precision format can be specified
+arbitrarily, although all bit sizes may not be supported on all hardware
+implementations.
+
+<b> `ReducePrecision(operand, mantissa_bits, exponent_bits)` </b>
+
+Arguments       | Type    | Semantics
+--------------- | ------- | -------------------------------------------------
+`operand`       | `XlaOp` | array of floating-point type `T`.
+`exponent_bits` | `int32` | number of exponent bits in lower-precision format
+`mantissa_bits` | `int32` | number of mantissa bits in lower-precision format
+
+The result is an array of type `T`.  The input values are rounded to the nearest
+value representable with the given number of mantissa bits (using "ties to even"
+semantics), and any values that exceed the range specified by the number of
+exponent bits are clamped to positive or negative infinity.  `NaN` values are
+retained, although they may be converted to canonical `NaN` values.
+
+The lower-precision format must have at least one exponent bit (in order to
+distinguish a zero value from an infinity, since both have a zero mantissa), and
+must have a non-negative number of mantissa bits.  The number of exponent or
+mantissa bits may exceed the corresponding value for type `T`; the corresponding
+portion of the conversion is then simply a no-op.
+
+## ReduceWindow
+
+See also
+[`XlaBuilder::ReduceWindow`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Applies a reduction function to all elements in each window of the input
+multi-dimensional array, producing an output multi-dimensional array with the
+same number of elements as the number of valid positions of the window. A
+pooling layer can be expressed as a `ReduceWindow`. Similar to
+[`Reduce`](#reduce), the applied `computation` is always passed the `init_value`
+on the left-hand side.
+
+<b> `ReduceWindow(operand, init_value, computation, window_dimensions,
+window_strides, padding)` </b>
+
+| Arguments           | Type                | Semantics                        |
+| ------------------- | ------------------- | -------------------------------- |
+| `operand`           | `XlaOp`             | N dimensional array containing   |
+:                     :                     : elements of type T. This is the  :
+:                     :                     : base area on which the window is :
+:                     :                     : placed.                          :
+| `init_value`        | `XlaOp`             | Starting value for the           |
+:                     :                     : reduction. See [Reduce](#reduce) :
+:                     :                     : for details.                     :
+| `computation`       | `XlaComputation`    | Reduction function of type `T, T |
+:                     :                     : -> T`, to apply to all elements  :
+:                     :                     : in each window                   :
+| `window_dimensions` | `ArraySlice<int64>` | array of integers for window     |
+:                     :                     : dimension values                 :
+| `window_strides`    | `ArraySlice<int64>` | array of integers for window     |
+:                     :                     : stride values                    :
+| `padding`           | `Padding`           | padding type for window          |
+:                     :                     : (Padding\:\:kSame or             :
+:                     :                     : Padding\:\:kValid)               :
+
+Below code and figure shows an example of using `ReduceWindow`. Input is a
+matrix of size [4x6] and both window_dimensions and window_stride_dimensions are
+[2x3].
+
+```
+// Create a computation for the reduction (maximum).
+XlaComputation max;
+{
+  XlaBuilder builder(client_, "max");
+  auto y = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "y");
+  auto x = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "x");
+  builder.Max(y, x);
+  max = builder.Build().ConsumeValueOrDie();
+}
+
+// Create a ReduceWindow computation with the max reduction computation.
+XlaBuilder builder(client_, "reduce_window_2x3");
+auto shape = ShapeUtil::MakeShape(F32, {4, 6});
+auto input = builder.Parameter(0, shape, "input");
+builder.ReduceWindow(
+    input, *max,
+    /*init_val=*/builder.ConstantLiteral(LiteralUtil::MinValue(F32)),
+    /*window_dimensions=*/{2, 3},
+    /*window_stride_dimensions=*/{2, 3},
+    Padding::kValid);
+```
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_window.png">
+</div>
+
+Stride of 1 in a dimension specifies that the position of a window in the
+dimension is 1 element away from its adjacent window. In order to specify that
+no windows overlap with each other, window_stride_dimensions should be equal to
+window_dimensions. The figure below illustrates the use of two different stride
+values. Padding is applied to each dimension of the input and the calculations
+are the same as though the input came in with the dimensions it has after
+padding.
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:75%" src="https://www.tensorflow.org/images/ops_reduce_window_stride.png">
+</div>
+
+The evaluation order of the reduction function is arbitrary and may be
+non-deterministic. Therefore, the reduction function should not be overly
+sensitive to reassociation. See the discussion about associativity in the
+context of [`Reduce`](#reduce) for more details.
+
+## Reshape
+
+See also
+[`XlaBuilder::Reshape`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
+and the [`Collapse`](#collapse) operation.
+
+Reshapes the dimensions of an array into a new configuration.
+
+<b> `Reshape(operand, new_sizes)` </b>
+<b> `Reshape(operand, dimensions, new_sizes)` </b>
+
+Arguments    | Type           | Semantics
+------------ | -------------- | ---------------------------------------
+`operand`    | `XlaOp`        | array of type T
+`dimensions` | `int64` vector | order in which dimensions are collapsed
+`new_sizes`  | `int64` vector | vector of sizes of new dimensions
+
+Conceptually, reshape first flattens an array into a one-dimensional vector of
+data values, and then refines this vector into a new shape. The input arguments
+are an arbitrary array of type T, a compile-time-constant vector of dimension
+indices, and a compile-time-constant vector of dimension sizes for the result.
+The values in the `dimension` vector, if given, must be a permutation of all of
+T's dimensions; the default if not given is `{0, ..., rank - 1}`. The order of
+the dimensions in `dimensions` is from slowest-varying dimension (most major) to
+fastest-varying dimension (most minor) in the loop nest which collapses the
+input array into a single dimension. The `new_sizes` vector determines the size
+of the output array. The value at index 0 in `new_sizes` is the size of
+dimension 0, the value at index 1 is the size of dimension 1, and so on. The
+product of the `new_size` dimensions must equal the product of the operand's
+dimension sizes. When refining the collapsed array into the multidimensional
+array defined by `new_sizes`, the dimensions in `new_sizes` are ordered from
+slowest varying (most major) and to fastest varying (most minor).
+
+For example, let v be an array of 24 elements:
+
+```
+let v = f32[4x2x3] {{{10, 11, 12}, {15, 16, 17}},
+                    {{20, 21, 22}, {25, 26, 27}},
+                    {{30, 31, 32}, {35, 36, 37}},
+                    {{40, 41, 42}, {45, 46, 47}}};
+
+In-order collapse:
+let v012_24 = Reshape(v, {0,1,2}, {24});
+then v012_24 == f32[24] {10, 11, 12, 15, 16, 17, 20, 21, 22, 25, 26, 27,
+                         30, 31, 32, 35, 36, 37, 40, 41, 42, 45, 46, 47};
+
+let v012_83 = Reshape(v, {0,1,2}, {8,3});
+then v012_83 == f32[8x3] {{10, 11, 12}, {15, 16, 17},
+                          {20, 21, 22}, {25, 26, 27},
+                          {30, 31, 32}, {35, 36, 37},
+                          {40, 41, 42}, {45, 46, 47}};
+
+Out-of-order collapse:
+let v021_24 = Reshape(v, {1,2,0}, {24});
+then v012_24 == f32[24]  {10, 20, 30, 40, 11, 21, 31, 41, 12, 22, 32, 42,
+                          15, 25, 35, 45, 16, 26, 36, 46, 17, 27, 37, 47};
+
+let v021_83 = Reshape(v, {1,2,0}, {8,3});
+then v021_83 == f32[8x3] {{10, 20, 30}, {40, 11, 21},
+                          {31, 41, 12}, {22, 32, 42},
+                          {15, 25, 35}, {45, 16, 26},
+                          {36, 46, 17}, {27, 37, 47}};
+
+
+let v021_262 = Reshape(v, {1,2,0}, {2,6,2});
+then v021_262 == f32[2x6x2] {{{10, 20}, {30, 40},
+                              {11, 21}, {31, 41},
+                              {12, 22}, {32, 42}},
+                             {{15, 25}, {35, 45},
+                              {16, 26}, {36, 46},
+                              {17, 27}, {37, 47}}};
+```
+
+As a special case, reshape can transform a single-element array to a scalar and
+vice versa. For example,
+
+```
+Reshape(f32[1x1] {{5}}, {0,1}, {}) == 5;
+Reshape(5, {}, {1,1}) == f32[1x1] {{5}};
+```
+
+## Rev (reverse)
+
+See also
+[`XlaBuilder::Rev`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+<b>`Rev(operand, dimensions)`</b>
+
+Arguments    | Type                | Semantics
+------------ | ------------------- | ---------------------
+`operand`    | `XlaOp`             | array of type T
+`dimensions` | `ArraySlice<int64>` | dimensions to reverse
+
+Reverses the order of elements in the `operand` array along the specified
+`dimensions`, generating an output array of the same shape. Each element of the
+operand array at a multidimensional index is stored into the output array at a
+transformed index. The multidimensional index is transformed by reversing the
+index in each dimension to be reversed (i.e., if a dimension of size N is one of
+the reversing dimensions, its index i is transformed into N - 1 - i).
+
+One use for the `Rev` operation is to reverse the convolution weight array along
+the two window dimensions during the gradient computation in neural networks.
+
+## RngNormal
+
+See also
+[`XlaBuilder::RngNormal`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Constructs an output of a given shape with random numbers generated following
+the $$N(\mu, \sigma)$$ normal distribution. The parameters $$\mu$$ and
+$$\sigma$$, and output shape have to have a floating point elemental type. The
+parameters furthermore have to be scalar valued.
+
+<b>`RngNormal(mu, sigma, shape)`</b>
+
+| Arguments | Type    | Semantics                                           |
+| --------- | ------- | --------------------------------------------------- |
+| `mu`      | `XlaOp` | Scalar of type T specifying mean of generated       |
+:           :         : numbers                                   :
+| `sigma`   | `XlaOp` | Scalar of type T specifying standard deviation of   |
+:           :         : generated numbers                                   :
+| `shape`   | `Shape` | Output shape of type T                              |
+
+## RngUniform
+
+See also
+[`XlaBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Constructs an output of a given shape with random numbers generated following
+the uniform distribution over the interval $$[a,b)$$. The parameters and output
+element type have to be a boolean type, an integral type or a floating point
+types, and the types have to be consistent. The CPU and GPU backends currently
+only support F64, F32, F16, BF16, S64, U64, S32 and U32. Furthermore, the
+parameters need to be scalar valued. If $$b <= a$$ the result is
+implementation-defined.
+
+<b>`RngUniform(a, b, shape)`</b>
+
+| Arguments | Type                    | Semantics                         |
+| --------- | ----------------------- | --------------------------------- |
+| `a`       | `XlaOp`                 | Scalar of type T specifying lower |
+:           :                         : limit of interval                 :
+| `b`       | `XlaOp`                 | Scalar of type T specifying upper |
+:           :                         : limit of interval                 :
+| `shape`   | `Shape`                 | Output shape of type T            |
+
+## Scatter
+
+The XLA scatter operation generates a result which is the value of the input
+tensor `operand`, with several slices (at indices specified by
+`scatter_indices`) updated with the values in `updates` using
+`update_computation`.
+
+See also
+[`XlaBuilder::Scatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+<b> `scatter(operand, scatter_indices, updates, update_computation, index_vector_dim, update_window_dims, inserted_window_dims, scatter_dims_to_operand_dims)` </b>
+
+|Arguments         | Type                   | Semantics                        |
+|------------------|------------------------|----------------------------------|
+|`operand`         | `XlaOp`                | Tensor to be scattered into.     |
+|`scatter_indices` | `XlaOp`                | Tensor containing the starting   |
+:                  :                        : indices of the slices that must  :
+:                  :                        : be scattered to.                 :
+|`updates`         | `XlaOp`                | Tensor containing the values that|
+:                  :                        : must be used for scattering.     :
+|`update_computation`| `XlaComputation`     | Computation to be used for       |
+:                  :                        : combining the existing values in :
+:                  :                        : the input tensor and the updates :
+:                  :                        : during scatter. This computation :
+:                  :                        : should be of type `T, T -> T`.   :
+|`index_vector_dim`| `int64`                | The dimension in                 |
+:                  :                        : `scatter_indices` that contains  :
+:                  :                        : the starting indices.            :
+|`update_window_dims`| `ArraySlice<int64>`  | The set of dimensions in         |
+:                  :                        : `updates` shape that are _window :
+:                  :                        : dimensions_.                     :
+|`inserted_window_dims`| `ArraySlice<int64>`| The set of _window dimensions_   |
+:                  :                        : that must be inserted into       :
+:                  :                        : `updates` shape.                 :
+|`scatter_dims_to_operand_dims`| `ArraySlice<int64>`  | A dimensions map from  |
+:                  :                        : the scatter indices to the       :
+:                  :                        : operand index space. This array  :
+:                  :                        : is interpreted as mapping `i` to :
+:                  :                        : `scatter_dims_to_operand_dims[i]`:
+:                  :                        : . It has to be one-to-one and    :
+:                  :                        : total.                           :
+
+If `index_vector_dim` is equal to `scatter_indices.rank` we implicitly consider
+`scatter_indices` to have a trailing `1` dimension.
+
+We define `update_scatter_dims` of type `ArraySlice<int64>` as the set of
+dimensions in `updates` shape that are not in `update_window_dims`, in ascending
+order.
+
+The arguments of scatter should follow these constraints:
+
+  - `updates` tensor must be of rank `update_window_dims.size +
+  scatter_indices.rank - 1`.
+
+  - Bounds of dimension `i` in `updates` must conform to the following:
+      - If `i` is present in `update_window_dims` (i.e. equal to
+        `update_window_dims`[`k`] for some `k`), then the bound of dimension
+        `i` in `updates` must not exceed the corresponding bound of `operand`
+        after accounting for the `inserted_window_dims` (i.e.
+        `adjusted_window_bounds`[`k`], where `adjusted_window_bounds` contains
+        the bounds of `operand` with the bounds at indices
+        `inserted_window_dims` removed).
+      - If `i` is present in `update_scatter_dims` (i.e. equal to
+        `update_scatter_dims`[`k`] for some `k`), then the bound of dimension
+        `i` in `updates` must be equal to the corresponding bound of
+        `scatter_indices`, skipping `index_vector_dim` (i.e.
+        `scatter_indices.shape.dims`[`k`], if `k` < `index_vector_dim` and
+        `scatter_indices.shape.dims`[`k+1`] otherwise).
+
+  - `update_window_dims` must be in ascending order, not have any repeating
+    dimension numbers, and be in the range `[0, updates.rank)`.
+
+  - `inserted_window_dims` must be in ascending order, not have any
+    repeating dimension numbers, and be in the range `[0, operand.rank)`.
+
+  - `scatter_dims_to_operand_dims.size` must be equal to
+    `scatter_indices`[`index_vector_dim`], and its values must be in the range
+    `[0, operand.rank)`.
+
+For a given index `U` in the `updates` tensor, the corresponding index `I` in
+the `operand` tensor into which this update has to be applied is computed as
+follows:
+
+  1. Let `G` = { `U`[`k`] for `k` in `update_scatter_dims` }. Use `G` to look up
+     an index vector `S` in the `scatter_indices` tensor such that `S`[`i`] =
+     `scatter_indices`[Combine(`G`, `i`)] where Combine(A, b) inserts b at
+     positions `index_vector_dim` into A.
+  2. Create an index `S`<sub>`in`</sub> into `operand` using `S` by scattering
+     `S` using the `scatter_dims_to_operand_dims` map. More formally:
+       1. `S`<sub>`in`</sub>[`scatter_dims_to_operand_dims`[`k`]] = `S`[`k`] if
+          `k` < `scatter_dims_to_operand_dims.size`.
+       2. `S`<sub>`in`</sub>[`_`] = `0` otherwise.
+  3. Create an index `W`<sub>`in`</sub> into `operand` by scattering the indices
+     at `update_window_dims` in `U` according to `inserted_window_dims`.
+     More formally:
+       1. `W`<sub>`in`</sub>[`window_dims_to_operand_dims`(`k`)] = `U`[`k`] if
+          `k` < `update_window_dims.size`, where `window_dims_to_operand_dims`
+          is the monotonic function with domain [`0`, `update_window_dims.size`)
+          and range [`0`, `operand.rank`) \\ `inserted_window_dims`. (For
+          example, if `update_window_dims.size` is `4`, `operand.rank` is `6`,
+          and `inserted_window_dims` is {`0`, `2`} then
+          `window_dims_to_operand_dims` is {`0`→`1`, `1`→`3`, `2`→`4`,
+          `3`→`5`}).
+       2. `W`<sub>`in`</sub>[`_`] = `0` otherwise.
+  4. `I` is `W`<sub>`in`</sub> + `S`<sub>`in`</sub> where + is element-wise
+     addition.
+
+In summary, the scatter operation can be defined as follows.
+
+   - Initialize `output` with `operand`, i.e. for all indices `O` in the
+     `operand` tensor:\
+       `output`[`O`] = `operand`[`O`]
+   - For every index `U` in the `updates` tensor and the corresponding index `O`
+     in the `operand` tensor:\
+       `output`[`O`] = `update_computation`(`output`[`O`], `updates`[`U`])
+
+The order in which updates are applied is non-deterministic. So, when multiple
+indices in `updates` refer to the same index in `operand`, the corresponding
+value in `output` will be non-deterministic.
+
+Note that the first parameter that is passed into the `update_computation` will
+always be the current value from the `output` tensor and the second parameter
+will always be the value from the `updates` tensor. This is important
+specifically for cases when the `update_computation` is _not commutative_.
+
+Informally, the scatter op can be viewed as an _inverse_ of the gather op, i.e.
+the scatter op updates the elements in the input that are extracted by the
+corresponding gather op.
+
+For a detailed informal description and examples, refer to the
+"Informal Description" section under `Gather`.
+
+## Select
+
+See also
+[`XlaBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Constructs an output array from elements of two input arrays, based on the
+values of a predicate array.
+
+<b> `Select(pred, on_true, on_false)` </b>
+
+Arguments  | Type    | Semantics
+---------- | ------- | ------------------
+`pred`     | `XlaOp` | array of type PRED
+`on_true`  | `XlaOp` | array of type T
+`on_false` | `XlaOp` | array of type T
+
+The arrays `on_true` and `on_false` must have the same shape. This is also the
+shape of the output array. The array `pred` must have the same dimensionality as
+`on_true` and `on_false`, with the `PRED` element type.
+
+For each element `P` of `pred`, the corresponding element of the output array is
+taken from `on_true` if the value of `P` is `true`, and from `on_false` if the
+value of `P` is `false`. As a restricted form of [broadcasting]
+(broadcasting.md), `pred` can be a scalar of type `PRED`. In this case, the
+output array is taken wholly from `on_true` if `pred` is `true`, and from
+`on_false` if `pred` is `false`.
+
+Example with non-scalar `pred`:
+
+```
+let pred: PRED[4] = {true, false, false, true};
+let v1: s32[4] = {1, 2, 3, 4};
+let v2: s32[4] = {100, 200, 300, 400};
+==>
+Select(pred, v1, v2) = s32[4]{1, 200, 300, 4};
+```
+
+Example with scalar `pred`:
+
+```
+let pred: PRED = true;
+let v1: s32[4] = {1, 2, 3, 4};
+let v2: s32[4] = {100, 200, 300, 400};
+==>
+Select(pred, v1, v2) = s32[4]{1, 2, 3, 4};
+```
+
+Selections between tuples are supported. Tuples are considered to be scalar
+types for this purpose. If `on_true` and `on_false` are tuples (which must have
+the same shape!) then `pred` has to be a scalar of type `PRED`.
+
+## SelectAndScatter
+
+See also
+[`XlaBuilder::SelectAndScatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+This operation can be considered as a composite operation that first computes
+`ReduceWindow` on the `operand` array to select an element from each window, and
+then scatters the `source` array to the indices of the selected elements to
+construct an output array with the same shape as the operand array. The binary
+`select` function is used to select an element from each window by applying it
+across each window, and it is called with the property that the first
+parameter's index vector is lexicographically less than the second parameter's
+index vector. The `select` function returns `true` if the first parameter is
+selected and returns `false` if the second parameter is selected, and the
+function must hold transitivity (i.e., if `select(a, b)` and `select(b, c)` are
+`true`, then `select(a, c)` is also `true`) so that the selected element does
+not depend on the order of the elements traversed for a given window.
+
+The function `scatter` is applied at each selected index in the output array. It
+takes two scalar parameters:
+
+1.  Current value at the selected index in the output array
+2.  The scatter value from `source` that applies to the selected index
+
+It combines the two parameters and returns a scalar value that's used to update
+the value at the selected index in the output array. Initially, all indices of
+the output array are set to `init_value`.
+
+The output array has the same shape as the `operand` array and the `source`
+array must have the same shape as the result of applying a `ReduceWindow`
+operation on the `operand` array. `SelectAndScatter` can be used to
+backpropagate the gradient values for a pooling layer in a neural network.
+
+<b>`SelectAndScatter(operand, select, window_dimensions, window_strides,
+padding, source, init_value, scatter)`</b>
+
+| Arguments           | Type                | Semantics                        |
+| ------------------- | ------------------- | -------------------------------- |
+| `operand`           | `XlaOp`             | array of type T over which the   |
+:                     :                     : windows slide                    :
+| `select`            | `XlaComputation`    | binary computation of type `T, T |
+:                     :                     : -> PRED`, to apply to all        :
+:                     :                     : elements in each window; returns :
+:                     :                     : `true` if the first parameter is :
+:                     :                     : selected and returns `false` if  :
+:                     :                     : the second parameter is selected :
+| `window_dimensions` | `ArraySlice<int64>` | array of integers for window     |
+:                     :                     : dimension values                 :
+| `window_strides`    | `ArraySlice<int64>` | array of integers for window     |
+:                     :                     : stride values                    :
+| `padding`           | `Padding`           | padding type for window          |
+:                     :                     : (Padding\:\:kSame or             :
+:                     :                     : Padding\:\:kValid)               :
+| `source`            | `XlaOp`             | array of type T with the values  |
+:                     :                     : to scatter                       :
+| `init_value`        | `XlaOp`             | scalar value of type T for the   |
+:                     :                     : initial value of the output      :
+:                     :                     : array                            :
+| `scatter`           | `XlaComputation`    | binary computation of type `T, T |
+:                     :                     : -> T`, to apply each scatter     :
+:                     :                     : source element with its          :
+:                     :                     : destination element              :
+
+The figure below shows examples of using `SelectAndScatter`, with the `select`
+function computing the maximal value among its parameters. Note that when the
+windows overlap, as in the figure (2) below, an index of the `operand` array may
+be selected multiple times by different windows. In the figure, the element of
+value 9 is selected by both of the top windows (blue and red) and the binary
+addition `scatter` function produces the output element of value 8 (2 + 6).
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%"
+    src="https://www.tensorflow.org/images/ops_scatter_to_selected_window_element.png">
+</div>
+
+The evaluation order of the `scatter` function is arbitrary and may be
+non-deterministic. Therefore, the `scatter` function should not be overly
+sensitive to reassociation. See the discussion about associativity in the
+context of [`Reduce`](#reduce) for more details.
+
+## Send
+
+See also
+[`XlaBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+<b> `Send(operand, channel_handle)` </b>
+
+Arguments        | Type            | Semantics
+---------------- | --------------- | -----------------------------------------
+`operand`        | `XlaOp`         | data to send (array of type T)
+`channel_handle` | `ChannelHandle` | unique identifier for each send/recv pair
+
+Sends the given operand data to a `Recv` instruction in another computation
+that shares the same channel handle. Does not return any data.
+
+Similar to the `Recv` operation, the client API of `Send` operation represents
+synchronous communication, and is internally decomposed into 2 HLO instructions
+(`Send` and `SendDone`) to enable asynchronous data transfers. See also
+[`HloInstruction::CreateSend` and `HloInstruction::CreateSendDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h).
+
+<b>`Send(HloInstruction operand, int64 channel_id)`</b>
+
+Initiates an asynchronous transfer of the operand to the resources allocated by
+the `Recv` instruction with the same channel id. Returns a context, which is
+used by a following `SendDone` instruction to wait for the completion of the
+data transfer. The context is a tuple of {operand (shape), request identifier
+(U32)} and it can only be used by a `SendDone` instruction.
+
+<b> `SendDone(HloInstruction context)` </b>
+
+Given a context created by a `Send` instruction, waits for the data transfer to
+complete.  The instruction does not return any data.
+
+<b> Scheduling of channel instructions </b>
+
+The execution order of the 4 instructions for each channel (`Recv`, `RecvDone`,
+`Send`, `SendDone`) is as below.
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:70%" src="../../images/send_recv_order.png">
+</div>
+
+* `Recv` happens before `Send`
+* `Send` happens before `RecvDone`
+* `Recv` happens before `RecvDone`
+* `Send` happens before `SendDone`
+
+When the backend compilers generate a linear schedule for each computation that
+communicates via channel instructions, there must not be cycles across the
+computations. For example, below schedules lead to deadlocks.
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="../../images/send_recv_schedule.png">
+</div>
+
+## Slice
+
+See also
+[`XlaBuilder::Slice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Slicing extracts a sub-array from the input array. The sub-array is of the same
+rank as the input and contains the values inside a bounding box within the input
+array where the dimensions and indices of the bounding box are given as
+arguments to the slice operation.
+
+<b> `Slice(operand, start_indices, limit_indices)` </b>
+
+| Arguments       | Type                | Semantics                            |
+| --------------- | ------------------- | ------------------------------------ |
+| `operand`       | `XlaOp`             | N dimensional array of type T        |
+| `start_indices` | `ArraySlice<int64>` | List of N integers containing the    |
+:                 :                     : starting indices of the slice for    :
+:                 :                     : each dimension. Values must be       :
+:                 :                     : greater than or equal to zero.       :
+| `limit_indices` | `ArraySlice<int64>` | List of N integers containing the    |
+:                 :                     : ending indices (exclusive) for the   :
+:                 :                     : slice for each dimension. Each value :
+:                 :                     : must be greater than or equal to the :
+:                 :                     : respective `start_indices` value for :
+:                 :                     : the dimension and less than or equal :
+:                 :                     : to the size of the dimension.        :
+
+1-dimensional example:
+
+```
+let a = {0.0, 1.0, 2.0, 3.0, 4.0}
+Slice(a, {2}, {4}) produces:
+  {2.0, 3.0}
+```
+
+2-dimensional example:
+
+```
+let b =
+ { {0.0,  1.0,  2.0},
+   {3.0,  4.0,  5.0},
+   {6.0,  7.0,  8.0},
+   {9.0, 10.0, 11.0} }
+
+Slice(b, {2, 1}, {4, 3}) produces:
+  { { 7.0,  8.0},
+    {10.0, 11.0} }
+```
+
+## Sort
+
+See also
+[`XlaBuilder::Sort`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+There are two versions of the Sort instruction: a single-operand and a
+two-operand version.
+
+<b>`Sort(operand)`</b>
+
+Arguments   | Type    | Semantics
+----------- | ------- | --------------------
+`operand`   | `XlaOp` | The operand to sort.
+`dimension` | `int64` | The dimension along which to sort.
+
+Sorts the elements in the operand in ascending order along the provided
+dimension. For example, for a rank-2 (matrix) operand, a `dimension` value of 0
+will sort each column independently, and a `dimension` value of 1 will sort each
+row independently. If the operand's elements have floating point type, and the
+operand contains NaN elements, the order of elements in the output is
+implementation-defined.
+
+<b>`Sort(key, value)`</b>
+
+Sorts both the key and the value operands. The keys are sorted as in the
+single-operand version. The values are sorted according to the order of their
+corresponding keys. For example, if the inputs are `keys = [3, 1]` and
+`values = [42, 50]`, then the output of the sort is the tuple 
+`{[1, 3], [50, 42]}`.
+
+The sort is not guaranteed to be stable, that is, if the keys array contains
+duplicates, the order of their corresponding values may not be preserved.
+
+Arguments   | Type    | Semantics
+----------- | ------- | -------------------
+`keys`      | `XlaOp` | The sort keys.
+`values`    | `XlaOp` | The values to sort.
+`dimension` | `int64` | The dimension along which to sort.
+
+The `keys` and `values` must have the same dimensions, but may have different
+element types.
+
+## Transpose
+
+See also the `tf.reshape` operation.
+
+<b>`Transpose(operand)`</b>
+
+Arguments     | Type                | Semantics
+------------- | ------------------- | ------------------------------
+`operand`     | `XlaOp`             | The operand to transpose.
+`permutation` | `ArraySlice<int64>` | How to permute the dimensions.
+
+
+Permutes the operand dimensions with the given permutation, so
+`∀ i . 0 ≤ i < rank ⇒ input_dimensions[permutation[i]] = output_dimensions[i]`.
+
+This is the same as Reshape(operand, permutation,
+                            Permute(permutation, operand.shape.dimensions)).
+
+## Tuple
+
+See also
+[`XlaBuilder::Tuple`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+A tuple containing a variable number of data handles, each of which has its own
+shape.
+
+This is analogous to `std::tuple` in C++. Conceptually:
+
+```
+let v: f32[10] = f32[10]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+let s: s32 = 5;
+let t: (f32[10], s32) = tuple(v, s);
+```
+
+Tuples can be deconstructed (accessed) via the [`GetTupleElement`]
+(#gettupleelement) operation.
+
+## While
+
+See also
+[`XlaBuilder::While`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+<b> `While(condition, body, init)` </b>
+
+| Arguments   | Type             | Semantics                                |
+| ----------- | ---------------- | ---------------------------------------- |
+| `condition` | `XlaComputation` | XlaComputation of type `T -> PRED` which |
+:             :                  : defines the termination condition of the :
+:             :                  : loop.                                    :
+| `body`      | `XlaComputation` | XlaComputation of type `T -> T` which    |
+:             :                  : defines the body of the loop.            :
+| `init`      | `T`              | Initial value for the parameter of       |
+:             :                  : `condition` and `body`.                  :
+
+Sequentially executes the `body` until the `condition` fails. This is similar to
+a typical while loop in many other languages except for the differences and
+restrictions listed below.
+
+*   A `While` node returns a value of type `T`, which is the result from the
+    last execution of the `body`.
+*   The shape of the type `T` is statically determined and must be the same
+    across all iterations.
+
+The T parameters of the computations are initialized with the `init` value in
+the first iteration and are automatically updated to the new result from `body`
+in each subsequent iteration.
+
+One main use case of the `While` node is to implement the repeated execution of
+training in neural networks. Simplified pseudocode is shown below with a graph
+that represents the computation. The code can be found in
+[`while_test.cc`](https://www.tensorflow.org/code/tensorflow/compiler/xla/tests/while_test.cc).
+The type `T` in this example is a `Tuple` consisting of an `int32` for the
+iteration count and a `vector[10]` for the accumulator. For 1000 iterations, the
+loop keeps adding a constant vector to the accumulator.
+
+```
+// Pseudocode for the computation.
+init = {0, zero_vector[10]} // Tuple of int32 and float[10].
+result = init;
+while (result(0) < 1000) {
+  iteration = result(0) + 1;
+  new_vector = result(1) + constant_vector[10];
+  result = {iteration, new_vector};
+}
+```
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="https://www.tensorflow.org/images/ops_while.png">
+</div>
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 2a858b4fd6..1a53f24177 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -127,7 +127,6 @@ py_test(
     name = "build_docs_test",
     size = "small",
     srcs = ["build_docs_test.py"],
-    data = ["//tensorflow/docs_src"],
     srcs_version = "PY2AND3",
     tags = [
         # No reason to run sanitizers or fastbuild for this test.
diff --git a/tensorflow/tools/docs/build_docs_test.py b/tensorflow/tools/docs/build_docs_test.py
index 0cbf8b478f..4d3bedda2d 100644
--- a/tensorflow/tools/docs/build_docs_test.py
+++ b/tensorflow/tools/docs/build_docs_test.py
@@ -30,9 +30,11 @@ from tensorflow.tools.docs import generate_lib
 
 class Flags(object):
   resource_root = resource_loader.get_root_dir_with_all_resources()
-  src_dir = os.path.join(resource_root, 'tensorflow/docs_src')
+  src_dir = os.path.join(googletest.GetTempDir(), 'input')
+  os.mkdir(src_dir)
   base_dir = os.path.join(resource_root, 'tensorflow/')
-  output_dir = googletest.GetTempDir()
+  output_dir = os.path.join(googletest.GetTempDir(), 'output')
+  os.mkdir(output_dir)
 
 
 class BuildDocsTest(googletest.TestCase):
-- 
GitLab


From df7221d84988e5f7c1cc2775d8f5f44ffdd5918b Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Mon, 1 Oct 2018 14:39:31 -0700
Subject: [PATCH 253/570] Drop external control dependencies in tfe.defun.

They shouldn't help given the automatic control dependencies, and are tricky
to capture in the general case.

PiperOrigin-RevId: 215282837
---
 tensorflow/python/eager/function.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 3b6f288fb9..f261d92d64 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -269,6 +269,15 @@ class FuncGraph(ops.Graph):
   def variables(self, var_list):
     self._weak_variables = [weakref.ref(v) for v in var_list]
 
+  def control_dependencies(self, control_inputs):
+    # Drop control dependencies to outside of the graph. TODO(b/117109273)
+    # unclear how to capture an op, not a tensor.
+    if not control_inputs:
+      return super(FuncGraph, self).control_dependencies(control_inputs)
+    return super(FuncGraph, self).control_dependencies(
+        [c for c in control_inputs
+         if getattr(c, "graph", None) is self])
+
   def create_op(
       self,
       op_type,
-- 
GitLab


From 9084e999b3caf65833f9651c72bc09eb3094eba5 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Mon, 1 Oct 2018 15:08:25 -0700
Subject: [PATCH 254/570] Don't run initialize ops if it's empty. Fixes a bug
 when using the profiler.

PiperOrigin-RevId: 215287936
---
 tensorflow/python/training/session_manager.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 5e4749f306..cd313c2ce0 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -184,9 +184,11 @@ class SessionManager(object):
     self._target = master
     sess = session.Session(self._target, graph=self._graph, config=config)
     # TODO(jhseu): Delete once tpu.initialize_system() goes away.
-    sess.run(
+    initialize_ops = (
         distribution_strategy_context.get_distribution_strategy().initialize()
     )
+    if initialize_ops:
+      sess.run(initialize_ops)
 
     if checkpoint_dir and checkpoint_filename_with_path:
       raise ValueError("Can not provide both checkpoint_dir and "
-- 
GitLab


From c7237e6070dbf4acd1ade5a40dc676418cbd889b Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 1 Oct 2018 15:10:19 -0700
Subject: [PATCH 255/570] Don't generate backward function and delete when its
 not necessary

PiperOrigin-RevId: 215288224
---
 tensorflow/c/eager/tape.h                 | 7 +++----
 tensorflow/python/eager/pywrap_tfe_src.cc | 3 +--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 41b5b8ff36..5ba55a203f 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -130,7 +130,7 @@ class GradientTape {
       const string& op_type, std::vector<TapeTensor>& output_tensors,
       gtl::ArraySlice<int64> input_tensor_id,
       gtl::ArraySlice<tensorflow::DataType> input_dtypes,
-      BackwardFunction* backward_function,
+      const std::function<BackwardFunction*()>& backward_function_getter,
       const std::function<void(BackwardFunction*)>& backward_function_deleter);
 
   void DeleteTrace(int64 tensor_id);
@@ -206,10 +206,9 @@ void GradientTape<Gradient, BackwardFunction, TapeTensor>::RecordOperation(
     const string& op_type, std::vector<TapeTensor>& output_tensors,
     gtl::ArraySlice<int64> input_tensor_id,
     gtl::ArraySlice<tensorflow::DataType> input_dtypes,
-    BackwardFunction* backward_function,
+    const std::function<BackwardFunction*()>& backward_function_getter,
     const std::function<void(BackwardFunction*)>& backward_function_deleter) {
   if (!ShouldRecord(input_tensor_id, input_dtypes)) {
-    backward_function_deleter(backward_function);
     return;
   }
   std::vector<int64> ids;
@@ -229,7 +228,7 @@ void GradientTape<Gradient, BackwardFunction, TapeTensor>::RecordOperation(
     tensors.push_back(o);
   }
   op_tape_[op_id] = OpTapeEntry<BackwardFunction, TapeTensor>{
-      op_type, std::move(tensors), ids, backward_function,
+      op_type, std::move(tensors), std::move(ids), backward_function_getter(),
       backward_function_deleter};
 }
 
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 4b9f7f4100..ae1e12f9c3 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1567,9 +1567,8 @@ void TapeSetRecordOperation(
   }
 
   for (TFE_Py_Tape* tape : SafeTapeSet()) {
-    auto* function = backward_function_getter();
     tape->tape->RecordOperation(op_type_str, output_info, input_ids,
-                                input_dtypes, function,
+                                input_dtypes, backward_function_getter,
                                 backward_function_killer);
   }
 }
-- 
GitLab


From cca204f12a5838f0ffdd4a80c27d451cf61d3636 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 15:25:27 -0700
Subject: [PATCH 256/570] Added option (off by default) to enable a
 higher-performance variant of the Adam optimizer's variable update formula.

PiperOrigin-RevId: 215290881
---
 tensorflow/contrib/tpu/proto/optimization_parameters.proto | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
index a43f45554f..8529b48c15 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
@@ -62,7 +62,10 @@ message FtrlParameters {
 // (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/AdamOptimizer). If
 // use_non_lazy_adam is enabled, use_gradient_accumulation is also required in
 // order to get correct results; a warning will be printed otherwise (which may
-// change to an error in the future).
+// change to an error in the future). If use_max_with_epsilon is set, the Adam
+// variable update formula will be changed from m / (sqrt(v) + epsilon) to
+// m / max(sqrt(v), abs(epsilon)); this option improves the performance of TPU
+// training and is not expected to harm model quality.
 message AdamParameters {
   float beta1 = 3;
   float beta2 = 4;
@@ -70,6 +73,7 @@ message AdamParameters {
   float initial_m = 6;
   float initial_v = 7;
   bool use_non_lazy_adam = 8;
+  bool use_max_with_epsilon = 9;
 }
 
 // https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer
-- 
GitLab


From 52574f95279d8cd5ec22cfc24668b9586e41367a Mon Sep 17 00:00:00 2001
From: Ayush Dubey <ayushd@google.com>
Date: Mon, 1 Oct 2018 15:26:59 -0700
Subject: [PATCH 257/570] Remove jemalloc build files and dead configuration
 options.

PiperOrigin-RevId: 215291195
---
 configure.py                                  |   2 -
 tensorflow/BUILD                              |  39 --
 tensorflow/contrib/cmake/CMakeLists.txt       |  11 -
 .../contrib/cmake/external/jemalloc.cmake     |  50 ---
 .../core/platform/default/build_config.bzl    |  20 +-
 tensorflow/core/platform/posix/port.cc        |  36 +-
 tensorflow/core/platform/windows/port.cc      |  51 +--
 tensorflow/tools/lib_package/BUILD            |  16 -
 tensorflow/tools/pip_package/BUILD            |   8 -
 tensorflow/workspace.bzl                      |  12 -
 third_party/jemalloc.BUILD                    | 356 ------------------
 third_party/systemlibs/jemalloc.BUILD         |  30 --
 third_party/systemlibs/syslibs_configure.bzl  |   1 -
 tools/bazel.rc                                |   1 -
 14 files changed, 11 insertions(+), 622 deletions(-)
 delete mode 100644 tensorflow/contrib/cmake/external/jemalloc.cmake
 delete mode 100644 third_party/jemalloc.BUILD
 delete mode 100644 third_party/systemlibs/jemalloc.BUILD

diff --git a/configure.py b/configure.py
index 57d9574d1f..0efa11aa41 100644
--- a/configure.py
+++ b/configure.py
@@ -1493,7 +1493,6 @@ def main():
   setup_python(environ_cp)
 
   if is_windows():
-    environ_cp['TF_NEED_JEMALLOC'] = '0'
     environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
     environ_cp['TF_NEED_COMPUTECPP'] = '0'
     environ_cp['TF_NEED_OPENCL'] = '0'
@@ -1507,7 +1506,6 @@ def main():
     environ_cp['TF_SET_ANDROID_WORKSPACE'] = '0'
 
   if is_macos():
-    environ_cp['TF_NEED_JEMALLOC'] = '0'
     environ_cp['TF_NEED_TENSORRT'] = '0'
     environ_cp['TF_ENABLE_XLA'] = '0'
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 4876b51a6f..9b62a50452 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -203,21 +203,6 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-# TODO(jhseu): Enable on other platforms other than Linux.
-config_setting(
-    name = "with_jemalloc_linux_x86_64",
-    define_values = {"with_jemalloc": "true"},
-    values = {"cpu": "k8"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_jemalloc_linux_ppc64le",
-    define_values = {"with_jemalloc": "true"},
-    values = {"cpu": "ppc"},
-    visibility = ["//visibility:public"],
-)
-
 config_setting(
     name = "with_default_optimizations",
     define_values = {"with_default_optimizations": "true"},
@@ -265,30 +250,6 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-config_setting(
-    name = "with_jemalloc_linux_x86_64_dynamic",
-    define_values = {
-        "with_jemalloc": "true",
-        "framework_shared_object": "true",
-    },
-    values = {
-        "cpu": "k8",
-    },
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_jemalloc_linux_ppc64le_dynamic",
-    define_values = {
-        "with_jemalloc": "true",
-        "framework_shared_object": "true",
-    },
-    values = {
-        "cpu": "ppc",
-    },
-    visibility = ["//visibility:public"],
-)
-
 config_setting(
     name = "using_cuda_clang",
     define_values = {
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index c6d6f04168..f675c135f4 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -30,7 +30,6 @@ endif()
 
 option(tensorflow_ENABLE_GRPC_SUPPORT "Enable gRPC support" ON)
 option(tensorflow_ENABLE_HDFS_SUPPORT "Enable HDFS support" OFF)
-option(tensorflow_ENABLE_JEMALLOC_SUPPORT "Enable jemalloc support" OFF)
 option(tensorflow_BUILD_CC_EXAMPLE "Build the C++ tutorial example" ON)
 option(tensorflow_BUILD_PYTHON_BINDINGS "Build the Python bindings" ON)
 option(tensorflow_BUILD_ALL_KERNELS "Build all OpKernels" ON)
@@ -218,10 +217,6 @@ if (tensorflow_WIN_CPU_SIMD_OPTIONS)
   endif()
 endif()
 
-if (tensorflow_ENABLE_JEMALLOC_SUPPORT)
-  add_definitions(-DTENSORFLOW_USE_JEMALLOC -DJEMALLOC_EXPORT=)
-endif()
-
 # External dependencies
 include(zlib)
 include(gif)
@@ -329,12 +324,6 @@ if(tensorflow_ENABLE_GRPC_SUPPORT)
     list(APPEND tensorflow_EXTERNAL_DEPENDENCIES boringssl)
   endif()
 endif()
-if(tensorflow_ENABLE_JEMALLOC_SUPPORT)
-  include(jemalloc)
-  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${jemalloc_STATIC_LIBRARIES})
-  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES jemalloc)
-  include_directories(${jemalloc_INCLUDE_DIRS})
-endif()
 if(tensorflow_ENABLE_SNAPPY_SUPPORT)
   include(snappy)
   list(APPEND tensorflow_EXTERNAL_LIBRARIES ${snappy_STATIC_LIBRARIES})
diff --git a/tensorflow/contrib/cmake/external/jemalloc.cmake b/tensorflow/contrib/cmake/external/jemalloc.cmake
deleted file mode 100644
index afadcc007d..0000000000
--- a/tensorflow/contrib/cmake/external/jemalloc.cmake
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-include (ExternalProject)
-
-set(jemalloc_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc/include)
-set(jemalloc_URL https://mirror.bazel.build/github.com/jemalloc/jemalloc-cmake/archive/jemalloc-cmake.4.3.1.tar.gz)
-set(jemalloc_HASH SHA256=f9be9a05fe906deb5c1c8ca818071a7d2e27d66fd87f5ba9a7bf3750bcedeaf0)
-set(jemalloc_BUILD ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc)
-
-if (WIN32)
-    set(jemalloc_INCLUDE_DIRS
-        ${jemalloc_INCLUDE_DIRS} 
-        ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc/include/msvc_compat
-    )
-    if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
-        set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.lib)
-    else()
-        set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/jemalloc.lib)
-    endif()
-else()
-    set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.a)
-endif()
-
-ExternalProject_Add(jemalloc
-    PREFIX jemalloc
-    URL ${jemalloc_URL}
-    URL_HASH ${jemalloc_HASH}
-    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
-    BUILD_IN_SOURCE 1
-    BUILD_BYPRODUCTS ${jemalloc_STATIC_LIBRARIES}
-    BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target jemalloc
-    INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "Skipping install step."
-    CMAKE_CACHE_ARGS
-        -DCMAKE_BUILD_TYPE:STRING=Release
-        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-        -Dwith-jemalloc-prefix:STRING=jemalloc_
-        -Dwithout-export:BOOL=ON
-)
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 3b14757945..d884c1aa7c 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -615,11 +615,7 @@ def tf_kernel_tests_linkstatic():
 
 def tf_additional_lib_defines():
     """Additional defines needed to build TF libraries."""
-    return select({
-        "//tensorflow:with_jemalloc_linux_x86_64": ["TENSORFLOW_USE_JEMALLOC"],
-        "//tensorflow:with_jemalloc_linux_ppc64le": ["TENSORFLOW_USE_JEMALLOC"],
-        "//conditions:default": [],
-    })
+    return []
 
 def tf_additional_lib_deps():
     """Additional dependencies needed to build TF libraries."""
@@ -631,13 +627,7 @@ def tf_additional_lib_deps():
     ] + if_static(
         ["@nsync//:nsync_cpp"],
         ["@nsync//:nsync_headers"],
-    ) + select({
-        "//tensorflow:with_jemalloc_linux_x86_64_dynamic": ["@jemalloc//:jemalloc_headers"],
-        "//tensorflow:with_jemalloc_linux_ppc64le_dynamic": ["@jemalloc//:jemalloc_headers"],
-        "//tensorflow:with_jemalloc_linux_x86_64": ["@jemalloc//:jemalloc_impl"],
-        "//tensorflow:with_jemalloc_linux_ppc64le": ["@jemalloc//:jemalloc_impl"],
-        "//conditions:default": [],
-    })
+    )
 
 def tf_additional_core_deps():
     return select({
@@ -725,11 +715,7 @@ def tf_additional_binary_deps():
             "//tensorflow/stream_executor:cuda_platform",
             "//tensorflow/core/platform/default/build_config:cuda",
         ],
-    ) + select({
-        "//tensorflow:with_jemalloc_linux_x86_64": ["@jemalloc//:jemalloc_impl"],
-        "//tensorflow:with_jemalloc_linux_ppc64le": ["@jemalloc//:jemalloc_impl"],
-        "//conditions:default": [],
-    }) + [
+    ) + [
         # TODO(allenl): Split these out into their own shared objects (they are
         # here because they are shared between contrib/ op shared objects and
         # core).
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index b46b9927cd..acdd7798ea 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -13,10 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifdef TENSORFLOW_USE_JEMALLOC
-#include "jemalloc/jemalloc.h"
-#endif
-
 #include "absl/base/internal/sysinfo.h"
 
 #include "tensorflow/core/platform/cpu_info.h"
@@ -101,11 +97,7 @@ void* AlignedMalloc(size_t size, int minimum_alignment) {
   // memory aligned to at least the size of a pointer.
   const int required_alignment = sizeof(void*);
   if (minimum_alignment < required_alignment) return Malloc(size);
-#ifdef TENSORFLOW_USE_JEMALLOC
-  int err = jemalloc_posix_memalign(&ptr, minimum_alignment, size);
-#else
   int err = posix_memalign(&ptr, minimum_alignment, size);
-#endif
   if (err != 0) {
     return nullptr;
   } else {
@@ -116,29 +108,11 @@ void* AlignedMalloc(size_t size, int minimum_alignment) {
 
 void AlignedFree(void* aligned_memory) { Free(aligned_memory); }
 
-void* Malloc(size_t size) {
-#ifdef TENSORFLOW_USE_JEMALLOC
-  return jemalloc_malloc(size);
-#else
-  return malloc(size);
-#endif
-}
+void* Malloc(size_t size) { return malloc(size); }
 
-void* Realloc(void* ptr, size_t size) {
-#ifdef TENSORFLOW_USE_JEMALLOC
-  return jemalloc_realloc(ptr, size);
-#else
-  return realloc(ptr, size);
-#endif
-}
+void* Realloc(void* ptr, size_t size) { return realloc(ptr, size); }
 
-void Free(void* ptr) {
-#ifdef TENSORFLOW_USE_JEMALLOC
-  jemalloc_free(ptr);
-#else
-  free(ptr);
-#endif
-}
+void Free(void* ptr) { free(ptr); }
 
 void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
   return AlignedMalloc(size, minimum_alignment);
@@ -146,9 +120,7 @@ void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
 
 void NUMAFree(void* ptr, size_t size) { Free(ptr); }
 
-int NUMAGetMemAffinity(const void* addr) {
-  return kNUMANoAffinity;
-}
+int NUMAGetMemAffinity(const void* addr) { return kNUMANoAffinity; }
 
 void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
   // No-op.
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 5375f56372..911ea1902f 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -13,10 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifdef TENSORFLOW_USE_JEMALLOC
-#include "jemalloc/jemalloc.h"
-#endif
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -70,55 +66,16 @@ void NUMASetThreadNodeAffinity(int node) {}
 int NUMAGetThreadNodeAffinity() { return kNUMANoAffinity; }
 
 void* AlignedMalloc(size_t size, int minimum_alignment) {
-#ifdef TENSORFLOW_USE_JEMALLOC
-  void* ptr = NULL;
-  // posix_memalign requires that the requested alignment be at least
-  // sizeof(void*). In this case, fall back on malloc which should return
-  // memory aligned to at least the size of a pointer.
-  const int required_alignment = sizeof(void*);
-  if (minimum_alignment < required_alignment) return Malloc(size);
-  int err = jemalloc_posix_memalign(&ptr, minimum_alignment, size);
-  if (err != 0) {
-    return NULL;
-  } else {
-    return ptr;
-  }
-#else
   return _aligned_malloc(size, minimum_alignment);
-#endif
 }
 
-void AlignedFree(void* aligned_memory) {
-#ifdef TENSORFLOW_USE_JEMALLOC
-  jemalloc_free(aligned_memory);
-#else
-  _aligned_free(aligned_memory);
-#endif
-}
+void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
 
-void* Malloc(size_t size) {
-#ifdef TENSORFLOW_USE_JEMALLOC
-  return jemalloc_malloc(size);
-#else
-  return malloc(size);
-#endif
-}
+void* Malloc(size_t size) { return malloc(size); }
 
-void* Realloc(void* ptr, size_t size) {
-#ifdef TENSORFLOW_USE_JEMALLOC
-  return jemalloc_realloc(ptr, size);
-#else
-  return realloc(ptr, size);
-#endif
-}
+void* Realloc(void* ptr, size_t size) { return realloc(ptr, size); }
 
-void Free(void* ptr) {
-#ifdef TENSORFLOW_USE_JEMALLOC
-  return jemalloc_free(ptr);
-#else
-  return free(ptr);
-#endif
-}
+void Free(void* ptr) { return free(ptr); }
 
 void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
   return AlignedMalloc(size, minimum_alignment);
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index b9f4902639..85514b8629 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -137,14 +137,6 @@ genrule(
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
     ] + select({
-        "//tensorflow:with_jemalloc_linux_x86_64": [
-            "@jemalloc//:COPYING",
-        ],
-        "//tensorflow:with_jemalloc_linux_ppc64le": [
-            "@jemalloc//:COPYING",
-        ],
-        "//conditions:default": [],
-    }) + select({
         "//tensorflow/core/kernels:xsmm": [
             "@libxsmm_archive//:LICENSE.md",
         ],
@@ -202,14 +194,6 @@ genrule(
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
     ] + select({
-        "//tensorflow:with_jemalloc_linux_x86_64": [
-            "@jemalloc//:COPYING",
-        ],
-        "//tensorflow:with_jemalloc_linux_ppc64le": [
-            "@jemalloc//:COPYING",
-        ],
-        "//conditions:default": [],
-    }) + select({
         "//tensorflow/core/kernels:xsmm": [
             "@libxsmm_archive//:LICENSE.md",
         ],
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index c621812535..3a1c4a45d4 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -167,14 +167,6 @@ filegroup(
         "@zlib_archive//:zlib.h",
         "@org_python_pypi_backports_weakref//:LICENSE",
     ] + select({
-        "//tensorflow:with_jemalloc_linux_x86_64": [
-            "@jemalloc//:COPYING",
-        ],
-        "//tensorflow:with_jemalloc_linux_ppc64le": [
-            "@jemalloc//:COPYING",
-        ],
-        "//conditions:default": [],
-    }) + select({
         "//tensorflow/core/kernels:xsmm": [
             "@libxsmm_archive//:LICENSE.md",
         ],
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 9b4b698874..bcc89ef729 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -642,18 +642,6 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         testonly_ = True,
     )
 
-    tf_http_archive(
-        name = "jemalloc",
-        build_file = clean_dep("//third_party:jemalloc.BUILD"),
-        sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
-        strip_prefix = "jemalloc-4.4.0",
-        system_build_file = clean_dep("//third_party/systemlibs:jemalloc.BUILD"),
-        urls = [
-            "https://mirror.bazel.build/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
-            "https://github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
-        ],
-    )
-
     java_import_external(
         name = "com_google_testing_compile",
         jar_sha256 = "edc180fdcd9f740240da1a7a45673f46f59c5578d8cd3fbc912161f74b5aebb8",
diff --git a/third_party/jemalloc.BUILD b/third_party/jemalloc.BUILD
deleted file mode 100644
index 1b0829b8fe..0000000000
--- a/third_party/jemalloc.BUILD
+++ /dev/null
@@ -1,356 +0,0 @@
-# Description:
-# jemalloc - a general-purpose scalable concurrent malloc implementation
-
-licenses(["notice"])  # BSD
-
-exports_files(["COPYING"])
-
-load("@org_tensorflow//third_party:common.bzl", "template_rule")
-
-cc_library(
-    name = "jemalloc_headers",
-    hdrs = ["include/jemalloc/jemalloc.h"],
-    includes = ["include"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "jemalloc_impl",
-    srcs = [
-        "src/arena.c",
-        "src/atomic.c",
-        "src/base.c",
-        "src/bitmap.c",
-        "src/chunk.c",
-        "src/chunk_dss.c",
-        "src/chunk_mmap.c",
-        "src/ckh.c",
-        "src/ctl.c",
-        "src/extent.c",
-        "src/hash.c",
-        "src/huge.c",
-        "src/jemalloc.c",
-        "src/mb.c",
-        "src/mutex.c",
-        "src/nstime.c",
-        "src/pages.c",
-        "src/prng.c",
-        "src/prof.c",
-        "src/quarantine.c",
-        "src/rtree.c",
-        "src/spin.c",
-        "src/stats.c",
-        "src/tcache.c",
-        "src/tsd.c",
-        "src/util.c",
-        "src/witness.c",
-    ],
-    hdrs = [
-        "include/jemalloc/internal/arena.h",
-        "include/jemalloc/internal/assert.h",
-        "include/jemalloc/internal/atomic.h",
-        "include/jemalloc/internal/base.h",
-        "include/jemalloc/internal/bitmap.h",
-        "include/jemalloc/internal/chunk.h",
-        "include/jemalloc/internal/chunk_dss.h",
-        "include/jemalloc/internal/chunk_mmap.h",
-        "include/jemalloc/internal/ckh.h",
-        "include/jemalloc/internal/ctl.h",
-        "include/jemalloc/internal/extent.h",
-        "include/jemalloc/internal/hash.h",
-        "include/jemalloc/internal/huge.h",
-        "include/jemalloc/internal/jemalloc_internal.h",
-        "include/jemalloc/internal/jemalloc_internal_decls.h",
-        "include/jemalloc/internal/jemalloc_internal_defs.h",
-        "include/jemalloc/internal/jemalloc_internal_macros.h",
-        "include/jemalloc/internal/mb.h",
-        "include/jemalloc/internal/mutex.h",
-        "include/jemalloc/internal/nstime.h",
-        "include/jemalloc/internal/pages.h",
-        "include/jemalloc/internal/ph.h",
-        "include/jemalloc/internal/private_namespace.h",
-        "include/jemalloc/internal/prng.h",
-        "include/jemalloc/internal/prof.h",
-        "include/jemalloc/internal/ql.h",
-        "include/jemalloc/internal/qr.h",
-        "include/jemalloc/internal/quarantine.h",
-        "include/jemalloc/internal/rb.h",
-        "include/jemalloc/internal/rtree.h",
-        "include/jemalloc/internal/size_classes.h",
-        "include/jemalloc/internal/smoothstep.h",
-        "include/jemalloc/internal/spin.h",
-        "include/jemalloc/internal/stats.h",
-        "include/jemalloc/internal/tcache.h",
-        "include/jemalloc/internal/ticker.h",
-        "include/jemalloc/internal/tsd.h",
-        "include/jemalloc/internal/util.h",
-        "include/jemalloc/internal/valgrind.h",
-        "include/jemalloc/internal/witness.h",
-    ],
-    # Same flags that jemalloc uses to build.
-    copts = [
-        "-O3",
-        "-funroll-loops",
-        "-D_GNU_SOURCE",
-        "-D_REENTRANT",
-    ],
-    includes = ["include"],
-    # pthread_atfork() is called for PPC.
-    linkopts = select({
-        "@org_tensorflow//tensorflow:linux_ppc64le": [
-            "-lpthread",
-        ],
-        "@org_tensorflow//tensorflow:linux_x86_64": [
-            "-lpthread",
-        ],
-        "//conditions:default": [
-        ],
-    }),
-    visibility = ["//visibility:public"],
-    deps = [":jemalloc_headers"],
-)
-
-sh_binary(
-    name = "jemalloc_sh",
-    srcs = ["include/jemalloc/jemalloc.sh"],
-)
-
-genrule(
-    name = "jemalloc_h",
-    srcs = [
-        ":jemalloc_defs_h",
-        ":jemalloc_macros_h",
-        ":jemalloc_mangle_h",
-        ":jemalloc_protos_h",
-        ":jemalloc_rename_h",
-        ":jemalloc_typedefs_h",
-    ],
-    outs = ["include/jemalloc/jemalloc.h"],
-    cmd = "$(location :jemalloc_sh) $$(dirname $(location :jemalloc_defs_h))/../../ >$@",
-    tools = [":jemalloc_sh"],
-)
-
-# Add to this list if you want to export more symbols from jemalloc.
-genrule(
-    name = "public_symbols_txt",
-    outs = ["include/jemalloc/internal/public_symbols.txt"],
-    cmd = "\n".join([
-        "cat <<'EOF' > $@",
-        "free:jemalloc_free",
-        "malloc:jemalloc_malloc",
-        "posix_memalign:jemalloc_posix_memalign",
-        "realloc:jemalloc_realloc",
-        "EOF",
-    ]),
-)
-
-sh_binary(
-    name = "jemalloc_mangle_sh",
-    srcs = ["include/jemalloc/jemalloc_mangle.sh"],
-)
-
-genrule(
-    name = "jemalloc_mangle_h",
-    srcs = [":public_symbols_txt"],
-    outs = ["include/jemalloc/jemalloc_mangle.h"],
-    cmd = "$(location :jemalloc_mangle_sh) $(location :public_symbols_txt) je_ >$@",
-    tools = [":jemalloc_mangle_sh"],
-)
-
-sh_binary(
-    name = "jemalloc_rename_sh",
-    srcs = ["include/jemalloc/jemalloc_rename.sh"],
-)
-
-genrule(
-    name = "jemalloc_rename_h",
-    srcs = [":public_symbols_txt"],
-    outs = ["include/jemalloc/jemalloc_rename.h"],
-    cmd = "$(location :jemalloc_rename_sh) $(location :public_symbols_txt) >$@",
-    tools = [":jemalloc_rename_sh"],
-)
-
-sh_binary(
-    name = "private_namespace_sh",
-    srcs = ["include/jemalloc/internal/private_namespace.sh"],
-)
-
-genrule(
-    name = "private_namespace_h",
-    srcs = ["include/jemalloc/internal/private_symbols.txt"],
-    outs = ["include/jemalloc/internal/private_namespace.h"],
-    cmd = "$(location :private_namespace_sh) $(location include/jemalloc/internal/private_symbols.txt) >$@",
-    tools = [":private_namespace_sh"],
-)
-
-sh_binary(
-    name = "public_namespace_sh",
-    srcs = ["include/jemalloc/internal/public_namespace.sh"],
-)
-
-genrule(
-    name = "public_namespace_h",
-    srcs = [":public_symbols_txt"],
-    outs = ["include/jemalloc/internal/public_namespace.h"],
-    cmd = "$(location :public_namespace_sh) $(location :public_symbols_txt) >$@",
-    tools = [":public_namespace_sh"],
-)
-
-sh_binary(
-    name = "size_classes_sh",
-    srcs = ["include/jemalloc/internal/size_classes.sh"],
-)
-
-# Size classes for Linux x86_64 and ppc64le. Update if adding builds for other
-# architectures. See size_classes.sh for details on the arguments.
-# For default case, kept the arguments same as that of  x86_64 for now.
-genrule(
-    name = "size_classes_h",
-    outs = ["include/jemalloc/internal/size_classes.h"],
-    cmd = select({
-        "@org_tensorflow//tensorflow:linux_ppc64le": "$(location :size_classes_sh) \"3 4\" 3 16 2 >$@",
-        "@org_tensorflow//tensorflow:linux_x86_64": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
-        "//conditions:default": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
-    }),
-    tools = [":size_classes_sh"],
-)
-
-template_rule(
-    name = "jemalloc_internal_h",
-    src = "include/jemalloc/internal/jemalloc_internal.h.in",
-    out = "include/jemalloc/internal/jemalloc_internal.h",
-    substitutions = {
-        "@private_namespace@": "je_",
-        "@install_suffix@": "",
-    },
-)
-
-template_rule(
-    name = "jemalloc_internal_defs_h",
-    src = "include/jemalloc/internal/jemalloc_internal_defs.h.in",
-    out = "include/jemalloc/internal/jemalloc_internal_defs.h",
-    substitutions = {
-        "#undef JEMALLOC_PREFIX": "#define JEMALLOC_PREFIX \"jemalloc_\"",
-        "#undef JEMALLOC_CPREFIX": "#define JEMALLOC_CPREFIX \"JEMALLOC_\"",
-        "#undef JEMALLOC_PRIVATE_NAMESPACE": "#define JEMALLOC_PRIVATE_NAMESPACE je_",
-        "#undef CPU_SPINWAIT": "\n".join([
-            "#if defined(__powerpc64__) || defined(__powerpc__)",
-            "#define CPU_SPINWAIT __asm__ volatile(\"or 27,27,27\")",
-            "#else",
-            "#define CPU_SPINWAIT __asm__ volatile(\"pause\")",
-            "#endif",
-        ]),
-        "#undef JEMALLOC_HAVE_BUILTIN_CLZ": "#define JEMALLOC_HAVE_BUILTIN_CLZ",
-        "#undef JEMALLOC_USE_SYSCALL": "#define JEMALLOC_USE_SYSCALL",
-        "#undef JEMALLOC_HAVE_SECURE_GETENV": "#define JEMALLOC_HAVE_SECURE_GETENV",
-        "#undef JEMALLOC_HAVE_PTHREAD_ATFORK": "#define JEMALLOC_HAVE_PTHREAD_ATFORK",
-        "#undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE": "#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE 1",
-        # Newline required because of substitution conflicts.
-        "#undef JEMALLOC_HAVE_CLOCK_MONOTONIC\n": "#define JEMALLOC_HAVE_CLOCK_MONOTONIC 1\n",
-        "#undef JEMALLOC_THREADED_INIT": "#define JEMALLOC_THREADED_INIT",
-        "#undef JEMALLOC_TLS_MODEL": "#define JEMALLOC_TLS_MODEL __attribute__((tls_model(\"initial-exec\")))",
-        "#undef JEMALLOC_CC_SILENCE": "#define JEMALLOC_CC_SILENCE",
-        "#undef JEMALLOC_STATS": "#define JEMALLOC_STATS",
-        "#undef JEMALLOC_TCACHE": "#define JEMALLOC_TCACHE",
-        "#undef JEMALLOC_DSS": "#define JEMALLOC_DSS",
-        "#undef JEMALLOC_FILL": "#define JEMALLOC_FILL",
-        "#undef LG_TINY_MIN": "#define LG_TINY_MIN 3",
-        "#undef LG_PAGE": "\n".join([
-            "#if defined(__powerpc64__) || defined(__powerpc__)",
-            "#define LG_PAGE 16",
-            "#else",
-            "#define LG_PAGE 12",
-            "#endif",
-        ]),
-        "#undef JEMALLOC_MAPS_COALESCE": "#define JEMALLOC_MAPS_COALESCE",
-        "#undef JEMALLOC_TLS": "#define JEMALLOC_TLS",
-        "#undef JEMALLOC_INTERNAL_UNREACHABLE": "#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable",
-        "#undef JEMALLOC_INTERNAL_FFSLL": "#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll",
-        # Newline required because of substitution conflicts.
-        "#undef JEMALLOC_INTERNAL_FFSL\n": "#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl\n",
-        "#undef JEMALLOC_INTERNAL_FFS\n": "#define JEMALLOC_INTERNAL_FFS __builtin_ffs\n",
-        "#undef JEMALLOC_CACHE_OBLIVIOUS": "#define JEMALLOC_CACHE_OBLIVIOUS",
-        "#undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY": "#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY",
-        "#undef JEMALLOC_HAVE_MADVISE": "#define JEMALLOC_HAVE_MADVISE",
-        "#undef JEMALLOC_PURGE_MADVISE_DONTNEED": "#define JEMALLOC_PURGE_MADVISE_DONTNEED",
-        "#undef JEMALLOC_THP": "#define JEMALLOC_THP",
-        "#undef JEMALLOC_HAS_ALLOCA_H": "#define JEMALLOC_HAS_ALLOCA_H 1",
-        # Newline required because of substitution conflicts.
-        "#undef LG_SIZEOF_INT\n": "#define LG_SIZEOF_INT 2\n",
-        "#undef LG_SIZEOF_LONG\n": "#define LG_SIZEOF_LONG 3\n",
-        "#undef LG_SIZEOF_LONG_LONG": "#define LG_SIZEOF_LONG_LONG 3",
-        "#undef LG_SIZEOF_INTMAX_T": "#define LG_SIZEOF_INTMAX_T 3",
-        "#undef JEMALLOC_GLIBC_MALLOC_HOOK": "#define JEMALLOC_GLIBC_MALLOC_HOOK",
-        "#undef JEMALLOC_GLIBC_MEMALIGN_HOOK": "#define JEMALLOC_GLIBC_MEMALIGN_HOOK",
-        "#undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP": "#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP",
-        "#undef JEMALLOC_CONFIG_MALLOC_CONF": "#define JEMALLOC_CONFIG_MALLOC_CONF \"\"",
-    },
-)
-
-template_rule(
-    name = "jemalloc_defs_h",
-    src = "include/jemalloc/jemalloc_defs.h.in",
-    out = "include/jemalloc/jemalloc_defs.h",
-    substitutions = {
-        "#undef JEMALLOC_HAVE_ATTR": "#define JEMALLOC_HAVE_ATTR",
-        "#undef JEMALLOC_HAVE_ATTR_ALLOC_SIZE": "#define JEMALLOC_HAVE_ATTR_ALLOC_SIZE",
-        "#undef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF": "#define JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF",
-        "#undef JEMALLOC_HAVE_ATTR_FORMAT_PRINTF": "#define JEMALLOC_HAVE_ATTR_FORMAT_PRINTF",
-        "#undef JEMALLOC_OVERRIDE_MEMALIGN": "#define JEMALLOC_OVERRIDE_MEMALIGN",
-        "#undef JEMALLOC_OVERRIDE_VALLOC": "#define JEMALLOC_OVERRIDE_VALLOC",
-        "#undef JEMALLOC_USABLE_SIZE_CONST": "#define JEMALLOC_USABLE_SIZE_CONST",
-        "#undef JEMALLOC_USE_CXX_THROW": "#define JEMALLOC_USE_CXX_THROW",
-        "#undef LG_SIZEOF_PTR": "#define LG_SIZEOF_PTR 3",
-    },
-)
-
-template_rule(
-    name = "jemalloc_macros_h",
-    src = "include/jemalloc/jemalloc_macros.h.in",
-    out = "include/jemalloc/jemalloc_macros.h",
-    substitutions = {
-        "@jemalloc_version@": "0.0.0",
-        "@jemalloc_version_major@": "0",
-        "@jemalloc_version_minor@": "0",
-        "@jemalloc_version_bugfix@": "0",
-        "@jemalloc_version_nrev@": "0",
-        "@jemalloc_version_gid@": "0000000000000000000000000000000000000000",
-    },
-)
-
-template_rule(
-    name = "jemalloc_protos_h",
-    src = "include/jemalloc/jemalloc_protos.h.in",
-    out = "include/jemalloc/jemalloc_protos.h",
-    substitutions = {
-        "@aligned_alloc": "aligned_alloc",
-        "@calloc": "calloc",
-        "@cbopaque": "cbopaque",
-        "@dallocx": "dallocx",
-        "@free": "free",
-        "@je": "je",
-        "@mallctl": "mallctl",
-        "@mallctlnametomib": "mallctlnametomib",
-        "@mallctlbymib": "mallctlbymib",
-        "@malloc_stats_print": "malloc_stats_print",
-        "@malloc_usable_size": "malloc_usable_size",
-        "@malloc": "malloc",
-        "@mallocx": "mallocx",
-        "@memalign": "memalign",
-        "@nallocx": "nallocx",
-        "@posix_memalign": "posix_memalign",
-        "@rallocx": "rallocx",
-        "@realloc": "realloc",
-        "@sallocx": "sallocx",
-        "@sdallocx": "sdallocx",
-        "@valloc": "valloc",
-        "@xallocx": "xallocx",
-    },
-)
-
-template_rule(
-    name = "jemalloc_typedefs_h",
-    src = "include/jemalloc/jemalloc_typedefs.h.in",
-    out = "include/jemalloc/jemalloc_typedefs.h",
-    substitutions = {},
-)
diff --git a/third_party/systemlibs/jemalloc.BUILD b/third_party/systemlibs/jemalloc.BUILD
deleted file mode 100644
index 6a48d582ba..0000000000
--- a/third_party/systemlibs/jemalloc.BUILD
+++ /dev/null
@@ -1,30 +0,0 @@
-licenses(["notice"])  # BSD
-
-filegroup(
-    name = "COPYING",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "jemalloc_headers",
-    defines = [
-        "jemalloc_posix_memalign=posix_memalign",
-        "jemalloc_malloc=malloc",
-        "jemalloc_realloc=realloc",
-        "jemalloc_free=free",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "jemalloc_impl",
-    linkopts = ["-ljemalloc"],
-    defines = [
-        "jemalloc_posix_memalign=posix_memalign",
-        "jemalloc_malloc=malloc",
-        "jemalloc_realloc=realloc",
-        "jemalloc_free=free",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [":jemalloc_headers"],
-)
diff --git a/third_party/systemlibs/syslibs_configure.bzl b/third_party/systemlibs/syslibs_configure.bzl
index 8b0ab39eaf..b03d3380d7 100644
--- a/third_party/systemlibs/syslibs_configure.bzl
+++ b/third_party/systemlibs/syslibs_configure.bzl
@@ -23,7 +23,6 @@ VALID_LIBS = [
     "gast_archive",
     "gif_archive",
     "grpc",
-    "jemalloc",
     "jpeg",
     "jsoncpp_git",
     "lmdb",
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 0cd148ed87..3734fab715 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -73,7 +73,6 @@ build --define=grpc_no_ares=true
 build --spawn_strategy=standalone
 build --genrule_strategy=standalone
 build -c opt
-build --define=with_jemalloc=false
 
 # Other build flags.
 build --define=grpc_no_ares=true
-- 
GitLab


From 55d96e8ea93407da156c156702a38fd8b5d06b2a Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Mon, 1 Oct 2018 15:34:08 -0700
Subject: [PATCH 258/570] Fix Android builds when using
 --define=with_tflite_flex

PiperOrigin-RevId: 215292521
---
 tensorflow/contrib/lite/delegates/flex/BUILD | 6 +++---
 tensorflow/core/common_runtime/eager/BUILD   | 7 ++++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/lite/delegates/flex/BUILD b/tensorflow/contrib/lite/delegates/flex/BUILD
index bf5d91899c..9dd38958e5 100644
--- a/tensorflow/contrib/lite/delegates/flex/BUILD
+++ b/tensorflow/contrib/lite/delegates/flex/BUILD
@@ -20,7 +20,7 @@ cc_library(
         "//tensorflow/contrib/lite:kernel_api",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite_no_runtime",
+            "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -60,7 +60,7 @@ cc_library(
         "//tensorflow/contrib/lite:util",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite_no_runtime",
+            "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:lib",
@@ -178,7 +178,7 @@ cc_library(
         "//tensorflow/contrib/lite:kernel_api",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite_no_runtime",
+            "//tensorflow/core:android_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:lib",
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index be5f3bae3a..7b74c67c85 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -147,10 +147,11 @@ tf_cuda_library(
         "kernel_and_device.h",
     ],
     visibility = ["//tensorflow:internal"],
-    deps = select({
+    deps = [
+        "@farmhash_archive//:farmhash",
+    ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib_lite",
-            "//util/hash:farmhash_fingerprint",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu_lib",
@@ -219,13 +220,13 @@ tf_cuda_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":kernel_and_device",
+        "@farmhash_archive//:farmhash",
         # Only the TF_AttrType enum is required, so pull in just the C headers.
         # TODO(b/113535673): Break this dependency and avoid the C header completely.
         "//tensorflow/c:c_api_headers",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib_lite",
-            "//util/hash:farmhash_fingerprint",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu",
-- 
GitLab


From dc4ac1b84c9c74655f04254779516f9968a5c385 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Mon, 1 Oct 2018 15:41:29 -0700
Subject: [PATCH 259/570] Clean up the build_xla_ops to use the generated C++
 TF op wrappers.

This cleanup will make the future CL implementing lazy compilation simpler.

Includes some supporting changes:

 - Teach NewInternalScope to create a scope that doesn't do shape inference.  We
   need this because we don't have a ShapeRefiner that has been run over the
   entire graph available in the build_xla_ops pass.

 - Add a WithAssignedDevice modifier to tensorflow::Scope.

 - Make cc_op_gen write out an Operation field for nodes which may not
   necessarily have any outputs.  We already did this in most cases, but we
   weren't doing it for nodes that have possibly-empty list outputs.

 - Minor change renaming ops/xla_jit_op.cc to ops/xla_jit_ops.cc, now that we
   have more than one XLA JIT op.

PiperOrigin-RevId: 215293817
---
 tensorflow/cc/framework/cc_op_gen.cc          |  10 +-
 tensorflow/cc/framework/scope.cc              |  33 +++-
 tensorflow/cc/framework/scope.h               |   4 +
 tensorflow/cc/framework/scope_internal.h      |   5 +
 tensorflow/compiler/jit/BUILD                 |   4 +
 tensorflow/compiler/jit/build_xla_ops_pass.cc | 180 ++++++++----------
 .../compiler/jit/build_xla_ops_pass_test.cc   |  32 +++-
 .../encapsulate_xla_computations_pass_test.cc |   2 +-
 tensorflow/compiler/tf2xla/cc/BUILD           |   7 +-
 tensorflow/core/graph/node_builder.cc         |   7 +
 tensorflow/core/graph/node_builder.h          |   4 +
 11 files changed, 174 insertions(+), 114 deletions(-)

diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index a32d1b1eb5..39593370d1 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -853,11 +853,7 @@ void OpInfo::WriteClassDecl(WritableFile* h) const {
     }
   }
 
-  strings::StrAppend(&class_decl, "\n");
-
-  if (output_types.empty()) {
-    strings::StrAppend(&class_decl, "  Operation operation;\n");
-  }
+  strings::StrAppend(&class_decl, "\n  Operation operation;\n");
   for (int i = 0; i < output_types.size(); ++i) {
     strings::StrAppend(&class_decl, "  ", output_types[i], " ", output_names[i],
                        ";\n");
@@ -878,9 +874,11 @@ void OpInfo::GetOutput(string* out) const {
   string return_on_error =
       strings::StrCat("if (!", scope_str, ".ok()) return;");
 
+  strings::StrAppend(out, "  this->operation = Operation(ret);\n");
+
   // No outputs.
   if (graph_op_def.output_arg_size() == 0) {
-    strings::StrAppend(out, "  this->operation = Operation(ret);\n  return;\n");
+    strings::StrAppend(out, "  return;\n");
     return;
   }
   if (graph_op_def.output_arg_size() == 1) {
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index 7f6ac4cae7..6abc9e268e 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -62,7 +62,7 @@ Scope::Impl::Impl(const std::shared_ptr<Graph>& graph,
       refiner_(refiner),
       scope_used_(nullptr),
       colocation_constraints_(),
-      disable_shape_inference_(false) {}
+      disable_shape_inference_(refiner_ == nullptr) {}
 
 Scope Scope::NewRootScope() {
   Graph* graph = new Graph(OpRegistry::Global());
@@ -94,6 +94,7 @@ Scope::Impl::Impl(const Scope& other, Tags::ScopeName, const string& name,
       exit_on_error_(other.impl()->exit_on_error_),
       kernel_label_(other.impl()->kernel_label_),
       device_(other.impl()->device_),
+      assigned_device_(other.impl()->assigned_device_),
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
@@ -110,6 +111,7 @@ Scope::Impl::Impl(const Scope& other, Tags::OpName, const string& name,
       exit_on_error_(other.impl()->exit_on_error_),
       kernel_label_(other.impl()->kernel_label_),
       device_(other.impl()->device_),
+      assigned_device_(other.impl()->assigned_device_),
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
@@ -132,6 +134,7 @@ Scope::Impl::Impl(const Scope& other, Tags::ControlDeps,
       exit_on_error_(other.impl()->exit_on_error_),
       kernel_label_(other.impl()->kernel_label_),
       device_(other.impl()->device_),
+      assigned_device_(other.impl()->assigned_device_),
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
@@ -163,6 +166,7 @@ Scope::Impl::Impl(const Scope& other, Tags::SingleUseScope,
       exit_on_error_(other.impl()->exit_on_error_),
       kernel_label_(other.impl()->kernel_label_),
       device_(other.impl()->device_),
+      assigned_device_(other.impl()->assigned_device_),
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
@@ -178,6 +182,7 @@ Scope::Impl::Impl(const Scope& other, Tags::ExitOnError)
       exit_on_error_(true),
       kernel_label_(other.impl()->kernel_label_),
       device_(other.impl()->device_),
+      assigned_device_(other.impl()->assigned_device_),
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
@@ -194,6 +199,7 @@ Scope::Impl::Impl(const Scope& other, Tags::KernelLabel,
       exit_on_error_(other.impl()->exit_on_error_),
       kernel_label_(kernel_label),
       device_(other.impl()->device_),
+      assigned_device_(other.impl()->assigned_device_),
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
@@ -210,12 +216,30 @@ Scope::Impl::Impl(const Scope& other, Tags::Colocate,
       exit_on_error_(other.impl()->exit_on_error_),
       kernel_label_(other.impl()->kernel_label_),
       device_(other.impl()->device_),
+      assigned_device_(other.impl()->assigned_device_),
       colocation_constraints_(
           clear_colocations
               ? std::unordered_set<string>()
               : other.impl()->GetColocationConstraints(colocate_with_op)),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
+Scope::Impl::Impl(const Scope& other, Tags::AssignedDevice,
+                  const string& assigned_device)
+    : graph_(other.impl()->graph_),
+      status_(other.impl()->status_),
+      name_map_(other.impl()->name_map_),
+      refiner_(other.impl()->refiner_),
+      scope_used_(other.impl()->scope_used_),
+      control_deps_(other.impl()->control_deps_),
+      name_(other.impl()->name_),
+      op_name_(other.impl()->op_name_),
+      exit_on_error_(other.impl()->exit_on_error_),
+      kernel_label_(other.impl()->kernel_label_),
+      device_(other.impl()->device_),
+      assigned_device_(assigned_device),
+      colocation_constraints_(other.impl()->colocation_constraints_),
+      disable_shape_inference_(other.impl()->disable_shape_inference_) {}
+
 std::unordered_set<string> Scope::Impl::GetColocationConstraints(
     const Operation& colocate_with_op) const {
   std::unordered_set<string> current_constraints(colocation_constraints_);
@@ -299,6 +323,9 @@ void Scope::UpdateBuilder(NodeBuilder* builder) const {
   if (!impl()->device_.empty()) {
     builder->Device(impl()->device_);
   }
+  if (!impl()->assigned_device_.empty()) {
+    builder->AssignedDevice(impl()->assigned_device_);
+  }
 }
 
 string Scope::Impl::GetUniqueName(const string& prefix,
@@ -394,6 +421,10 @@ Scope Scope::WithDevice(const string& device) const {
   return Scope(new Impl(*this, Impl::Tags::Device(), device));
 }
 
+Scope Scope::WithAssignedDevice(const string& assigned_device) const {
+  return Scope(new Impl(*this, Impl::Tags::AssignedDevice(), assigned_device));
+}
+
 Scope Scope::ColocateWith(const Operation& op) const {
   return Scope(new Impl(*this, Impl::Tags::Colocate(), op,
                         /* clear_colocations */ false));
diff --git a/tensorflow/cc/framework/scope.h b/tensorflow/cc/framework/scope.h
index 30c32bd44b..e307d8989b 100644
--- a/tensorflow/cc/framework/scope.h
+++ b/tensorflow/cc/framework/scope.h
@@ -133,6 +133,10 @@ class Scope {
   /// the device field set to 'device'.
   Scope WithDevice(const string& device) const;
 
+  /// Returns a new scope.  All ops created within the returned scope will have
+  /// their assigned device set to `assigned_device`.
+  Scope WithAssignedDevice(const string& assigned_device) const;
+
   /// Return a new scope. All ops created within the returned scope will be
   /// co-located on the device where op is placed.
   /// NOTE: This function is intended to be use internal libraries only for
diff --git a/tensorflow/cc/framework/scope_internal.h b/tensorflow/cc/framework/scope_internal.h
index 58adaef2e9..514e02e841 100644
--- a/tensorflow/cc/framework/scope_internal.h
+++ b/tensorflow/cc/framework/scope_internal.h
@@ -26,6 +26,8 @@ class ShapeRefiner;
 // graph, status, name_map, and refiner.
 // This is intended to enable the C API (which are used by other language
 // bindings) to create a Scope and access C++ functionality (i.e. gradients).
+//
+// Shape inference is disabled if `refiner` is nullptr.
 Scope NewInternalScope(Graph* graph, Status* status, ShapeRefiner* refiner);
 
 class Scope::Impl {
@@ -58,6 +60,7 @@ class Scope::Impl {
     enum class ExitOnError;
     enum class KernelLabel;
     enum class Colocate;
+    enum class AssignedDevice;
   };
 
   Impl(Graph* graph, Status* status, NameMap* name_map, ShapeRefiner* refiner,
@@ -74,6 +77,7 @@ class Scope::Impl {
   Impl(const Scope& other, Tags::KernelLabel, const string& kernel_label);
   Impl(const Scope& other, Tags::Colocate, const Operation& colocate_with_op,
        bool clear_colocations);
+  Impl(const Scope& other, Tags::AssignedDevice, const string& assigned_device);
 
   std::unordered_set<string> GetColocationConstraints(
       const Operation& colocate_with_op) const;
@@ -107,6 +111,7 @@ class Scope::Impl {
   const bool exit_on_error_ = false;
   const string kernel_label_ = "";
   const string device_ = "";
+  const string assigned_device_ = "";
   const std::unordered_set<string> colocation_constraints_;
 
   // If true, Scope::DoShapeInference() always returns Status:OK().
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 29b60d1dbe..f20270931f 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -385,12 +385,16 @@ cc_library(
         ":shape_inference_helpers",
         ":union_find",
         ":xla_cluster_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope_internal",
         "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla/cc:xla_jit_ops",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:core_cpu",
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index 9e3fd93cda..5974696b77 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -14,8 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/build_xla_ops_pass.h"
+#include "absl/algorithm/container.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope_internal.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
+#include "tensorflow/compiler/tf2xla/cc/ops/xla_jit_ops.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -31,132 +35,108 @@ limitations under the License.
 #include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
-
-static Status BuildXlaCompileNode(
-    const string& nodename, const string& function_name,
-    const AttrValueMap& function_attr, const string& device_name,
-    const DataTypeVector& constant_dtypes, int num_resources,
-    const DataTypeVector& arg_dtypes, Graph* graph, Node** node) {
-  NodeDef def;
-  def.set_name(graph->NewName(nodename));
-  def.set_op("_XlaCompile");
-  def.set_device(device_name);
-  AddNodeAttr("Tconstants", constant_dtypes, &def);
-  AddNodeAttr("Targs", arg_dtypes, &def);
-  AddNodeAttr("Nresources", num_resources, &def);
-  NameAttrList function;
-  function.set_name(function_name);
-  *function.mutable_attr() = function_attr;
-  AddNodeAttr("function", function, &def);
-
-  Status status;
-  *node = graph->AddNode(def, &status);
-  return status;
+namespace {
+void MoveOutgoingEdges(Graph* g, Node* old_node, Node* new_node) {
+  std::vector<const Edge*> out_edges(old_node->out_edges().begin(),
+                                     old_node->out_edges().end());
+  for (const Edge* edge : out_edges) {
+    // TODO(sanjoy): This does not update NodeDef inputs.  To be able to update
+    // NodeDef inputs we first need to fix encapsulate_subgraphs_pass to fix up
+    // the NodeDef inputs to the function call nodes.
+    g->AddEdge(new_node, edge->src_output(), edge->dst(), edge->dst_input());
+    g->RemoveEdge(edge);
+  }
 }
 
-static Status BuildXlaRunNode(const string& nodename, const string& device_name,
-                              const DataTypeVector& arg_dtypes,
-                              const DataTypeVector& result_dtypes, Graph* graph,
-                              Node** node) {
-  NodeDef def;
-  def.set_name(graph->NewName(nodename));
-  def.set_op("_XlaRun");
-  def.set_device(device_name);
-  AddNodeAttr("Targs", arg_dtypes, &def);
-  AddNodeAttr("Tresults", result_dtypes, &def);
+struct XlaClusterInfo {
+  std::vector<Output> constant_inputs;
+  std::vector<Output> non_constant_inputs;
+  std::vector<Output> resource_inputs;
+  NameAttrList function;
+};
 
-  Status status;
-  *node = graph->AddNode(def, &status);
-  return status;
+Output IncomingEdgeAsOutput(const Edge* e) {
+  return Output(e->src(), e->src_output());
 }
 
-static Status GetXlaAttrs(Node* node, int* num_constant_args,
-                          int* num_resource_args, DataTypeVector* const_dtypes,
-                          DataTypeVector* arg_dtypes) {
+Status GetXlaClusterInfo(Node* n, XlaClusterInfo* result) {
+  int num_constant_inputs, num_resource_inputs;
   TF_RETURN_IF_ERROR(
-      GetNodeAttr(node->attrs(), kXlaNumConstantArgsAttr, num_constant_args));
+      GetNodeAttr(n->attrs(), kXlaNumConstantArgsAttr, &num_constant_inputs));
   TF_RETURN_IF_ERROR(
-      GetNodeAttr(node->attrs(), kXlaNumResourceArgsAttr, num_resource_args));
+      GetNodeAttr(n->attrs(), kXlaNumResourceArgsAttr, &num_resource_inputs));
 
-  if (*num_constant_args < 0 || *num_resource_args < 0 ||
-      *num_constant_args + *num_resource_args > node->num_inputs()) {
+  if (num_constant_inputs < 0 || num_resource_inputs < 0 ||
+      num_constant_inputs + num_resource_inputs > n->num_inputs()) {
     return errors::InvalidArgument(
         "Invalid number of constant/resource arguments to XLA kernel.");
   }
 
-  const int num_nonconst_args =
-      node->num_inputs() - *num_constant_args - *num_resource_args;
-
-  const DataTypeVector& input_types = node->input_types();
-  std::copy(input_types.begin(), input_types.begin() + *num_constant_args,
-            std::back_inserter(*const_dtypes));
-  std::copy(input_types.begin() + *num_constant_args,
-            input_types.begin() + *num_constant_args + num_nonconst_args,
-            std::back_inserter(*arg_dtypes));
-  return Status::OK();
-}
-
-static void CopyIncomingEdges(Graph* g, Node* old_node, Node* new_node,
-                              int prefix_to_ignore) {
-  for (const Edge* edge : old_node->in_edges()) {
-    if (edge->IsControlEdge()) {
-      g->AddControlEdge(edge->src(), new_node);
-    } else if (edge->dst_input() >= prefix_to_ignore) {
-      g->AddEdge(edge->src(), edge->src_output(), new_node,
-                 edge->dst_input() - prefix_to_ignore);
-    }
-  }
-}
+  int num_non_constant_inputs =
+      n->num_inputs() - num_constant_inputs - num_resource_inputs;
 
-static void MoveOutgoingEdges(Graph* g, Node* old_node, Node* new_node) {
-  std::vector<const Edge*> out_edges(old_node->out_edges().begin(),
-                                     old_node->out_edges().end());
-  for (const Edge* edge : out_edges) {
-    // TODO(sanjoy): This does not update NodeDef inputs.
-    g->AddEdge(new_node, edge->src_output(), edge->dst(), edge->dst_input());
-    g->RemoveEdge(edge);
-  }
-}
+  std::vector<const Edge*> input_edges_vector;
+  TF_RETURN_IF_ERROR(n->input_edges(&input_edges_vector));
+  absl::Span<const Edge*> input_edges(input_edges_vector);
 
-static Status ReplaceNodeWithXlaCompileAndRun(Graph* g, Node* n) {
-  int num_constant_args, num_resource_args;
-  DataTypeVector const_dtypes;
-  DataTypeVector arg_dtypes;
+  absl::c_transform(input_edges.subspan(0, num_constant_inputs),
+                    std::back_inserter(result->constant_inputs),
+                    IncomingEdgeAsOutput);
 
-  TF_RETURN_IF_ERROR(GetXlaAttrs(n, &num_constant_args, &num_resource_args,
-                                 &const_dtypes, &arg_dtypes));
+  absl::c_transform(
+      input_edges.subspan(num_constant_inputs, num_non_constant_inputs),
+      std::back_inserter(result->non_constant_inputs), IncomingEdgeAsOutput);
 
-  Node *compile_node, *run_node;
+  absl::c_transform(
+      input_edges.subspan(num_constant_inputs + num_non_constant_inputs,
+                          num_resource_inputs),
+      std::back_inserter(result->resource_inputs), IncomingEdgeAsOutput);
 
-  TF_RETURN_IF_ERROR(BuildXlaCompileNode(
-      n->name(), n->type_string(), n->def().attr(), n->requested_device(),
-      const_dtypes, num_resource_args, arg_dtypes, g, &compile_node));
+  result->function.set_name(n->type_string());
+  *result->function.mutable_attr() = n->def().attr();
+  return Status::OK();
+}
 
-  DataTypeVector arg_dtypes_with_resources = arg_dtypes;
-  for (int i = 0; i < num_resource_args; i++) {
-    arg_dtypes_with_resources.push_back(DT_RESOURCE);
+Status CopyIncomingControlEdges(Graph* g, Node* from, Node* to) {
+  for (const Edge* e : from->in_edges()) {
+    if (e->IsControlEdge()) {
+      g->AddControlEdge(e->src(), to);
+    }
   }
 
-  TF_RETURN_IF_ERROR(BuildXlaRunNode(n->name(), n->requested_device(),
-                                     arg_dtypes_with_resources,
-                                     n->output_types(), g, &run_node));
-
-  compile_node->set_assigned_device_name(n->assigned_device_name());
-  run_node->set_assigned_device_name(n->assigned_device_name());
+  return Status::OK();
+}
 
-  CopyIncomingEdges(g, /*old_node=*/n, /*new_node=*/compile_node,
-                    /*prefix_to_ignore=*/0);
-  CopyIncomingEdges(g, /*old_node=*/n, /*new_node=*/run_node,
-                    /*prefix_to_ignore=*/num_constant_args);
+Status ReplaceNodeWithXlaCompileAndXlaRun(Graph* g, Node* n) {
+  Status status;
+  Scope root = NewInternalScope(g, &status, /*refiner=*/nullptr)
+                   .NewSubScope(n->name())
+                   .WithDevice(n->requested_device())
+                   .WithAssignedDevice(n->assigned_device_name());
+
+  XlaClusterInfo cluster_info;
+  TF_RETURN_IF_ERROR(GetXlaClusterInfo(n, &cluster_info));
+
+  ops::_XlaCompile xla_compile(root.WithOpName("xla_compile"),
+                               /*constants=*/cluster_info.constant_inputs,
+                               /*args=*/cluster_info.non_constant_inputs,
+                               /*resources=*/cluster_info.resource_inputs,
+                               cluster_info.function);
+  TF_RETURN_IF_ERROR(
+      CopyIncomingControlEdges(g, /*from=*/n, /*to=*/xla_compile.key.node()));
 
-  // The compilation_key output.
-  g->AddEdge(compile_node, 0, run_node, n->num_inputs() - num_constant_args);
+  std::vector<Output> xla_run_args = cluster_info.non_constant_inputs;
+  absl::c_copy(cluster_info.resource_inputs, std::back_inserter(xla_run_args));
+  ops::_XlaRun xla_run(root.WithOpName("xla_run"), xla_run_args,
+                       xla_compile.key, n->output_types());
 
-  MoveOutgoingEdges(g, /*old_node=*/n, /*new_node=*/run_node);
+  MoveOutgoingEdges(g, /*old_node=*/n,
+                    /*new_node=*/xla_run.operation.node());
   g->RemoveNode(n);
 
   return Status::OK();
 }
+}  // namespace
 
 Status BuildXlaOpsPass::Run(const GraphOptimizationPassOptions& options) {
   Graph* graph = options.graph->get();
@@ -170,7 +150,7 @@ Status BuildXlaOpsPass::Run(const GraphOptimizationPassOptions& options) {
     // Only compile nodes that are marked for compilation by the
     // compilation-marking pass (via 'attr_name').
     if (IsXlaCompiledKernel(*n)) {
-      TF_RETURN_IF_ERROR(ReplaceNodeWithXlaCompileAndRun(graph, n));
+      TF_RETURN_IF_ERROR(ReplaceNodeWithXlaCompileAndXlaRun(graph, n));
     }
   }
 
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
index b7cb4506b9..9d56db7b6b 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
@@ -56,18 +56,26 @@ Status BuildXlaOps(const Scope& s, std::unique_ptr<Graph>* result) {
 }
 
 Status MakeXlaCompiledKernel(Graph* graph, const string& callee_name,
-                             const string& node_name, Node** result) {
+                             const string& node_name, int num_constant_args,
+                             int num_resource_args, Node** result) {
   NodeDef call_node;
   call_node.set_name(node_name);
   call_node.set_op(callee_name);
   AddNodeAttr(kXlaCompiledKernelAttr, true, &call_node);
-  AddNodeAttr(kXlaNumConstantArgsAttr, 0, &call_node);
-  AddNodeAttr(kXlaNumResourceArgsAttr, 0, &call_node);
+  AddNodeAttr(kXlaNumConstantArgsAttr, num_constant_args, &call_node);
+  AddNodeAttr(kXlaNumResourceArgsAttr, num_resource_args, &call_node);
   Status s;
   *result = graph->AddNode(call_node, &s);
   return s;
 }
 
+Status MakeXlaCompiledKernel(Graph* graph, const string& callee_name,
+                             const string& node_name, Node** result) {
+  return MakeXlaCompiledKernel(graph, callee_name, node_name,
+                               /*num_constant_args=*/0, /*num_resource_args=*/0,
+                               result);
+}
+
 Node* MakeWrite(const Scope& scope, const string& id) {
   Output var_handle =
       ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
@@ -108,5 +116,23 @@ TEST(BuildXlaOps, ControlDepsPreserved) {
   EXPECT_THAT(write_op_new, NodeWith(CtrlDeps(NodeWith(Op("_XlaRun")))));
 }
 
+TEST(BuildXlaOps, CleanFailureOnBogusAttr) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  FunctionDefLibrary flib_def =
+      CreateFunctionDefLibWithConstFunction("cluster_0");
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+  Node* call;
+  TF_ASSERT_OK(
+      MakeXlaCompiledKernel(root.graph(), "cluster_0", "C", 100, 100, &call));
+  Node* write_op = MakeWrite(root, "write");
+  root.graph()->AddControlEdge(call, write_op);
+
+  std::unique_ptr<Graph> graph;
+  Status failure_status = BuildXlaOps(root, &graph);
+  ASSERT_FALSE(failure_status.ok());
+  EXPECT_EQ(failure_status.code(), error::INVALID_ARGUMENT);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
index 479038ac8e..22531a4ace 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
-#include "tensorflow/compiler/tf2xla/cc/ops/xla_jit_op.h"
+#include "tensorflow/compiler/tf2xla/cc/ops/xla_jit_ops.h"
 #include "tensorflow/compiler/tf2xla/test_util.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/graph/graph_constructor.h"
diff --git a/tensorflow/compiler/tf2xla/cc/BUILD b/tensorflow/compiler/tf2xla/cc/BUILD
index ea8d1b3d14..adcdb6c8f7 100644
--- a/tensorflow/compiler/tf2xla/cc/BUILD
+++ b/tensorflow/compiler/tf2xla/cc/BUILD
@@ -30,14 +30,15 @@ cc_library(
 
 tf_gen_op_wrapper_cc(
     name = "xla_jit_op_gen",
-    out_ops_file = "ops/xla_jit_op",
+    include_internal_ops = 1,
+    out_ops_file = "ops/xla_jit_ops",
     deps = ["//tensorflow/compiler/jit/ops:xla_ops"],
 )
 
 cc_library(
     name = "xla_jit_ops",
-    srcs = ["ops/xla_jit_op.cc"],
-    hdrs = ["ops/xla_jit_op.h"],
+    srcs = ["ops/xla_jit_ops.cc"],
+    hdrs = ["ops/xla_jit_ops.h"],
     deps = [
         "//tensorflow/cc:const_op",
         "//tensorflow/cc:ops",
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index a446e0d136..d92874909f 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -99,6 +99,11 @@ NodeBuilder& NodeBuilder::Device(StringPiece device_spec) {
   return *this;
 }
 
+NodeBuilder& NodeBuilder::AssignedDevice(StringPiece device) {
+  assigned_device_ = string(device);
+  return *this;
+}
+
 Status NodeBuilder::Finalize(Graph* graph, Node** created_node) const {
   // In case of error, set *created_node to nullptr.
   if (created_node != nullptr) *created_node = nullptr;
@@ -115,6 +120,8 @@ Status NodeBuilder::Finalize(Graph* graph, Node** created_node) const {
   Node* node = graph->AddNode(node_def, &status);
   if (!status.ok()) return status;
 
+  node->set_assigned_device_name(assigned_device_);
+
   for (size_t i = 0; i < inputs_.size(); ++i) {
     if (inputs_[i].node != nullptr) {  // Skip back edges.
       graph->AddEdge(inputs_[i].node, inputs_[i].index, node, i);
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index 4727ee7b56..d576985a23 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -100,6 +100,9 @@ class NodeBuilder {
   // "assigned device" in the Node).
   NodeBuilder& Device(StringPiece device_spec);
 
+  // Sets the device name in the "assigned device" field in tensorflow::Node.
+  NodeBuilder& AssignedDevice(StringPiece device);
+
   // Set the value of an attr.  attr_name must match the name of one of
   // attrs defined by the Op, and value must have the corresponding type
   // (see SetAttrValue() in ../framework/attr_value_util.h for legal
@@ -141,6 +144,7 @@ class NodeBuilder {
   std::vector<NodeOut> inputs_;
   std::vector<Node*> control_inputs_;
   std::vector<string> errors_;
+  string assigned_device_;
 };
 
 // IMPLEMENTATION -------------------------------------------------------------
-- 
GitLab


From 28a5ce4cf8702a6605e13a99c861ec6f2cd75929 Mon Sep 17 00:00:00 2001
From: Tayo Oguntebi <tayo@google.com>
Date: Mon, 1 Oct 2018 15:47:52 -0700
Subject: [PATCH 260/570]   Improve error message in transpose shape inference.

PiperOrigin-RevId: 215294817
---
 tensorflow/compiler/xla/service/shape_inference.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 7194b2cafd..6ccea9d2b5 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -2380,7 +2380,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
       !std::is_permutation(dimensions.begin(), dimensions.end(),
                            indices.begin())) {
     return InvalidArgument(
-        "Transpose dimensions not a permutation of the operand dimensions.");
+        "Transpose dimensions [%s] are not a permutation of the operand "
+        "dimensions (operand shape is %s).",
+        StrJoin(dimensions, ","), ShapeUtil::HumanString(operand));
   }
 
   // Permute(dimensions,input) computes output[dimensions[i]]=input[i]. However,
-- 
GitLab


From 6509437545f8fc973b39489c285811ea8cc8b15a Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Mon, 1 Oct 2018 15:52:16 -0700
Subject: [PATCH 261/570] If keras_model_path is google storage url, provide
 util to download model remotely.

PiperOrigin-RevId: 215295504
---
 tensorflow/python/estimator/keras.py      | 48 ++++++++++++++++++++---
 tensorflow/python/estimator/keras_test.py |  6 ---
 2 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/estimator/keras.py b/tensorflow/python/estimator/keras.py
index 7546771ed3..5d5ed81fbb 100644
--- a/tensorflow/python/estimator/keras.py
+++ b/tensorflow/python/estimator/keras.py
@@ -368,6 +368,44 @@ def _save_first_checkpoint(keras_model, custom_objects, config):
   return latest_path
 
 
+def _get_file_from_google_storage(keras_model_path, model_dir):
+  """Get file from google storage and download to local file.
+
+  Args:
+    keras_model_path: a google storage path for compiled keras model.
+    model_dir: the directory from estimator config.
+
+  Returns:
+    The path where keras model is saved.
+
+  Raises:
+    ValueError: if storage object name does not end with .h5.
+  """
+  try:
+    from google.cloud import storage  # pylint:disable=g-import-not-at-top
+  except ImportError:
+    raise TypeError('Could not save model to Google cloud storage; please '
+                    'install `google-cloud-storage` via '
+                    '`pip install google-cloud-storage`.')
+  storage_client = storage.Client()
+  path, blob_name = os.path.split(keras_model_path)
+  _, bucket_name = os.path.split(path)
+  keras_model_dir = os.path.join(model_dir, 'keras')
+  if not gfile.Exists(keras_model_dir):
+    gfile.MakeDirs(keras_model_dir)
+  file_name = os.path.join(keras_model_dir, 'keras_model.h5')
+  try:
+    blob = storage_client.get_bucket(bucket_name).blob(blob_name)
+    blob.download_to_filename(file_name)
+  except:
+    raise ValueError('Failed to download keras model, please check '
+                     'environment variable GOOGLE_APPLICATION_CREDENTIALS '
+                     'and model path storage.googleapis.com/{bucket}/{object}.')
+  logging.info('Saving model to {}'.format(file_name))
+  del storage_client
+  return file_name
+
+
 def model_to_estimator(keras_model=None,
                        keras_model_path=None,
                        custom_objects=None,
@@ -407,12 +445,13 @@ def model_to_estimator(keras_model=None,
         'Please specity either `keras_model` or `keras_model_path`, '
         'but not both.')
 
+  config = estimator_lib.maybe_overwrite_model_dir_and_session_config(
+      config, model_dir)
   if not keras_model:
     if keras_model_path.startswith(
         'gs://') or 'storage.googleapis.com' in keras_model_path:
-      raise ValueError(
-          '%s is not a local path. Please copy the model locally first.' %
-          keras_model_path)
+      keras_model_path = _get_file_from_google_storage(keras_model_path,
+                                                       config.model_dir)
     logging.info('Loading models from %s', keras_model_path)
     keras_model = models.load_model(keras_model_path)
   else:
@@ -425,9 +464,6 @@ def model_to_estimator(keras_model=None,
         'Please compile the model with `model.compile()` '
         'before calling `model_to_estimator()`.')
 
-  config = estimator_lib.maybe_overwrite_model_dir_and_session_config(config,
-                                                                      model_dir)
-
   keras_model_fn = _create_keras_model_fn(keras_model, custom_objects)
   if _any_weight_initialized(keras_model):
     # Warn if config passed to estimator tries to update GPUOptions. If a
diff --git a/tensorflow/python/estimator/keras_test.py b/tensorflow/python/estimator/keras_test.py
index 288f9b8906..4e285fa25a 100644
--- a/tensorflow/python/estimator/keras_test.py
+++ b/tensorflow/python/estimator/keras_test.py
@@ -581,12 +581,6 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, 'compiled'):
         keras_lib.model_to_estimator(keras_model=keras_model)
 
-    with self.cached_session():
-      keras_model = simple_sequential_model()
-      with self.assertRaisesRegexp(ValueError, 'not a local path'):
-        keras_lib.model_to_estimator(
-            keras_model_path='gs://bucket/object')
-
   def test_invalid_ionames_error(self):
     (x_train, y_train), (_, _) = testing_utils.get_test_data(
         train_samples=_TRAIN_SIZE,
-- 
GitLab


From 8559bc2c4c7616c5da8b4f7a3e1405c549a6068d Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Mon, 1 Oct 2018 15:58:21 -0700
Subject: [PATCH 262/570] Add email comment explicitly authorizing
 distributions/special_math.py be released under Apache 2.0.

PiperOrigin-RevId: 215296386
---
 .../python/ops/distributions/special_math.py  | 61 ++++++++++++++++++-
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/distributions/special_math.py b/tensorflow/python/ops/distributions/special_math.py
index 31b7a36fd3..ccc667cae3 100644
--- a/tensorflow/python/ops/distributions/special_math.py
+++ b/tensorflow/python/ops/distributions/special_math.py
@@ -12,6 +12,62 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+
+# Functions "ndtr" and "ndtri" are derived from calculations made in:
+# https://root.cern.ch/doc/v608/SpecFuncCephesInv_8cxx_source.html
+# In the following email exchange, the author gives his consent to redistribute
+# derived works under an Apache 2.0 license.
+#
+# From: Stephen Moshier <steve@moshier.net>
+# Date: Sat, Jun 9, 2018 at 2:36 PM
+# Subject: Re: Licensing cephes under Apache (BSD-like) license.
+# To: rif <rif@google.com>
+#
+#
+#
+# Hello Rif,
+#
+# Yes, Google may distribute Cephes files under the Apache 2 license.
+#
+# If clarification is needed, I do not favor BSD over other free licenses.
+# I would agree that Apache 2 seems to cover the concern you mentioned
+# about sublicensees.
+#
+# Best wishes for good luck with your projects!
+# Steve Moshier
+#
+#
+#
+# On Thu, 31 May 2018, rif wrote:
+#
+# > Hello Steve.
+# > My name is Rif. I work on machine learning software at Google.
+# >
+# > Your cephes software continues to be incredibly useful and widely used. I
+# > was wondering whether it would be permissible for us to use the Cephes code
+# > under the Apache 2.0 license, which is extremely similar in permissions to
+# > the BSD license (Wikipedia comparisons). This would be quite helpful to us
+# > in terms of avoiding multiple licenses on software.
+# >
+# > I'm sorry to bother you with this (I can imagine you're sick of hearing
+# > about this by now), but I want to be absolutely clear we're on the level and
+# > not misusing your important software. In former conversation with Eugene
+# > Brevdo (ebrevdo@google.com), you wrote "If your licensing is similar to BSD,
+# > the formal way that has been handled is simply to add a statement to the
+# > effect that you are incorporating the Cephes software by permission of the
+# > author." I wanted to confirm that (a) we could use the Apache license, (b)
+# > that we don't need to (and probably you don't want to) keep getting
+# > contacted about individual uses, because your intent is generally to allow
+# > this software to be reused under "BSD-like" license, and (c) you're OK
+# > letting incorporators decide whether a license is sufficiently BSD-like?
+# >
+# > Best,
+# >
+# > rif
+# >
+# >
+# >
+
 """Special Math Ops."""
 
 from __future__ import absolute_import
@@ -135,7 +191,7 @@ def _ndtri(p):
 
   # Constants used in piece-wise rational approximations. Taken from the cephes
   # library:
-  # https://github.com/scipy/scipy/blob/master/scipy/special/cephes/ndtri.c
+  # https://root.cern.ch/doc/v608/SpecFuncCephesInv_8cxx_source.html
   p0 = list(reversed([-5.99633501014107895267E1,
                       9.80010754185999661536E1,
                       -5.66762857469070293439E1,
@@ -305,7 +361,8 @@ def log_ndtr(x, series_order=3, name="log_ndtr"):
     else:
       raise TypeError("x.dtype=%s is not supported." % x.dtype)
 
-    # The basic idea here was ported from py/scipy/special/cephes/ndtr.c.
+    # The basic idea here was ported from:
+    #   https://root.cern.ch/doc/v608/SpecFuncCephesInv_8cxx_source.html
     # We copy the main idea, with a few changes
     # * For x >> 1, and X ~ Normal(0, 1),
     #     Log[P[X < x]] = Log[1 - P[X < -x]] approx -P[X < -x],
-- 
GitLab


From 55f561e6740d61b3665594babce4be72ad955bc6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 16:07:09 -0700
Subject: [PATCH 263/570] Small tweaks to comments and documentation strings.

PiperOrigin-RevId: 215297961
---
 tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
index 5c27d59f82..ef2f8dd36d 100644
--- a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
@@ -46,7 +46,7 @@ namespace tensorflow {
 // 5. TPUEmbeddingActivations, when used with appropriate Python libraries,
 //    enables the automatic differentiation of models that use embeddings.
 // 6. TPUEmbeddingSendGradients takes a list of Tensors (of the same shapes
-//    as those returned by TPUEmbeddingReceivActivations) containing gradients
+//    as those returned by TPUEmbeddingReceiveActivations) containing gradients
 //    to use in updating the embedding tables.
 // 7. Before saving a checkpoint, use the TPUEmbeddingRetrieve Op to update
 //    the Graph's embedding table Variables from the updated tables in the
@@ -147,7 +147,7 @@ parameters that are loaded from a checkpoint before a training loop is
 executed.
 %s
 table_name: Name of this table; must match a name in the
-  EmbeddingLayerConfiguration proto (overrides table_id).
+  TPUEmbeddingConfiguration proto (overrides table_id).
 num_shards: Number of shards into which the embedding tables are divided.
 shard_id: Identifier of shard for this operation.
 table_id: Index of this table in the EmbeddingLayerConfiguration proto
@@ -283,7 +283,7 @@ the correct embedding table configuration. For example, this op is
 used to retrieve updated parameters before saving a checkpoint.
 %s
 table_name: Name of this table; must match a name in the
-  EmbeddingLayerConfiguration proto (overrides table_id).
+  TPUEmbeddingConfiguration proto (overrides table_id).
 num_shards: Number of shards into which the embedding tables are divided.
 shard_id: Identifier of shard for this operation.
 table_id: Index of this table in the EmbeddingLayerConfiguration proto
-- 
GitLab


From 24333d8e55bdd995089e93122750340bf8d1ddba Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 1 Oct 2018 16:09:45 -0700
Subject: [PATCH 264/570] [TF/XLA] Optimize
 `Encapsulator::GetFunctionNameAttr()`.

The previous version was hitting a very slow path in `GetNodeAttr()`, which is expensive when the named attr is not found. This change inlines the logic of finding the two relevant attrs inside `GetFunctionNameAttr()` and avoids constructing a status object with a serialized `NodeDef` when the attr can't be found.

PiperOrigin-RevId: 215298411
---
 .../jit/encapsulate_subgraphs_pass.cc         | 43 ++++++++++---------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 15faf31077..d165341f21 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -1363,28 +1363,31 @@ void Encapsulator::Subgraph::GetOutsideCompilationSubgraphNames(
 
 Status Encapsulator::GetFunctionNameAttr(
     Node const* node, string* attr, string* outside_compilation_attr) const {
-  Status s = GetNodeAttr(node->attrs(), group_attribute_, attr);
-  if (s.code() == error::Code::NOT_FOUND) {
-    // Return empty attr if there's no group_attribute.
-    attr->clear();
-  } else {
-    TF_RETURN_IF_ERROR(s);
-  }
-  bool has_group_attr = s.ok();
-  s = GetNodeAttr(node->attrs(), outside_compilation_attribute_,
-                  outside_compilation_attr);
-  if (s.code() == error::Code::NOT_FOUND) {
-    // Return empty attr if there's no outside_compilation attribute.
-    outside_compilation_attr->clear();
-  } else {
-    TF_RETURN_IF_ERROR(s);
-    if (!has_group_attr) {
-      return errors::InvalidArgument(
-          "Node ", node->name(), " has ", outside_compilation_attribute_,
-          " attribute but no ", group_attribute_, " attribute.");
+  AttrSlice attrs = node->attrs();
+  attr->clear();
+  outside_compilation_attr->clear();
+  bool found_group_attribute = false;
+  bool found_outside_compilation_attribute = false;
+  for (const auto& node_attr : attrs) {
+    if (node_attr.first == group_attribute_) {
+      TF_RETURN_IF_ERROR(AttrValueHasType(node_attr.second, "string"));
+      *attr = node_attr.second.s();
+      found_group_attribute = true;
+    } else if (node_attr.first == outside_compilation_attribute_) {
+      TF_RETURN_IF_ERROR(AttrValueHasType(node_attr.second, "string"));
+      *outside_compilation_attr = node_attr.second.s();
+      found_outside_compilation_attribute = true;
     }
+    if (found_group_attribute && found_outside_compilation_attribute) break;
+  }
+
+  if (found_outside_compilation_attribute && !found_group_attribute) {
+    return errors::InvalidArgument(
+        "Node ", node->name(), " has ", outside_compilation_attribute_,
+        " attribute but no ", group_attribute_, " attribute.");
+  } else {
+    return Status::OK();
   }
-  return Status::OK();
 }
 
 bool IsInSubgraph(const string& func_id, const string& outside_compilation_id) {
-- 
GitLab


From 49bbfec04b729960999ef054e3acab719631b101 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 16:16:43 -0700
Subject: [PATCH 265/570] Override implementation of log survival for
 Exponential distribution to better handle small values.

PiperOrigin-RevId: 215299532
---
 .../distributions/exponential_test.py            | 16 ++++++++++++++++
 .../python/ops/distributions/exponential.py      |  3 +++
 2 files changed, 19 insertions(+)

diff --git a/tensorflow/python/kernel_tests/distributions/exponential_test.py b/tensorflow/python/kernel_tests/distributions/exponential_test.py
index 27d1291912..367f8bb0f1 100644
--- a/tensorflow/python/kernel_tests/distributions/exponential_test.py
+++ b/tensorflow/python/kernel_tests/distributions/exponential_test.py
@@ -81,6 +81,22 @@ class ExponentialTest(test.TestCase):
     expected_cdf = stats.expon.cdf(x, scale=1 / lam_v)
     self.assertAllClose(self.evaluate(cdf), expected_cdf)
 
+  def testExponentialLogSurvival(self):
+    batch_size = 7
+    lam = constant_op.constant([2.0] * batch_size)
+    lam_v = 2.0
+    x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0, 10.0], dtype=np.float32)
+
+    exponential = exponential_lib.Exponential(rate=lam)
+
+    log_survival = exponential.log_survival_function(x)
+    self.assertEqual(log_survival.get_shape(), (7,))
+
+    if not stats:
+      return
+    expected_log_survival = stats.expon.logsf(x, scale=1 / lam_v)
+    self.assertAllClose(self.evaluate(log_survival), expected_log_survival)
+
   def testExponentialMean(self):
     lam_v = np.array([1.0, 4.0, 2.5])
     exponential = exponential_lib.Exponential(rate=lam_v)
diff --git a/tensorflow/python/ops/distributions/exponential.py b/tensorflow/python/ops/distributions/exponential.py
index 4325a14449..02129b5e2a 100644
--- a/tensorflow/python/ops/distributions/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -114,6 +114,9 @@ class Exponential(gamma.Gamma):
   def rate(self):
     return self._rate
 
+  def _log_survival_function(self, value):
+    return self._log_prob(value) - math_ops.log(self._rate)
+
   def _sample_n(self, n, seed=None):
     shape = array_ops.concat([[n], array_ops.shape(self._rate)], 0)
     # Uniform variates must be sampled from the open-interval `(0, 1)` rather
-- 
GitLab


From bb1f9e1a57c8bc18325b3c86298be96e6647a0a3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 16:31:13 -0700
Subject: [PATCH 266/570] Change semantics of DistributionStrategy.update() to
 make sure the output depends on the updates across all mirrors. Before this
 change, update() would return a Mirrored value that where each component was
 an update to a single mirror. This caused a problem since for reading
 purposes other DistributionStrategy methods would consider it okay to read
 any single component, and so if you for example did something like
 session.run(strategy.update(...)) it would only perform the update on one
 replica. The fix is to have the output be a Mirrored value that is actually
 the identity operation returning the output on that device, but that has a
 control dependency making sure that the update actually happens on all the
 replicas. This fix was already present in MirroredVariable._assign_func, this
 CL moves the fix into update() and generalizes it to multiple return values.

To disable this new grouping behavior, you may now pass
"grouped=False" to update(). For example, some callers (like Optimizer)
are performing a lot of updates and they prefer to group all of them
together at once for performance reasons.  In this case, we still want
to make sure the caller executes the update on all replicas, so we
return an unwrapped value instead of a Mirrored value. This has the
happy side effect of removing a bunch of unwrap calls in client code,
since unwrapping was the only safe way to use the Mirrored value we
used to return.

PiperOrigin-RevId: 215301909
---
 .../collective_all_reduce_strategy_test.py    |  3 +-
 .../distribute/python/mirrored_strategy.py    | 12 +++--
 .../python/mirrored_strategy_multigpu_test.py |  2 +-
 .../distribute/python/one_device_strategy.py  | 17 +++++--
 .../python/parameter_server_strategy.py       | 22 ++++++--
 .../python/parameter_server_strategy_test.py  |  3 +-
 .../distribute/python/strategy_test_lib.py    |  6 ++-
 .../contrib/distribute/python/tpu_strategy.py | 36 ++++++++-----
 .../contrib/distribute/python/values.py       | 36 ++++++++-----
 .../contrib/optimizer_v2/optimizer_v2.py      | 32 +++++-------
 tensorflow/python/training/distribute.py      | 51 +++++++++++--------
 .../training/distribution_strategy_context.py |  2 +
 tensorflow/python/training/optimizer.py       | 10 ++--
 13 files changed, 144 insertions(+), 88 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index 33ffbf6abe..6796a23d46 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -128,7 +128,8 @@ class CollectiveAllReduceStrategyTestBase(
             # TODO(yuefengz): support non-Mirrored variable as destinations.
             g = d.reduce(
                 variable_scope.VariableAggregation.SUM, g, destinations=v)
-            with ops.control_dependencies(d.unwrap(d.update(v, update, g))):
+            with ops.control_dependencies(
+                d.update(v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
         return before_list, after_list
 
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 4d7516063c..6bd380a22d 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -627,9 +627,11 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     return self._get_cross_tower_ops().batch_reduce(aggregation,
                                                     value_destination_pairs)
 
-  def _update(self, var, fn, *args, **kwargs):
+  def _update(self, var, options, fn, *args, **kwargs):
     # TODO(josh11b): In eager mode, use one thread per device.
     assert isinstance(var, values.DistributedVariable)
+    should_group = options.pop("grouped")
+    assert not options  # Validate that we are processing all of the options.
     updates = {}
     for d, v in var._index.items():  # pylint: disable=protected-access
       name = "update_%d" % self._device_index.get(d)
@@ -638,10 +640,12 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
         updates[d] = fn(v,
                         *values.select_device_mirrored(d, args),
                         **values.select_device_mirrored(d, kwargs))
-    return values.regroup(updates, values.Mirrored)
+    return values.update_regroup(self, updates, should_group)
 
-  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
     assert isinstance(colocate_with, list)
+    should_group = options.pop("grouped")
+    assert not options  # Validate that we are processing all of the options.
     # TODO(josh11b): In eager mode, use one thread per device.
     updates = {}
     for d in colocate_with:
@@ -649,7 +653,7 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
       with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
         updates[d] = fn(*values.select_device_mirrored(d, args),
                         **values.select_device_mirrored(d, kwargs))
-    return values.regroup(updates, values.Mirrored)
+    return values.update_regroup(self, updates, should_group)
 
   def read_var(self, tower_local_var):
     """Read the aggregate value of a tower-local variable."""
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index f51e543624..eeac528329 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -826,7 +826,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       with dist.scope():
         ret_v_sum = dist.call_for_each_tower(model_fn, run_concurrently=False)
-        update_ops = dist.unwrap(dist.update(ret_v_sum, update, 5.0))
+        update_ops = dist.update(ret_v_sum, update, 5.0, grouped=False)
 
         # Initialize variables.
         self.evaluate(variables.global_variables_initializer())
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 23b220f64b..f525919048 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -141,14 +141,21 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
       else:
         assert False
 
-  def _update(self, var, fn, *args, **kwargs):
-    with ops.device(self._device), distribute_lib.UpdateContext(self._device):
-      return fn(var, *args, **kwargs)
+  def _update(self, var, options, fn, *args, **kwargs):
+    # The implementations of _update() and _update_non_slot() are identical
+    # except _update() passes `var` as the first argument to `fn()`.
+    return self._update_non_slot(var, options, fn, var, *args, **kwargs)
 
-  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
     del colocate_with
+    should_group = options.pop("grouped")
+    assert not options  # Validate that we are processing all of the options.
     with ops.device(self._device), distribute_lib.UpdateContext(self._device):
-      return fn(*args, **kwargs)
+      result = fn(*args, **kwargs)
+      if should_group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
 
   def read_var(self, tower_local_var):
     """Read the aggregate value of a tower-local variable."""
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index 1125d027f6..6ddd91507b 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -343,21 +343,33 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
 
     return nest.map_structure(_select_fn, structured)
 
-  def _update(self, var, fn, *args, **kwargs):
+  def _update(self, var, options, fn, *args, **kwargs):
     if isinstance(var, values.AggregatingVariable):
       var = var.get()
     if not isinstance(var, resource_variable_ops.ResourceVariable):
       raise ValueError(
           "You can not update `var` %r. It must be a Variable." % var)
+    should_group = options.pop("grouped")
+    assert not options  # Validate that we are processing all of the options.
     with ops.colocate_with(var), distribute_lib.UpdateContext(var.device):
-      return fn(var, *self._select_single_value(args),
-                **self._select_single_value(kwargs))
+      result = fn(var, *self._select_single_value(args),
+                  **self._select_single_value(kwargs))
+      if should_group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
 
   # TODO(yuefengz): does it need to call _select_single_value?
-  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
+    should_group = options.pop("grouped")
+    assert not options  # Validate that we are processing all of the options.
     with ops.device(
         colocate_with.device), distribute_lib.UpdateContext(colocate_with):
-      return fn(*args, **kwargs)
+      result = fn(*args, **kwargs)
+      if should_group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
 
   def _unwrap(self, val):
     if isinstance(val, values.DistributedValues):
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index 12789e0bc9..353d11a583 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -395,7 +395,8 @@ class ParameterServerStrategyTestBase(
             # TODO(yuefengz): support non-Mirrored variable as destinations.
             g = d.reduce(
                 variable_scope.VariableAggregation.SUM, g, destinations=v)
-            with ops.control_dependencies(d.unwrap(d.update(v, update, g))):
+            with ops.control_dependencies(
+                d.update(v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
         return before_list, after_list
 
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
index 5d498fb629..fd280f5754 100644
--- a/tensorflow/contrib/distribute/python/strategy_test_lib.py
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -115,7 +115,8 @@ class DistributionTestBase(test.TestCase):
           with ops.control_dependencies([fetched]):
             g = d.reduce(
                 variable_scope.VariableAggregation.SUM, g, destinations=v)
-            with ops.control_dependencies(d.unwrap(d.update(v, update, g))):
+            with ops.control_dependencies(d.update(
+                v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
         return before_list, after_list
 
@@ -169,7 +170,8 @@ class DistributionTestBase(test.TestCase):
           with ops.control_dependencies([fetched]):
             g = d.reduce(
                 variable_scope.VariableAggregation.SUM, g, destinations=v)
-            with ops.control_dependencies(d.unwrap(d.update(v, update, g))):
+            with ops.control_dependencies(d.update(
+                v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
         return before_list, after_list
 
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index 1b555482d3..c3c7df3cd8 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -297,6 +297,7 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
       # For outputs that have already been aggregated, take the first value
       # from the list as each value should be the same. Else return the full
       # list of values.
+      # TODO(josh11b): If aggregation is NONE, we should return a PerDevice value.
       if aggregation is not variables_lib.VariableAggregation.NONE:
         # TODO(priyag): Should this return the element or a list with 1 element
         last_step_tensor_outputs_dict[name] = output[0]
@@ -398,11 +399,16 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
       return output * (1. / len(value))
     return output
 
-  def _update(self, var, fn, *args, **kwargs):
-    # TODO(jhseu): Consider supporting grouped==False.
+  def _update(self, var, options, fn, *args, **kwargs):
     assert isinstance(var, values.TPUMirroredVariable)
+    should_group = options.pop("grouped")
+    assert not options  # Validate that we are processing all of the options.
+
     if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
-      return fn(var, *args, **kwargs)
+      if should_group:
+        return fn(var, *args, **kwargs)
+      else:
+        return [fn(var, *args, **kwargs)]
 
     # Otherwise, we revert to MirroredStrategy behavior and update each variable
     # directly.
@@ -414,23 +420,25 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
         updates[d] = fn(v,
                         *values.select_device_mirrored(d, args),
                         **values.select_device_mirrored(d, kwargs))
+    return values.update_regroup(self, updates, should_group)
 
-    # Make a single control dependency to keep the variables mirrored. If one
-    # assignment is fetched, then run all assignments.
-    sorted_keys = sorted(updates.keys())
-    update_tuple = control_flow_ops.tuple([updates[d] for d in sorted_keys])
-    for i, d in enumerate(sorted_keys):
-      updates[d] = update_tuple[i]
-    return values.regroup(updates, values.Mirrored)
+  # TODO(josh11b): Need to implement _update_non_slot()!
 
   def read_var(self, var):
     assert isinstance(var, values.TPUMirroredVariable)
     return var.read_value()
 
-  def _unwrap(self, value):
-    if isinstance(value, list):
-      return value
-    return [value]
+  def _unwrap(self, val):
+    if isinstance(val, values.DistributedValues):
+      # Return in a deterministic order.
+      return [val.get(device=d) for d in sorted(val.devices)]
+    elif isinstance(val, list):
+      # TODO(josh11b): We need to remove this case; per device values should
+      # be represented using a PerDevice wrapper instead of a list with
+      # one entry per device.
+      return val
+    return [val]
+
 
   @property
   def num_towers(self):
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index c18faeb67d..18ceba42c2 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -366,18 +366,7 @@ class MirroredVariable(DistributedVariable, Mirrored,
       # We are calling assign on the mirrored variable in cross tower context,
       # use update to update the variable.
       strategy = distribution_strategy_context.get_distribution_strategy()
-      updates = strategy.update(self, f, *args, **kwargs)
-      grouped = strategy.group(updates)
-      if isinstance(updates, DistributedValues) and updates.is_tensor_like:
-        # Make sure we run all updates. Without this, something like
-        # session.run(mirrored_var.assign*(...)) may only update one tower.
-        index = {}
-        for d in updates.devices:
-          with ops.device(d), ops.control_dependencies([grouped]):
-            index[d] = array_ops.identity(updates.get(d))
-        return Mirrored(index)
-      else:
-        return grouped
+      return strategy.update(self, f, *args, **kwargs)
     else:
       _assert_tower_context()
       # We are calling an assign function on the mirrored variable in tower
@@ -1049,6 +1038,29 @@ def select_device_mirrored(device, structured):
   return nest.map_structure(_get_mirrored, structured)
 
 
+def update_regroup(strategy, updates, should_group):
+  """Regroup for an update, with dependencies to ensure all updates execute."""
+  regrouped = regroup(updates, Mirrored)
+  if not should_group:
+    return nest.map_structure(strategy.unwrap, regrouped)
+  grouped_flat = []
+  for u in nest.flatten(regrouped):
+    if isinstance(u, DistributedValues):
+      g = strategy.group(u)
+      if u.is_tensor_like:
+        # Make sure we run all updates. Without this, something like
+        # session.run(strategy.update(...)) may only update one tower.
+        index = {}
+        for d in u.devices:
+          with ops.device(d), ops.control_dependencies([g]):
+            index[d] = array_ops.identity(u.get(d))
+        g = Mirrored(index)
+    else:
+      g = u
+    grouped_flat.append(g)
+  return nest.pack_sequence_as(regrouped, grouped_flat)
+
+
 class PerDeviceDataIterator(object):
   """An iterator (like `tf.data.Iterator`) into a `PerDeviceDataset`."""
 
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 6af59dcfbf..53e27c08c4 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -30,7 +30,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import distribute as distribute_lib
@@ -965,8 +964,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
       # Use the processors to update the variables.
       update_ops = []
       for grad, var in grads_and_vars:
-        update_ops.extend(distribution.unwrap(distribution.update(
-            var, update, grad)))
+        update_ops.extend(distribution.update(var, update, grad, grouped=False))
 
       # Give the child class a chance to do something after applying
       # gradients
@@ -978,26 +976,24 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
       update_ops = control_flow_ops.group(update_ops)
       with ops.control_dependencies([update_ops]):
-        finish_updates = distribution.update_non_slot(non_slot_devices, finish)
-      if finish_updates is None:
-        finish_updates = update_ops
+        finish_updates = distribution.update_non_slot(
+            non_slot_devices, finish, grouped=False)
+      # We said grouped=False, which means finish_updates is always a list.
+      # It will be [None] when finish() returns None.
+      if finish_updates == [None]:
+        finish_updates = [update_ops]
 
       # Update `global_step` (if any).
       if global_step is None:
         apply_updates = distribution.group(finish_updates, name=name)
       else:
-        with ops.control_dependencies(distribution.unwrap(finish_updates)):
-
-          def update_global_step(global_step):
-            if isinstance(global_step, resource_variable_ops.ResourceVariable):
-              return global_step.assign_add(
-                  ops.convert_to_tensor(1, dtype=global_step.dtype),
-                  read_value=False)
-            else:
-              return state_ops.assign_add(global_step, 1)
-
-          apply_updates = distribution.group(
-              distribution.update(global_step, update_global_step), name=name)
+        with ops.control_dependencies(finish_updates):
+
+          def update_global_step(global_step, name):
+            return global_step.assign_add(1, read_value=False, name=name)
+
+          apply_updates = distribution.update(
+              global_step, update_global_step, name)
 
       # Add the training op to the TRAIN_OP graph collection in graph mode.
       if not eager_execution:
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 419a9ec12b..a92a1bdee7 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -26,7 +26,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import tf_logging
@@ -807,15 +806,22 @@ class DistributionStrategy(object):
       var: Variable, possibly mirrored to multiple devices, to operate on.
       fn: Function to call. Should take the variable as the first argument.
       *args: Additional positional arguments to pass to `fn()`.
-      **kwargs: Keyword arguments to pass to `fn()`.
+      **kwargs: Keyword arguments to pass to `fn()`. If "grouped=False" is
+        specified, the return value will be unwrapped.
 
     Returns:
-      Merged return value of `fn` across all towers.
+      By default, the merged return value of `fn` across all towers.  The merged
+      result has dependencies to make sure that if it is evaluated at all, the
+      side effects (updates) will happen on every tower. If instead
+      "grouped=False" is specified, this function will return a nest of lists
+      where each list has an element per tower, and the caller is responsible
+      for ensuring all elements are executed.
     """
     _require_cross_tower_context(self)
-    return self._update(var, fn, *args, **kwargs)
+    options = {"grouped": kwargs.pop("grouped", True)}
+    return self._update(var, options, fn, *args, **kwargs)
 
-  def _update(self, var, fn, *args, **kwargs):
+  def _update(self, var, options, fn, *args, **kwargs):
     raise NotImplementedError("must be implemented in descendants")
 
   def update_non_slot(self, colocate_with, fn, *args, **kwargs):
@@ -825,15 +831,18 @@ class DistributionStrategy(object):
       colocate_with: The return value of `non_slot_devices()`.
       fn: Function to execute.
       *args: Positional arguments to pass to `fn()`.
-      **kwargs: Keyword arguments to pass to `fn()`.
+      **kwargs: Keyword arguments to pass to `fn()`. If "grouped=False" is
+        specified, the return value will be unwrapped and the caller is
+        responsible for ensuring all elements are executed.
 
     Returns:
       Return value of `fn`, possibly merged across devices.
     """
     _require_cross_tower_context(self)
-    return self._update_non_slot(colocate_with, fn, *args, **kwargs)
+    options = {"grouped": kwargs.pop("grouped", True)}
+    return self._update_non_slot(colocate_with, options, fn, *args, **kwargs)
 
-  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
     raise NotImplementedError("must be implemented in descendants")
 
   def unwrap(self, value):
@@ -1134,17 +1143,22 @@ class _DefaultDistributionStrategy(DistributionStrategy):
     del aggregation, destinations
     return value
 
-  def _update(self, var, fn, *args, **kwargs):
-    # TODO(josh11b): Figure out what we should be passing to UpdateContext()
-    # once that value is used for something.
-    with ops.colocate_with(var), UpdateContext(var):
-      return fn(var, *args, **kwargs)
+  def _update(self, var, options, fn, *args, **kwargs):
+    # The implementations of _update() and _update_non_slot() are identical
+    # except _update() passes `var` as the first argument to `fn()`.
+    return self._update_non_slot(var, options, fn, var, *args, **kwargs)
 
-  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
+    should_group = options.pop("grouped")
+    assert not options  # Validate that we are processing all of the options.
     # TODO(josh11b): Figure out what we should be passing to UpdateContext()
     # once that value is used for something.
     with ops.colocate_with(colocate_with), UpdateContext(colocate_with):
-      return fn(*args, **kwargs)
+      result = fn(*args, **kwargs)
+      if should_group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
 
   def read_var(self, tower_local_var):
     return array_ops.identity(tower_local_var)
@@ -1193,13 +1207,10 @@ class _DefaultDistributionStrategy(DistributionStrategy):
 def increment_var(v, amount=1):
   """`v += amount`, distributed-aware version."""
   def update(vu):
-    if isinstance(vu, resource_variable_ops.ResourceVariable):
-      return vu.assign_add(amount, read_value=False)
-    else:
-      return state_ops.assign_add(vu, amount)
+    return vu.assign_add(amount, read_value=False)
 
   def merge_fn(dist, vm):
-    return dist.group(dist.update(vm, update))
+    return dist.update(vm, update)
 
   tower_context = distribution_strategy_context.get_tower_context()
   return tower_context.merge_call(merge_fn, v)
diff --git a/tensorflow/python/training/distribution_strategy_context.py b/tensorflow/python/training/distribution_strategy_context.py
index 998b5c35ce..ce580a406f 100644
--- a/tensorflow/python/training/distribution_strategy_context.py
+++ b/tensorflow/python/training/distribution_strategy_context.py
@@ -89,6 +89,7 @@ def get_tower_context():
   """Returns the current TowerContext or None if in a cross-tower context.
 
   Note that execution:
+
   1. starts in the default (single-tower) tower context (this function
      will return the default TowerContext object);
   2. switches to cross-tower context (in which case this will return
@@ -121,6 +122,7 @@ def get_cross_tower_context():
   """Returns the current DistributionStrategy if in a cross-tower context.
 
   Note that execution:
+
   1. starts in the default (single-tower) tower context;
   2. switches to cross-tower context when entering a
      `with DistributionStrategy.scope():` block;
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 30b0ed20c8..47034919e1 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -692,7 +692,7 @@ class Optimizer(
       update_ops = [
           op
           for grad, var in grads_and_vars
-          for op in distribution.unwrap(distribution.update(var, update, grad))
+          for op in distribution.update(var, update, grad, grouped=False)
       ]
 
       def finish(self, update_ops):
@@ -700,13 +700,13 @@ class Optimizer(
 
       non_slot_devices = distribution.non_slot_devices(var_list)
       finish_updates = distribution.update_non_slot(
-          non_slot_devices, finish, self, update_ops)
+          non_slot_devices, finish, self, update_ops, grouped=False)
       if global_step is None:
         apply_updates = distribution.group(finish_updates, name=name)
       else:
-        with ops.control_dependencies(distribution.unwrap(finish_updates)):
-          apply_updates = distribution.group(distribution.update(
-              global_step, state_ops.assign_add, 1, name=name))
+        with ops.control_dependencies(finish_updates):
+          apply_updates = distribution.update(
+              global_step, state_ops.assign_add, 1, name=name)
 
       if not context.executing_eagerly():
         if isinstance(apply_updates, ops.Tensor):
-- 
GitLab


From b72265dc002e712fc3d0f33434f13c7a36a484b2 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 1 Oct 2018 16:45:11 -0700
Subject: [PATCH 267/570] [tf.data] Deprecate `tf.contrib.data` and introduce
 `tf.data.experimental` to replace it.

This change prepares `tf.data` for TensorFlow 2.0, where `tf.contrib` will no longer exist. It retains the pre-existing endpoints in `tf.contrib.data` with deprecation warnings.

Note there are some exceptions to the move:

* Deprecated symbols in `tf.contrib.data` have not been moved to `tf.data.experimental`, because replacements already exist.
* `tf.contrib.data.LMDBDataset` has not been moved, because we plan to move it to a SIG-maintained repository.
* `tf.contrib.data.assert_element_shape()` has not yet been moved, because it depends on functionality in `tf.contrib`, and it will move in a later change.
* `tf.contrib.data.AUTOTUNE` has not yet been moved, because we have not yet determined how to `tf_export()` a Python integer.
* The stats-related API endpoints have not yet appeared in a released version of TensorFlow, so these are moved to `tf.data.experimental` without retaining an endpoint in `tf.contrib.data`.

In addition, this change includes some build rule and ApiDef refactoring:
* Some of the "//third_party/tensorflow/python:training" dependencies had to be split in order to avoid a circular dependency.
* The `tf.contrib.stateless` ops now have a private core library for the generated wrappers (and accordingly are hidden in their ApiDef) so that `tf.data.experimental.sample_from_datasets()` can depend on them.

PiperOrigin-RevId: 215304249
---
 tensorflow/contrib/bigtable/README.md         |   4 +-
 .../bigtable/python/ops/bigtable_api.py       |   4 +-
 tensorflow/contrib/cmake/python_modules.txt   |   1 -
 tensorflow/contrib/data/README.md             |  18 +-
 tensorflow/contrib/data/__init__.py           |  11 +-
 .../contrib/data/python/kernel_tests/BUILD    | 560 +----------
 .../kernel_tests/assert_element_shape_test.py | 226 +++++
 .../kernel_tests/reduce_dataset_test.py       |  62 ++
 .../kernel_tests/window_dataset_op_test.py    | 527 ----------
 tensorflow/contrib/data/python/ops/BUILD      | 170 +---
 .../contrib/data/python/ops/batching.py       | 549 +----------
 tensorflow/contrib/data/python/ops/counter.py |  13 +-
 .../contrib/data/python/ops/enumerate_ops.py  |  15 +-
 .../contrib/data/python/ops/error_ops.py      |  37 +-
 .../data/python/ops/get_single_element.py     |  29 +-
 .../contrib/data/python/ops/grouping.py       | 441 +--------
 .../contrib/data/python/ops/interleave_ops.py | 149 +--
 .../contrib/data/python/ops/iterator_ops.py   | 167 +---
 .../contrib/data/python/ops/parsing_ops.py    | 107 +--
 .../data/python/ops/prefetching_ops.py        | 486 +---------
 .../contrib/data/python/ops/random_ops.py     |  34 +-
 tensorflow/contrib/data/python/ops/readers.py | 674 +------------
 .../contrib/data/python/ops/resampling.py     | 260 +----
 .../contrib/data/python/ops/scan_ops.py       | 137 +--
 .../contrib/data/python/ops/shuffle_ops.py    |  56 +-
 .../contrib/data/python/ops/threadpool.py     |  88 +-
 tensorflow/contrib/data/python/ops/unique.py  |  43 +-
 tensorflow/contrib/data/python/ops/writers.py |  40 +-
 .../distribute/python/prefetching_ops_v2.py   |   2 +-
 tensorflow/contrib/eager/python/datasets.py   |   4 +-
 .../contrib/eager/python/datasets_test.py     |   6 +-
 .../python/examples/revnet/imagenet_input.py  |  12 +-
 .../estimator/python/estimator/rnn_test.py    |   2 +-
 tensorflow/contrib/lookup/lookup_ops_test.py  |   2 +-
 tensorflow/contrib/stateless/BUILD            |   8 +-
 tensorflow/contrib/stateless/__init__.py      |   5 +-
 tensorflow/contrib/tpu/python/tpu/datasets.py |   4 +-
 tensorflow/contrib/tpu/tpu_estimator.md       |   2 +-
 tensorflow/contrib/training/BUILD             |   2 +-
 .../training/tensor_queue_dataset_test.py     |   2 +-
 .../api_def_StatelessMultinomial.pbtxt        |   4 +
 .../api_def_StatelessRandomNormal.pbtxt       |   4 +
 .../api_def_StatelessRandomUniform.pbtxt      |   4 +
 .../api_def_StatelessTruncatedNormal.pbtxt    |   4 +
 .../examples/get_started/regression/test.py   |   2 +-
 tensorflow/python/BUILD                       |  34 +
 tensorflow/python/data/BUILD                  |   1 +
 tensorflow/python/data/__init__.py            |   1 +
 tensorflow/python/data/experimental/BUILD     |  16 +
 .../python/data/experimental/__init__.py      | 109 +++
 .../data/experimental/kernel_tests/BUILD      | 569 +++++++++++
 .../kernel_tests/batch_dataset_op_test.py     | 317 +-----
 .../kernel_tests/bucketing_test.py            |   2 +-
 .../kernel_tests/csv_dataset_op_test.py       |   4 +-
 .../dataset_constructor_op_test.py            |   2 +-
 .../dataset_serialization_test_base.py        |   2 +-
 .../directed_interleave_dataset_test.py       |   4 +-
 .../kernel_tests/filter_dataset_op_test.py    |   2 +-
 .../kernel_tests/get_single_element_test.py   |  30 +-
 .../kernel_tests/indexed_dataset_ops_test.py  |   2 +-
 .../interleave_dataset_op_test.py             |   2 +-
 .../kernel_tests/iterator_ops_test.py         |   2 +-
 .../kernel_tests/map_dataset_op_test.py       |   6 +-
 .../kernel_tests/map_defun_op_test.py         |   2 +-
 .../kernel_tests/optimization/BUILD           |  30 +-
 .../assert_next_dataset_op_test.py            |   2 +-
 .../optimization/hoist_random_uniform_test.py |   2 +-
 .../optimization/latency_all_edges_test.py    |   6 +-
 .../map_and_filter_fusion_test.py             |   2 +-
 .../optimization/map_parallelization_test.py  |   2 +-
 .../optimization/map_vectorization_test.py    |   2 +-
 .../optimization/model_dataset_op_test.py     |   4 +-
 .../optimization/noop_elimination_test.py     |   2 +-
 .../optimization/optimize_dataset_op_test.py  |   2 +-
 .../kernel_tests/parsing_ops_test.py          |   3 +-
 .../kernel_tests/prefetching_ops_test.py      |   2 +-
 .../kernel_tests/range_dataset_op_test.py     |   4 +-
 .../kernel_tests/reader_dataset_ops_test.py   |   4 +-
 .../reader_dataset_ops_test_base.py           |   2 +-
 .../kernel_tests/resample_test.py             |   2 +-
 .../kernel_tests/scan_dataset_op_test.py      |   2 +-
 .../kernel_tests/serialization/BUILD          |  46 +-
 .../batch_dataset_serialization_test.py       |   4 +-
 .../cache_dataset_serialization_test.py       |   2 +-
 .../concatenate_dataset_serialization_test.py |   2 +-
 .../csv_dataset_serialization_test.py         |   4 +-
 .../dataset_constructor_serialization_test.py |   2 +-
 .../dataset_serialization_test_base.py        | 692 ++++++++++++++
 .../filter_dataset_serialization_test.py      |   2 +-
 ...ength_record_dataset_serialization_test.py |   4 +-
 .../flat_map_dataset_serialization_test.py    |   2 +-
 .../group_by_reducer_serialization_test.py    |   4 +-
 .../group_by_window_serialization_test.py     |   4 +-
 .../ignore_errors_serialization_test.py       |   4 +-
 .../interleave_dataset_serialization_test.py  |   2 +-
 ...ap_and_batch_dataset_serialization_test.py |   4 +-
 .../map_dataset_serialization_test.py         |   2 +-
 .../optimize_dataset_serialization_test.py    |   4 +-
 ...padded_batch_dataset_serialization_test.py |   2 +-
 ...l_interleave_dataset_serialization_test.py |   4 +-
 ...parallel_map_dataset_serialization_test.py |   4 +-
 ...arse_example_dataset_serialization_test.py |   4 +-
 .../prefetch_dataset_serialization_test.py    |   2 +-
 .../range_dataset_serialization_test.py       |   2 +-
 ...sample_from_datasets_serialization_test.py |   4 +-
 .../scan_dataset_serialization_test.py        |   4 +-
 .../sequence_dataset_serialization_test.py    |   2 +-
 .../serialization_integration_test.py         |   2 +-
 ...e_and_repeat_dataset_serialization_test.py |   4 +-
 .../shuffle_dataset_serialization_test.py     |   4 +-
 .../sql_dataset_serialization_test.py         |   6 +-
 .../stats_dataset_serialization_test.py       |   4 +-
 .../textline_dataset_serialization_test.py    |   4 +-
 .../tf_record_dataset_serialization_test.py   |   4 +-
 .../unbatch_dataset_serialization_test.py     |   4 +-
 .../unique_dataset_serialization_test.py      |   4 +-
 .../zip_dataset_serialization_test.py         |   2 +-
 .../serialization_integration_test.py         |  85 ++
 .../kernel_tests/shuffle_dataset_op_test.py   |   2 +-
 .../kernel_tests/sql_dataset_op_test.py       |   2 +-
 .../kernel_tests/sql_dataset_op_test_base.py  |   2 +-
 .../kernel_tests/stats_dataset_ops_test.py    |   4 +-
 .../kernel_tests/stats_dataset_test_base.py   |   0
 .../threadpool_dataset_ops_test.py            |   4 +-
 .../kernel_tests/unique_dataset_op_test.py    |   2 +-
 .../kernel_tests/writer_ops_test.py           |   2 +-
 tensorflow/python/data/experimental/ops/BUILD | 377 ++++++++
 .../python/data/experimental/ops/batching.py  | 669 +++++++++++++
 .../python/data/experimental/ops/counter.py   |  55 ++
 .../data/experimental/ops/enumerate_ops.py    |  60 ++
 .../python/data/experimental/ops/error_ops.py |  78 ++
 .../experimental/ops/get_single_element.py    |  72 ++
 .../python/data/experimental/ops/grouping.py  | 551 +++++++++++
 .../experimental}/ops/indexed_dataset_ops.py  |   0
 .../data/experimental/ops/interleave_ops.py   | 262 +++++
 .../data/experimental/ops/iterator_ops.py     | 268 ++++++
 .../data/experimental}/ops/map_defun.py       |   0
 .../data/experimental}/ops/optimization.py    |   0
 .../data/experimental/ops/parsing_ops.py      | 152 +++
 .../data/experimental/ops/prefetching_ops.py  | 531 ++++++++++
 .../data/experimental/ops/random_ops.py       |  54 ++
 .../python/data/experimental/ops/readers.py   | 904 ++++++++++++++++++
 .../data/experimental/ops/resampling.py       | 296 ++++++
 .../python/data/experimental/ops/scan_ops.py  | 177 ++++
 .../data/experimental/ops/shuffle_ops.py      | 102 ++
 .../data/experimental}/ops/stats_ops.py       |  14 +-
 .../data/experimental/ops/threadpool.py       | 104 ++
 .../python/data/experimental/ops/unique.py    |  79 ++
 .../python/data/experimental/ops/writers.py   |  60 ++
 tensorflow/python/data/ops/dataset_ops.py     |   4 +-
 tensorflow/python/data/ops/optional_ops.py    |   4 +-
 tensorflow/python/data/ops/readers.py         |   4 +-
 .../debug/examples/debug_tflearn_iris.py      |  14 +-
 .../tools/api/generator/api_init_files.bzl    |   1 +
 .../tools/api/generator/api_init_files_v1.bzl |   1 +
 ...ntal.-checkpoint-input-pipeline-hook.pbtxt |  30 +
 ...erimental.-csv-dataset.__metaclass__.pbtxt |  14 +
 ...rflow.data.experimental.-csv-dataset.pbtxt | 127 +++
 ...nsorflow.data.experimental.-optional.pbtxt |  28 +
 ...mental.-random-dataset.__metaclass__.pbtxt |  14 +
 ...ow.data.experimental.-random-dataset.pbtxt | 127 +++
 ...ensorflow.data.experimental.-reducer.pbtxt |  21 +
 ...erimental.-sql-dataset.__metaclass__.pbtxt |  14 +
 ...rflow.data.experimental.-sql-dataset.pbtxt | 127 +++
 ....data.experimental.-stats-aggregator.pbtxt |  13 +
 ...data.experimental.-t-f-record-writer.pbtxt |  13 +
 .../v1/tensorflow.data.experimental.pbtxt     | 139 +++
 .../tools/api/golden/v1/tensorflow.data.pbtxt |   4 +
 ...ntal.-checkpoint-input-pipeline-hook.pbtxt |  30 +
 ...erimental.-csv-dataset.__metaclass__.pbtxt |  14 +
 ...rflow.data.experimental.-csv-dataset.pbtxt | 127 +++
 ...nsorflow.data.experimental.-optional.pbtxt |  28 +
 ...mental.-random-dataset.__metaclass__.pbtxt |  14 +
 ...ow.data.experimental.-random-dataset.pbtxt | 127 +++
 ...ensorflow.data.experimental.-reducer.pbtxt |  21 +
 ...erimental.-sql-dataset.__metaclass__.pbtxt |  14 +
 ...rflow.data.experimental.-sql-dataset.pbtxt | 127 +++
 ....data.experimental.-stats-aggregator.pbtxt |  13 +
 ...data.experimental.-t-f-record-writer.pbtxt |  13 +
 .../v2/tensorflow.data.experimental.pbtxt     | 139 +++
 .../tools/api/golden/v2/tensorflow.data.pbtxt |   4 +
 tensorflow/tools/pip_package/BUILD            |   4 +-
 182 files changed, 8389 insertions(+), 4960 deletions(-)
 create mode 100644 tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
 create mode 100644 tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
 delete mode 100644 tensorflow/contrib/data/python/kernel_tests/window_dataset_op_test.py
 create mode 100644 tensorflow/core/api_def/python_api/api_def_StatelessMultinomial.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_StatelessRandomNormal.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_StatelessRandomUniform.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_StatelessTruncatedNormal.pbtxt
 create mode 100644 tensorflow/python/data/experimental/BUILD
 create mode 100644 tensorflow/python/data/experimental/__init__.py
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/BUILD
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/batch_dataset_op_test.py (67%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/bucketing_test.py (99%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/csv_dataset_op_test.py (99%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/dataset_constructor_op_test.py (97%)
 rename tensorflow/{contrib/data/python/kernel_tests/serialization => python/data/experimental/kernel_tests}/dataset_serialization_test_base.py (99%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/directed_interleave_dataset_test.py (97%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/filter_dataset_op_test.py (97%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/get_single_element_test.py (76%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/indexed_dataset_ops_test.py (97%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/interleave_dataset_op_test.py (99%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/iterator_ops_test.py (98%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/map_dataset_op_test.py (98%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/map_defun_op_test.py (99%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/optimization/BUILD (81%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/optimization/assert_next_dataset_op_test.py (97%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/optimization/hoist_random_uniform_test.py (98%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/optimization/latency_all_edges_test.py (91%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/optimization/map_and_filter_fusion_test.py (99%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/optimization/map_parallelization_test.py (97%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/optimization/map_vectorization_test.py (99%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/optimization/model_dataset_op_test.py (98%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/optimization/noop_elimination_test.py (97%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/optimization/optimize_dataset_op_test.py (98%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/parsing_ops_test.py (99%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/prefetching_ops_test.py (99%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/range_dataset_op_test.py (95%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/reader_dataset_ops_test.py (99%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/reader_dataset_ops_test_base.py (99%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/resample_test.py (99%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/scan_dataset_op_test.py (99%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/BUILD (90%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/batch_dataset_serialization_test.py (94%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/cache_dataset_serialization_test.py (98%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/concatenate_dataset_serialization_test.py (94%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/csv_dataset_serialization_test.py (93%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/dataset_constructor_serialization_test.py (97%)
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/filter_dataset_serialization_test.py (95%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py (89%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/flat_map_dataset_serialization_test.py (97%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/group_by_reducer_serialization_test.py (93%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/group_by_window_serialization_test.py (93%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/ignore_errors_serialization_test.py (90%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/interleave_dataset_serialization_test.py (96%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py (94%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/map_dataset_serialization_test.py (97%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/optimize_dataset_serialization_test.py (89%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/padded_batch_dataset_serialization_test.py (95%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py (95%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/parallel_map_dataset_serialization_test.py (96%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/parse_example_dataset_serialization_test.py (90%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/prefetch_dataset_serialization_test.py (93%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/range_dataset_serialization_test.py (97%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/sample_from_datasets_serialization_test.py (90%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/scan_dataset_serialization_test.py (89%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/sequence_dataset_serialization_test.py (97%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/serialization_integration_test.py (97%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py (89%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/shuffle_dataset_serialization_test.py (96%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/sql_dataset_serialization_test.py (88%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/stats_dataset_serialization_test.py (96%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/textline_dataset_serialization_test.py (90%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/tf_record_dataset_serialization_test.py (95%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/unbatch_dataset_serialization_test.py (91%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/unique_dataset_serialization_test.py (89%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/serialization/zip_dataset_serialization_test.py (94%)
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/serialization_integration_test.py
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/shuffle_dataset_op_test.py (98%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/sql_dataset_op_test.py (99%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/sql_dataset_op_test_base.py (98%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/stats_dataset_ops_test.py (98%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/stats_dataset_test_base.py (100%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/threadpool_dataset_ops_test.py (96%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/unique_dataset_op_test.py (98%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/kernel_tests/writer_ops_test.py (98%)
 create mode 100644 tensorflow/python/data/experimental/ops/BUILD
 create mode 100644 tensorflow/python/data/experimental/ops/batching.py
 create mode 100644 tensorflow/python/data/experimental/ops/counter.py
 create mode 100644 tensorflow/python/data/experimental/ops/enumerate_ops.py
 create mode 100644 tensorflow/python/data/experimental/ops/error_ops.py
 create mode 100644 tensorflow/python/data/experimental/ops/get_single_element.py
 create mode 100644 tensorflow/python/data/experimental/ops/grouping.py
 rename tensorflow/{contrib/data/python => python/data/experimental}/ops/indexed_dataset_ops.py (100%)
 create mode 100644 tensorflow/python/data/experimental/ops/interleave_ops.py
 create mode 100644 tensorflow/python/data/experimental/ops/iterator_ops.py
 rename tensorflow/{contrib/data/python => python/data/experimental}/ops/map_defun.py (100%)
 rename tensorflow/{contrib/data/python => python/data/experimental}/ops/optimization.py (100%)
 create mode 100644 tensorflow/python/data/experimental/ops/parsing_ops.py
 create mode 100644 tensorflow/python/data/experimental/ops/prefetching_ops.py
 create mode 100644 tensorflow/python/data/experimental/ops/random_ops.py
 create mode 100644 tensorflow/python/data/experimental/ops/readers.py
 create mode 100644 tensorflow/python/data/experimental/ops/resampling.py
 create mode 100644 tensorflow/python/data/experimental/ops/scan_ops.py
 create mode 100644 tensorflow/python/data/experimental/ops/shuffle_ops.py
 rename tensorflow/{contrib/data/python => python/data/experimental}/ops/stats_ops.py (92%)
 create mode 100644 tensorflow/python/data/experimental/ops/threadpool.py
 create mode 100644 tensorflow/python/data/experimental/ops/unique.py
 create mode 100644 tensorflow/python/data/experimental/ops/writers.py
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-checkpoint-input-pipeline-hook.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-reducer.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-t-f-record-writer.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-checkpoint-input-pipeline-hook.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optional.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-reducer.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-t-f-record-writer.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt

diff --git a/tensorflow/contrib/bigtable/README.md b/tensorflow/contrib/bigtable/README.md
index f33eaf7e3d..2c44abed5e 100644
--- a/tensorflow/contrib/bigtable/README.md
+++ b/tensorflow/contrib/bigtable/README.md
@@ -203,7 +203,7 @@ def interleave_fn(index):
   start = tf.string_join(['training_data_', start_idx_str])
   end = tf.string_join(['training_data_', end_idx_str])
   return table.scan_range(start_idx, end_idx, columns=columns)
-ds = ds.apply(tf.contrib.data.parallel_interleave(
+ds = ds.apply(tf.data.experimental.parallel_interleave(
     interleave_fn, cycle_length=NUM_PARALLEL_READS, prefetch_input_elements=1))
 ```
 
@@ -249,7 +249,7 @@ def make_row_key_dataset():
    - ...
    - fake-data-23498103
   """
-  counter_dataset = tf.contrib.data.Counter()
+  counter_dataset = tf.data.experimental.Counter()
   width = 8
   row_key_prefix = 'fake-data-'
   ds = counter_dataset.map(lambda index: tf.as_string(index,
diff --git a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
index cf56822ff4..7c87b0daeb 100644
--- a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
+++ b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
@@ -31,8 +31,8 @@ from six import iteritems
 from six import string_types
 
 from tensorflow.contrib.bigtable.ops import gen_bigtable_ops
-from tensorflow.contrib.data.python.ops import interleave_ops
 from tensorflow.contrib.util import loader
+from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
@@ -228,7 +228,7 @@ class BigtableTable(object):
     """Retrieves a sampling of row keys from the Bigtable table.
 
     This dataset is most often used in conjunction with
-    `tf.contrib.data.parallel_interleave` to construct a set of ranges for
+    `tf.data.experimental.parallel_interleave` to construct a set of ranges for
     scanning in parallel.
 
     Returns:
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 9b80eb559f..6e72670142 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -134,7 +134,6 @@ tensorflow/contrib/cudnn_rnn/python/ops
 tensorflow/contrib/data
 tensorflow/contrib/data/python
 tensorflow/contrib/data/python/kernel_tests
-tensorflow/contrib/data/python/kernel_tests/serialization
 tensorflow/contrib/data/python/ops
 tensorflow/contrib/decision_trees
 tensorflow/contrib/decision_trees/proto
diff --git a/tensorflow/contrib/data/README.md b/tensorflow/contrib/data/README.md
index 848782e8d8..90be7a66ca 100644
--- a/tensorflow/contrib/data/README.md
+++ b/tensorflow/contrib/data/README.md
@@ -1,10 +1,12 @@
 `tf.contrib.data` API
 =====================
 
-NOTE: The `tf.contrib.data` module has been deprecated. Use `tf.data` instead.
-We are continuing to support existing code using the `tf.contrib.data` APIs in
-the current version of TensorFlow, but will eventually remove support. The
-`tf.data` APIs are subject to backwards compatibility guarantees.
+NOTE: The `tf.contrib.data` module has been deprecated. Use `tf.data` instead,
+or `tf.data.experimental` for the experimental transformations previously hosted
+in this module. We are continuing to support existing code using the
+`tf.contrib.data` APIs in the current version of TensorFlow, but will eventually
+remove support. The non-experimental `tf.data` APIs are subject to backwards
+compatibility guarantees.
 
 Porting your code to `tf.data`
 ------------------------------
@@ -25,13 +27,13 @@ instead apply them using `Dataset.apply()` transformation. The full list of
 changes is as follows:
 
 * `dataset.dense_to_sparse_batch(...)` is now
-  `dataset.apply(tf.contrib.data.dense_to_sparse_batch(...)`.
+  `dataset.apply(tf.data.experimental.dense_to_sparse_batch(...)`.
 * `dataset.enumerate(...)` is now
-  `dataset.apply(tf.contrib.data.enumerate_dataset(...))`.
+  `dataset.apply(tf.data.experimental.enumerate_dataset(...))`.
 * `dataset.group_by_window(...)` is now
-  `dataset.apply(tf.contrib.data.group_by_window(...))`.
+  `dataset.apply(tf.data.experimental.group_by_window(...))`.
 * `dataset.ignore_errors()` is now
-  `dataset.apply(tf.contrib.data.ignore_errors())`.
+  `dataset.apply(tf.data.experimental.ignore_errors())`.
 * `dataset.unbatch()` is now `dataset.apply(tf.contrib.data.unbatch())`.
 
 The `Dataset.make_dataset_resource()` and `Iterator.dispose_op()` methods have
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 3cb51279c3..c3d3e981fa 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -96,10 +96,6 @@ from tensorflow.contrib.data.python.ops.interleave_ops import sample_from_datase
 from tensorflow.contrib.data.python.ops.interleave_ops import sloppy_interleave
 from tensorflow.contrib.data.python.ops.iterator_ops import CheckpointInputPipelineHook
 from tensorflow.contrib.data.python.ops.iterator_ops import make_saveable_from_iterator
-
-# Optimization constant that can be used to enable auto-tuning.
-from tensorflow.contrib.data.python.ops.optimization import AUTOTUNE
-
 from tensorflow.contrib.data.python.ops.parsing_ops import parse_example_dataset
 from tensorflow.contrib.data.python.ops.prefetching_ops import copy_to_device
 from tensorflow.contrib.data.python.ops.prefetching_ops import prefetch_to_device
@@ -114,11 +110,12 @@ from tensorflow.contrib.data.python.ops.resampling import rejection_resample
 from tensorflow.contrib.data.python.ops.scan_ops import scan
 from tensorflow.contrib.data.python.ops.shuffle_ops import shuffle_and_repeat
 from tensorflow.contrib.data.python.ops.sliding import sliding_window_batch
-from tensorflow.contrib.data.python.ops.stats_ops import latency_stats
-from tensorflow.contrib.data.python.ops.stats_ops import set_stats_aggregator
-from tensorflow.contrib.data.python.ops.stats_ops import StatsAggregator
 from tensorflow.contrib.data.python.ops.unique import unique
 from tensorflow.contrib.data.python.ops.writers import TFRecordWriter
+
+# Optimization constant that can be used to enable auto-tuning.
+from tensorflow.python.data.experimental.ops.optimization import AUTOTUNE
+
 from tensorflow.python.data.ops.iterator_ops import get_next_as_optional
 from tensorflow.python.data.ops.optional_ops import Optional
 # pylint: enable=unused-import
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 33784afa3f..42f538b4ba 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -8,51 +8,17 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_test(
-    name = "batch_dataset_op_test",
-    size = "medium",
-    srcs = ["batch_dataset_op_test.py"],
+    name = "assert_element_shape_test",
+    srcs = ["assert_element_shape_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",  # (b/79552534)
-        "no_pip",
-    ],
     deps = [
         "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:script_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_test(
-    name = "bucketing_test",
-    size = "medium",
-    srcs = ["bucketing_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/data/python/ops:grouping",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/kernel_tests:test_base",
@@ -61,147 +27,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "csv_dataset_op_test",
-    size = "medium",
-    srcs = ["csv_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:error_ops",
-        "//tensorflow/contrib/data/python/ops:readers",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:session",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "dataset_constructor_op_test",
-    size = "medium",
-    srcs = ["dataset_constructor_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "nomac",  # b/62040583
-    ],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:batching",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
-
-py_test(
-    name = "directed_interleave_dataset_test",
-    size = "medium",
-    srcs = ["directed_interleave_dataset_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/data/python/ops:interleave_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "get_single_element_test",
-    size = "small",
-    srcs = ["get_single_element_test.py"],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:get_single_element",
-        "//tensorflow/contrib/data/python/ops:grouping",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_test(
-    name = "indexed_dataset_ops_test",
-    srcs = ["indexed_dataset_ops_test.py"],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:indexed_dataset_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "interleave_dataset_op_test",
-    size = "medium",
-    srcs = ["interleave_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "notap",
-    ],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:interleave_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "iterator_ops_test",
-    size = "small",
-    srcs = ["iterator_ops_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:iterator_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/estimator:estimator_py",
-    ],
-)
-
 py_test(
     name = "lmdb_dataset_op_test",
     size = "medium",
@@ -229,252 +54,18 @@ py_test(
 )
 
 py_test(
-    name = "map_dataset_op_test",
-    size = "medium",
-    srcs = ["map_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "noasan",  # times out
-        "optonly",
-    ],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:batching",
-        "//tensorflow/contrib/data/python/ops:error_ops",
-        "//tensorflow/contrib/data/python/ops:optimization",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "filter_dataset_op_test",
-    size = "medium",
-    srcs = ["filter_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/data/python/ops:optimization",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "map_defun_op_test",
+    name = "reduce_dataset_test",
     size = "small",
-    srcs = ["map_defun_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    srcs = ["reduce_dataset_test.py"],
     deps = [
-        "//tensorflow/contrib/data/python/ops:map_defun",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python/data/kernel_tests:test_base",
-    ],
-)
-
-py_test(
-    name = "parsing_ops_test",
-    size = "small",
-    srcs = ["parsing_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/data/python/ops:parsing_ops",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "prefetching_ops_test",
-    size = "small",
-    srcs = ["prefetching_ops_test.py"],
-    additional_deps = [
-        "//tensorflow/contrib/data/python/ops:prefetching_ops",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/compat:compat",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-    tags = ["no_windows_gpu"],
-)
-
-py_test(
-    name = "range_dataset_op_test",
-    size = "small",
-    srcs = ["range_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/data/python/ops:counter",
-        "//tensorflow/contrib/data/python/ops:enumerate_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-py_library(
-    name = "reader_dataset_ops_test_base",
-    testonly = 1,
-    srcs = [
-        "reader_dataset_ops_test_base.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = [
-        "//tensorflow/contrib/data/python/kernel_tests:__pkg__",
-        "//tensorflow/contrib/data/python/kernel_tests/serialization:__pkg__",
-    ],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:readers",
-        "//tensorflow/core:protos_all_py",
+        "//tensorflow/contrib/data/python/ops:get_single_element",
+        "//tensorflow/contrib/data/python/ops:grouping",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:readers",
-    ],
-)
-
-py_test(
-    name = "reader_dataset_ops_test",
-    size = "medium",
-    srcs = ["reader_dataset_ops_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
-        ":reader_dataset_ops_test_base",
-        "//tensorflow/contrib/data/python/ops:readers",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python/data/util:nest",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "resample_test",
-    size = "medium",
-    srcs = ["resample_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = [
-        "noasan",
-        "optonly",
-    ],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:resampling",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "scan_dataset_op_test",
-    size = "small",
-    srcs = ["scan_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:scan_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "shuffle_dataset_op_test",
-    size = "medium",
-    srcs = ["shuffle_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "optonly",
-    ],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:shuffle_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -496,142 +87,3 @@ py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
-
-py_library(
-    name = "sql_dataset_op_test_base",
-    srcs = ["sql_dataset_op_test_base.py"],
-    srcs_version = "PY2AND3",
-    visibility = [
-        "//tensorflow/contrib/data/python/kernel_tests:__pkg__",
-        "//tensorflow/contrib/data/python/kernel_tests/serialization:__pkg__",
-    ],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:readers",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "@org_sqlite//:python",
-    ],
-)
-
-py_test(
-    name = "sql_dataset_op_test",
-    size = "small",
-    srcs = ["sql_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
-        ":sql_dataset_op_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-    ],
-)
-
-py_test(
-    name = "stats_dataset_ops_test",
-    size = "medium",
-    srcs = ["stats_dataset_ops_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
-        ":reader_dataset_ops_test_base",
-        ":stats_dataset_test_base",
-        "//tensorflow/contrib/data/python/ops:stats_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "stats_dataset_test_base",
-    srcs = ["stats_dataset_test_base.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/data/kernel_tests:test_base",
-    ],
-)
-
-py_test(
-    name = "threadpool_dataset_ops_test",
-    size = "small",
-    srcs = ["threadpool_dataset_ops_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:threadpool",
-        "//tensorflow/contrib/data/python/ops:unique",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_test(
-    name = "unique_dataset_op_test",
-    size = "small",
-    srcs = ["unique_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:unique",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-py_test(
-    name = "window_dataset_op_test",
-    size = "medium",
-    srcs = ["window_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:batching",
-        "//tensorflow/contrib/data/python/ops:grouping",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_test(
-    name = "writer_ops_test",
-    size = "small",
-    srcs = ["writer_ops_test.py"],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:writers",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:readers",
-    ],
-)
diff --git a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
new file mode 100644
index 0000000000..0456463a19
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
@@ -0,0 +1,226 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import batching
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import script_ops
+from tensorflow.python.platform import test
+
+
+class AssertElementShapeTest(test_base.DatasetTestBase):
+
+  def test_assert_element_shape(self):
+
+    def create_dataset(_):
+      return (array_ops.ones(2, dtype=dtypes.float32),
+              array_ops.zeros((3, 4), dtype=dtypes.int32))
+
+    dataset = dataset_ops.Dataset.range(5).map(create_dataset)
+    expected_shapes = (tensor_shape.TensorShape(2),
+                       tensor_shape.TensorShape((3, 4)))
+    self.assertEqual(expected_shapes, dataset.output_shapes)
+
+    result = dataset.apply(batching.assert_element_shape(expected_shapes))
+    self.assertEqual(expected_shapes, result.output_shapes)
+
+    iterator = result.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for _ in range(5):
+        sess.run(get_next)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def test_assert_wrong_element_shape(self):
+
+    def create_dataset(_):
+      return (array_ops.ones(2, dtype=dtypes.float32),
+              array_ops.zeros((3, 4), dtype=dtypes.int32))
+
+    dataset = dataset_ops.Dataset.range(3).map(create_dataset)
+    wrong_shapes = (tensor_shape.TensorShape(2),
+                    tensor_shape.TensorShape((3, 10)))
+    with self.assertRaises(ValueError):
+      dataset.apply(batching.assert_element_shape(wrong_shapes))
+
+  def test_assert_element_shape_on_unknown_shape_dataset(self):
+
+    def create_unknown_shape_dataset(x):
+      return script_ops.py_func(
+          lambda _: (  # pylint: disable=g-long-lambda
+              np.ones(2, dtype=np.float32),
+              np.zeros((3, 4), dtype=np.int32)),
+          [x],
+          [dtypes.float32, dtypes.int32])
+
+    dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset)
+    unknown_shapes = (tensor_shape.TensorShape(None),
+                      tensor_shape.TensorShape(None))
+    self.assertEqual(unknown_shapes, dataset.output_shapes)
+
+    expected_shapes = (tensor_shape.TensorShape(2),
+                       tensor_shape.TensorShape((3, 4)))
+    result = dataset.apply(batching.assert_element_shape(expected_shapes))
+    self.assertEqual(expected_shapes, result.output_shapes)
+
+    iterator = result.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for _ in range(5):
+        sess.run(get_next)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def test_assert_wrong_element_shape_on_unknown_shape_dataset(self):
+
+    def create_unknown_shape_dataset(x):
+      return script_ops.py_func(
+          lambda _: (  # pylint: disable=g-long-lambda
+              np.ones(2, dtype=np.float32),
+              np.zeros((3, 4), dtype=np.int32)),
+          [x],
+          [dtypes.float32, dtypes.int32])
+
+    dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset)
+    unknown_shapes = (tensor_shape.TensorShape(None),
+                      tensor_shape.TensorShape(None))
+    self.assertEqual(unknown_shapes, dataset.output_shapes)
+
+    wrong_shapes = (tensor_shape.TensorShape(2),
+                    tensor_shape.TensorShape((3, 10)))
+    iterator = (
+        dataset.apply(batching.assert_element_shape(wrong_shapes))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next)
+
+  def test_assert_partial_element_shape(self):
+
+    def create_dataset(_):
+      return (array_ops.ones(2, dtype=dtypes.float32),
+              array_ops.zeros((3, 4), dtype=dtypes.int32))
+
+    dataset = dataset_ops.Dataset.range(5).map(create_dataset)
+    partial_expected_shape = (
+        tensor_shape.TensorShape(None),  # Unknown shape
+        tensor_shape.TensorShape((None, 4)))  # Partial shape
+    result = dataset.apply(
+        batching.assert_element_shape(partial_expected_shape))
+    # Partial shapes are merged with actual shapes:
+    actual_shapes = (tensor_shape.TensorShape(2),
+                     tensor_shape.TensorShape((3, 4)))
+    self.assertEqual(actual_shapes, result.output_shapes)
+
+    iterator = result.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for _ in range(5):
+        sess.run(get_next)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def test_assert_wrong_partial_element_shape(self):
+
+    def create_dataset(_):
+      return (array_ops.ones(2, dtype=dtypes.float32),
+              array_ops.zeros((3, 4), dtype=dtypes.int32))
+
+    dataset = dataset_ops.Dataset.range(3).map(create_dataset)
+    wrong_shapes = (tensor_shape.TensorShape(2),
+                    tensor_shape.TensorShape((None, 10)))
+    with self.assertRaises(ValueError):
+      dataset.apply(batching.assert_element_shape(wrong_shapes))
+
+  def test_assert_partial_element_shape_on_unknown_shape_dataset(self):
+
+    def create_unknown_shape_dataset(x):
+      return script_ops.py_func(
+          lambda _: (  # pylint: disable=g-long-lambda
+              np.ones(2, dtype=np.float32),
+              np.zeros((3, 4), dtype=np.int32)),
+          [x],
+          [dtypes.float32, dtypes.int32])
+
+    dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset)
+    unknown_shapes = (tensor_shape.TensorShape(None),
+                      tensor_shape.TensorShape(None))
+    self.assertEqual(unknown_shapes, dataset.output_shapes)
+
+    expected_shapes = (tensor_shape.TensorShape(2),
+                       tensor_shape.TensorShape((None, 4)))
+    result = dataset.apply(batching.assert_element_shape(expected_shapes))
+    self.assertEqual(expected_shapes, result.output_shapes)
+
+    iterator = result.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for _ in range(5):
+        sess.run(get_next)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def test_assert_wrong_partial_element_shape_on_unknown_shape_dataset(self):
+
+    def create_unknown_shape_dataset(x):
+      return script_ops.py_func(
+          lambda _: (  # pylint: disable=g-long-lambda
+              np.ones(2, dtype=np.float32),
+              np.zeros((3, 4), dtype=np.int32)),
+          [x],
+          [dtypes.float32, dtypes.int32])
+
+    dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset)
+    unknown_shapes = (tensor_shape.TensorShape(None),
+                      tensor_shape.TensorShape(None))
+    self.assertEqual(unknown_shapes, dataset.output_shapes)
+
+    wrong_shapes = (tensor_shape.TensorShape(2),
+                    tensor_shape.TensorShape((None, 10)))
+    iterator = (
+        dataset.apply(batching.assert_element_shape(wrong_shapes))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py b/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
new file mode 100644
index 0000000000..e7281d5318
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
@@ -0,0 +1,62 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import get_single_element
+from tensorflow.contrib.data.python.ops import grouping
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ("SumZero", 0),
+      ("SumOne", 1),
+      ("SumFive", 5),
+      ("SumTen", 10),
+  )
+  def testReduceDataset(self, stop):
+    def init_fn(_):
+      return np.int64(0)
+
+    def reduce_fn(state, value):
+      return state + value
+
+    def finalize_fn(state):
+      return state
+
+    sum_reducer = grouping.Reducer(init_fn, reduce_fn, finalize_fn)
+
+    stop_t = array_ops.placeholder(dtypes.int64, shape=[])
+    dataset = dataset_ops.Dataset.range(stop_t)
+    element = get_single_element.reduce_dataset(dataset, sum_reducer)
+
+    with self.cached_session() as sess:
+      value = sess.run(element, feed_dict={stop_t: stop})
+      self.assertEqual(stop * (stop - 1) / 2, value)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/window_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/window_dataset_op_test.py
deleted file mode 100644
index 79134c7bc6..0000000000
--- a/tensorflow/contrib/data/python/kernel_tests/window_dataset_op_test.py
+++ /dev/null
@@ -1,527 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.contrib.data.python.ops import batching
-from tensorflow.contrib.data.python.ops import grouping
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.platform import test
-
-
-class WindowDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  def _structuredDataset(self, structure, shape, dtype):
-    if structure is None:
-      return dataset_ops.Dataset.from_tensors(
-          array_ops.zeros(shape, dtype=dtype))
-    else:
-      return dataset_ops.Dataset.zip(
-          tuple([
-              self._structuredDataset(substructure, shape, dtype)
-              for substructure in structure
-          ]))
-
-  def _structuredElement(self, structure, shape, dtype):
-    if structure is None:
-      return array_ops.zeros(shape, dtype=dtype)
-    else:
-      return tuple([
-          self._structuredElement(substructure, shape, dtype)
-          for substructure in structure
-      ])
-
-  def _assertEqual(self, xs, ys):
-    self.assertEqual(type(xs), type(ys))
-    if isinstance(xs, tuple) and isinstance(ys, tuple):
-      self.assertEqual(len(xs), len(ys))
-      for x, y in zip(xs, ys):
-        self._assertEqual(x, y)
-    elif isinstance(xs, np.ndarray) and isinstance(ys, np.ndarray):
-      self.assertAllEqual(xs, ys)
-    else:
-      self.assertEqual(xs, ys)
-
-  @parameterized.named_parameters(
-      ("1", None, np.int32([]), dtypes.bool),
-      ("2", None, np.int32([]), dtypes.int32),
-      ("3", None, np.int32([]), dtypes.float32),
-      ("4", None, np.int32([]), dtypes.string),
-      ("5", None, np.int32([2]), dtypes.int32),
-      ("6", None, np.int32([2, 2]), dtypes.int32),
-      ("7", (None, None, None), np.int32([]), dtypes.int32),
-      ("8", (None, (None, None)), np.int32([]), dtypes.int32),
-  )
-  def testWindowDatasetFlatMap(self, structure, shape, dtype):
-    """Tests windowing by chaining it with flat map.
-
-    Args:
-      structure: the input structure
-      shape: the input shape
-      dtype: the input data type
-    """
-
-    def fn(*args):
-      if len(args) == 1 and not isinstance(args[0], tuple):
-        return args[0]
-      return dataset_ops.Dataset.zip(
-          tuple([fn(*arg) if isinstance(arg, tuple) else arg for arg in args]))
-
-    dataset = self._structuredDataset(structure, shape, dtype).repeat(5).apply(
-        grouping.window_dataset(5)).flat_map(fn)
-    get_next = dataset.make_one_shot_iterator().get_next()
-    with self.cached_session() as sess:
-      expected = sess.run(self._structuredElement(structure, shape, dtype))
-      for _ in range(5):
-        actual = sess.run(get_next)
-        self._assertEqual(expected, actual)
-
-  @parameterized.named_parameters(
-      ("1", None, np.int32([]), dtypes.bool),
-      ("2", None, np.int32([]), dtypes.int32),
-      ("3", None, np.int32([]), dtypes.float32),
-      ("4", None, np.int32([]), dtypes.string),
-      ("5", None, np.int32([2]), dtypes.int32),
-      ("6", None, np.int32([2, 2]), dtypes.int32),
-      ("7", (None, None, None), np.int32([]), dtypes.int32),
-      ("8", (None, (None, None)), np.int32([]), dtypes.int32),
-  )
-  def testWindowDatasetBatchDense(self, structure, shape, dtype):
-    """Tests batching of dense tensor windows.
-
-    Args:
-      structure: the input structure
-      shape: the input shape
-      dtype: the input data type
-    """
-
-    def fn(*args):
-      if len(args) == 1 and not isinstance(args[0], tuple):
-        return batching.batch_window(args[0])
-
-      return tuple([
-          fn(*arg) if isinstance(arg, tuple) else batching.batch_window(arg)
-          for arg in args
-      ])
-
-    dataset = self._structuredDataset(structure, shape, dtype).repeat(5).apply(
-        grouping.window_dataset(5)).apply(grouping._map_x_dataset(fn))
-    get_next = dataset.make_one_shot_iterator().get_next()
-    with self.cached_session() as sess:
-      expected = sess.run(
-          self._structuredElement(structure, np.concatenate(
-              ([5], shape), axis=0), dtype))
-      actual = sess.run(get_next)
-      self._assertEqual(expected, actual)
-
-  @parameterized.named_parameters(
-      ("1", np.int32([])),
-      ("2", np.int32([1])),
-      ("3", np.int32([1, 2, 3])),
-  )
-  def testWindowDatasetBatchDenseDynamicShape(self, shape):
-    """Tests batching of dynamically shaped dense tensor windows.
-
-    Args:
-      shape: the input shape
-    """
-
-    shape_t = array_ops.placeholder(dtypes.int32)
-    dataset = dataset_ops.Dataset.from_tensors(
-        array_ops.zeros(shape_t)).repeat(5).apply(
-            grouping.window_dataset(5)).apply(
-                grouping._map_x_dataset(batching.batch_window))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op, {shape_t: shape})
-      expected = sess.run(
-          self._structuredElement(None, np.concatenate(([5], shape), axis=0),
-                                  dtypes.int32))
-      actual = sess.run(get_next)
-      self._assertEqual(expected, actual)
-
-  def _make_dense_to_sparse_fn(self, is_scalar):
-
-    def dense_to_sparse_scalar(tensor):
-      indices = [[]]
-      values = array_ops.expand_dims(tensor, 0)
-      shape = []
-      return sparse_tensor.SparseTensorValue(indices, values, shape)
-
-    def dense_to_sparse_non_scalar(tensor):
-      indices = array_ops.where(array_ops.ones_like(tensor, dtype=dtypes.bool))
-      values = array_ops.gather_nd(tensor, indices)
-      shape = array_ops.shape(tensor, out_type=dtypes.int64)
-      return sparse_tensor.SparseTensorValue(indices, values, shape)
-
-    if is_scalar:
-      return dense_to_sparse_scalar
-    return dense_to_sparse_non_scalar
-
-  def _structuredSparseDataset(self, structure, shape, dtype):
-    dense_to_sparse = self._make_dense_to_sparse_fn(len(shape) == 0)  # pylint: disable=g-explicit-length-test
-    if structure is None:
-      return dataset_ops.Dataset.from_tensors(
-          dense_to_sparse(array_ops.zeros(shape, dtype=dtype)))
-    else:
-      return dataset_ops.Dataset.zip(
-          tuple([
-              self._structuredSparseDataset(substructure, shape, dtype)
-              for substructure in structure
-          ]))
-
-  def _structuredSparseElement(self, structure, shape, dtype):
-    dense_to_sparse = self._make_dense_to_sparse_fn(len(shape) == 0)  # pylint: disable=g-explicit-length-test
-    if structure is None:
-      return dense_to_sparse(array_ops.zeros(shape, dtype=dtype))
-    else:
-      return tuple([
-          self._structuredSparseElement(substructure, shape, dtype)
-          for substructure in structure
-      ])
-
-  @parameterized.named_parameters(
-      ("1", None, np.int32([]), dtypes.bool),
-      ("2", None, np.int32([]), dtypes.int32),
-      ("3", None, np.int32([]), dtypes.float32),
-      ("4", None, np.int32([]), dtypes.string),
-      ("5", None, np.int32([2]), dtypes.int32),
-      ("6", None, np.int32([2, 2]), dtypes.int32),
-      ("7", (None, None, None), np.int32([]), dtypes.int32),
-      ("8", (None, (None, None)), np.int32([]), dtypes.int32),
-  )
-  def testWindowDatasetBatchSparse(self, structure, shape, dtype):
-    """Tests batching of sparse tensor windows.
-
-    Args:
-      structure: the input structure
-      shape: the input shape
-      dtype: the input data type
-    """
-
-    def fn(*args):
-      if len(args) == 1 and not isinstance(args[0], tuple):
-        return batching.batch_window(args[0])
-
-      return tuple([
-          fn(*arg) if isinstance(arg, tuple) else batching.batch_window(arg)
-          for arg in args
-      ])
-
-    dataset = self._structuredSparseDataset(
-        structure, shape, dtype).repeat(5).apply(
-            grouping.window_dataset(5)).apply(grouping._map_x_dataset(fn))
-    get_next = dataset.make_one_shot_iterator().get_next()
-    with self.cached_session() as sess:
-      expected = sess.run(
-          self._structuredSparseElement(structure,
-                                        np.concatenate(([5], shape), axis=0),
-                                        dtype))
-      actual = sess.run(get_next)
-      self._assertEqual(expected, actual)
-
-  @parameterized.named_parameters(
-      ("1", np.int32([])),
-      ("2", np.int32([1])),
-      ("3", np.int32([1, 2, 3])),
-  )
-  def testWindowDatasetBatchSparseDynamicShape(self, shape):
-    """Tests batching of dynamically shaped sparse tensor windows.
-
-    Args:
-      shape: the input shape
-    """
-
-    shape_t = array_ops.placeholder(dtypes.int32)
-    dataset = dataset_ops.Dataset.from_tensors(array_ops.zeros(shape_t)).map(
-        self._make_dense_to_sparse_fn(len(shape) == 0)).repeat(5).apply(  # pylint: disable=g-explicit-length-test
-            grouping.window_dataset(5)).apply(
-                grouping._map_x_dataset(batching.batch_window))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op, {shape_t: shape})
-      expected = sess.run(
-          self._structuredSparseElement(None,
-                                        np.concatenate(([5], shape), axis=0),
-                                        dtypes.int32))
-      actual = sess.run(get_next)
-      self._assertEqual(expected, actual)
-
-  def _structuredRaggedDataset(self, structure, shapes, dtype):
-
-    if structure is None:
-      return dataset_ops.Dataset.from_tensor_slices(shapes).map(
-          lambda shape: array_ops.zeros(shape, dtype=dtype))
-    else:
-      return dataset_ops.Dataset.zip(
-          tuple([
-              self._structuredRaggedDataset(substructure, shapes, dtype)
-              for substructure in structure
-          ]))
-
-  @parameterized.named_parameters(
-      ("1", None, np.int32([[1], [2], [3]]), dtypes.bool, [-1]),
-      ("2", None, np.int32([[1], [2], [3]]), dtypes.int32, [-1]),
-      ("3", None, np.int32([[1], [2], [3]]), dtypes.float32, [-1]),
-      ("4", None, np.int32([[1], [2], [3]]), dtypes.string, [-1]),
-      ("5", None, np.int32([[1, 3], [2, 2], [3, 1]]), dtypes.int32, [-1, -1]),
-      ("6", None, np.int32([[3, 1, 3], [1, 3, 1]]), dtypes.int32, [-1, -1, -1]),
-      ("7", (None, None, None), np.int32([[1], [2], [3]]), dtypes.int32, [-1]),
-      ("8", (None,
-             (None, None)), np.int32([[1], [2], [3]]), dtypes.int32, [-1]),
-      ("9", None, np.int32([[1], [2], [3]]), dtypes.int32, [-1]),
-      ("10", None, np.int32([[1], [2], [3]]), dtypes.int32, np.int32([10])),
-  )
-  def testWindowDatasetPaddedBatchDense(self, structure, shapes, dtype,
-                                        padded_shape):
-    """Tests padded batching of dense tensor windows.
-
-    Args:
-      structure: the input structure
-      shapes: the input shapes
-      dtype: the input data type
-      padded_shape: the shape to pad the output to
-    """
-
-    def fn(*args):
-      if len(args) == 1 and not isinstance(args[0], tuple):
-        return batching.padded_batch_window(args[0], padded_shape)
-
-      return tuple([
-          fn(*arg) if isinstance(arg, tuple) else batching.padded_batch_window(
-              arg, padded_shape) for arg in args
-      ])
-
-    dataset = self._structuredRaggedDataset(structure, shapes, dtype).apply(
-        grouping.window_dataset(len(shapes))).apply(
-            grouping._map_x_dataset(fn))
-    get_next = dataset.make_one_shot_iterator().get_next()
-    with self.cached_session() as sess:
-      expected_shape = np.maximum(np.amax(shapes, axis=0), padded_shape)
-      expected = sess.run(
-          self._structuredElement(
-              structure,
-              np.concatenate((np.int32([len(shapes)]), expected_shape)), dtype))
-      actual = sess.run(get_next)
-      self._assertEqual(expected, actual)
-
-  @parameterized.named_parameters(
-      ("1", np.int32([[1], [2], [3]]), [-1]),
-      ("2", np.int32([[1, 3], [2, 2], [3, 1]]), [-1, -1]),
-      ("3", np.int32([[3, 1, 3], [1, 3, 1]]), [-1, -1, -1]),
-  )
-  def testWindowDatasetPaddedBatchDenseDynamicShape(self, shapes, padded_shape):
-    """Tests padded batching of dynamically shaped dense tensor windows.
-
-    Args:
-      shapes: the input shapes
-      padded_shape: the shape to pad the output to
-    """
-
-    shapes_t = array_ops.placeholder(dtypes.int32)
-    dataset = dataset_ops.Dataset.from_tensor_slices(shapes_t).map(
-        lambda shape: array_ops.zeros(shape, dtype=dtypes.int32)).apply(
-            grouping.window_dataset(len(shapes))).apply(
-                grouping._map_x_dataset(
-                    lambda x: batching.padded_batch_window(x, padded_shape)))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op, {shapes_t: shapes})
-      expected_shape = np.maximum(np.amax(shapes, axis=0), padded_shape)
-      expected = sess.run(
-          self._structuredElement(
-              None, np.concatenate((np.int32([len(shapes)]), expected_shape)),
-              dtypes.int32))
-      actual = sess.run(get_next)
-      self._assertEqual(expected, actual)
-
-  @parameterized.named_parameters(
-      ("1", np.int32([[1]]), np.int32([0])),
-      ("2", np.int32([[10], [20]]), np.int32([15])),
-  )
-  def testWindowDatasetPaddedBatchDenseInvalid(self, shapes, padded_shape):
-    """Tests invalid padded batching of dense tensor windows.
-
-    Args:
-      shapes: the input shapes
-      padded_shape: the shape to pad the output to
-    """
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(shapes).map(
-        lambda shape: array_ops.zeros(shape, dtype=dtypes.int32)).apply(
-            grouping.window_dataset(len(shapes))).apply(
-                grouping._map_x_dataset(
-                    lambda x: batching.padded_batch_window(x, padded_shape)))
-    get_next = dataset.make_one_shot_iterator().get_next()
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-
-  def _structuredRaggedSparseDataset(self, structure, shapes, dtype):
-
-    def map_fn(shape):
-      dense_to_sparse = self._make_dense_to_sparse_fn(False)
-      return dense_to_sparse(array_ops.zeros(shape, dtype=dtype))
-
-    if structure is None:
-      return dataset_ops.Dataset.from_tensor_slices(shapes).map(map_fn)
-    else:
-      return dataset_ops.Dataset.zip(
-          tuple([
-              self._structuredRaggedSparseDataset(substructure, shapes, dtype)
-              for substructure in structure
-          ]))
-
-  def _structuredRaggedSparseElement(self, structure, shapes, dtype,
-                                     padded_shape):
-    if structure is None:
-      dense_shape = np.maximum(np.amax(shapes, axis=0), padded_shape)
-      values = []
-      for shape in shapes:
-        dense_to_sparse = self._make_dense_to_sparse_fn(len(shape) == 0)  # pylint: disable=g-explicit-length-test
-        sparse = dense_to_sparse(array_ops.zeros(shape, dtype=dtype))
-        padded_sparse = sparse_tensor.SparseTensor(sparse.indices,
-                                                   sparse.values, dense_shape)
-        reshaped_sparse = sparse_ops.sparse_reshape(
-            padded_sparse,
-            array_ops.concat([np.array([1], dtype=np.int64), dense_shape], 0))
-        values.append(reshaped_sparse)
-      return sparse_ops.sparse_concat(0, values)
-    else:
-      return tuple([
-          self._structuredRaggedSparseElement(substructure, shapes, dtype,
-                                              padded_shape)
-          for substructure in structure
-      ])
-
-  @parameterized.named_parameters(
-      ("1", None, np.int64([[1], [2], [3]]), dtypes.bool, [-1]),
-      ("2", None, np.int64([[1], [2], [3]]), dtypes.int32, [-1]),
-      ("3", None, np.int64([[1], [2], [3]]), dtypes.float32, [-1]),
-      ("4", None, np.int64([[1], [2], [3]]), dtypes.string, [-1]),
-      ("5", None, np.int64([[1, 3], [2, 2], [3, 1]]), dtypes.int32, [-1, -1]),
-      ("6", None, np.int64([[1, 3, 1], [3, 1, 3]]), dtypes.int32, [-1, -1, -1]),
-      ("7", (None, None, None), np.int64([[1], [2], [3]]), dtypes.int32, [-1]),
-      ("8", (None,
-             (None, None)), np.int64([[1], [2], [3]]), dtypes.int32, [-1]),
-      ("9", None, np.int64([[1], [2], [3]]), dtypes.int32, [-1]),
-      ("10", None, np.int64([[1], [2], [3]]), dtypes.int32, np.int64([10])),
-  )
-  def testWindowDatasetPaddedBatchSparse(self, structure, shapes, dtype,
-                                         padded_shape):
-    """Tests padded batching of sparse tensor windows.
-
-    Args:
-      structure: the input structure
-      shapes: the input shapes
-      dtype: the input data type
-      padded_shape: the shape to pad the output to
-    """
-
-    def fn(*args):
-      if len(args) == 1 and not isinstance(args[0], tuple):
-        return batching.padded_batch_window(args[0], padded_shape)
-
-      return tuple([
-          fn(*arg) if isinstance(arg, tuple) else batching.padded_batch_window(
-              arg, padded_shape) for arg in args
-      ])
-
-    dataset = self._structuredRaggedSparseDataset(
-        structure, shapes, dtype).apply(grouping.window_dataset(
-            len(shapes))).apply(grouping._map_x_dataset(fn))
-    get_next = dataset.make_one_shot_iterator().get_next()
-    with self.cached_session() as sess:
-      expected = sess.run(
-          self._structuredRaggedSparseElement(structure, shapes, dtype,
-                                              padded_shape))
-      actual = sess.run(get_next)
-      self._assertEqual(expected, actual)
-
-  @parameterized.named_parameters(
-      ("1", np.int64([[1], [2], [3]]), [-1]),
-      ("2", np.int64([[1, 3], [2, 2], [3, 1]]), [-1, -1]),
-      ("3", np.int64([[3, 1, 3], [1, 3, 1]]), [-1, -1, -1]),
-  )
-  def testWindowDatasetPaddedBatchSparseDynamicShape(self, shapes,
-                                                     padded_shape):
-    """Tests padded batching of dynamically shaped sparse tensor windows.
-
-    Args:
-      shapes: the input shapes
-      padded_shape: the shape to pad the output to
-    """
-
-    shapes_t = array_ops.placeholder(dtypes.int32)
-    dataset = dataset_ops.Dataset.from_tensor_slices(shapes_t).map(
-        lambda shape: array_ops.zeros(shape, dtype=dtypes.int32)).map(
-            self._make_dense_to_sparse_fn(False)
-        ).apply(grouping.window_dataset(len(shapes))).apply(
-            grouping._map_x_dataset(
-                lambda x: batching.padded_batch_window(x, padded_shape)))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op, {shapes_t: shapes})
-      expected = sess.run(
-          self._structuredRaggedSparseElement(None, shapes, dtypes.int32,
-                                              padded_shape))
-      actual = sess.run(get_next)
-      self._assertEqual(expected, actual)
-
-  @parameterized.named_parameters(
-      ("1", np.int64([[1]]), [0]),
-      ("2", np.int64([[10], [20]]), [15]),
-  )
-  def testWindowDatasetPaddedBatchSparseInvalid(self, shapes, padded_shape):
-    """Tests invalid padded batching of sparse tensor windows.
-
-    Args:
-      shapes: the input shapes
-      padded_shape: the shape to pad the output to
-    """
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(shapes).map(
-        lambda shape: array_ops.zeros(shape, dtype=dtypes.int32)).map(
-            self._make_dense_to_sparse_fn(False)
-        ).apply(grouping.window_dataset(len(shapes))).apply(
-            grouping._map_x_dataset(
-                lambda x: batching.padded_batch_window(x, padded_shape)))
-    get_next = dataset.make_one_shot_iterator().get_next()
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 5cd1ed542b..34dc2379d0 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -16,10 +16,7 @@ py_library(
     srcs = ["counter.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":scan_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/experimental/ops:counter",
     ],
 )
 
@@ -28,12 +25,7 @@ py_library(
     srcs = ["get_single_element.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":grouping",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
-        "//third_party/py/numpy",
+        "//tensorflow/python/data/experimental/ops:get_single_element",
     ],
 )
 
@@ -44,10 +36,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/experimental/ops:iterator_ops",
     ],
 )
 
@@ -58,15 +47,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/experimental/ops:random_ops",
     ],
 )
 
@@ -79,7 +60,6 @@ py_library(
     deps = [
         ":batching",
         ":interleave_ops",
-        ":optimization",
         ":parsing_ops",
         ":shuffle_ops",
         "//tensorflow/python:constant_op",
@@ -91,6 +71,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:convert",
@@ -106,7 +87,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/experimental/ops:shuffle_ops",
     ],
 )
 
@@ -125,6 +106,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
@@ -138,8 +120,7 @@ py_library(
     srcs = ["enumerate_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/experimental/ops:enumerate_ops",
     ],
 )
 
@@ -148,10 +129,7 @@ py_library(
     srcs = ["error_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/experimental/ops:error_ops",
     ],
 )
 
@@ -160,16 +138,7 @@ py_library(
     srcs = ["grouping.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:function",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/experimental/ops:grouping",
     ],
 )
 
@@ -178,30 +147,7 @@ py_library(
     srcs = ["interleave_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":random_ops",
-        "//tensorflow/contrib/stateless",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
-    ],
-)
-
-py_library(
-    name = "optimization",
-    srcs = ["optimization.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
     ],
 )
 
@@ -210,25 +156,7 @@ py_library(
     srcs = ["parsing_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
-
-py_library(
-    name = "map_defun",
-    srcs = ["map_defun.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/experimental/ops:parsing_ops",
     ],
 )
 
@@ -237,18 +165,7 @@ py_library(
     srcs = ["resampling.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":batching",
-        ":interleave_ops",
-        ":scan_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
+        "//tensorflow/python/data/experimental/ops:resampling",
     ],
 )
 
@@ -257,12 +174,7 @@ py_library(
     srcs = ["scan_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:function",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/experimental/ops:scan_ops",
     ],
 )
 
@@ -281,32 +193,12 @@ py_library(
     ],
 )
 
-py_library(
-    name = "stats_ops",
-    srcs = ["stats_ops.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
-    ],
-)
-
 py_library(
     name = "threadpool",
     srcs = ["threadpool.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/data/experimental/ops:threadpool",
     ],
 )
 
@@ -317,11 +209,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/experimental/ops:unique",
     ],
 )
 
@@ -332,20 +220,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-py_library(
-    name = "indexed_dataset_ops",
-    srcs = ["indexed_dataset_ops.py"],
-    deps = [
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/experimental/ops:writers",
     ],
 )
 
@@ -353,11 +228,7 @@ py_library(
     name = "prefetching_ops",
     srcs = ["prefetching_ops.py"],
     deps = [
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/experimental/ops:prefetching_ops",
     ],
 )
 
@@ -370,17 +241,14 @@ py_library(
         ":error_ops",
         ":get_single_element",
         ":grouping",
-        ":indexed_dataset_ops",
         ":interleave_ops",
-        ":map_defun",
-        ":optimization",
         ":prefetching_ops",
+        ":random_ops",
         ":readers",
         ":resampling",
         ":scan_ops",
         ":shuffle_ops",
         ":sliding",
-        ":stats_ops",
         ":threadpool",
         ":unique",
         ":writers",
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index 7a0f221284..8c60459ca8 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -17,134 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.contrib.data.python.ops import get_single_element
-from tensorflow.contrib.data.python.ops import grouping
 from tensorflow.contrib.framework import with_shape
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import convert
+from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
 from tensorflow.python.util import deprecation
 
 
-def batch_window(dataset):
-  """Batches a window of tensors.
-
-  Args:
-    dataset: the input dataset.
-
-  Returns:
-    A `Tensor` representing the batch of the entire input dataset.
-  """
-  if isinstance(dataset.output_classes, tuple):
-    raise TypeError("Input dataset expected to have a single component")
-  if dataset.output_classes is ops.Tensor:
-    return _batch_dense_window(dataset)
-  elif dataset.output_classes is sparse_tensor.SparseTensor:
-    return _batch_sparse_window(dataset)
-  else:
-    raise TypeError("Unsupported dataset type: %s" % dataset.output_classes)
-
-
-def _batch_dense_window(dataset):
-  """Batches a window of dense tensors."""
-
-  def key_fn(_):
-    return np.int64(0)
-
-  def shape_init_fn(_):
-    return array_ops.shape(first_element)
-
-  def shape_reduce_fn(state, value):
-    check_ops.assert_equal(state, array_ops.shape(value))
-    return state
-
-  def finalize_fn(state):
-    return state
-
-  if dataset.output_shapes.is_fully_defined():
-    shape = dataset.output_shapes
-  else:
-    first_element = get_single_element.get_single_element(dataset.take(1))
-    shape_reducer = grouping.Reducer(shape_init_fn, shape_reduce_fn,
-                                     finalize_fn)
-    shape = get_single_element.get_single_element(
-        dataset.apply(grouping.group_by_reducer(key_fn, shape_reducer)))
-
-  def batch_init_fn(_):
-    batch_shape = array_ops.concat([[0], shape], 0)
-    return gen_array_ops.empty(batch_shape, dtype=dataset.output_types)
-
-  def batch_reduce_fn(state, value):
-    return array_ops.concat([state, [value]], 0)
-
-  batch_reducer = grouping.Reducer(batch_init_fn, batch_reduce_fn, finalize_fn)
-  return get_single_element.get_single_element(
-      dataset.apply(grouping.group_by_reducer(key_fn, batch_reducer)))
-
-
-def _batch_sparse_window(dataset):
-  """Batches a window of sparse tensors."""
-
-  def key_fn(_):
-    return np.int64(0)
-
-  def shape_init_fn(_):
-    return first_element.dense_shape
-
-  def shape_reduce_fn(state, value):
-    check_ops.assert_equal(state, value.dense_shape)
-    return state
-
-  def finalize_fn(state):
-    return state
-
-  if dataset.output_shapes.is_fully_defined():
-    shape = dataset.output_shapes
-  else:
-    first_element = get_single_element.get_single_element(dataset.take(1))
-    shape_reducer = grouping.Reducer(shape_init_fn, shape_reduce_fn,
-                                     finalize_fn)
-    shape = get_single_element.get_single_element(
-        dataset.apply(grouping.group_by_reducer(key_fn, shape_reducer)))
-
-  def batch_init_fn(_):
-    indices_shape = array_ops.concat([[0], [array_ops.size(shape) + 1]], 0)
-    return sparse_tensor.SparseTensor(
-        indices=gen_array_ops.empty(indices_shape, dtype=dtypes.int64),
-        values=constant_op.constant([], shape=[0], dtype=dataset.output_types),
-        dense_shape=array_ops.concat(
-            [np.array([0], dtype=np.int64),
-             math_ops.cast(shape, dtypes.int64)], 0))
-
-  def batch_reduce_fn(state, value):
-    return sparse_ops.sparse_concat(0, [state, value])
-
-  def reshape_fn(value):
-    return sparse_ops.sparse_reshape(
-        value,
-        array_ops.concat([np.array([1], dtype=np.int64), value.dense_shape], 0))
-
-  batch_reducer = grouping.Reducer(batch_init_fn, batch_reduce_fn, finalize_fn)
-  return get_single_element.get_single_element(
-      dataset.map(reshape_fn).apply(
-          grouping.group_by_reducer(key_fn, batch_reducer)))
-
-
+@deprecation.deprecated(
+    None, "Use `tf.data.experimental.dense_to_sparse_batch(...)`.")
 def dense_to_sparse_batch(batch_size, row_shape):
   """A transformation that batches ragged elements into `tf.SparseTensor`s.
 
@@ -187,201 +67,10 @@ def dense_to_sparse_batch(batch_size, row_shape):
     A `Dataset` transformation function, which can be passed to
     `tf.data.Dataset.apply`.
   """
-
-  def _apply_fn(dataset):
-    return _DenseToSparseBatchDataset(dataset, batch_size, row_shape)
-
-  return _apply_fn
-
-
-def padded_batch_window(dataset, padded_shape, padding_value=None):
-  """Batches a window of tensors with padding.
-
-  Args:
-    dataset: the input dataset.
-    padded_shape: (Optional.) `tf.TensorShape` or `tf.int64` vector tensor-like
-      object representing the shape to which the input elements should be padded
-      prior to batching. Any unknown dimensions (e.g. `tf.Dimension(None)` in a
-      `tf.TensorShape` or `-1` in a tensor-like object) will be padded to the
-      maximum size of that dimension in each batch.
-    padding_value: (Optional.) A scalar-shaped `tf.Tensor`, representing the
-      padding value to use. Defaults are `0` for numeric types and the empty
-      string for string types. If `dataset` contains `tf.SparseTensor`, this
-      value is ignored.
-
-  Returns:
-    A `Tensor` representing the batch of the entire input dataset.
-
-  Raises:
-    ValueError: if invalid arguments are provided.
-  """
-  if not issubclass(dataset.output_classes,
-                    (ops.Tensor, sparse_tensor.SparseTensor)):
-    raise TypeError("Input dataset expected to have a single tensor component")
-  if issubclass(dataset.output_classes, (ops.Tensor)):
-    return _padded_batch_dense_window(dataset, padded_shape, padding_value)
-  elif issubclass(dataset.output_classes, (sparse_tensor.SparseTensor)):
-    if padding_value is not None:
-      raise ValueError("Padding value not allowed for sparse tensors")
-    return _padded_batch_sparse_window(dataset, padded_shape)
-  else:
-    raise TypeError("Unsupported dataset type: %s" % dataset.output_classes)
-
-
-def _padded_batch_dense_window(dataset, padded_shape, padding_value=None):
-  """Batches a window of dense tensors with padding."""
-
-  padded_shape = math_ops.cast(
-      convert.partial_shape_to_tensor(padded_shape), dtypes.int32)
-
-  def key_fn(_):
-    return np.int64(0)
-
-  def max_init_fn(_):
-    return padded_shape
-
-  def max_reduce_fn(state, value):
-    """Computes the maximum shape to pad to."""
-    condition = math_ops.reduce_all(
-        math_ops.logical_or(
-            math_ops.less_equal(array_ops.shape(value), padded_shape),
-            math_ops.equal(padded_shape, -1)))
-    assert_op = control_flow_ops.Assert(condition, [
-        "Actual shape greater than padded shape: ",
-        array_ops.shape(value), padded_shape
-    ])
-    with ops.control_dependencies([assert_op]):
-      return math_ops.maximum(state, array_ops.shape(value))
-
-  def finalize_fn(state):
-    return state
-
-  # Compute the padded shape.
-  max_reducer = grouping.Reducer(max_init_fn, max_reduce_fn, finalize_fn)
-  padded_shape = get_single_element.get_single_element(
-      dataset.apply(grouping.group_by_reducer(key_fn, max_reducer)))
-
-  if padding_value is None:
-    if dataset.output_types == dtypes.string:
-      padding_value = ""
-    elif dataset.output_types == dtypes.bool:
-      padding_value = False
-    elif dataset.output_types == dtypes.variant:
-      raise TypeError("Unable to create padding for field of type 'variant'")
-    else:
-      padding_value = 0
-
-  def batch_init_fn(_):
-    batch_shape = array_ops.concat(
-        [np.array([0], dtype=np.int32), padded_shape], 0)
-    return gen_array_ops.empty(batch_shape, dtype=dataset.output_types)
-
-  def batch_reduce_fn(state, value):
-    return array_ops.concat([state, [value]], 0)
-
-  def pad_fn(value):
-    shape = array_ops.shape(value)
-    left = array_ops.zeros_like(shape)
-    right = padded_shape - shape
-    return array_ops.pad(
-        value, array_ops.stack([left, right], 1), constant_values=padding_value)
-
-  batch_reducer = grouping.Reducer(batch_init_fn, batch_reduce_fn, finalize_fn)
-  return get_single_element.get_single_element(
-      dataset.map(pad_fn).apply(
-          grouping.group_by_reducer(key_fn, batch_reducer)))
-
-
-def _padded_batch_sparse_window(dataset, padded_shape):
-  """Batches a window of sparse tensors with padding."""
-
-  def key_fn(_):
-    return np.int64(0)
-
-  def max_init_fn(_):
-    return convert.partial_shape_to_tensor(padded_shape)
-
-  def max_reduce_fn(state, value):
-    """Computes the maximum shape to pad to."""
-    condition = math_ops.reduce_all(
-        math_ops.logical_or(
-            math_ops.less_equal(value.dense_shape, padded_shape),
-            math_ops.equal(padded_shape, -1)))
-    assert_op = control_flow_ops.Assert(condition, [
-        "Actual shape greater than padded shape: ", value.dense_shape,
-        padded_shape
-    ])
-    with ops.control_dependencies([assert_op]):
-      return math_ops.maximum(state, value.dense_shape)
-
-  def finalize_fn(state):
-    return state
-
-  # Compute the padded shape.
-  max_reducer = grouping.Reducer(max_init_fn, max_reduce_fn, finalize_fn)
-  padded_shape = get_single_element.get_single_element(
-      dataset.apply(grouping.group_by_reducer(key_fn, max_reducer)))
-
-  def batch_init_fn(_):
-    indices_shape = array_ops.concat([[0], [array_ops.size(padded_shape) + 1]],
-                                     0)
-    return sparse_tensor.SparseTensor(
-        indices=gen_array_ops.empty(indices_shape, dtype=dtypes.int64),
-        values=constant_op.constant([], shape=[0], dtype=dataset.output_types),
-        dense_shape=array_ops.concat(
-            [np.array([0], dtype=np.int64), padded_shape], 0))
-
-  def batch_reduce_fn(state, value):
-    padded_value = sparse_tensor.SparseTensor(
-        indices=value.indices, values=value.values, dense_shape=padded_shape)
-    reshaped_value = sparse_ops.sparse_reshape(
-        padded_value,
-        array_ops.concat(
-            [np.array([1], dtype=np.int64), padded_value.dense_shape], 0))
-    return sparse_ops.sparse_concat(0, [state, reshaped_value])
-
-  reducer = grouping.Reducer(batch_init_fn, batch_reduce_fn, finalize_fn)
-  return get_single_element.get_single_element(
-      dataset.apply(grouping.group_by_reducer(key_fn, reducer)))
-
-
-class _UnbatchDataset(dataset_ops.UnaryDataset):
-  """A dataset that splits the elements of its input into multiple elements."""
-
-  def __init__(self, input_dataset):
-    """See `unbatch()` for more details."""
-    super(_UnbatchDataset, self).__init__(input_dataset)
-    flat_shapes = nest.flatten(input_dataset.output_shapes)
-    if any(s.ndims == 0 for s in flat_shapes):
-      raise ValueError("Cannot unbatch an input with scalar components.")
-    known_batch_dim = tensor_shape.Dimension(None)
-    for s in flat_shapes:
-      try:
-        known_batch_dim = known_batch_dim.merge_with(s[0])
-      except ValueError:
-        raise ValueError("Cannot unbatch an input whose components have "
-                         "different batch sizes.")
-    self._input_dataset = input_dataset
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.unbatch_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return nest.map_structure(lambda s: s[1:],
-                              self._input_dataset.output_shapes)
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  return batching.dense_to_sparse_batch(batch_size, row_shape)
 
 
+@deprecation.deprecated(None, "Use `tf.data.experimental.unbatch()`.")
 def unbatch():
   """Splits elements of a dataset into multiple elements on the batch dimension.
 
@@ -403,39 +92,7 @@ def unbatch():
     A `Dataset` transformation function, which can be passed to
     `tf.data.Dataset.apply`.
   """
-
-  def _apply_fn(dataset):
-    """Function from `Dataset` to `Dataset` that applies the transformation."""
-    if not sparse.any_sparse(dataset.output_classes):
-      return _UnbatchDataset(dataset)
-
-    # NOTE(mrry): We must ensure that any SparseTensors in `dataset`
-    # are normalized to the rank-1 dense representation, so that the
-    # sparse-oblivious unbatching logic will slice them
-    # appropriately. This leads to a somewhat inefficient re-encoding step
-    # for all SparseTensor components.
-    # TODO(mrry): Consider optimizing this in future
-    # if it turns out to be a bottleneck.
-    def normalize(arg, *rest):
-      if rest:
-        return sparse.serialize_many_sparse_tensors((arg,) + rest)
-      else:
-        return sparse.serialize_many_sparse_tensors(arg)
-
-    normalized_dataset = dataset.map(normalize)
-
-    # NOTE(mrry): Our `map()` has lost information about the sparseness
-    # of any SparseTensor components, so re-apply the structure of the
-    # original dataset.
-    restructured_dataset = _RestructuredDataset(
-        normalized_dataset,
-        dataset.output_types,
-        dataset.output_shapes,
-        dataset.output_classes,
-        allow_unsafe_cast=True)
-    return _UnbatchDataset(restructured_dataset)
-
-  return _apply_fn
+  return batching.unbatch()
 
 
 @deprecation.deprecated(
@@ -514,135 +171,8 @@ def padded_batch_and_drop_remainder(batch_size,
   return _apply_fn
 
 
-class _DenseToSparseBatchDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that batches ragged dense elements into `tf.SparseTensor`s."""
-
-  def __init__(self, input_dataset, batch_size, row_shape):
-    """See `Dataset.dense_to_sparse_batch()` for more details."""
-    super(_DenseToSparseBatchDataset, self).__init__(input_dataset)
-    if not isinstance(input_dataset.output_types, dtypes.DType):
-      raise TypeError("DenseToSparseDataset requires an input whose elements "
-                      "have a single component, whereas the input has %r." %
-                      input_dataset.output_types)
-    self._input_dataset = input_dataset
-    self._batch_size = batch_size
-    self._row_shape = row_shape
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.dense_to_sparse_batch_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._batch_size,
-        row_shape=convert.partial_shape_to_tensor(self._row_shape),
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return sparse_tensor.SparseTensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.vector(None).concatenate(self._row_shape)
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-class _RestructuredDataset(dataset_ops.UnaryDataset):
-  """An internal helper for changing the structure and shape of a dataset."""
-
-  def __init__(self,
-               dataset,
-               output_types,
-               output_shapes=None,
-               output_classes=None,
-               allow_unsafe_cast=False):
-    """Creates a new dataset with the given output types and shapes.
-
-    The given `dataset` must have a structure that is convertible:
-    * `dataset.output_types` must be the same as `output_types` module nesting.
-    * Each shape in `dataset.output_shapes` must be compatible with each shape
-      in `output_shapes` (if given).
-
-    Note: This helper permits "unsafe casts" for shapes, equivalent to using
-    `tf.Tensor.set_shape()` where domain-specific knowledge is available.
-
-    Args:
-      dataset: A `Dataset` object.
-      output_types: A nested structure of `tf.DType` objects.
-      output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects.
-        If omitted, the shapes will be inherited from `dataset`.
-      output_classes: (Optional.) A nested structure of class types.
-        If omitted, the class types will be inherited from `dataset`.
-      allow_unsafe_cast: (Optional.) If `True`, the caller may switch the
-        reported output types and shapes of the restructured dataset, e.g. to
-        switch a sparse tensor represented as `tf.variant` to its user-visible
-        type and shape.
-
-    Raises:
-      ValueError: If either `output_types` or `output_shapes` is not compatible
-        with the structure of `dataset`.
-    """
-    super(_RestructuredDataset, self).__init__(dataset)
-    self._input_dataset = dataset
-
-    if not allow_unsafe_cast:
-      # Validate that the types are compatible.
-      output_types = nest.map_structure(dtypes.as_dtype, output_types)
-      flat_original_types = nest.flatten(dataset.output_types)
-      flat_new_types = nest.flatten(output_types)
-      if flat_original_types != flat_new_types:
-        raise ValueError(
-            "Dataset with output types %r cannot be restructured to have "
-            "output types %r" % (dataset.output_types, output_types))
-
-    self._output_types = output_types
-
-    if output_shapes is None:
-      # Inherit shapes from the original `dataset`.
-      self._output_shapes = nest.pack_sequence_as(output_types,
-                                                  nest.flatten(
-                                                      dataset.output_shapes))
-    else:
-      if not allow_unsafe_cast:
-        # Validate that the shapes are compatible.
-        nest.assert_same_structure(output_types, output_shapes)
-        flat_original_shapes = nest.flatten(dataset.output_shapes)
-        flat_new_shapes = nest.flatten_up_to(output_types, output_shapes)
-
-        for original_shape, new_shape in zip(flat_original_shapes,
-                                             flat_new_shapes):
-          if not original_shape.is_compatible_with(new_shape):
-            raise ValueError(
-                "Dataset with output shapes %r cannot be restructured to have "
-                "incompatible output shapes %r" % (dataset.output_shapes,
-                                                   output_shapes))
-      self._output_shapes = nest.map_structure_up_to(
-          output_types, tensor_shape.as_shape, output_shapes)
-    if output_classes is None:
-      # Inherit class types from the original `dataset`.
-      self._output_classes = nest.pack_sequence_as(output_types,
-                                                   nest.flatten(
-                                                       dataset.output_classes))
-    else:
-      self._output_classes = output_classes
-
-  def _as_variant_tensor(self):
-    return self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-
+# TODO(b/116817045): Move this to `tf.data.experimental` when the `with_shape()`
+# function is available in the core.
 def assert_element_shape(expected_shapes):
   """Assert the shape of this `Dataset`.
 
@@ -687,7 +217,8 @@ def assert_element_shape(expected_shapes):
   def _apply_fn(dataset):
     output_shapes = _merge_output_shapes(dataset.output_shapes,
                                          expected_shapes)
-    return _RestructuredDataset(
+    # pylint: disable=protected-access
+    return batching._RestructuredDataset(
         dataset.map(_check_shape),
         dataset.output_types,
         output_shapes=output_shapes,
@@ -696,49 +227,7 @@ def assert_element_shape(expected_shapes):
   return _apply_fn
 
 
-class _MapAndBatchDataset(dataset_ops.MapDataset):
-  """A `Dataset` that maps a function over a batch of elements."""
-
-  def __init__(self, input_dataset, map_func, batch_size, num_parallel_calls,
-               drop_remainder):
-    """See `Dataset.map()` for details."""
-    super(_MapAndBatchDataset, self).__init__(input_dataset, map_func)
-    self._batch_size_t = ops.convert_to_tensor(
-        batch_size, dtype=dtypes.int64, name="batch_size")
-    self._num_parallel_calls_t = ops.convert_to_tensor(
-        num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
-    self._drop_remainder_t = ops.convert_to_tensor(
-        drop_remainder, dtype=dtypes.bool, name="drop_remainder")
-
-    self._batch_size = batch_size
-    self._drop_remainder = drop_remainder
-
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    input_resource = self._input_dataset._as_variant_tensor()
-    return gen_dataset_ops.map_and_batch_dataset_v2(
-        input_resource,
-        self._map_func.captured_inputs,
-        f=self._map_func,
-        batch_size=self._batch_size_t,
-        num_parallel_calls=self._num_parallel_calls_t,
-        drop_remainder=self._drop_remainder_t,
-        **dataset_ops.flat_structure(self))
-    # pylint: enable=protected-access
-
-  @property
-  def output_shapes(self):
-    dim = self._batch_size if self._drop_remainder else None
-    return nest.pack_sequence_as(self._output_shapes, [
-        tensor_shape.vector(dim).concatenate(s)
-        for s in nest.flatten(self._output_shapes)
-    ])
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-
+@deprecation.deprecated(None, "Use `tf.data.experimental.map_and_batch(...)`.")
 def map_and_batch(map_func,
                   batch_size,
                   num_parallel_batches=None,
@@ -779,17 +268,5 @@ def map_and_batch(map_func,
     ValueError: If both `num_parallel_batches` and `num_parallel_calls` are
       specified.
   """
-
-  if num_parallel_batches is None and num_parallel_calls is None:
-    num_parallel_calls = batch_size
-  elif num_parallel_batches is not None and num_parallel_calls is None:
-    num_parallel_calls = batch_size * num_parallel_batches
-  elif num_parallel_batches is not None and num_parallel_calls is not None:
-    raise ValueError("The `num_parallel_batches` and `num_parallel_calls` "
-                     "arguments are mutually exclusive.")
-
-  def _apply_fn(dataset):
-    return _MapAndBatchDataset(dataset, map_func, batch_size,
-                               num_parallel_calls, drop_remainder)
-
-  return _apply_fn
+  return batching.map_and_batch(map_func, batch_size, num_parallel_batches,
+                                drop_remainder, num_parallel_calls)
diff --git a/tensorflow/contrib/data/python/ops/counter.py b/tensorflow/contrib/data/python/ops/counter.py
index 6ef65f9624..4ff5bf3e39 100644
--- a/tensorflow/contrib/data/python/ops/counter.py
+++ b/tensorflow/contrib/data/python/ops/counter.py
@@ -17,13 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import scan_ops
-
-from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.experimental.ops import counter
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
+from tensorflow.python.util import deprecation
 
 
+@deprecation.deprecated(None, "Use `tf.data.experimental.Counter(...)`.")
 def Counter(start=0, step=1, dtype=dtypes.int64):
   """Creates a `Dataset` that counts from `start` in steps of size `step`.
 
@@ -46,8 +45,4 @@ def Counter(start=0, step=1, dtype=dtypes.int64):
   Returns:
     A `Dataset` of scalar `dtype` elements.
   """
-  with ops.name_scope("counter"):
-    start = ops.convert_to_tensor(start, dtype=dtype, name="start")
-    step = ops.convert_to_tensor(step, dtype=dtype, name="step")
-    return dataset_ops.Dataset.from_tensors(0).repeat(None).apply(
-        scan_ops.scan(start, lambda state, _: (state + step, state)))
+  return counter.Counter(start, step, dtype)
diff --git a/tensorflow/contrib/data/python/ops/enumerate_ops.py b/tensorflow/contrib/data/python/ops/enumerate_ops.py
index 490281e0d2..a21da4d3ec 100644
--- a/tensorflow/contrib/data/python/ops/enumerate_ops.py
+++ b/tensorflow/contrib/data/python/ops/enumerate_ops.py
@@ -17,12 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
+from tensorflow.python.data.experimental.ops import enumerate_ops
+from tensorflow.python.util import deprecation
 
 
+@deprecation.deprecated(None,
+                        "Use `tf.data.experimental.enumerate_dataset(...)`.")
 def enumerate_dataset(start=0):
   """A transformation that enumerate the elements of a dataset.
 
@@ -49,10 +50,4 @@ def enumerate_dataset(start=0):
     A `Dataset` transformation function, which can be passed to
     `tf.data.Dataset.apply`.
   """
-
-  def _apply_fn(dataset):
-    max_value = np.iinfo(dtypes.int64.as_numpy_dtype).max
-    return dataset_ops.Dataset.zip((dataset_ops.Dataset.range(start, max_value),
-                                    dataset))
-
-  return _apply_fn
+  return enumerate_ops.enumerate_dataset(start)
diff --git a/tensorflow/contrib/data/python/ops/error_ops.py b/tensorflow/contrib/data/python/ops/error_ops.py
index f962e623ee..0559a2e09c 100644
--- a/tensorflow/contrib/data/python/ops/error_ops.py
+++ b/tensorflow/contrib/data/python/ops/error_ops.py
@@ -17,10 +17,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.data.experimental.ops import error_ops
+from tensorflow.python.util import deprecation
 
 
+@deprecation.deprecated(None, "Use `tf.data.experimental.ignore_errors()`.")
 def ignore_errors():
   """Creates a `Dataset` from another `Dataset` and silently ignores any errors.
 
@@ -43,34 +44,4 @@ def ignore_errors():
     A `Dataset` transformation function, which can be passed to
     `tf.data.Dataset.apply`.
   """
-
-  def _apply_fn(dataset):
-    return _IgnoreErrorsDataset(dataset)
-
-  return _apply_fn
-
-
-class _IgnoreErrorsDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that silently ignores errors when computing its input."""
-
-  def __init__(self, input_dataset):
-    """See `Dataset.ignore_errors()` for details."""
-    super(_IgnoreErrorsDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-
-  def _as_variant_tensor(self):
-    return gen_experimental_dataset_ops.experimental_ignore_errors_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  return error_ops.ignore_errors()
diff --git a/tensorflow/contrib/data/python/ops/get_single_element.py b/tensorflow/contrib/data/python/ops/get_single_element.py
index a6713b017a..58ad9eea90 100644
--- a/tensorflow/contrib/data/python/ops/get_single_element.py
+++ b/tensorflow/contrib/data/python/ops/get_single_element.py
@@ -19,13 +19,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import grouping
+from tensorflow.python.data.experimental.ops import get_single_element as experimental_get_single_element
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util import deprecation
 
 
+@deprecation.deprecated(None,
+                        "Use `tf.data.experimental.get_single_element(...)`.")
 def get_single_element(dataset):
   """Returns the single element in `dataset` as a nested structure of tensors.
 
@@ -61,18 +61,10 @@ def get_single_element(dataset):
     InvalidArgumentError (at runtime): if `dataset` does not contain exactly
       one element.
   """
-  if not isinstance(dataset, dataset_ops.Dataset):
-    raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
-
-  nested_ret = nest.pack_sequence_as(
-      dataset.output_types, gen_dataset_ops.dataset_to_single_element(
-          dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          **dataset_ops.flat_structure(dataset)))
-  return sparse.deserialize_sparse_tensors(
-      nested_ret, dataset.output_types, dataset.output_shapes,
-      dataset.output_classes)
+  return experimental_get_single_element.get_single_element(dataset)
 
 
+@deprecation.deprecated(None, "Use `tf.data.Dataset.reduce(...)`.")
 def reduce_dataset(dataset, reducer):
   """Returns the result of reducing the `dataset` using `reducer`.
 
@@ -90,11 +82,4 @@ def reduce_dataset(dataset, reducer):
   if not isinstance(dataset, dataset_ops.Dataset):
     raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
 
-  # The sentinel dataset is used in case the reduced dataset is empty.
-  sentinel_dataset = dataset_ops.Dataset.from_tensors(
-      reducer.finalize_func(reducer.init_func(np.int64(0))))
-  reduced_dataset = dataset.apply(
-      grouping.group_by_reducer(lambda x: np.int64(0), reducer))
-
-  return get_single_element(
-      reduced_dataset.concatenate(sentinel_dataset).take(1))
+  return dataset.reduce(reducer.init_func(np.int64(0)), reducer.reduce_func)
diff --git a/tensorflow/contrib/data/python/ops/grouping.py b/tensorflow/contrib/data/python/ops/grouping.py
index 7cae33beb3..a99dc2f29a 100644
--- a/tensorflow/contrib/data/python/ops/grouping.py
+++ b/tensorflow/contrib/data/python/ops/grouping.py
@@ -17,20 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import math_ops
+from tensorflow.python.data.experimental.ops import grouping
+from tensorflow.python.util import deprecation
 
 
+@deprecation.deprecated(None,
+                        "Use `tf.data.experimental.group_by_reducer(...)`.")
 def group_by_reducer(key_func, reducer):
   """A transformation that groups elements and performs a reduction.
 
@@ -52,14 +45,11 @@ def group_by_reducer(key_func, reducer):
     A `Dataset` transformation function, which can be passed to
     `tf.data.Dataset.apply`.
   """
-
-  def _apply_fn(dataset):
-    """Function from `Dataset` to `Dataset` that applies the transformation."""
-    return _GroupByReducerDataset(dataset, key_func, reducer)
-
-  return _apply_fn
+  return grouping.group_by_reducer(key_func, reducer)
 
 
+@deprecation.deprecated(None,
+                        "Use `tf.data.experimental.group_by_window(...)`.")
 def group_by_window(key_func,
                     reduce_func,
                     window_size=None,
@@ -98,27 +88,12 @@ def group_by_window(key_func,
     ValueError: if neither or both of {`window_size`, `window_size_func`} are
       passed.
   """
-  if (window_size is not None and window_size_func or
-      not (window_size is not None or window_size_func)):
-    raise ValueError("Must pass either window_size or window_size_func.")
-
-  if window_size is not None:
-
-    def constant_window_func(unused_key):
-      return ops.convert_to_tensor(window_size, dtype=dtypes.int64)
-
-    window_size_func = constant_window_func
-
-  assert window_size_func is not None
-
-  def _apply_fn(dataset):
-    """Function from `Dataset` to `Dataset` that applies the transformation."""
-    return _GroupByWindowDataset(dataset, key_func, reduce_func,
-                                 window_size_func)
-
-  return _apply_fn
+  return grouping.group_by_window(key_func, reduce_func, window_size,
+                                  window_size_func)
 
 
+@deprecation.deprecated(
+    None, "Use `tf.data.experimental.bucket_by_sequence_length(...)`.")
 def bucket_by_sequence_length(element_length_func,
                               bucket_boundaries,
                               bucket_batch_sizes,
@@ -163,342 +138,12 @@ def bucket_by_sequence_length(element_length_func,
   Raises:
     ValueError: if `len(bucket_batch_sizes) != len(bucket_boundaries) + 1`.
   """
-  with ops.name_scope("bucket_by_seq_length"):
-    if len(bucket_batch_sizes) != (len(bucket_boundaries) + 1):
-      raise ValueError(
-          "len(bucket_batch_sizes) must equal len(bucket_boundaries) + 1")
-
-    batch_sizes = constant_op.constant(bucket_batch_sizes, dtype=dtypes.int64)
-
-    def element_to_bucket_id(*args):
-      """Return int64 id of the length bucket for this element."""
-      seq_length = element_length_func(*args)
-
-      boundaries = list(bucket_boundaries)
-      buckets_min = [np.iinfo(np.int32).min] + boundaries
-      buckets_max = boundaries + [np.iinfo(np.int32).max]
-      conditions_c = math_ops.logical_and(
-          math_ops.less_equal(buckets_min, seq_length),
-          math_ops.less(seq_length, buckets_max))
-      bucket_id = math_ops.reduce_min(array_ops.where(conditions_c))
-
-      return bucket_id
-
-    def window_size_fn(bucket_id):
-      # The window size is set to the batch size for this bucket
-      window_size = batch_sizes[bucket_id]
-      return window_size
-
-    def make_padded_shapes(shapes, none_filler=None):
-      padded = []
-      for shape in nest.flatten(shapes):
-        shape = tensor_shape.TensorShape(shape)
-        shape = [
-            none_filler if d.value is None else d
-            for d in shape
-        ]
-        padded.append(shape)
-      return nest.pack_sequence_as(shapes, padded)
-
-    def batching_fn(bucket_id, grouped_dataset):
-      """Batch elements in dataset."""
-      batch_size = window_size_fn(bucket_id)
-      if no_padding:
-        return grouped_dataset.batch(batch_size)
-      none_filler = None
-      if pad_to_bucket_boundary:
-        err_msg = ("When pad_to_bucket_boundary=True, elements must have "
-                   "length < max(bucket_boundaries).")
-        check = check_ops.assert_less(
-            bucket_id,
-            constant_op.constant(len(bucket_batch_sizes) - 1,
-                                 dtype=dtypes.int64),
-            message=err_msg)
-        with ops.control_dependencies([check]):
-          boundaries = constant_op.constant(bucket_boundaries,
-                                            dtype=dtypes.int64)
-          bucket_boundary = boundaries[bucket_id]
-          none_filler = bucket_boundary - 1
-      shapes = make_padded_shapes(
-          padded_shapes or grouped_dataset.output_shapes,
-          none_filler=none_filler)
-      return grouped_dataset.padded_batch(batch_size, shapes, padding_values)
-
-    def _apply_fn(dataset):
-      return dataset.apply(
-          group_by_window(element_to_bucket_id, batching_fn,
-                          window_size_func=window_size_fn))
-
-    return _apply_fn
-
-
-def _map_x_dataset(map_func):
-  """A transformation that maps `map_func` across its input.
-
-  This transformation is similar to `tf.data.Dataset.map`, but in addition to
-  supporting dense and sparse tensor inputs, it also supports dataset inputs.
-
-  Args:
-    map_func: A function mapping a nested structure of tensors and/or datasets
-      (having shapes and types defined by `self.output_shapes` and
-     `self.output_types`) to another nested structure of tensors and/or
-     datasets.
-
-  Returns:
-    Dataset: A `Dataset`.
-  """
-
-  def _apply_fn(dataset):
-    """Function from `Dataset` to `Dataset` that applies the transformation."""
-    return _MapXDataset(dataset, map_func)
-
-  return _apply_fn
-
-
-# TODO(b/115382007) Remove this once canned reducers move to core.
-def window_dataset(window_size):
-  """A transformation that creates window datasets from the input dataset.
-
-  The resulting datasets will contain `window_size` elements (or
-  `N % window_size` for the last dataset if `window_size` does not divide the
-  number of input elements `N` evenly).
-
-  Args:
-    window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
-      consecutive elements of the input dataset to combine into a window.
-
-  Returns:
-    Dataset: A `Dataset`.
-  """
-
-  def _apply_fn(dataset):
-    return dataset_ops.WindowDataset(
-        dataset,
-        size=window_size,
-        shift=window_size,
-        stride=1,
-        drop_remainder=False)
-
-  return _apply_fn
-
-
-class _GroupByReducerDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that groups its input and performs a reduction."""
-
-  def __init__(self, input_dataset, key_func, reducer):
-    """See `group_by_reducer()` for details."""
-    super(_GroupByReducerDataset, self).__init__(input_dataset)
+  return grouping.bucket_by_sequence_length(
+      element_length_func, bucket_boundaries, bucket_batch_sizes, padded_shapes,
+      padding_values, pad_to_bucket_boundary, no_padding)
 
-    self._input_dataset = input_dataset
 
-    self._make_key_func(key_func, input_dataset)
-    self._make_init_func(reducer.init_func)
-    self._make_reduce_func(reducer.reduce_func, input_dataset)
-    self._make_finalize_func(reducer.finalize_func)
-
-  def _make_key_func(self, key_func, input_dataset):
-    """Make wrapping Defun for key_func."""
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        key_func, "tf.contrib.data.group_by_reducer()", input_dataset)
-    if not (
-        wrapped_func.output_types == dtypes.int64 and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
-      raise ValueError(
-          "`key_func` must return a single tf.int64 tensor. "
-          "Got type=%s and shape=%s"
-          % (wrapped_func.output_types, wrapped_func.output_shapes))
-    self._key_func = wrapped_func.function
-
-  def _make_init_func(self, init_func):
-    """Make wrapping Defun for init_func."""
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        init_func, "tf.contrib.data.group_by_reducer()",
-        input_classes=ops.Tensor, input_shapes=tensor_shape.scalar(),
-        input_types=dtypes.int64)
-    self._init_func = wrapped_func.function
-    self._state_classes = wrapped_func.output_classes
-    self._state_shapes = wrapped_func.output_shapes
-    self._state_types = wrapped_func.output_types
-
-  def _make_reduce_func(self, reduce_func, input_dataset):
-    """Make wrapping Defun for reduce_func."""
-
-    # Iteratively rerun the reduce function until reaching a fixed point on
-    # `self._state_shapes`.
-    need_to_rerun = True
-    while need_to_rerun:
-
-      wrapped_func = dataset_ops.StructuredFunctionWrapper(
-          reduce_func, "tf.contrib.data.group_by_reducer()",
-          input_classes=(self._state_classes, input_dataset.output_classes),
-          input_shapes=(self._state_shapes, input_dataset.output_shapes),
-          input_types=(self._state_types, input_dataset.output_types),
-          add_to_graph=False)
-
-      # Extract and validate class information from the returned values.
-      for new_state_class, state_class in zip(
-          nest.flatten(wrapped_func.output_classes),
-          nest.flatten(self._state_classes)):
-        if not issubclass(new_state_class, state_class):
-          raise TypeError(
-              "The element classes for the new state must match the initial "
-              "state. Expected %s; got %s." %
-              (self._state_classes, wrapped_func.output_classes))
-
-      # Extract and validate type information from the returned values.
-      for new_state_type, state_type in zip(
-          nest.flatten(wrapped_func.output_types),
-          nest.flatten(self._state_types)):
-        if new_state_type != state_type:
-          raise TypeError(
-              "The element types for the new state must match the initial "
-              "state. Expected %s; got %s." %
-              (self._state_types, wrapped_func.output_types))
-
-      # Extract shape information from the returned values.
-      flat_state_shapes = nest.flatten(self._state_shapes)
-      flat_new_state_shapes = nest.flatten(wrapped_func.output_shapes)
-      weakened_state_shapes = [
-          original.most_specific_compatible_shape(new)
-          for original, new in zip(flat_state_shapes, flat_new_state_shapes)
-      ]
-
-      need_to_rerun = False
-      for original_shape, weakened_shape in zip(flat_state_shapes,
-                                                weakened_state_shapes):
-        if original_shape.ndims is not None and (
-            weakened_shape.ndims is None or
-            original_shape.as_list() != weakened_shape.as_list()):
-          need_to_rerun = True
-          break
-
-      if need_to_rerun:
-        self._state_shapes = nest.pack_sequence_as(self._state_shapes,
-                                                   weakened_state_shapes)
-
-    self._reduce_func = wrapped_func.function
-    self._reduce_func.add_to_graph(ops.get_default_graph())
-
-  def _make_finalize_func(self, finalize_func):
-    """Make wrapping Defun for finalize_func."""
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        finalize_func, "tf.contrib.data.group_by_reducer()",
-        input_classes=self._state_classes, input_shapes=self._state_shapes,
-        input_types=self._state_types)
-    self._finalize_func = wrapped_func.function
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
-
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.group_by_reducer_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._key_func.captured_inputs,
-        self._init_func.captured_inputs,
-        self._reduce_func.captured_inputs,
-        self._finalize_func.captured_inputs,
-        key_func=self._key_func,
-        init_func=self._init_func,
-        reduce_func=self._reduce_func,
-        finalize_func=self._finalize_func,
-        **dataset_ops.flat_structure(self))
-
-
-class _GroupByWindowDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that groups its input and performs a windowed reduction."""
-
-  def __init__(self, input_dataset, key_func, reduce_func, window_size_func):
-    """See `group_by_window()` for details."""
-    super(_GroupByWindowDataset, self).__init__(input_dataset)
-
-    self._input_dataset = input_dataset
-
-    self._make_key_func(key_func, input_dataset)
-    self._make_reduce_func(reduce_func, input_dataset)
-    self._make_window_size_func(window_size_func)
-
-  def _make_window_size_func(self, window_size_func):
-    """Make wrapping Defun for window_size_func."""
-    def window_size_func_wrapper(key):
-      return ops.convert_to_tensor(window_size_func(key), dtype=dtypes.int64)
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        window_size_func_wrapper, "tf.contrib.data.group_by_window()",
-        input_classes=ops.Tensor, input_shapes=tensor_shape.scalar(),
-        input_types=dtypes.int64)
-    if not (
-        wrapped_func.output_types == dtypes.int64 and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
-      raise ValueError(
-          "`window_size_func` must return a single tf.int64 scalar tensor.")
-    self._window_size_func = wrapped_func.function
-
-  def _make_key_func(self, key_func, input_dataset):
-    """Make wrapping Defun for key_func."""
-    def key_func_wrapper(*args):
-      return ops.convert_to_tensor(key_func(*args), dtype=dtypes.int64)
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        key_func_wrapper, "tf.contrib.data.group_by_window()", input_dataset)
-    if not (
-        wrapped_func.output_types == dtypes.int64 and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
-      raise ValueError(
-          "`key_func` must return a single tf.int64 scalar tensor.")
-    self._key_func = wrapped_func.function
-
-  def _make_reduce_func(self, reduce_func, input_dataset):
-    """Make wrapping Defun for reduce_func."""
-    nested_dataset = dataset_ops._NestedDatasetComponent(input_dataset)  # pylint: disable=protected-access
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        reduce_func, "tf.contrib.data.reduce_by_window()",
-        input_classes=(ops.Tensor, nested_dataset),
-        input_shapes=(tensor_shape.scalar(), nested_dataset),
-        input_types=(dtypes.int64, nested_dataset),
-        experimental_nested_dataset_support=True)
-    if not isinstance(
-        wrapped_func.output_classes, dataset_ops._NestedDatasetComponent):  # pylint: disable=protected-access
-      raise TypeError("`reduce_func` must return a `Dataset` object.")
-    self._output_classes = wrapped_func.output_classes.output_classes
-    self._output_types = wrapped_func.output_types.output_types
-    self._output_shapes = wrapped_func.output_shapes.output_shapes
-    self._reduce_func = wrapped_func.function
-
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.group_by_window_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._key_func.captured_inputs,
-        self._reduce_func.captured_inputs,
-        self._window_size_func.captured_inputs,
-        key_func=self._key_func,
-        reduce_func=self._reduce_func,
-        window_size_func=self._window_size_func,
-        **dataset_ops.flat_structure(self))
-
-
-class Reducer(object):
+class Reducer(grouping.Reducer):
   """A reducer is used for reducing a set of elements.
 
   A reducer is represented as a tuple of the three functions:
@@ -507,58 +152,6 @@ class Reducer(object):
     3) finalization function: state => result
   """
 
+  @deprecation.deprecated(None, "Use `tf.data.experimental.Reducer(...)`.")
   def __init__(self, init_func, reduce_func, finalize_func):
-    self._init_func = init_func
-    self._reduce_func = reduce_func
-    self._finalize_func = finalize_func
-
-  @property
-  def init_func(self):
-    return self._init_func
-
-  @property
-  def reduce_func(self):
-    return self._reduce_func
-
-  @property
-  def finalize_func(self):
-    return self._finalize_func
-
-
-class _MapXDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that maps a function over elements in its input."""
-
-  def __init__(self, input_dataset, map_func):
-    """See `map_x_dataset()` for details."""
-    super(_MapXDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        map_func,
-        "tf.contrib.data.map_x_dataset()",
-        input_dataset,
-        experimental_nested_dataset_support=True)
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
-    self._map_func = wrapped_func.function
-
-  def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-    return gen_dataset_ops.map_dataset(
-        input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+    super(Reducer, self).__init__(init_func, reduce_func, finalize_func)
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index 1ee9db1aa8..f50da4d429 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -17,20 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib import stateless
-from tensorflow.contrib.data.python.ops import random_ops
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.data.util import nest
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_experimental_dataset_ops
-from tensorflow.python.ops import math_ops
+from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.util import deprecation
 
 
+@deprecation.deprecated(None,
+                        "Use `tf.data.experimental.parallel_interleave(...)`.")
 def parallel_interleave(map_func,
                         cycle_length,
                         block_length=1,
@@ -80,12 +72,9 @@ def parallel_interleave(map_func,
     A `Dataset` transformation function, which can be passed to
     `tf.data.Dataset.apply`.
   """
-  def _apply_fn(dataset):
-    return readers.ParallelInterleaveDataset(
-        dataset, map_func, cycle_length, block_length, sloppy,
-        buffer_output_elements, prefetch_input_elements)
-
-  return _apply_fn
+  return interleave_ops.parallel_interleave(
+      map_func, cycle_length, block_length, sloppy, buffer_output_elements,
+      prefetch_input_elements)
 
 
 @deprecation.deprecated(
@@ -139,63 +128,12 @@ def sloppy_interleave(map_func, cycle_length, block_length=1):
     A `Dataset` transformation function, which can be passed to
     `tf.data.Dataset.apply`.
   """
-  def _apply_fn(dataset):
-    return readers.ParallelInterleaveDataset(
-        dataset,
-        map_func,
-        cycle_length,
-        block_length,
-        sloppy=True,
-        buffer_output_elements=None,
-        prefetch_input_elements=None)
-
-  return _apply_fn
-
-
-class _DirectedInterleaveDataset(dataset_ops.Dataset):
-  """A substitute for `Dataset.interleave()` on a fixed list of datasets."""
-
-  def __init__(self, selector_input, data_inputs):
-    self._selector_input = selector_input
-    self._data_inputs = list(data_inputs)
-
-    for data_input in data_inputs[1:]:
-      if (data_input.output_types != data_inputs[0].output_types or
-          data_input.output_classes != data_inputs[0].output_classes):
-        raise TypeError("All datasets must have the same type and class.")
-
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    return (
-        gen_experimental_dataset_ops.experimental_directed_interleave_dataset(
-            self._selector_input._as_variant_tensor(), [
-                data_input._as_variant_tensor()
-                for data_input in self._data_inputs
-            ], **dataset_ops.flat_structure(self)))
-    # pylint: enable=protected-access
-
-  def _inputs(self):
-    return [self._selector_input] + self._data_inputs
-
-  @property
-  def output_classes(self):
-    return self._data_inputs[0].output_classes
-
-  @property
-  def output_shapes(self):
-    ret = self._data_inputs[0].output_shapes
-    for data_input in self._data_inputs[1:]:
-      ret = nest.pack_sequence_as(ret, [
-          ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip(
-              nest.flatten(ret), nest.flatten(data_input.output_shapes))
-      ])
-    return ret
-
-  @property
-  def output_types(self):
-    return self._data_inputs[0].output_types
+  return interleave_ops.parallel_interleave(
+      map_func, cycle_length, block_length, sloppy=True)
 
 
+@deprecation.deprecated(None,
+                        "Use `tf.data.experimental.sample_from_datasets(...)`.")
 def sample_from_datasets(datasets, weights=None, seed=None):
   """Samples elements at random from the datasets in `datasets`.
 
@@ -219,64 +157,11 @@ def sample_from_datasets(datasets, weights=None, seed=None):
     ValueError: If the `weights` argument is specified and does not match the
       length of the `datasets` element.
   """
-  num_datasets = len(datasets)
-  if not isinstance(weights, dataset_ops.Dataset):
-    if weights is None:
-      # Select inputs with uniform probability.
-      logits = [[1.0] * num_datasets]
-
-    else:
-      # Use the given `weights` as the probability of choosing the respective
-      # input.
-      weights = ops.convert_to_tensor(weights, name="weights")
-      if weights.dtype not in (dtypes.float32, dtypes.float64):
-        raise TypeError("`weights` must be convertible to a tensor of "
-                        "`tf.float32` or `tf.float64` elements.")
-      if not weights.shape.is_compatible_with([num_datasets]):
-        raise ValueError(
-            "`weights` must be a vector of length `len(datasets)`.")
-
-      # The `stateless_multinomial()` op expects log-probabilities, as opposed
-      # to weights.
-      logits = array_ops.expand_dims(math_ops.log(weights, name="logits"), 0)
-
-    # NOTE(mrry): We only specialize when `weights` is not a `Dataset`. When it
-    # is a `Dataset`, it is possible that evaluating it has a side effect the
-    # user depends on.
-    if len(datasets) == 1:
-      return datasets[0]
-
-    def select_dataset_constant_logits(seed):
-      return array_ops.squeeze(
-          stateless.stateless_multinomial(logits, 1, seed=seed), axis=[0, 1])
-
-    selector_input = dataset_ops.MapDataset(
-        random_ops.RandomDataset(seed).batch(2),
-        select_dataset_constant_logits,
-        use_inter_op_parallelism=False)
-
-  else:
-    # Use each element of the given `weights` dataset as the probability of
-    # choosing the respective input.
-
-    # The `stateless_multinomial()` op expects log-probabilities, as opposed to
-    # weights.
-    logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits"))
-
-    def select_dataset_varying_logits(logits, seed):
-      return array_ops.squeeze(
-          stateless.stateless_multinomial(logits, 1, seed=seed), axis=[0, 1])
-
-    logits_and_seeds = dataset_ops.Dataset.zip(
-        (logits_ds, random_ops.RandomDataset(seed).batch(2)))
-    selector_input = dataset_ops.MapDataset(
-        logits_and_seeds,
-        select_dataset_varying_logits,
-        use_inter_op_parallelism=False)
-
-  return _DirectedInterleaveDataset(selector_input, datasets)
+  return interleave_ops.sample_from_datasets(datasets, weights, seed)
 
 
+@deprecation.deprecated(None,
+                        "Use `tf.data.experimental.choose_from_datasets(...)`.")
 def choose_from_datasets(datasets, choice_dataset):
   """Creates a dataset that deterministically chooses elements from `datasets`.
 
@@ -312,10 +197,4 @@ def choose_from_datasets(datasets, choice_dataset):
     TypeError: If the `datasets` or `choice_dataset` arguments have the wrong
       type.
   """
-  if not (choice_dataset.output_types == dtypes.int64
-          and choice_dataset.output_shapes.is_compatible_with(
-              tensor_shape.scalar())
-          and choice_dataset.output_classes == ops.Tensor):
-    raise TypeError("`choice_dataset` must be a dataset of scalar "
-                    "`tf.int64` tensors.")
-  return _DirectedInterleaveDataset(choice_dataset, datasets)
+  return interleave_ops.choose_from_datasets(datasets, choice_dataset)
diff --git a/tensorflow/contrib/data/python/ops/iterator_ops.py b/tensorflow/contrib/data/python/ops/iterator_ops.py
index 18515e21ed..48c325c86f 100644
--- a/tensorflow/contrib/data/python/ops/iterator_ops.py
+++ b/tensorflow/contrib/data/python/ops/iterator_ops.py
@@ -16,15 +16,13 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training import session_run_hook
 
+from tensorflow.python.data.experimental.ops import iterator_ops
+from tensorflow.python.util import deprecation
 
+
+@deprecation.deprecated(
+    None, "Use `tf.data.experimental.make_saveable_from_iterator(...)`.")
 def make_saveable_from_iterator(iterator):
   """Returns a SaveableObject for saving/restore iterator state using Saver.
 
@@ -60,27 +58,10 @@ def make_saveable_from_iterator(iterator):
   Note: Not all iterators support checkpointing yet. Attempting to save the
   state of an unsupported iterator will throw an error.
   """
-  return _Saveable(iterator._iterator_resource)  # pylint: disable=protected-access
-
-
-class _Saveable(saver_lib.BaseSaverBuilder.SaveableObject):
-  """SaveableObject for saving/restoring iterator state."""
+  return iterator_ops.make_saveable_from_iterator(iterator)
 
-  def __init__(self, iterator_resource):
-    serialized_iterator = gen_dataset_ops.serialize_iterator(iterator_resource)
-    specs = [
-        saver_lib.BaseSaverBuilder.SaveSpec(serialized_iterator, "",
-                                            iterator_resource.name + "-state")
-    ]
-    super(_Saveable, self).__init__(iterator_resource, specs,
-                                    iterator_resource.name)
 
-  def restore(self, restored_tensors, unused_restored_shapes):
-    with ops.colocate_with(self.op):
-      return gen_dataset_ops.deserialize_iterator(self.op, restored_tensors[0])
-
-
-class CheckpointInputPipelineHook(session_run_hook.SessionRunHook):
+class CheckpointInputPipelineHook(iterator_ops.CheckpointInputPipelineHook):
   """Checkpoints input pipeline state every N steps or seconds.
 
   This hook saves the state of the iterators in the `Graph` so that when
@@ -125,135 +106,7 @@ class CheckpointInputPipelineHook(session_run_hook.SessionRunHook):
   collector when building the eval graph.
   """
 
+  @deprecation.deprecated(
+      None, "Use `tf.data.experimental.CheckpointInputPipelineHook(...)`.")
   def __init__(self, estimator):
-    """Initializes a `CheckpointInputPipelineHook`.
-
-    Args:
-      estimator: Estimator.
-
-    Raises:
-      ValueError: One of `save_steps` or `save_secs` should be set.
-      ValueError: At most one of saver or scaffold should be set.
-    """
-    # `checkpoint_basename` is "input.ckpt" for non-distributed pipelines or
-    # of the form "input_<task_type>_<task_id>.ckpt" for distributed pipelines.
-    # Note: The default `checkpoint_basename` used by `CheckpointSaverHook` is
-    # "model.ckpt". We intentionally choose the input pipeline checkpoint prefix
-    # to be different to avoid conflicts with the model checkpoint.
-
-    # pylint: disable=protected-access
-    checkpoint_prefix = "input"
-    if estimator._config.num_worker_replicas > 1:
-      # Distributed setting.
-      suffix = "_{}_{}".format(estimator._config.task_type,
-                               estimator._config.task_id)
-      checkpoint_prefix += suffix
-    # pylint: enable=protected-access
-
-    # We use a composition paradigm instead of inheriting from
-    # `CheckpointSaverHook` because `Estimator` does an `isinstance` check
-    # to check whether a `CheckpointSaverHook` is already present in the list
-    # of hooks and if not, adds one. Inheriting from `CheckpointSaverHook`
-    # would thwart this behavior. This hook checkpoints *only the iterators*
-    # and not the graph variables.
-    self._checkpoint_saver_hook = basic_session_run_hooks.CheckpointSaverHook(
-        estimator.model_dir,
-        save_secs=estimator._config.save_checkpoints_secs,  # pylint: disable=protected-access
-        save_steps=estimator._config.save_checkpoints_steps,  # pylint: disable=protected-access
-        checkpoint_basename=checkpoint_prefix + ".ckpt")
-
-    # Name for the protocol buffer file that will contain the list of most
-    # recent checkpoints stored as a `CheckpointState` protocol buffer.
-    # This file, kept in the same directory as the checkpoint files, is
-    # automatically managed by the `Saver` to keep track of recent checkpoints.
-    # The default name used by the `Saver` for this file is "checkpoint". Here
-    # we use the name "checkpoint_<checkpoint_prefix>" so that in case the
-    # `checkpoint_dir` is the same as the model checkpoint directory, there are
-    # no conflicts during restore.
-    self._latest_filename = "checkpoint_" + checkpoint_prefix
-    self._first_run = True
-
-  def begin(self):
-    # Build a Saver that saves all iterators in the `GLOBAL_ITERATORS`
-    # collection if no `Saver` or `Scaffold` is provided.
-    # pylint: disable=protected-access
-    if (self._checkpoint_saver_hook._saver is None and
-        self._checkpoint_saver_hook._scaffold is None):
-      iterators = ops.get_collection(iterator_ops.GLOBAL_ITERATORS)
-      saveables = [_Saveable(i) for i in iterators]
-      self._checkpoint_saver_hook._saver = _CustomSaver(saveables,
-                                                        self._latest_filename)
-    # pylint: enable=protected-access
-    self._checkpoint_saver_hook.begin()
-
-  def _restore_or_save_initial_ckpt(self, session):
-    # Ideally this should be run in after_create_session but is not for the
-    # following reason:
-    # Currently there is no way of enforcing an order of running the
-    # `SessionRunHooks`. Hence it is possible that the `_DatasetInitializerHook`
-    # is run *after* this hook. That is troublesome because
-    # 1. If a checkpoint exists and this hook restores it, the initializer hook
-    #    will override it.
-    # 2. If no checkpoint exists, this hook will try to save an initialized
-    #    iterator which will result in an exception.
-    #
-    # As a temporary fix we enter the following implicit contract between this
-    # hook and the _DatasetInitializerHook.
-    # 1. The _DatasetInitializerHook initializes the iterator in the call to
-    #    after_create_session.
-    # 2. This hook saves the iterator on the first call to `before_run()`, which
-    #    is guaranteed to happen after `after_create_session()` of all hooks
-    #    have been run.
-
-    # Check if there is an existing checkpoint. If so, restore from it.
-    # pylint: disable=protected-access
-    latest_checkpoint_path = checkpoint_management.latest_checkpoint(
-        self._checkpoint_saver_hook._checkpoint_dir,
-        latest_filename=self._latest_filename)
-    if latest_checkpoint_path:
-      self._checkpoint_saver_hook._get_saver().restore(session,
-                                                       latest_checkpoint_path)
-    else:
-      # The checkpoint saved here is the state at step "global_step".
-      # Note: We do not save the GraphDef or MetaGraphDef here.
-      global_step = session.run(self._checkpoint_saver_hook._global_step_tensor)
-      self._checkpoint_saver_hook._save(session, global_step)
-      self._checkpoint_saver_hook._timer.update_last_triggered_step(global_step)
-    # pylint: enable=protected-access
-
-  def before_run(self, run_context):
-    if self._first_run:
-      self._restore_or_save_initial_ckpt(run_context.session)
-      self._first_run = False
-    return self._checkpoint_saver_hook.before_run(run_context)
-
-  def after_run(self, run_context, run_values):
-    self._checkpoint_saver_hook.after_run(run_context, run_values)
-
-  def end(self, session):
-    self._checkpoint_saver_hook.end(session)
-
-
-class _CustomSaver(saver_lib.Saver):
-  """`Saver` with a different default `latest_filename`.
-
-  This is used in the `CheckpointInputPipelineHook` to avoid conflicts with
-  the model ckpt saved by the `CheckpointSaverHook`.
-  """
-
-  def __init__(self, var_list, latest_filename):
-    super(_CustomSaver, self).__init__(var_list)
-    self._latest_filename = latest_filename
-
-  def save(self,
-           sess,
-           save_path,
-           global_step=None,
-           latest_filename=None,
-           meta_graph_suffix="meta",
-           write_meta_graph=True,
-           write_state=True,
-           strip_default_attrs=False):
-    return super(_CustomSaver, self).save(
-        sess, save_path, global_step, latest_filename or self._latest_filename,
-        meta_graph_suffix, write_meta_graph, write_state, strip_default_attrs)
+    super(CheckpointInputPipelineHook, self).__init__(estimator)
diff --git a/tensorflow/contrib/data/python/ops/parsing_ops.py b/tensorflow/contrib/data/python/ops/parsing_ops.py
index cfbba701b0..3aeee9d8e4 100644
--- a/tensorflow/contrib/data/python/ops/parsing_ops.py
+++ b/tensorflow/contrib/data/python/ops/parsing_ops.py
@@ -17,92 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import parsing_ops
+from tensorflow.python.data.experimental.ops import parsing_ops
+from tensorflow.python.util import deprecation
 
 
-class _ParseExampleDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that parses `example` dataset into a `dict` dataset."""
-
-  def __init__(self, input_dataset, features, num_parallel_calls):
-    super(_ParseExampleDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-    if not all(types == dtypes.string
-               for types in nest.flatten(input_dataset.output_types)):
-      raise TypeError("Input dataset should be a dataset of vectors of strings")
-    self._num_parallel_calls = num_parallel_calls
-    # pylint: disable=protected-access
-    self._features = parsing_ops._prepend_none_dimension(features)
-    # sparse_keys and dense_keys come back sorted here.
-    (sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults,
-     dense_shapes) = parsing_ops._features_to_raw_params(
-         self._features, [
-             parsing_ops.VarLenFeature, parsing_ops.SparseFeature,
-             parsing_ops.FixedLenFeature, parsing_ops.FixedLenSequenceFeature
-         ])
-    # TODO(b/112859642): Pass sparse_index and sparse_values for SparseFeature.
-    (_, dense_defaults_vec, sparse_keys, sparse_types, dense_keys, dense_shapes,
-     dense_shape_as_shape) = parsing_ops._process_raw_parameters(
-         None, dense_defaults, sparse_keys, sparse_types, dense_keys,
-         dense_types, dense_shapes)
-    # pylint: enable=protected-access
-    self._sparse_keys = sparse_keys
-    self._sparse_types = sparse_types
-    self._dense_keys = dense_keys
-    self._dense_defaults = dense_defaults_vec
-    self._dense_shapes = dense_shapes
-    self._dense_types = dense_types
-    dense_output_shapes = [
-        self._input_dataset.output_shapes.concatenate(shape)
-        for shape in dense_shape_as_shape
-    ]
-    sparse_output_shapes = [
-        self._input_dataset.output_shapes.concatenate([None])
-        for _ in range(len(sparse_keys))
-    ]
-
-    self._output_shapes = dict(
-        zip(self._dense_keys + self._sparse_keys,
-            dense_output_shapes + sparse_output_shapes))
-    self._output_types = dict(
-        zip(self._dense_keys + self._sparse_keys,
-            self._dense_types + self._sparse_types))
-    self._output_classes = dict(
-        zip(self._dense_keys + self._sparse_keys,
-            [ops.Tensor for _ in range(len(self._dense_defaults))] +
-            [sparse_tensor.SparseTensor for _ in range(len(self._sparse_keys))
-            ]))
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.parse_example_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._num_parallel_calls,
-        self._dense_defaults,
-        self._sparse_keys,
-        self._dense_keys,
-        self._sparse_types,
-        self._dense_shapes,
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-
-# TODO(b/111553342): add arguments names and example names as well.
+@deprecation.deprecated(
+    None, "Use `tf.data.experimental.parse_example_dataset(...)`.")
 def parse_example_dataset(features, num_parallel_calls=1):
   """A transformation that parses `Example` protos into a `dict` of tensors.
 
@@ -130,21 +50,4 @@ def parse_example_dataset(features, num_parallel_calls=1):
   Raises:
     ValueError: if features argument is None.
   """
-  if features is None:
-    raise ValueError("Missing: features was %s." % features)
-
-  def _apply_fn(dataset):
-    """Function from `Dataset` to `Dataset` that applies the transformation."""
-    out_dataset = _ParseExampleDataset(dataset, features, num_parallel_calls)
-    if any([
-        isinstance(feature, parsing_ops.SparseFeature)
-        for _, feature in features.items()
-    ]):
-      # pylint: disable=protected-access
-      # pylint: disable=g-long-lambda
-      out_dataset = out_dataset.map(
-          lambda x: parsing_ops._construct_sparse_tensors_for_sparse_features(
-              features, x), num_parallel_calls=num_parallel_calls)
-    return out_dataset
-
-  return _apply_fn
+  return parsing_ops.parse_example_dataset(features, num_parallel_calls)
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 46f82e453a..adfb390cd9 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -17,321 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import warnings
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
-from tensorflow.python.eager import context
-from tensorflow.python.framework import device as framework_device
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
-from tensorflow.python.ops import resource_variable_ops
-
-
-def function_buffering_resource(string_arg,
-                                target_device,
-                                f,
-                                buffer_size,
-                                output_types,
-                                container="",
-                                shared_name=None,
-                                name=None):
-  """Creates a FunctionBufferingResource.
-
-  A FunctionBufferingResource fills up a buffer by calling a function `f` on
-  `target_device`. `f` should take in only a single string argument as input.
-
-  Args:
-    string_arg: The single string argument to the function.
-    target_device: The device to run `f` on.
-    f: The function to be executed.
-    buffer_size: Size of the buffer to be populated.
-    output_types: The output types generated by the function.
-    container: (Optional) string. Defaults to "".
-    shared_name: (Optional) string.
-    name: (Optional) string to name the op.
-
-  Returns:
-    Handle to a FunctionBufferingResource.
-  """
-  if shared_name is None:
-    shared_name = ""
-  return ged_ops.experimental_function_buffering_resource(
-      string_arg=string_arg,
-      target_device=target_device,
-      shared_name=shared_name,
-      f=f,
-      buffer_size=buffer_size,
-      container=container,
-      name=name,
-      output_types=output_types)
-
-
-def function_buffering_resource_get_next(function_buffer_resource,
-                                         output_types,
-                                         name=None):
-  return ged_ops.experimental_function_buffering_resource_get_next(
-      function_buffer_resource=function_buffer_resource,
-      output_types=output_types,
-      name=name)
-
-
-def function_buffering_resource_reset(function_buffer_resource, name=None):
-  return ged_ops.experimental_function_buffering_resource_reset(
-      function_buffer_resource=function_buffer_resource, name=name)
-
-
-# pylint: disable=protected-access
-class _PrefetchToDeviceIterator(object):
-  """A replacement for `tf.data.Iterator` that prefetches to another device.
-
-  Args:
-    input_dataset: The input dataset
-    one_shot: If true, we make a one shot iterator that's already initialized.
-    device: A fully specified device string where we want to prefetch to
-    buffer_size: Size of the prefetching buffer.
-    shared_name: (Optional.) If non-empty, the returned iterator will be
-        shared under the given name across multiple sessions that share the
-        same devices (e.g. when using a remote server).
-
-  Returns:
-    An Iterator type object.
-  """
-
-  def __init__(self,
-               input_dataset,
-               one_shot,
-               device,
-               buffer_size,
-               shared_name=None):
-    self._input_dataset = input_dataset
-    self._get_next_call_count = 0
-    self._one_shot = one_shot
-    if shared_name is None:
-      shared_name = ""
-
-    if self._one_shot:
-      self._input_iterator = input_dataset.make_one_shot_iterator()
-    else:
-      self._input_iterator = iterator_ops.Iterator.from_structure(
-          self._input_dataset.output_types, self._input_dataset.output_shapes,
-          shared_name, self._input_dataset.output_classes)
-    input_iterator_handle = self._input_iterator.string_handle()
-
-    @function.Defun(dtypes.string)
-    def _prefetch_fn(handle):
-      """Prefetches one element from `input_iterator`."""
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, self._input_iterator.output_types,
-          self._input_iterator.output_shapes,
-          self._input_iterator.output_classes)
-      ret = remote_iterator.get_next()
-      return nest.flatten(sparse.serialize_sparse_tensors(ret))
-
-    iterator_device = ged_ops.experimental_iterator_get_device(
-        self._input_iterator._iterator_resource)
-
-    with ops.device(device):
-      self._buffering_resource = function_buffering_resource(
-          f=_prefetch_fn,
-          target_device=iterator_device,
-          string_arg=input_iterator_handle,
-          buffer_size=buffer_size,
-          shared_name=shared_name,
-          output_types=nest.flatten(
-              sparse.as_dense_types(self._input_dataset.output_types,
-                                    self._input_dataset.output_classes)))
-
-    if not self._one_shot:
-      reset_op = function_buffering_resource_reset(self._buffering_resource)
-      with ops.control_dependencies([reset_op]):
-        self._initializer = self._input_iterator.make_initializer(
-            self._input_dataset)
-
-  def get_next(self, name=None):
-    """See `tf.data.Iterator.get_next`."""
-    self._get_next_call_count += 1
-    if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
-      warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
-
-    flat_ret = ged_ops.experimental_function_buffering_resource_get_next(
-        self._buffering_resource,
-        output_types=nest.flatten(
-            sparse.as_dense_types(self.output_types, self.output_classes)),
-        name=name)
-
-    ret = sparse.deserialize_sparse_tensors(
-        nest.pack_sequence_as(self.output_types, flat_ret),
-        self.output_types, self.output_shapes, self.output_classes)
-
-    for tensor, shape in zip(
-        nest.flatten(ret), nest.flatten(self.output_shapes)):
-      if isinstance(tensor, ops.Tensor):
-        tensor.set_shape(shape)
-
-    return ret
-
-  @property
-  def initializer(self):
-    if self._one_shot:
-      raise NotImplementedError("Can't initialize a one_shot_iterator")
-    return self._initializer
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
-  """A replacement for `tf.data.Iterator` that prefetches to another device.
-
-  Args:
-    input_dataset: The input dataset
-    one_shot: If true, we make a one shot iterator that's already initialized.
-    device: A fully specified device string where we want to prefetch to
-    buffer_size: Size of the prefetching buffer.
-    shared_name: (Optional.) If non-empty, the returned iterator will be
-        shared under the given name across multiple sessions that share the
-        same devices (e.g. when using a remote server).
-
-  Returns:
-    An Iterator type object.
-  """
-
-  def __init__(self,
-               input_dataset,
-               device,
-               buffer_size):
-    with ops.device("/device:CPU:0"):
-      super(_PrefetchToDeviceEagerIterator, self).__init__(input_dataset)
-      input_iterator_handle = gen_dataset_ops.iterator_to_string_handle(
-          self._resource)
-
-    self._device = device
-
-    @function.Defun(dtypes.string)
-    def _prefetch_fn(handle):
-      """Prefetches one element from `input_iterator`."""
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, self.output_types, self.output_shapes, self.output_classes)
-      ret = remote_iterator.get_next()
-      return nest.flatten(sparse.serialize_sparse_tensors(ret))
-
-    _prefetch_fn.add_to_graph(None)
-
-    with ops.device(device):
-      self._buffering_resource = function_buffering_resource(
-          f=_prefetch_fn,
-          output_types=self._flat_output_types,
-          target_device=ged_ops.experimental_iterator_get_device(
-              self._resource),
-          string_arg=input_iterator_handle,
-          buffer_size=buffer_size,
-          shared_name=iterator_ops._generate_shared_name(
-              "function_buffer_resource"))
-
-  def _next_internal(self):
-    """Returns a nested structure of `tf.Tensor`s containing the next element.
-    """
-    # This runs in sync mode as iterators use an error status to communicate
-    # that there is no more data to iterate over.
-    # TODO(b/77291417): Fix
-    with context.execution_mode(context.SYNC):
-      with ops.device(self._device):
-        ret = ged_ops.experimental_function_buffering_resource_get_next(
-            function_buffer_resource=self._buffering_resource,
-            output_types=self._flat_output_types)
-      return sparse.deserialize_sparse_tensors(
-          nest.pack_sequence_as(self._output_types, ret), self._output_types,
-          self._output_shapes, self._output_classes)
-# pylint: enable=protected-access
-
-
-class _PrefetchToDeviceDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` whose iterator prefetches elements to another device."""
-
-  def __init__(self, input_dataset, device, buffer_size):
-    super(_PrefetchToDeviceDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-    self._device = device
-    self._buffer_size = buffer_size if buffer_size is not None else 1
-
-  # The static analysis cannot tell that the eager iterator's superclass has
-  # a `next()` method.
-  # pylint: disable=non-iterator-returned
-  def __iter__(self):
-    """Creates an `Iterator` for enumerating the elements of this dataset.
-
-    The returned iterator implements the Python iterator protocol and therefore
-    can only be used in eager mode.
-
-    Returns:
-      An `Iterator` over the elements of this dataset.
-
-    Raises:
-      RuntimeError: If eager execution is enabled.
-    """
-    if context.executing_eagerly():
-      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
-                                            self._buffer_size)
-    else:
-      raise RuntimeError("dataset.__iter__() is only supported when eager "
-                         "execution is enabled.")
-  # pylint: enable=non-iterator-returned
-
-  def make_one_shot_iterator(self):
-    if context.executing_eagerly():
-      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
-                                            self._buffer_size)
-    else:
-      return _PrefetchToDeviceIterator(self._input_dataset, one_shot=True,
-                                       device=self._device,
-                                       buffer_size=self._buffer_size)
-
-  def make_initializable_iterator(self, shared_name=None):
-    return _PrefetchToDeviceIterator(
-        self._input_dataset,
-        one_shot=False,
-        device=self._device,
-        buffer_size=self._buffer_size,
-        shared_name=shared_name)
-
-  def _as_variant_tensor(self):
-    # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
-    # transformation methods is called.
-    # TODO(mrry): Investigate support for chaining further transformations after
-    # the prefetch, including GPU support.
-    raise NotImplementedError("`prefetch_to_device()` must be the last "
-                              "transformation in a dataset pipeline.")
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
+from tensorflow.python.data.experimental.ops import prefetching_ops
+from tensorflow.python.util import deprecation
 
 
+@deprecation.deprecated(None,
+                        "Use `tf.data.experimental.prefetch_to_device(...)`.")
 def prefetch_to_device(device, buffer_size=None):
   """A transformation that prefetches dataset values to the given `device`.
 
@@ -347,12 +38,10 @@ def prefetch_to_device(device, buffer_size=None):
     A `Dataset` transformation function, which can be passed to
     `tf.data.Dataset.apply`.
   """
-  def _apply_fn(dataset):
-    return _PrefetchToDeviceDataset(dataset, device, buffer_size)
-
-  return _apply_fn
+  return prefetching_ops.prefetch_to_device(device, buffer_size)
 
 
+@deprecation.deprecated(None, "Use `tf.data.experimental.copy_to_device(...)`.")
 def copy_to_device(target_device, source_device="/cpu:0"):
   """A transformation that copies dataset elements to the given `target_device`.
 
@@ -364,165 +53,4 @@ def copy_to_device(target_device, source_device="/cpu:0"):
     A `Dataset` transformation function, which can be passed to
     `tf.data.Dataset.apply`.
   """
-
-  def _apply_fn(dataset):
-    return _CopyToDeviceDataset(
-        dataset, target_device=target_device, source_device=source_device)
-
-  return _apply_fn
-
-
-# TODO(rohanj): Use the _input_hostmem attr on the RemoteCall ops to indicate
-# all inputs to the Op are in host memory, thereby avoiding some unnecessary
-# Sends and Recvs.
-class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that copies elements to another device."""
-
-  def __init__(self, input_dataset, target_device, source_device="/cpu:0"):
-    """Constructs a _CopyToDeviceDataset.
-
-    Args:
-      input_dataset: `Dataset` to be copied
-      target_device: The name of the device to which elements would be copied.
-      source_device: Device where input_dataset would be placed.
-    """
-    super(_CopyToDeviceDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-    self._target_device = target_device
-    spec = framework_device.DeviceSpec().from_string(self._target_device)
-    self._is_gpu_target = (spec.device_type == "GPU")
-    self._source_device_string = source_device
-    self._source_device = ops.convert_to_tensor(source_device)
-
-    self._flat_output_shapes = nest.flatten(
-        sparse.as_dense_shapes(self._input_dataset.output_shapes,
-                               self._input_dataset.output_classes))
-    self._flat_output_types = nest.flatten(
-        sparse.as_dense_types(self._input_dataset.output_types,
-                              self._input_dataset.output_classes))
-
-    @function.Defun()
-    def _init_func():
-      """Creates an iterator for the input dataset.
-
-      Returns:
-        A `string` tensor that encapsulates the iterator created.
-      """
-      # pylint: disable=protected-access
-      ds_variant = self._input_dataset._as_variant_tensor()
-      resource = gen_dataset_ops.anonymous_iterator(
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
-      with ops.control_dependencies(
-          [gen_dataset_ops.make_iterator(ds_variant, resource)]):
-        return gen_dataset_ops.iterator_to_string_handle(resource)
-
-    @function.Defun()
-    def _remote_init_func():
-      return functional_ops.remote_call(
-          target=self._source_device,
-          args=_init_func.captured_inputs,
-          Tout=[dtypes.string],
-          f=_init_func)
-
-    self._init_func = _remote_init_func
-    self._init_captured_args = _remote_init_func.captured_inputs
-
-    @function.Defun(dtypes.string)
-    def _next_func(string_handle):
-      """Calls get_next for created iterator.
-
-      Args:
-        string_handle: An iterator string handle created by _init_func
-      Returns:
-        The elements generated from `input_dataset`
-      """
-      with ops.device(self._source_device_string):
-        iterator = iterator_ops.Iterator.from_string_handle(
-            string_handle, self.output_types, self.output_shapes,
-            self.output_classes)
-      ret = iterator.get_next()
-      return nest.flatten(sparse.serialize_sparse_tensors(ret))
-
-    @function.Defun(dtypes.string)
-    def _remote_next_func(string_handle):
-      return functional_ops.remote_call(
-          target=self._source_device,
-          args=[string_handle] + _next_func.captured_inputs,
-          Tout=self._flat_output_types,
-          f=_next_func)
-
-    self._next_func = _remote_next_func
-    self._next_captured_args = _remote_next_func.captured_inputs
-
-    @function.Defun(dtypes.string)
-    def _finalize_func(string_handle):
-      """Destroys the iterator resource created.
-
-      Args:
-        string_handle: An iterator string handle created by _init_func
-      Returns:
-        Tensor constant 0
-      """
-      iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
-          string_handle,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
-      with ops.control_dependencies([
-          resource_variable_ops.destroy_resource_op(
-              iterator_resource, ignore_lookup_error=True)]):
-        return array_ops.constant(0, dtypes.int64)
-
-    @function.Defun(dtypes.string)
-    def _remote_finalize_func(string_handle):
-      return functional_ops.remote_call(
-          target=self._source_device,
-          args=[string_handle] + _finalize_func.captured_inputs,
-          Tout=[dtypes.int64],
-          f=_finalize_func)
-
-    self._finalize_func = _remote_finalize_func
-    self._finalize_captured_args = _remote_finalize_func.captured_inputs
-
-    g = ops.get_default_graph()
-    _remote_init_func.add_to_graph(g)
-    _remote_next_func.add_to_graph(g)
-    _remote_finalize_func.add_to_graph(g)
-    # pylint: enable=protected-scope
-
-  # The one_shot_iterator implementation needs a 0 arg _make_dataset function
-  # that thereby captures all the inputs required to create the dataset. Since
-  # there are strings that are inputs to the GeneratorDataset which can't be
-  # placed on a GPU, this fails for the GPU case. Therefore, disabling it for
-  # GPU
-  def make_one_shot_iterator(self):
-    if self._is_gpu_target:
-      raise ValueError("Cannot create a one shot iterator when using "
-                       "`tf.contrib.data.copy_to_device()` on GPU. Please use "
-                       "`Dataset.make_initializable_iterator()` instead.")
-    else:
-      return super(_CopyToDeviceDataset, self).make_one_shot_iterator()
-
-  def _as_variant_tensor(self):
-    with ops.device(self._target_device):
-      return gen_dataset_ops.generator_dataset(
-          self._init_captured_args,
-          self._next_captured_args,
-          self._finalize_captured_args,
-          init_func=self._init_func,
-          next_func=self._next_func,
-          finalize_func=self._finalize_func,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
+  return prefetching_ops.copy_to_device(target_device, source_device)
diff --git a/tensorflow/contrib/data/python/ops/random_ops.py b/tensorflow/contrib/data/python/ops/random_ops.py
index 344a0763c8..2c95125636 100644
--- a/tensorflow/contrib/data/python/ops/random_ops.py
+++ b/tensorflow/contrib/data/python/ops/random_ops.py
@@ -17,36 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import random_seed
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.data.experimental.ops import random_ops
+from tensorflow.python.util import deprecation
 
 
-class RandomDataset(dataset_ops.DatasetSource):
+class RandomDataset(random_ops.RandomDataset):
   """A `Dataset` of pseudorandom values."""
 
+  @deprecation.deprecated(
+      None, "Use `tf.data.experimental.RandomDataset(...)`.")
   def __init__(self, seed=None):
-    """A `Dataset` of pseudorandom values."""
-    super(RandomDataset, self).__init__()
-    self._seed, self._seed2 = random_seed.get_seed(seed)
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.random_dataset(
-        seed=self._seed,
-        seed2=self._seed2,
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.int64
+    super(RandomDataset, self).__init__(seed)
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 360971e200..4601376dff 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -17,295 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import csv
-
-import numpy as np
-
-from tensorflow.contrib.data.python.ops import batching
-from tensorflow.contrib.data.python.ops import interleave_ops
-from tensorflow.contrib.data.python.ops import optimization
-from tensorflow.contrib.data.python.ops import parsing_ops
-from tensorflow.contrib.data.python.ops import shuffle_ops
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
-from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.lib.io import file_io
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
-from tensorflow.python.platform import gfile
 from tensorflow.python.util import deprecation
 
-_ACCEPTABLE_CSV_TYPES = (dtypes.float32, dtypes.float64, dtypes.int32,
-                         dtypes.int64, dtypes.string)
-
-
-def _is_valid_int32(str_val):
-  try:
-    # Checks equality to prevent int32 overflow
-    return dtypes.int32.as_numpy_dtype(str_val) == dtypes.int64.as_numpy_dtype(
-        str_val)
-  except (ValueError, OverflowError):
-    return False
-
-
-def _is_valid_int64(str_val):
-  try:
-    dtypes.int64.as_numpy_dtype(str_val)
-    return True
-  except (ValueError, OverflowError):
-    return False
-
-
-def _is_valid_float(str_val, float_dtype):
-  try:
-    return float_dtype.as_numpy_dtype(str_val) < np.inf
-  except ValueError:
-    return False
-
-
-def _infer_type(str_val, na_value, prev_type):
-  """Given a string, infers its tensor type.
-
-  Infers the type of a value by picking the least 'permissive' type possible,
-  while still allowing the previous type inference for this column to be valid.
-
-  Args:
-    str_val: String value to infer the type of.
-    na_value: Additional string to recognize as a NA/NaN CSV value.
-    prev_type: Type previously inferred based on values of this column that
-      we've seen up till now.
-  Returns:
-    Inferred dtype.
-  """
-  if str_val in ("", na_value):
-    # If the field is null, it gives no extra information about its type
-    return prev_type
-
-  type_list = [
-      dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64, dtypes.string
-  ]  # list of types to try, ordered from least permissive to most
-
-  type_functions = [
-      _is_valid_int32,
-      _is_valid_int64,
-      lambda str_val: _is_valid_float(str_val, dtypes.float32),
-      lambda str_val: _is_valid_float(str_val, dtypes.float64),
-      lambda str_val: True,
-  ]  # Corresponding list of validation functions
-
-  for i in range(len(type_list)):
-    validation_fn = type_functions[i]
-    if validation_fn(str_val) and (prev_type is None or
-                                   prev_type in type_list[:i + 1]):
-      return type_list[i]
-
-
-def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header):
-  """Generator that yields rows of CSV file(s) in order."""
-  for fn in filenames:
-    with file_io.FileIO(fn, "r") as f:
-      rdr = csv.reader(
-          f,
-          delimiter=field_delim,
-          quoting=csv.QUOTE_MINIMAL if use_quote_delim else csv.QUOTE_NONE)
-      if header:
-        next(rdr)  # Skip header lines
-
-      for csv_row in rdr:
-        if len(csv_row) != num_cols:
-          raise ValueError(
-              "Problem inferring types: CSV row has different number of fields "
-              "than expected.")
-        yield csv_row
-
-
-def _infer_column_defaults(filenames, num_cols, field_delim, use_quote_delim,
-                           na_value, header, num_rows_for_inference,
-                           select_columns):
-  """Infers column types from the first N valid CSV records of files."""
-  if select_columns is None:
-    select_columns = range(num_cols)
-  inferred_types = [None] * len(select_columns)
-
-  for i, csv_row in enumerate(
-      _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header)):
-    if num_rows_for_inference is not None and i >= num_rows_for_inference:
-      break
-
-    for j, col_index in enumerate(select_columns):
-      inferred_types[j] = _infer_type(csv_row[col_index], na_value,
-                                      inferred_types[j])
-
-  # Replace None's with a default type
-  inferred_types = [t or dtypes.string for t in inferred_types]
-  # Default to 0 or '' for null values
-  return [
-      constant_op.constant([0 if t is not dtypes.string else ""], dtype=t)
-      for t in inferred_types
-  ]
-
-
-def _infer_column_names(filenames, field_delim, use_quote_delim):
-  """Infers column names from first rows of files."""
-  csv_kwargs = {
-      "delimiter": field_delim,
-      "quoting": csv.QUOTE_MINIMAL if use_quote_delim else csv.QUOTE_NONE
-  }
-  with file_io.FileIO(filenames[0], "r") as f:
-    try:
-      column_names = next(csv.reader(f, **csv_kwargs))
-    except StopIteration:
-      raise ValueError(("Received StopIteration when reading the header line "
-                        "of %s.  Empty file?") % filenames[0])
-
-  for name in filenames[1:]:
-    with file_io.FileIO(name, "r") as f:
-      try:
-        if next(csv.reader(f, **csv_kwargs)) != column_names:
-          raise ValueError(
-              "Files have different column names in the header row.")
-      except StopIteration:
-        raise ValueError(("Received StopIteration when reading the header line "
-                          "of %s.  Empty file?") % filenames[0])
-  return column_names
-
-
-def _get_sorted_col_indices(select_columns, column_names):
-  """Transforms select_columns argument into sorted column indices."""
-  names_to_indices = {n: i for i, n in enumerate(column_names)}
-  num_cols = len(column_names)
-  for i, v in enumerate(select_columns):
-    if isinstance(v, int):
-      if v < 0 or v >= num_cols:
-        raise ValueError(
-            "Column index %d specified in select_columns out of valid range." %
-            v)
-      continue
-    if v not in names_to_indices:
-      raise ValueError(
-          "Value '%s' specified in select_columns not a valid column index or "
-          "name." % v)
-    select_columns[i] = names_to_indices[v]
-
-  # Sort and ensure there are no duplicates
-  result = sorted(set(select_columns))
-  if len(result) != len(select_columns):
-    raise ValueError("select_columns contains duplicate columns")
-  return result
-
-
-def _maybe_shuffle_and_repeat(
-    dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed):
-  """Optionally shuffle and repeat dataset, as requested."""
-  if num_epochs != 1 and shuffle:
-    # Use shuffle_and_repeat for perf
-    return dataset.apply(
-        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
-                                       shuffle_seed))
-  elif shuffle:
-    return dataset.shuffle(shuffle_buffer_size, shuffle_seed)
-  elif num_epochs != 1:
-    return dataset.repeat(num_epochs)
-  return dataset
-
-
-def make_tf_record_dataset(file_pattern,
-                           batch_size,
-                           parser_fn=None,
-                           num_epochs=None,
-                           shuffle=True,
-                           shuffle_buffer_size=None,
-                           shuffle_seed=None,
-                           prefetch_buffer_size=optimization.AUTOTUNE,
-                           num_parallel_reads=None,
-                           num_parallel_parser_calls=None,
-                           drop_final_batch=False):
-  """Reads and optionally parses TFRecord files into a dataset.
-
-  Provides common functionality such as batching, optional parsing, shuffling,
-  and performant defaults.
-
-  Args:
-    file_pattern: List of files or patterns of TFRecord file paths.
-      See `tf.gfile.Glob` for pattern rules.
-    batch_size: An int representing the number of records to combine
-      in a single batch.
-    parser_fn: (Optional.) A function accepting string input to parse
-      and process the record contents. This function must map records
-      to components of a fixed shape, so they may be batched. By
-      default, uses the record contents unmodified.
-    num_epochs: (Optional.) An int specifying the number of times this
-      dataset is repeated.  If None (the default), cycles through the
-      dataset forever.
-    shuffle: (Optional.) A bool that indicates whether the input
-      should be shuffled. Defaults to `True`.
-    shuffle_buffer_size: (Optional.) Buffer size to use for
-      shuffling. A large buffer size ensures better shuffling, but
-      increases memory usage and startup time.
-    shuffle_seed: (Optional.) Randomization seed to use for shuffling.
-    prefetch_buffer_size: (Optional.) An int specifying the number of
-      feature batches to prefetch for performance improvement.
-      Defaults to auto-tune. Set to 0 to disable prefetching.
-    num_parallel_reads: (Optional.) Number of threads used to read
-      records from files. By default or if set to a value >1, the
-      results will be interleaved.
-    num_parallel_parser_calls: (Optional.) Number of parallel
-      records to parse in parallel. Defaults to an automatic selection.
-    drop_final_batch: (Optional.) Whether the last batch should be
-      dropped in case its size is smaller than `batch_size`; the
-      default behavior is not to drop the smaller batch.
-
-  Returns:
-    A dataset, where each element matches the output of `parser_fn`
-    except it will have an additional leading `batch-size` dimension,
-    or a `batch_size`-length 1-D tensor of strings if `parser_fn` is
-    unspecified.
-  """
-  files = dataset_ops.Dataset.list_files(
-      file_pattern, shuffle=shuffle, seed=shuffle_seed)
-
-  if num_parallel_reads is None:
-    # Note: We considered auto-tuning this value, but there is a concern
-    # that this affects the mixing of records from different files, which
-    # could affect training convergence/accuracy, so we are defaulting to
-    # a constant for now.
-    num_parallel_reads = 24
-  dataset = core_readers.TFRecordDataset(
-      files, num_parallel_reads=num_parallel_reads)
-
-  if shuffle_buffer_size is None:
-    # TODO(josh11b): Auto-tune this value when not specified
-    shuffle_buffer_size = 10000
-  dataset = _maybe_shuffle_and_repeat(
-      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
-
-  # NOTE(mrry): We set `drop_final_batch=True` when `num_epochs is None` to
-  # improve the shape inference, because it makes the batch dimension static.
-  # It is safe to do this because in that case we are repeating the input
-  # indefinitely, and all batches will be full-sized.
-  drop_final_batch = drop_final_batch or num_epochs is None
-
-  if parser_fn is None:
-    dataset = dataset.batch(batch_size, drop_remainder=drop_final_batch)
-  else:
-    # TODO(josh11b): if num_parallel_parser_calls is None, use some function
-    # of num cores instead of map_and_batch's default behavior of one batch.
-    dataset = dataset.apply(batching.map_and_batch(
-        parser_fn, batch_size, num_parallel_calls=num_parallel_parser_calls,
-        drop_remainder=drop_final_batch))
-
-  if prefetch_buffer_size == 0:
-    return dataset
-  else:
-    return dataset.prefetch(buffer_size=prefetch_buffer_size)
-
 
+@deprecation.deprecated(None,
+                        "Use `tf.data.experimental.make_csv_dataset(...)`.")
 def make_csv_dataset(
     file_pattern,
     batch_size,
@@ -387,7 +112,6 @@ def make_csv_dataset(
     prefetch_buffer_size: An int specifying the number of feature
       batches to prefetch for performance improvement. Recommended value is the
       number of batches consumed per training step. Defaults to auto-tune.
-
     num_parallel_reads: Number of threads used to read CSV records from files.
       If >1, the results will be interleaved.
     sloppy: If `True`, reading performance will be improved at
@@ -411,106 +135,18 @@ def make_csv_dataset(
   Raises:
     ValueError: If any of the arguments is malformed.
   """
-  # Create dataset of all matching filenames
-  filenames = _get_file_names(file_pattern, False)
-  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
-  if shuffle:
-    dataset = dataset.shuffle(len(filenames), shuffle_seed)
-
-  # Clean arguments; figure out column names and defaults
+  return readers.make_csv_dataset(
+      file_pattern, batch_size, column_names, column_defaults, label_name,
+      select_columns, field_delim, use_quote_delim, na_value, header,
+      num_epochs, shuffle, shuffle_buffer_size, shuffle_seed,
+      prefetch_buffer_size, num_parallel_reads, sloppy, num_rows_for_inference,
+      compression_type)
 
-  if column_names is None:
-    if not header:
-      raise ValueError("Cannot infer column names without a header line.")
-    # If column names are not provided, infer from the header lines
-    column_names = _infer_column_names(filenames, field_delim, use_quote_delim)
-  if len(column_names) != len(set(column_names)):
-    raise ValueError("Cannot have duplicate column names.")
 
-  if select_columns is not None:
-    select_columns = _get_sorted_col_indices(select_columns, column_names)
-
-  if column_defaults is not None:
-    column_defaults = [
-        constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
-        for x in column_defaults
-    ]
-  else:
-    # If column defaults are not provided, infer from records at graph
-    # construction time
-    column_defaults = _infer_column_defaults(
-        filenames, len(column_names), field_delim, use_quote_delim, na_value,
-        header, num_rows_for_inference, select_columns)
-
-  if select_columns is not None and len(column_defaults) != len(select_columns):
-    raise ValueError(
-        "If specified, column_defaults and select_columns must have same "
-        "length."
-    )
-  if select_columns is not None and len(column_names) > len(select_columns):
-    # Pick the relevant subset of column names
-    column_names = [column_names[i] for i in select_columns]
-
-  if label_name is not None and label_name not in column_names:
-    raise ValueError("`label_name` provided must be one of the columns.")
-
-  def filename_to_dataset(filename):
-    return CsvDataset(
-        filename,
-        record_defaults=column_defaults,
-        field_delim=field_delim,
-        use_quote_delim=use_quote_delim,
-        na_value=na_value,
-        select_cols=select_columns,
-        header=header,
-        compression_type=compression_type,
-    )
-
-  def map_fn(*columns):
-    """Organizes columns into a features dictionary.
-
-    Args:
-      *columns: list of `Tensor`s corresponding to one csv record.
-    Returns:
-      An OrderedDict of feature names to values for that particular record. If
-      label_name is provided, extracts the label feature to be returned as the
-      second element of the tuple.
-    """
-    features = collections.OrderedDict(zip(column_names, columns))
-    if label_name is not None:
-      label = features.pop(label_name)
-      return features, label
-    return features
-
-  # Read files sequentially (if num_parallel_reads=1) or in parallel
-  dataset = dataset.apply(
-      interleave_ops.parallel_interleave(
-          filename_to_dataset, cycle_length=num_parallel_reads, sloppy=sloppy))
-
-  dataset = _maybe_shuffle_and_repeat(
-      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
-
-  # Apply batch before map for perf, because map has high overhead relative
-  # to the size of the computation in each map.
-  # NOTE(mrry): We set `drop_remainder=True` when `num_epochs is None` to
-  # improve the shape inference, because it makes the batch dimension static.
-  # It is safe to do this because in that case we are repeating the input
-  # indefinitely, and all batches will be full-sized.
-  dataset = dataset.batch(batch_size=batch_size,
-                          drop_remainder=num_epochs is None)
-  dataset = dataset_ops.MapDataset(
-      dataset, map_fn, use_inter_op_parallelism=False)
-  dataset = dataset.prefetch(prefetch_buffer_size)
-
-  return dataset
-
-
-_DEFAULT_READER_BUFFER_SIZE_BYTES = 4 * 1024 * 1024  # 4 MB
-
-
-class CsvDataset(dataset_ops.DatasetSource):
+class CsvDataset(readers.CsvDataset):
   """A Dataset comprising lines from one or more CSV files."""
 
+  @deprecation.deprecated(None, "Use `tf.data.experimental.CsvDataset(...)`.")
   def __init__(self,
                filenames,
                record_defaults,
@@ -521,140 +157,13 @@ class CsvDataset(dataset_ops.DatasetSource):
                use_quote_delim=True,
                na_value="",
                select_cols=None):
-    """Creates a `CsvDataset` by reading and decoding CSV files.
-
-    The elements of this dataset correspond to records from the file(s).
-    RFC 4180 format is expected for CSV files
-    (https://tools.ietf.org/html/rfc4180)
-    Note that we allow leading and trailing spaces with int or float field.
-
-
-    For example, suppose we have a file 'my_file0.csv' with four CSV columns of
-    different data types:
-    ```
-    abcdefg,4.28E10,5.55E6,12
-    hijklmn,-5.3E14,,2
-    ```
-
-    We can construct a CsvDataset from it as follows:
-    ```python
-    dataset = tf.contrib.data.CsvDataset(
-      "my_file*.csv",
-      [tf.float32,  # Required field, use dtype or empty tensor
-       tf.constant([0.0], dtype=tf.float32),  # Optional field, default to 0.0
-       tf.int32,  # Required field, use dtype or empty tensor
-       ],
-      select_cols=[1,2,3]  # Only parse last three columns
-    )
-    ```
-
-    The expected output of its iterations is:
-    ```python
-    next_element = dataset.make_one_shot_iterator().get_next()
-    with tf.Session() as sess:
-      while True:
-        try:
-          print(sess.run(next_element))
-        except tf.errors.OutOfRangeError:
-          break
-
-    >> (4.28e10, 5.55e6, 12)
-    >> (-5.3e14, 0.0, 2)
-    ```
-
-    Args:
-      filenames: A `tf.string` tensor containing one or more filenames.
-      record_defaults: A list of default values for the CSV fields. Each item in
-        the list is either a valid CSV `DType` (float32, float64, int32, int64,
-        string), or a `Tensor` object with one of the above types. One per
-        column of CSV data, with either a scalar `Tensor` default value for the
-        column if it is optional, or `DType` or empty `Tensor` if required. If
-        both this and `select_columns` are specified, these must have the same
-        lengths, and `column_defaults` is assumed to be sorted in order of
-        increasing column index.
-      compression_type: (Optional.) A `tf.string` scalar evaluating to one of
-        `""` (no compression), `"ZLIB"`, or `"GZIP"`. Defaults to no
-        compression.
-      buffer_size: (Optional.) A `tf.int64` scalar denoting the number of bytes
-        to buffer while reading files. Defaults to 4MB.
-      header: (Optional.) A `tf.bool` scalar indicating whether the CSV file(s)
-        have header line(s) that should be skipped when parsing. Defaults to
-        `False`.
-      field_delim: (Optional.) A `tf.string` scalar containing the delimiter
-        character that separates fields in a record. Defaults to `","`.
-      use_quote_delim: (Optional.) A `tf.bool` scalar. If `False`, treats
-        double quotation marks as regular characters inside of string fields
-        (ignoring RFC 4180, Section 2, Bullet 5). Defaults to `True`.
-      na_value: (Optional.) A `tf.string` scalar indicating a value that will
-        be treated as NA/NaN.
-      select_cols: (Optional.) A sorted list of column indices to select from
-        the input data. If specified, only this subset of columns will be
-        parsed. Defaults to parsing all columns.
-    """
-    super(CsvDataset, self).__init__()
-    self._filenames = ops.convert_to_tensor(
-        filenames, dtype=dtypes.string, name="filenames")
-    self._compression_type = convert.optional_param_to_tensor(
-        "compression_type",
-        compression_type,
-        argument_default="",
-        argument_dtype=dtypes.string)
-    record_defaults = [
-        constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
-        for x in record_defaults
-    ]
-    self._record_defaults = ops.convert_n_to_tensor(
-        record_defaults, name="record_defaults")
-    self._buffer_size = convert.optional_param_to_tensor(
-        "buffer_size", buffer_size, _DEFAULT_READER_BUFFER_SIZE_BYTES)
-    self._header = ops.convert_to_tensor(
-        header, dtype=dtypes.bool, name="header")
-    self._field_delim = ops.convert_to_tensor(
-        field_delim, dtype=dtypes.string, name="field_delim")
-    self._use_quote_delim = ops.convert_to_tensor(
-        use_quote_delim, dtype=dtypes.bool, name="use_quote_delim")
-    self._na_value = ops.convert_to_tensor(
-        na_value, dtype=dtypes.string, name="na_value")
-    self._select_cols = convert.optional_param_to_tensor(
-        "select_cols",
-        select_cols,
-        argument_default=[],
-        argument_dtype=dtypes.int64,
-    )
-    self._output_shapes = tuple(
-        tensor_shape.scalar() for _ in range(len(record_defaults)))
-    self._output_types = tuple(d.dtype for d in self._record_defaults)
-    self._output_classes = tuple(
-        ops.Tensor for _ in range(len(record_defaults)))
-
-  def _as_variant_tensor(self):
-    # Constructs graph node for the dataset op.
-    return gen_experimental_dataset_ops.experimental_csv_dataset(
-        filenames=self._filenames,
-        record_defaults=self._record_defaults,
-        buffer_size=self._buffer_size,
-        header=self._header,
-        output_shapes=self._output_shapes,
-        field_delim=self._field_delim,
-        use_quote_delim=self._use_quote_delim,
-        na_value=self._na_value,
-        select_cols=self._select_cols,
-        compression_type=self._compression_type,
-    )
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_classes(self):
-    return self._output_classes
+    super(CsvDataset, self).__init__(
+        filenames, record_defaults, compression_type, buffer_size, header,
+        field_delim, use_quote_delim, na_value, select_cols)
 
 
+@deprecation.deprecated(
+    None, "Use `tf.data.experimental.make_batched_features_dataset(...)`.")
 def make_batched_features_dataset(file_pattern,
                                   batch_size,
                                   features,
@@ -759,57 +268,15 @@ def make_batched_features_dataset(file_pattern,
   Raises:
     ValueError: If `label_key` is not one of the `features` keys.
   """
-  # Create dataset of all matching filenames
-  filenames = _get_file_names(file_pattern, False)
-  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
-  if shuffle:
-    dataset = dataset.shuffle(len(filenames), shuffle_seed)
-
-  # Read `Example` records from files as tensor objects.
-  if reader_args is None:
-    reader_args = []
+  return readers.make_batched_features_dataset(
+      file_pattern, batch_size, features, reader, label_key, reader_args,
+      num_epochs, shuffle, shuffle_buffer_size, shuffle_seed,
+      prefetch_buffer_size, reader_num_threads, parser_num_threads,
+      sloppy_ordering, drop_final_batch)
 
-  # Read files sequentially (if reader_num_threads=1) or in parallel
-  dataset = dataset.apply(
-      interleave_ops.parallel_interleave(
-          lambda filename: reader(filename, *reader_args),
-          cycle_length=reader_num_threads,
-          sloppy=sloppy_ordering))
 
-  # Extract values if the `Example` tensors are stored as key-value tuples.
-  if dataset.output_types == (dtypes.string, dtypes.string):
-    dataset = dataset_ops.MapDataset(
-        dataset, lambda _, v: v, use_inter_op_parallelism=False)
-
-  # Apply dataset repeat and shuffle transformations.
-  dataset = _maybe_shuffle_and_repeat(
-      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
-
-  # NOTE(mrry): We set `drop_remainder=True` when `num_epochs is None` to
-  # improve the shape inference, because it makes the batch dimension static.
-  # It is safe to do this because in that case we are repeating the input
-  # indefinitely, and all batches will be full-sized.
-  dataset = dataset.batch(
-      batch_size, drop_remainder=drop_final_batch or num_epochs is None)
-
-  # Parse `Example` tensors to a dictionary of `Feature` tensors.
-  dataset = dataset.apply(
-      parsing_ops.parse_example_dataset(
-          features, num_parallel_calls=parser_num_threads))
-
-  if label_key:
-    if label_key not in features:
-      raise ValueError(
-          "The `label_key` provided (%r) must be one of the `features` keys." %
-          label_key)
-    dataset = dataset.map(lambda x: (x, x.pop(label_key)))
-
-  dataset = dataset.prefetch(prefetch_buffer_size)
-  return dataset
-
-
-@deprecation.deprecated(None,
-                        "Use `tf.contrib.data.make_batched_features_dataset`")
+@deprecation.deprecated(
+    None, "Use `tf.data.experimental.make_batched_features_dataset(...)`")
 def read_batch_features(file_pattern,
                         batch_size,
                         features,
@@ -879,7 +346,7 @@ def read_batch_features(file_pattern,
   Returns:
     A dict from keys in features to `Tensor` or `SparseTensor` objects.
   """
-  dataset = make_batched_features_dataset(
+  dataset = readers.make_batched_features_dataset(
       file_pattern,
       batch_size,
       features,
@@ -893,96 +360,13 @@ def read_batch_features(file_pattern,
   return outputs
 
 
-def _get_file_names(file_pattern, shuffle):
-  """Parse list of file names from pattern, optionally shuffled.
-
-  Args:
-    file_pattern: File glob pattern, or list of glob patterns.
-    shuffle: Whether to shuffle the order of file names.
-
-  Returns:
-    List of file names matching `file_pattern`.
-
-  Raises:
-    ValueError: If `file_pattern` is empty, or pattern matches no files.
-  """
-  if isinstance(file_pattern, list):
-    if not file_pattern:
-      raise ValueError("File pattern is empty.")
-    file_names = []
-    for entry in file_pattern:
-      file_names.extend(gfile.Glob(entry))
-  else:
-    file_names = list(gfile.Glob(file_pattern))
-
-  if not file_names:
-    raise ValueError("No files match %s." % file_pattern)
-
-  # Sort files so it will be deterministic for unit tests.
-  if not shuffle:
-    file_names = sorted(file_names)
-  return file_names
-
-
-class SqlDataset(dataset_ops.DatasetSource):
+class SqlDataset(readers.SqlDataset):
   """A `Dataset` consisting of the results from a SQL query."""
 
+  @deprecation.deprecated(None, "Use `tf.data.experimental.SqlDataset(...)`.")
   def __init__(self, driver_name, data_source_name, query, output_types):
-    """Creates a `SqlDataset`.
-
-    `SqlDataset` allows a user to read data from the result set of a SQL query.
-    For example:
-
-    ```python
-    dataset = tf.contrib.data.SqlDataset("sqlite", "/foo/bar.sqlite3",
-                                         "SELECT name, age FROM people",
-                                         (tf.string, tf.int32))
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-    # Prints the rows of the result set of the above query.
-    while True:
-      try:
-        print(sess.run(next_element))
-      except tf.errors.OutOfRangeError:
-        break
-    ```
-
-    Args:
-      driver_name: A 0-D `tf.string` tensor containing the database type.
-        Currently, the only supported value is 'sqlite'.
-      data_source_name: A 0-D `tf.string` tensor containing a connection string
-        to connect to the database.
-      query: A 0-D `tf.string` tensor containing the SQL query to execute.
-      output_types: A tuple of `tf.DType` objects representing the types of the
-        columns returned by `query`.
-    """
-    super(SqlDataset, self).__init__()
-    self._driver_name = ops.convert_to_tensor(
-        driver_name, dtype=dtypes.string, name="driver_name")
-    self._data_source_name = ops.convert_to_tensor(
-        data_source_name, dtype=dtypes.string, name="data_source_name")
-    self._query = ops.convert_to_tensor(
-        query, dtype=dtypes.string, name="query")
-    self._output_types = output_types
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.sql_dataset(self._driver_name,
-                                       self._data_source_name, self._query,
-                                       nest.flatten(self.output_types),
-                                       nest.flatten(self.output_shapes))
-
-  @property
-  def output_classes(self):
-    return nest.map_structure(lambda _: ops.Tensor, self._output_types)
-
-  @property
-  def output_shapes(self):
-    return nest.map_structure(lambda _: tensor_shape.TensorShape([]),
-                              self._output_types)
-
-  @property
-  def output_types(self):
-    return self._output_types
+    super(SqlDataset, self).__init__(
+        driver_name, data_source_name, query, output_types)
 
 
 class LMDBDataset(dataset_ops.DatasetSource):
diff --git a/tensorflow/contrib/data/python/ops/resampling.py b/tensorflow/contrib/data/python/ops/resampling.py
index 75642f143e..29d77528d9 100644
--- a/tensorflow/contrib/data/python/ops/resampling.py
+++ b/tensorflow/contrib/data/python/ops/resampling.py
@@ -17,22 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.contrib.data.python.ops import batching
-from tensorflow.contrib.data.python.ops import interleave_ops
-from tensorflow.contrib.data.python.ops import scan_ops
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import logging_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.data.experimental.ops import resampling
+from tensorflow.python.util import deprecation
 
 
+@deprecation.deprecated(None,
+                        "Use `tf.data.experimental.rejection_resample(...)`.")
 def rejection_resample(class_func, target_dist, initial_dist=None, seed=None):
   """A transformation that resamples a dataset to achieve a target distribution.
 
@@ -52,243 +42,5 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None):
     A `Dataset` transformation function, which can be passed to
     `tf.data.Dataset.apply`.
   """
-  def _apply_fn(dataset):
-    """Function from `Dataset` to `Dataset` that applies the transformation."""
-    target_dist_t = ops.convert_to_tensor(target_dist, name="target_dist")
-    class_values_ds = dataset.map(class_func)
-
-    # Get initial distribution.
-    if initial_dist is not None:
-      initial_dist_t = ops.convert_to_tensor(initial_dist, name="initial_dist")
-      acceptance_dist, prob_of_original = (
-          _calculate_acceptance_probs_with_mixing(initial_dist_t,
-                                                  target_dist_t))
-      initial_dist_ds = dataset_ops.Dataset.from_tensors(
-          initial_dist_t).repeat()
-      acceptance_dist_ds = dataset_ops.Dataset.from_tensors(
-          acceptance_dist).repeat()
-      prob_of_original_ds = dataset_ops.Dataset.from_tensors(
-          prob_of_original).repeat()
-    else:
-      initial_dist_ds = _estimate_initial_dist_ds(
-          target_dist_t, class_values_ds)
-      acceptance_and_original_prob_ds = initial_dist_ds.map(
-          lambda initial: _calculate_acceptance_probs_with_mixing(
-              initial, target_dist_t))
-      acceptance_dist_ds = acceptance_and_original_prob_ds.map(
-          lambda accept_prob, _: accept_prob)
-      prob_of_original_ds = acceptance_and_original_prob_ds.map(
-          lambda _, prob_original: prob_original)
-    filtered_ds = _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds,
-                             class_values_ds, seed)
-    # Prefetch filtered dataset for speed.
-    filtered_ds = filtered_ds.prefetch(3)
-
-    prob_original_static = _get_prob_original_static(
-        initial_dist_t, target_dist_t) if initial_dist is not None else None
-    if prob_original_static == 1:
-      return dataset_ops.Dataset.zip((class_values_ds, dataset))
-    elif prob_original_static == 0:
-      return filtered_ds
-    else:
-      return interleave_ops.sample_from_datasets(
-          [dataset_ops.Dataset.zip((class_values_ds, dataset)), filtered_ds],
-          weights=prob_of_original_ds.map(lambda prob: [(prob, 1.0 - prob)]),
-          seed=seed)
-
-  return _apply_fn
-
-
-def _get_prob_original_static(initial_dist_t, target_dist_t):
-  """Returns the static probability of sampling from the original.
-
-  `tensor_util.constant_value(prob_of_original)` returns `None` if it encounters
-  an Op that it isn't defined for. We have some custom logic to avoid this.
-
-  Args:
-    initial_dist_t: A tensor of the initial distribution.
-    target_dist_t: A tensor of the target distribution.
-
-  Returns:
-    The probability of sampling from the original distribution as a constant,
-    if it is a constant, or `None`.
-  """
-  init_static = tensor_util.constant_value(initial_dist_t)
-  target_static = tensor_util.constant_value(target_dist_t)
-
-  if init_static is None or target_static is None:
-    return None
-  else:
-    return np.min(target_static / init_static)
-
-
-def _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds, class_values_ds,
-               seed):
-  """Filters a dataset based on per-class acceptance probabilities.
-
-  Args:
-    dataset: The dataset to be filtered.
-    acceptance_dist_ds: A dataset of acceptance probabilities.
-    initial_dist_ds: A dataset of the initial probability distribution, given or
-        estimated.
-    class_values_ds: A dataset of the corresponding classes.
-    seed: (Optional.) Python integer seed for the resampler.
-
-  Returns:
-    A dataset of (class value, data) after filtering.
-  """
-  def maybe_warn_on_large_rejection(accept_dist, initial_dist):
-    proportion_rejected = math_ops.reduce_sum((1 - accept_dist) * initial_dist)
-    return control_flow_ops.cond(
-        math_ops.less(proportion_rejected, .5),
-        lambda: accept_dist,
-        lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
-            accept_dist, [proportion_rejected, initial_dist, accept_dist],
-            message="Proportion of examples rejected by sampler is high: ",
-            summarize=100,
-            first_n=10))
-
-  acceptance_dist_ds = (dataset_ops.Dataset.zip((acceptance_dist_ds,
-                                                 initial_dist_ds))
-                        .map(maybe_warn_on_large_rejection))
-
-  def _gather_and_copy(class_val, acceptance_prob, data):
-    return class_val, array_ops.gather(acceptance_prob, class_val), data
-
-  current_probabilities_and_class_and_data_ds = dataset_ops.Dataset.zip(
-      (class_values_ds, acceptance_dist_ds, dataset)).map(_gather_and_copy)
-  filtered_ds = (
-      current_probabilities_and_class_and_data_ds
-      .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
-  return filtered_ds.map(lambda class_value, _, data: (class_value, data))
-
-
-def _estimate_initial_dist_ds(
-    target_dist_t, class_values_ds, dist_estimation_batch_size=32,
-    smoothing_constant=10):
-  num_classes = (target_dist_t.shape[0].value or
-                 array_ops.shape(target_dist_t)[0])
-  initial_examples_per_class_seen = array_ops.fill(
-      [num_classes], np.int64(smoothing_constant))
-
-  def update_estimate_and_tile(num_examples_per_class_seen, c):
-    updated_examples_per_class_seen, dist = _estimate_data_distribution(
-        c, num_examples_per_class_seen)
-    tiled_dist = array_ops.tile(
-        array_ops.expand_dims(dist, 0), [dist_estimation_batch_size, 1])
-    return updated_examples_per_class_seen, tiled_dist
-
-  initial_dist_ds = (class_values_ds.batch(dist_estimation_batch_size)
-                     .apply(scan_ops.scan(initial_examples_per_class_seen,
-                                          update_estimate_and_tile))
-                     .apply(batching.unbatch()))
-
-  return initial_dist_ds
-
-
-def _get_target_to_initial_ratio(initial_probs, target_probs):
-  # Add tiny to initial_probs to avoid divide by zero.
-  denom = (initial_probs + np.finfo(initial_probs.dtype.as_numpy_dtype).tiny)
-  return target_probs / denom
-
-
-def _estimate_data_distribution(c, num_examples_per_class_seen):
-  """Estimate data distribution as labels are seen.
-
-  Args:
-    c: The class labels.  Type `int32`, shape `[batch_size]`.
-    num_examples_per_class_seen: Type `int64`, shape `[num_classes]`,
-      containing counts.
-
-  Returns:
-    num_examples_per_lass_seen: Updated counts.  Type `int64`, shape
-      `[num_classes]`.
-    dist: The updated distribution.  Type `float32`, shape `[num_classes]`.
-  """
-  num_classes = num_examples_per_class_seen.get_shape()[0].value
-  # Update the class-count based on what labels are seen in batch.
-  num_examples_per_class_seen = math_ops.add(
-      num_examples_per_class_seen, math_ops.reduce_sum(
-          array_ops.one_hot(c, num_classes, dtype=dtypes.int64), 0))
-  init_prob_estimate = math_ops.truediv(
-      num_examples_per_class_seen,
-      math_ops.reduce_sum(num_examples_per_class_seen))
-  dist = math_ops.cast(init_prob_estimate, dtypes.float32)
-  return num_examples_per_class_seen, dist
-
-
-def _calculate_acceptance_probs_with_mixing(initial_probs, target_probs):
-  """Calculates the acceptance probabilities and mixing ratio.
-
-  In this case, we assume that we can *either* sample from the original data
-  distribution with probability `m`, or sample from a reshaped distribution
-  that comes from rejection sampling on the original distribution. This
-  rejection sampling is done on a per-class basis, with `a_i` representing the
-  probability of accepting data from class `i`.
-
-  This method is based on solving the following analysis for the reshaped
-  distribution:
-
-  Let F be the probability of a rejection (on any example).
-  Let p_i be the proportion of examples in the data in class i (init_probs)
-  Let a_i is the rate the rejection sampler should *accept* class i
-  Let t_i is the target proportion in the minibatches for class i (target_probs)
-
-  ```
-  F = sum_i(p_i * (1-a_i))
-    = 1 - sum_i(p_i * a_i)     using sum_i(p_i) = 1
-  ```
-
-  An example with class `i` will be accepted if `k` rejections occur, then an
-  example with class `i` is seen by the rejector, and it is accepted. This can
-  be written as follows:
-
-  ```
-  t_i = sum_k=0^inf(F^k * p_i * a_i)
-      = p_i * a_j / (1 - F)    using geometric series identity, since 0 <= F < 1
-      = p_i * a_i / sum_j(p_j * a_j)        using F from above
-  ```
-
-  Note that the following constraints hold:
-  ```
-  0 <= p_i <= 1, sum_i(p_i) = 1
-  0 <= a_i <= 1
-  0 <= t_i <= 1, sum_i(t_i) = 1
-  ```
-
-  A solution for a_i in terms of the other variables is the following:
-    ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
-
-  If we try to minimize the amount of data rejected, we get the following:
-
-  M_max = max_i [ t_i / p_i ]
-  M_min = min_i [ t_i / p_i ]
-
-  The desired probability of accepting data if it comes from class `i`:
-
-  a_i = (t_i/p_i - m) / (M_max - m)
-
-  The desired probability of pulling a data element from the original dataset,
-  rather than the filtered one:
-
-  m = M_min
-
-  Args:
-    initial_probs: A Tensor of the initial probability distribution, given or
-      estimated.
-    target_probs: A Tensor of the corresponding classes.
-
-  Returns:
-    (A 1D Tensor with the per-class acceptance probabilities, the desired
-    probability of pull from the original distribution.)
-  """
-  ratio_l = _get_target_to_initial_ratio(initial_probs, target_probs)
-  max_ratio = math_ops.reduce_max(ratio_l)
-  min_ratio = math_ops.reduce_min(ratio_l)
-
-  # Target prob to sample from original distribution.
-  m = min_ratio
-
-  # TODO(joelshor): Simplify fraction, if possible.
-  a_i = (ratio_l - m) / (max_ratio - m)
-  return a_i, m
+  return resampling.rejection_resample(class_func, target_dist, initial_dist,
+                                       seed)
diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py
index c52582cd35..0ca9fddb23 100644
--- a/tensorflow/contrib/data/python/ops/scan_ops.py
+++ b/tensorflow/contrib/data/python/ops/scan_ops.py
@@ -17,137 +17,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import gen_dataset_ops
-
-
-class _ScanDataset(dataset_ops.UnaryDataset):
-  """A dataset that scans a function across its input."""
-
-  def __init__(self, input_dataset, initial_state, scan_func):
-    """See `scan()` for details."""
-    super(_ScanDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-
-    with ops.name_scope("initial_state"):
-      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
-      # values to tensors.
-      self._initial_state = nest.pack_sequence_as(initial_state, [
-          sparse_tensor.SparseTensor.from_value(t)
-          if sparse_tensor.is_sparse(t) else ops.convert_to_tensor(
-              t, name="component_%d" % i)
-          for i, t in enumerate(nest.flatten(initial_state))
-      ])
-
-    # Compute initial values for the state classes, shapes and types based on
-    # the initial state. The shapes may be refined by running `tf_scan_func` one
-    # or more times below.
-    self._state_classes = sparse.get_classes(self._initial_state)
-    self._state_shapes = nest.pack_sequence_as(
-        self._initial_state,
-        [t.get_shape() for t in nest.flatten(self._initial_state)])
-    self._state_types = nest.pack_sequence_as(
-        self._initial_state,
-        [t.dtype for t in nest.flatten(self._initial_state)])
-
-    # Will be populated by calling `tf_scan_func`.
-    self._output_classes = None
-    self._output_shapes = None
-    self._output_types = None
-
-    # Iteratively rerun the scan function until reaching a fixed point on
-    # `self._state_shapes`.
-    need_to_rerun = True
-    while need_to_rerun:
-
-      wrapped_func = dataset_ops.StructuredFunctionWrapper(
-          scan_func, "tf.contrib.data.scan()",
-          input_classes=(self._state_classes, input_dataset.output_classes),
-          input_shapes=(self._state_shapes, input_dataset.output_shapes),
-          input_types=(self._state_types, input_dataset.output_types),
-          add_to_graph=False)
-      if not (
-          isinstance(wrapped_func.output_types, collections.Sequence) and
-          len(wrapped_func.output_types) == 2):
-        raise TypeError("The scan function must return a pair comprising the "
-                        "new state and the output value.")
-
-      new_state_classes, self._output_classes = wrapped_func.output_classes
-
-      # Extract and validate class information from the returned values.
-      for new_state_class, state_class in zip(
-          nest.flatten(new_state_classes),
-          nest.flatten(self._state_classes)):
-        if not issubclass(new_state_class, state_class):
-          raise TypeError(
-              "The element classes for the new state must match the initial "
-              "state. Expected %s; got %s." %
-              (self._state_classes, new_state_classes))
-
-      # Extract and validate type information from the returned values.
-      new_state_types, self._output_types = wrapped_func.output_types
-      for new_state_type, state_type in zip(
-          nest.flatten(new_state_types), nest.flatten(self._state_types)):
-        if new_state_type != state_type:
-          raise TypeError(
-              "The element types for the new state must match the initial "
-              "state. Expected %s; got %s." %
-              (self._state_types, new_state_types))
-
-      # Extract shape information from the returned values.
-      new_state_shapes, self._output_shapes = wrapped_func.output_shapes
-
-      flat_state_shapes = nest.flatten(self._state_shapes)
-      flat_new_state_shapes = nest.flatten(new_state_shapes)
-      weakened_state_shapes = [
-          original.most_specific_compatible_shape(new)
-          for original, new in zip(flat_state_shapes, flat_new_state_shapes)
-      ]
-
-      need_to_rerun = False
-      for original_shape, weakened_shape in zip(flat_state_shapes,
-                                                weakened_state_shapes):
-        if original_shape.ndims is not None and (
-            weakened_shape.ndims is None or
-            original_shape.as_list() != weakened_shape.as_list()):
-          need_to_rerun = True
-          break
-
-      if need_to_rerun:
-        self._state_shapes = nest.pack_sequence_as(self._state_shapes,
-                                                   weakened_state_shapes)
-
-    self._scan_func = wrapped_func.function
-    self._scan_func.add_to_graph(ops.get_default_graph())
-
-  def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-    return gen_dataset_ops.scan_dataset(
-        input_t,
-        nest.flatten(sparse.serialize_sparse_tensors(self._initial_state)),
-        self._scan_func.captured_inputs,
-        f=self._scan_func,
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+from tensorflow.python.data.experimental.ops import scan_ops
+from tensorflow.python.util import deprecation
 
 
+@deprecation.deprecated(None, "Use `tf.data.experimental.scan(...)`.")
 def scan(initial_state, scan_func):
   """A transformation that scans a function across an input dataset.
 
@@ -168,7 +42,4 @@ def scan(initial_state, scan_func):
     A `Dataset` transformation function, which can be passed to
     `tf.data.Dataset.apply`.
   """
-  def _apply_fn(dataset):
-    return _ScanDataset(dataset, initial_state, scan_func)
-
-  return _apply_fn
+  return scan_ops.scan(initial_state, scan_func)
diff --git a/tensorflow/contrib/data/python/ops/shuffle_ops.py b/tensorflow/contrib/data/python/ops/shuffle_ops.py
index 985d1d87d0..329b34fdfe 100644
--- a/tensorflow/contrib/data/python/ops/shuffle_ops.py
+++ b/tensorflow/contrib/data/python/ops/shuffle_ops.py
@@ -17,54 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import random_seed
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_dataset_ops
-
-
-class _ShuffleAndRepeatDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that fuses `shuffle` and `repeat`."""
-
-  def __init__(self, input_dataset, buffer_size, count=None, seed=None):
-    super(_ShuffleAndRepeatDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-    self._buffer_size = ops.convert_to_tensor(
-        buffer_size, dtype=dtypes.int64, name="buffer_size")
-    if count is None:
-      self._count = constant_op.constant(-1, dtype=dtypes.int64, name="count")
-    else:
-      self._count = ops.convert_to_tensor(
-          count, dtype=dtypes.int64, name="count")
-    self._seed, self._seed2 = random_seed.get_seed(seed)
-
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    input_resource = self._input_dataset._as_variant_tensor()
-    return gen_dataset_ops.shuffle_and_repeat_dataset(
-        input_resource,
-        buffer_size=self._buffer_size,
-        count=self._count,
-        seed=self._seed,
-        seed2=self._seed2,
-        **dataset_ops.flat_structure(self))
-    # pylint: enable=protected-access
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+from tensorflow.python.data.experimental.ops import shuffle_ops
+from tensorflow.python.util import deprecation
 
 
+@deprecation.deprecated(None,
+                        "Use `tf.data.experimental.shuffle_and_repeat(...)`.")
 def shuffle_and_repeat(buffer_size, count=None, seed=None):
   """Shuffles and repeats a Dataset returning a new permutation for each epoch.
 
@@ -93,8 +51,4 @@ def shuffle_and_repeat(buffer_size, count=None, seed=None):
     A `Dataset` transformation function, which can be passed to
     `tf.data.Dataset.apply`.
   """
-
-  def _apply_fn(dataset):  # pylint: disable=missing-docstring
-    return _ShuffleAndRepeatDataset(dataset, buffer_size, count, seed)
-
-  return _apply_fn
+  return shuffle_ops.shuffle_and_repeat(buffer_size, count, seed)
diff --git a/tensorflow/contrib/data/python/ops/threadpool.py b/tensorflow/contrib/data/python/ops/threadpool.py
index f73c3fd9cb..20cceb4647 100644
--- a/tensorflow/contrib/data/python/ops/threadpool.py
+++ b/tensorflow/contrib/data/python/ops/threadpool.py
@@ -17,88 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
-from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
-from tensorflow.python.ops import resource_variable_ops
-
-_uid_counter = 0
-_uid_lock = threading.Lock()
-
-
-def _generate_shared_name(prefix):
-  with _uid_lock:
-    global _uid_counter
-    uid = _uid_counter
-    _uid_counter += 1
-  return "{}{}".format(prefix, uid)
-
-
-# TODO(b/73383364): Properly export in the `tf.contrib.data` API when stable
-# or make private / remove.
-class PrivateThreadPool(object):
-  """A stateful resource that represents a private thread pool."""
-
-  def __init__(self, num_threads, display_name=None,
-               max_intra_op_parallelism=1):
-    """Creates a `PrivateThreadPool` with the given number of threads."""
-    if context.executing_eagerly():
-      shared_name = _generate_shared_name("privatethreadpool")
-      self._resource = ged_ops.experimental_thread_pool_handle(
-          num_threads=num_threads,
-          max_intra_op_parallelism=max_intra_op_parallelism,
-          display_name=display_name,
-          shared_name=shared_name)
-      self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
-          handle=self._resource, handle_device=context.context().device_name)
-    else:
-      self._resource = ged_ops.experimental_thread_pool_handle(
-          num_threads=num_threads,
-          max_intra_op_parallelism=max_intra_op_parallelism,
-          display_name=display_name)
-
-
-class _ThreadPoolDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that acts as an identity, and sets a custom threadpool."""
-
-  def __init__(self, input_dataset, thread_pool):
-    super(_ThreadPoolDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-    self._thread_pool = thread_pool
-
-  def _as_variant_tensor(self):
-    return ged_ops.experimental_thread_pool_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._thread_pool._resource,  # pylint: disable=protected-access
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-
-# TODO(b/73383364): Properly export in the `tf.contrib.data` API when stable
-# or make private / remove.
-def override_threadpool(dataset, thread_pool):
-  """Returns a new dataset that uses the given thread pool for its operations.
-
-  Args:
-    dataset: A `tf.data.Dataset` object.
-    thread_pool: A `PrivateThreadPool` object.
-
-  Returns:
-    A dataset containing the same values as `dataset`, but which uses
-    `thread_pool` to compute any of its parallel operations (such as
-    `tf.data.Dataset.map`).
-  """
-  return _ThreadPoolDataset(dataset, thread_pool)
+# pylint: disable=unused-import
+from tensorflow.python.data.experimental.ops.threadpool import override_threadpool
+from tensorflow.python.data.experimental.ops.threadpool import PrivateThreadPool
diff --git a/tensorflow/contrib/data/python/ops/unique.py b/tensorflow/contrib/data/python/ops/unique.py
index ed363a7090..909d06c677 100644
--- a/tensorflow/contrib/data/python/ops/unique.py
+++ b/tensorflow/contrib/data/python/ops/unique.py
@@ -17,11 +17,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.data.experimental.ops import unique as experimental_unique
+from tensorflow.python.util import deprecation
 
 
+@deprecation.deprecated(None, "Use `tf.data.experimental.unique()`.")
 def unique():
   """Creates a `Dataset` from another `Dataset`, discarding duplicates.
 
@@ -39,39 +39,4 @@ def unique():
     A `Dataset` transformation function, which can be passed to
     `tf.data.Dataset.apply`.
   """
-
-  def _apply_fn(dataset):
-    return _UniqueDataset(dataset)
-
-  return _apply_fn
-
-
-class _UniqueDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` contains the unique elements from its input."""
-
-  def __init__(self, input_dataset):
-    """See `unique()` for details."""
-    super(_UniqueDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-    if input_dataset.output_types not in (dtypes.int32, dtypes.int64,
-                                          dtypes.string):
-      raise TypeError(
-          "`tf.contrib.data.unique()` only supports inputs with a single "
-          "`tf.int32`, `tf.int64`, or `tf.string` component.")
-
-  def _as_variant_tensor(self):
-    return gen_experimental_dataset_ops.experimental_unique_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  return experimental_unique.unique()
diff --git a/tensorflow/contrib/data/python/ops/writers.py b/tensorflow/contrib/data/python/ops/writers.py
index c455fdcba6..42fb69bf07 100644
--- a/tensorflow/contrib/data/python/ops/writers.py
+++ b/tensorflow/contrib/data/python/ops/writers.py
@@ -17,42 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import convert
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.data.experimental.ops import writers
+from tensorflow.python.util import deprecation
 
 
-class TFRecordWriter(object):
+class TFRecordWriter(writers.TFRecordWriter):
   """Writes data to a TFRecord file."""
 
+  @deprecation.deprecated(
+      None, "Use `tf.data.experimental.TFRecordWriter(...)`.")
   def __init__(self, filename, compression_type=None):
-    self._filename = ops.convert_to_tensor(
-        filename, dtypes.string, name="filename")
-    self._compression_type = convert.optional_param_to_tensor(
-        "compression_type",
-        compression_type,
-        argument_default="",
-        argument_dtype=dtypes.string)
-
-  def write(self, dataset):
-    """Returns a `tf.Operation` to write a dataset to a file.
-
-    Args:
-      dataset: a `tf.data.Dataset` whose elements are to be written to a file
-
-    Returns:
-      A `tf.Operation` that, when run, writes contents of `dataset` to a file.
-    """
-    if not isinstance(dataset, dataset_ops.Dataset):
-      raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
-    if (dataset.output_types != dtypes.string or
-        dataset.output_shapes != tensor_shape.scalar()):
-      raise TypeError(
-          "`dataset` must produce scalar `DT_STRING` tensors whereas it "
-          "produces shape {0} and types {1}".format(dataset.output_shapes,
-                                                    dataset.output_types))
-    return gen_dataset_ops.dataset_to_tf_record(
-        dataset._as_variant_tensor(), self._filename, self._compression_type)  # pylint: disable=protected-access
+    super(TFRecordWriter, self).__init__(filename, compression_type)
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
index 8d949943b7..d48aa9c89b 100644
--- a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import warnings
 
-from tensorflow.contrib.data.python.ops import prefetching_ops
+from tensorflow.python.data.experimental.ops import prefetching_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest as data_nest
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 135095a979..3aed121233 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import prefetching_ops
+from tensorflow.python.data.experimental.ops import prefetching_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
@@ -54,7 +54,7 @@ class Iterator(iterator_ops.EagerIterator):
     """
     if isinstance(dataset, prefetching_ops._PrefetchToDeviceDataset):  # pylint: disable=protected-access
       raise TypeError(
-          "`tf.contrib.data.prefetch_to_device()` is not compatible with "
+          "`tf.data.experimental.prefetch_to_device()` is not compatible with "
           "`tf.contrib.eager.Iterator`. Use `for ... in dataset:` to iterate "
           "over the dataset instead.")
 
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index a753d77580..6a508fc6ba 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -24,11 +24,11 @@ import time
 import numpy as np
 
 from tensorflow.contrib import lookup
-from tensorflow.contrib.data.python.ops import prefetching_ops
-from tensorflow.contrib.data.python.ops import threadpool
-from tensorflow.contrib.data.python.ops import unique
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.python.data import Dataset
+from tensorflow.python.data.experimental.ops import prefetching_ops
+from tensorflow.python.data.experimental.ops import threadpool
+from tensorflow.python.data.experimental.ops import unique
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/eager/python/examples/revnet/imagenet_input.py b/tensorflow/contrib/eager/python/examples/revnet/imagenet_input.py
index 34a9984b0e..d85188de03 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/imagenet_input.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/imagenet_input.py
@@ -169,11 +169,11 @@ class ImageNetInput(object):
 
     # Read the data from disk in parallel
     dataset = dataset.apply(
-        tf.contrib.data.parallel_interleave(
+        tf.data.experimental.parallel_interleave(
             fetch_dataset, cycle_length=self.num_parallel_calls, sloppy=True))
     if self.cache:
       dataset = dataset.cache().apply(
-          tf.contrib.data.shuffle_and_repeat(1024 * 16))
+          tf.data.experimental.shuffle_and_repeat(1024 * 16))
     else:
       dataset = dataset.shuffle(1024)
 
@@ -188,9 +188,11 @@ class ImageNetInput(object):
     # batch size. As long as this validation is done with consistent batch size,
     # exactly the same images will be used.
     dataset = dataset.apply(
-        tf.contrib.data.map_and_batch(
-            self.dataset_parser, batch_size=batch_size,
-            num_parallel_batches=self.num_cores, drop_remainder=True))
+        tf.data.experimental.map_and_batch(
+            self.dataset_parser,
+            batch_size=batch_size,
+            num_parallel_batches=self.num_cores,
+            drop_remainder=True))
 
     # Transpose for performance on TPU
     if self.transpose_input:
diff --git a/tensorflow/contrib/estimator/python/estimator/rnn_test.py b/tensorflow/contrib/estimator/python/estimator/rnn_test.py
index 1aebed348d..89506ee661 100644
--- a/tensorflow/contrib/estimator/python/estimator/rnn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/rnn_test.py
@@ -25,12 +25,12 @@ import tempfile
 import numpy as np
 import six
 
-from tensorflow.contrib.data.python.ops import readers
 from tensorflow.contrib.estimator.python.estimator import head as head_lib
 from tensorflow.contrib.estimator.python.estimator import rnn
 from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as seq_fc
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.estimator import model_fn
 from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.canned import parsing_utils
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index 89b538d1ba..9e9345e875 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -23,8 +23,8 @@ import numpy as np
 import six
 
 from tensorflow.contrib import lookup
-from tensorflow.contrib.data.python.ops import counter
 from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import counter
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/stateless/BUILD b/tensorflow/contrib/stateless/BUILD
index dcbef2881d..a217397c1a 100644
--- a/tensorflow/contrib/stateless/BUILD
+++ b/tensorflow/contrib/stateless/BUILD
@@ -9,19 +9,13 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
-tf_gen_op_wrapper_py(
-    name = "stateless_random_ops",
-    out = "gen_stateless_random_ops.py",  # cmake chokes without this
-    deps = ["//tensorflow/core:stateless_random_ops_op_lib"],
-)
-
 py_library(
     name = "stateless",
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":stateless_random_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:stateless_random_ops_gen",
         "//tensorflow/python:util",
     ],
 )
diff --git a/tensorflow/contrib/stateless/__init__.py b/tensorflow/contrib/stateless/__init__.py
index 0cca40f071..fe23fe0dd8 100644
--- a/tensorflow/contrib/stateless/__init__.py
+++ b/tensorflow/contrib/stateless/__init__.py
@@ -32,10 +32,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
+
 # pylint: disable=wildcard-import
-from tensorflow.contrib.stateless.gen_stateless_random_ops import *
+from tensorflow.python.ops.gen_stateless_random_ops import *
 
-from tensorflow.python.framework import ops
 from tensorflow.python.util.all_util import remove_undocumented
 
 ops.NotDifferentiable("StatelessMultinomial")
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index d879170b68..c694e9c1bc 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import batching
-from tensorflow.contrib.data.python.ops import interleave_ops
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers
diff --git a/tensorflow/contrib/tpu/tpu_estimator.md b/tensorflow/contrib/tpu/tpu_estimator.md
index 639e708169..b6514e19dc 100644
--- a/tensorflow/contrib/tpu/tpu_estimator.md
+++ b/tensorflow/contrib/tpu/tpu_estimator.md
@@ -87,7 +87,7 @@ handle training:
           label = tf.cast(features["label"], tf.int32)
           return image, label
 
-        dataset = tf.contrib.data.TFRecordDataset(
+        dataset = tf.data.TFRecordDataset(
             filename, buffer_size=FLAGS.dataset_reader_buffer_size)
         dataset = dataset.map(parser).cache().repeat().batch(batch_size)
         images, labels = dataset.make_one_shot_iterator().get_next()
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index b565ebd073..00295f57f6 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -295,7 +295,6 @@ py_test(
     tags = ["notsan"],
     deps = [
         ":training_py",
-        "//tensorflow/contrib/data/python/kernel_tests/serialization:dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
@@ -305,6 +304,7 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//tensorflow/python/data",
+        "//tensorflow/python/data/experimental/kernel_tests/serialization:dataset_serialization_test_base",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
index d9b0511a98..c1657fec7b 100644
--- a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
+++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.contrib.training.python.training import tensor_queue_dataset as tqd
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
diff --git a/tensorflow/core/api_def/python_api/api_def_StatelessMultinomial.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatelessMultinomial.pbtxt
new file mode 100644
index 0000000000..d3c70190dd
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatelessMultinomial.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatelessMultinomial"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatelessRandomNormal.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatelessRandomNormal.pbtxt
new file mode 100644
index 0000000000..e294325fb8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatelessRandomNormal.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatelessRandomNormal"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatelessRandomUniform.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatelessRandomUniform.pbtxt
new file mode 100644
index 0000000000..95d414c54a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatelessRandomUniform.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatelessRandomUniform"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatelessTruncatedNormal.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatelessTruncatedNormal.pbtxt
new file mode 100644
index 0000000000..c72bdda94a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatelessTruncatedNormal.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatelessTruncatedNormal"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/examples/get_started/regression/test.py b/tensorflow/examples/get_started/regression/test.py
index 0b1477ad96..bb4db6700b 100644
--- a/tensorflow/examples/get_started/regression/test.py
+++ b/tensorflow/examples/get_started/regression/test.py
@@ -29,7 +29,7 @@ import tensorflow.examples.get_started.regression.imports85 as imports85
 sys.modules["imports85"] = imports85
 
 # pylint: disable=g-bad-import-order,g-import-not-at-top
-import tensorflow.contrib.data as data
+import tensorflow.data as data
 
 import tensorflow.examples.get_started.regression.dnn_regression as dnn_regression
 import tensorflow.examples.get_started.regression.linear_regression as linear_regression
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 9275ad767e..fe81254ef7 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1739,6 +1739,14 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "stateless_random_ops_gen",
+    visibility = [
+        "//tensorflow/contrib/stateless:__pkg__",
+        "//tensorflow/python/data/experimental/ops:__pkg__",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "list_ops_gen",
 )
@@ -3302,9 +3310,11 @@ py_library(
             "training/checkpointable/**/*.py",
             # The following targets have their own build rules (same name as the
             # file):
+            "training/basic_session_run_hooks.py",
             "training/checkpoint_management.py",
             "training/saveable_object.py",
             "training/saver.py",
+            "training/session_run_hook.py",
             "training/training_util.py",
         ],
     ),
@@ -3312,6 +3322,7 @@ py_library(
     deps = [
         ":array_ops",
         ":array_ops_gen",
+        ":basic_session_run_hooks",
         ":checkpoint_management",
         ":checkpoint_ops_gen",
         ":client",
@@ -3336,6 +3347,7 @@ py_library(
         ":saver",
         ":sdca_ops",
         ":session",
+        ":session_run_hook",
         ":sparse_ops",
         ":sparse_tensor",
         ":state_ops",
@@ -3379,6 +3391,28 @@ py_library(
     ],
 )
 
+py_library(
+    name = "session_run_hook",
+    srcs = ["training/session_run_hook.py"],
+    srcs_version = "PY2AND3",
+    deps = [":util"],
+)
+
+py_library(
+    name = "basic_session_run_hooks",
+    srcs = ["training/basic_session_run_hooks.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client",
+        ":framework",
+        ":platform",
+        ":protos_all_py",
+        ":session_run_hook",
+        ":training_util",
+        ":util",
+    ],
+)
+
 py_library(
     name = "saver",
     srcs = ["training/saver.py"],
diff --git a/tensorflow/python/data/BUILD b/tensorflow/python/data/BUILD
index 138141f4fc..e32eeecbb8 100644
--- a/tensorflow/python/data/BUILD
+++ b/tensorflow/python/data/BUILD
@@ -10,6 +10,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index f8b561205e..7536ba668a 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -22,6 +22,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
+from tensorflow.python.data import experimental
 from tensorflow.python.data.ops.dataset_ops import Dataset
 from tensorflow.python.data.ops.iterator_ops import Iterator
 from tensorflow.python.data.ops.readers import FixedLengthRecordDataset
diff --git a/tensorflow/python/data/experimental/BUILD b/tensorflow/python/data/experimental/BUILD
new file mode 100644
index 0000000000..84e761d376
--- /dev/null
+++ b/tensorflow/python/data/experimental/BUILD
@@ -0,0 +1,16 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "experimental",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:dataset_ops",
+        "//tensorflow/python/data/experimental/ops:iterator_ops",
+    ],
+)
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
new file mode 100644
index 0000000000..2ac159d38a
--- /dev/null
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -0,0 +1,109 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for building input pipelines.
+
+This module contains experimental `Dataset` sources and transformations that can
+be used in conjunction with the `tf.data.Dataset` API. Note that the
+`tf.data.experimental` API is not subject to the same backwards compatibility
+guarantees as `tf.data`, but we will provide deprecation advice in advance of
+removing existing functionality.
+
+See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
+
+@@Counter
+@@CheckpointInputPipelineHook
+@@CsvDataset
+@@Optional
+@@RandomDataset
+@@Reducer
+@@SqlDataset
+@@TFRecordWriter
+
+@@bucket_by_sequence_length
+@@choose_from_datasets
+@@copy_to_device
+@@dense_to_sparse_batch
+@@enumerate_dataset
+@@get_next_as_optional
+@@get_single_element
+@@group_by_reducer
+@@group_by_window
+@@ignore_errors
+@@latency_stats
+@@make_batched_features_dataset
+@@make_csv_dataset
+@@make_saveable_from_iterator
+@@map_and_batch
+@@parallel_interleave
+@@parse_example_dataset
+@@prefetch_to_device
+@@rejection_resample
+@@sample_from_datasets
+@@scan
+@@set_stats_aggregator
+@@shuffle_and_repeat
+@@StatsAggregator
+@@unbatch
+@@unique
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+
+from tensorflow.python.data.experimental.ops.batching import dense_to_sparse_batch
+from tensorflow.python.data.experimental.ops.batching import map_and_batch
+from tensorflow.python.data.experimental.ops.batching import unbatch
+from tensorflow.python.data.experimental.ops.counter import Counter
+from tensorflow.python.data.experimental.ops.enumerate_ops import enumerate_dataset
+from tensorflow.python.data.experimental.ops.error_ops import ignore_errors
+from tensorflow.python.data.experimental.ops.get_single_element import get_single_element
+from tensorflow.python.data.experimental.ops.grouping import bucket_by_sequence_length
+from tensorflow.python.data.experimental.ops.grouping import group_by_reducer
+from tensorflow.python.data.experimental.ops.grouping import group_by_window
+from tensorflow.python.data.experimental.ops.grouping import Reducer
+from tensorflow.python.data.experimental.ops.interleave_ops import choose_from_datasets
+from tensorflow.python.data.experimental.ops.interleave_ops import parallel_interleave
+from tensorflow.python.data.experimental.ops.interleave_ops import sample_from_datasets
+from tensorflow.python.data.experimental.ops.iterator_ops import CheckpointInputPipelineHook
+from tensorflow.python.data.experimental.ops.iterator_ops import make_saveable_from_iterator
+
+# Optimization constant that can be used to enable auto-tuning.
+from tensorflow.python.data.experimental.ops.optimization import AUTOTUNE
+
+from tensorflow.python.data.experimental.ops.parsing_ops import parse_example_dataset
+from tensorflow.python.data.experimental.ops.prefetching_ops import copy_to_device
+from tensorflow.python.data.experimental.ops.prefetching_ops import prefetch_to_device
+from tensorflow.python.data.experimental.ops.random_ops import RandomDataset
+from tensorflow.python.data.experimental.ops.readers import CsvDataset
+from tensorflow.python.data.experimental.ops.readers import make_batched_features_dataset
+from tensorflow.python.data.experimental.ops.readers import make_csv_dataset
+from tensorflow.python.data.experimental.ops.readers import SqlDataset
+from tensorflow.python.data.experimental.ops.resampling import rejection_resample
+from tensorflow.python.data.experimental.ops.scan_ops import scan
+from tensorflow.python.data.experimental.ops.shuffle_ops import shuffle_and_repeat
+from tensorflow.python.data.experimental.ops.stats_ops import latency_stats
+from tensorflow.python.data.experimental.ops.stats_ops import set_stats_aggregator
+from tensorflow.python.data.experimental.ops.stats_ops import StatsAggregator
+from tensorflow.python.data.experimental.ops.unique import unique
+from tensorflow.python.data.experimental.ops.writers import TFRecordWriter
+from tensorflow.python.data.ops.iterator_ops import get_next_as_optional
+from tensorflow.python.data.ops.optional_ops import Optional
+# pylint: enable=unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
new file mode 100644
index 0000000000..a46c30ed2e
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -0,0 +1,569 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_test(
+    name = "batch_dataset_op_test",
+    size = "medium",
+    srcs = ["batch_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",  # (b/79552534)
+        "no_pip",
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "bucketing_test",
+    size = "medium",
+    srcs = ["bucketing_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/experimental/ops:grouping",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "csv_dataset_op_test",
+    size = "medium",
+    srcs = ["csv_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:error_ops",
+        "//tensorflow/python/data/experimental/ops:readers",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/eager:context",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "dataset_constructor_op_test",
+    size = "medium",
+    srcs = ["dataset_constructor_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        "nomac",  # b/62040583
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+py_test(
+    name = "directed_interleave_dataset_test",
+    size = "medium",
+    srcs = ["directed_interleave_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "get_single_element_test",
+    size = "small",
+    srcs = ["get_single_element_test.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/experimental/ops:get_single_element",
+        "//tensorflow/python/data/experimental/ops:grouping",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "indexed_dataset_ops_test",
+    srcs = ["indexed_dataset_ops_test.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python/data/experimental/ops:indexed_dataset_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "interleave_dataset_op_test",
+    size = "medium",
+    srcs = ["interleave_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "iterator_ops_test",
+    size = "small",
+    srcs = ["iterator_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/experimental/ops:iterator_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
+py_test(
+    name = "map_dataset_op_test",
+    size = "medium",
+    srcs = ["map_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "noasan",  # times out
+        "optonly",
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:error_ops",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "filter_dataset_op_test",
+    size = "medium",
+    srcs = ["filter_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "map_defun_op_test",
+    size = "small",
+    srcs = ["map_defun_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:map_defun",
+        "//tensorflow/python/data/kernel_tests:test_base",
+    ],
+)
+
+py_test(
+    name = "parsing_ops_test",
+    size = "small",
+    srcs = ["parsing_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/experimental/ops:parsing_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "prefetching_ops_test",
+    size = "small",
+    srcs = ["prefetching_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/python/data/experimental/ops:prefetching_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:function",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/compat:compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+    tags = ["no_windows_gpu"],
+)
+
+py_test(
+    name = "range_dataset_op_test",
+    size = "small",
+    srcs = ["range_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/experimental/ops:counter",
+        "//tensorflow/python/data/experimental/ops:enumerate_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_library(
+    name = "reader_dataset_ops_test_base",
+    testonly = 1,
+    srcs = [
+        "reader_dataset_ops_test_base.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow/python/data/experimental/kernel_tests:__pkg__",
+        "//tensorflow/python/data/experimental/kernel_tests/serialization:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:readers",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+py_test(
+    name = "reader_dataset_ops_test",
+    size = "medium",
+    srcs = ["reader_dataset_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":reader_dataset_ops_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python/data/experimental/ops:readers",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:nest",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "resample_test",
+    size = "medium",
+    srcs = ["resample_test.py"],
+    shard_count = 2,
+    srcs_version = "PY2AND3",
+    tags = [
+        "noasan",
+        "optonly",
+    ],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:resampling",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "scan_dataset_op_test",
+    size = "small",
+    srcs = ["scan_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/experimental/ops:scan_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "shuffle_dataset_op_test",
+    size = "medium",
+    srcs = ["shuffle_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "optonly",
+    ],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/experimental/ops:shuffle_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "sql_dataset_op_test_base",
+    srcs = ["sql_dataset_op_test_base.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow/python/data/experimental/kernel_tests:__pkg__",
+        "//tensorflow/python/data/experimental/kernel_tests/serialization:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/experimental/ops:readers",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "@org_sqlite//:python",
+    ],
+)
+
+py_test(
+    name = "sql_dataset_op_test",
+    size = "small",
+    srcs = ["sql_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":sql_dataset_op_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+    ],
+)
+
+py_test(
+    name = "stats_dataset_ops_test",
+    size = "medium",
+    srcs = ["stats_dataset_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":reader_dataset_ops_test_base",
+        ":stats_dataset_test_base",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/experimental/ops:stats_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "stats_dataset_test_base",
+    srcs = ["stats_dataset_test_base.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/kernel_tests:test_base",
+    ],
+)
+
+py_test(
+    name = "threadpool_dataset_ops_test",
+    size = "small",
+    srcs = ["threadpool_dataset_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python/data/experimental/ops:threadpool",
+        "//tensorflow/python/data/experimental/ops:unique",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "unique_dataset_op_test",
+    size = "small",
+    srcs = ["unique_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:unique",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_test(
+    name = "writer_ops_test",
+    size = "small",
+    srcs = ["writer_ops_test.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:writers",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
similarity index 67%
rename from tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
index fed7de5f2b..8703b2810e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
@@ -23,8 +23,8 @@ import time
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import batching
 from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -32,7 +32,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
@@ -43,7 +42,6 @@ from tensorflow.python.util import compat
 
 class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-
   def testDenseToSparseBatchDataset(self):
     components = np.random.randint(12, size=(100,)).astype(np.int32)
     iterator = (
@@ -302,128 +300,6 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(next_element)
 
-  def testBatchAndDropRemainder(self):
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).apply(
-            batching.batch_and_drop_remainder(batch_size))
-        .make_initializable_iterator())
-
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for test_batch_size in [1, 3, 7, 10]:
-        sess.run(iterator.initializer, feed_dict={batch_size: test_batch_size})
-        num_batches = 7 // test_batch_size
-        for i in range(num_batches):
-          result = sess.run(next_element)
-          for component, result_component in zip(components, result):
-            for j in range(test_batch_size):
-              self.assertAllEqual(component[(i * test_batch_size + j)],
-                                  result_component[j])
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
-
-  def testBatchAndDropRemainderSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(12).map(_sparse).apply(
-        batching.batch_and_drop_remainder(5)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(2):
-        actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPaddedBatchAndDropRemainder(self):
-    els = []
-    for length in [3, 6, 9, 4, 12, 10, 2]:
-      els.append((np.array(length), np.arange(length) + 1,
-                  np.array(length * 2)))
-
-    dataset = dataset_ops.Dataset.from_tensors(els[0])
-    for el in els[1:]:
-      dataset = dataset.concatenate(dataset_ops.Dataset.from_tensors(el))
-
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        dataset.apply(
-            batching.padded_batch_and_drop_remainder(
-                batch_size, ([], [None], []))).make_initializable_iterator())
-
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for test_batch_size in [1, 3, 7, 10]:
-        sess.run(iterator.initializer, feed_dict={batch_size: test_batch_size})
-        num_batches = 7 // test_batch_size
-        for i in range(num_batches):
-          result = sess.run(next_element)
-          for component_idx, result_component in enumerate(result):
-            for j in range(test_batch_size):
-              data_idx = i * test_batch_size + j
-              comp = result_component[j]
-              unpadded = comp[comp > 0]
-              if np.isscalar(comp):
-                # The boolean mask indexing above adds a dim back. Rm it.
-                unpadded = unpadded[0]
-              self.assertAllEqual(els[data_idx][component_idx], unpadded)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
-
-  def testPaddedBatchAndDropRemainderSparseError(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
-
-    with self.assertRaises(TypeError):
-      _ = dataset_ops.Dataset.range(10).map(_map_fn).apply(
-          batching.padded_batch_and_drop_remainder(5))
-
-  def testBatchAndDropRemainderShapeInference(self):
-    components = (array_ops.placeholder(dtypes.int32),
-                  (array_ops.placeholder(dtypes.int32, shape=[None]),
-                   array_ops.placeholder(dtypes.int32, shape=[20, 30])))
-
-    # Test with a statically known batch size.
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(components).apply(
-            batching.batch_and_drop_remainder(128)))
-
-    self.assertIs(None, dataset.output_shapes[0].ndims)
-    self.assertEqual([128], dataset.output_shapes[1][0].as_list())
-    self.assertEqual([128, 30], dataset.output_shapes[1][1].as_list())
-
-    # Test with a dynamic batch size: the static shape will be unknown, because
-    # `batch_size` is a placeholder.
-    batch_size = array_ops.placeholder(dtypes.int64)
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(components).apply(
-            batching.batch_and_drop_remainder(batch_size)))
-
-    self.assertIs(None, dataset.output_shapes[0].ndims)
-    self.assertEqual([None], dataset.output_shapes[1][0].as_list())
-    self.assertEqual([None, 30], dataset.output_shapes[1][1].as_list())
-
   @parameterized.named_parameters(
       ("Default", None, None),
       ("SequentialCalls", 1, None),
@@ -720,197 +596,6 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         self.assertAllEqual([element for _ in range(10)], sess.run(get_next))
 
 
-class RestructuredDatasetTest(test_base.DatasetTestBase):
-
-  def test_assert_element_shape(self):
-
-    def create_dataset(_):
-      return (array_ops.ones(2, dtype=dtypes.float32),
-              array_ops.zeros((3, 4), dtype=dtypes.int32))
-
-    dataset = dataset_ops.Dataset.range(5).map(create_dataset)
-    expected_shapes = (tensor_shape.TensorShape(2),
-                       tensor_shape.TensorShape((3, 4)))
-    self.assertEqual(expected_shapes, dataset.output_shapes)
-
-    result = dataset.apply(batching.assert_element_shape(expected_shapes))
-    self.assertEqual(expected_shapes, result.output_shapes)
-
-    iterator = result.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(5):
-        sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def test_assert_wrong_element_shape(self):
-
-    def create_dataset(_):
-      return (array_ops.ones(2, dtype=dtypes.float32),
-              array_ops.zeros((3, 4), dtype=dtypes.int32))
-
-    dataset = dataset_ops.Dataset.range(3).map(create_dataset)
-    wrong_shapes = (tensor_shape.TensorShape(2),
-                    tensor_shape.TensorShape((3, 10)))
-    with self.assertRaises(ValueError):
-      dataset.apply(batching.assert_element_shape(wrong_shapes))
-
-  def test_assert_element_shape_on_unknown_shape_dataset(self):
-
-    def create_unknown_shape_dataset(x):
-      return script_ops.py_func(
-          lambda _: (  # pylint: disable=g-long-lambda
-              np.ones(2, dtype=np.float32),
-              np.zeros((3, 4), dtype=np.int32)),
-          [x],
-          [dtypes.float32, dtypes.int32])
-
-    dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset)
-    unknown_shapes = (tensor_shape.TensorShape(None),
-                      tensor_shape.TensorShape(None))
-    self.assertEqual(unknown_shapes, dataset.output_shapes)
-
-    expected_shapes = (tensor_shape.TensorShape(2),
-                       tensor_shape.TensorShape((3, 4)))
-    result = dataset.apply(batching.assert_element_shape(expected_shapes))
-    self.assertEqual(expected_shapes, result.output_shapes)
-
-    iterator = result.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(5):
-        sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def test_assert_wrong_element_shape_on_unknown_shape_dataset(self):
-
-    def create_unknown_shape_dataset(x):
-      return script_ops.py_func(
-          lambda _: (  # pylint: disable=g-long-lambda
-              np.ones(2, dtype=np.float32),
-              np.zeros((3, 4), dtype=np.int32)),
-          [x],
-          [dtypes.float32, dtypes.int32])
-
-    dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset)
-    unknown_shapes = (tensor_shape.TensorShape(None),
-                      tensor_shape.TensorShape(None))
-    self.assertEqual(unknown_shapes, dataset.output_shapes)
-
-    wrong_shapes = (tensor_shape.TensorShape(2),
-                    tensor_shape.TensorShape((3, 10)))
-    iterator = (
-        dataset.apply(batching.assert_element_shape(wrong_shapes))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-
-  def test_assert_partial_element_shape(self):
-
-    def create_dataset(_):
-      return (array_ops.ones(2, dtype=dtypes.float32),
-              array_ops.zeros((3, 4), dtype=dtypes.int32))
-
-    dataset = dataset_ops.Dataset.range(5).map(create_dataset)
-    partial_expected_shape = (tensor_shape.TensorShape(None),       # Unknown shape
-                              tensor_shape.TensorShape((None, 4)))  # Partial shape
-    result = dataset.apply(
-        batching.assert_element_shape(partial_expected_shape))
-    # Partial shapes are merged with actual shapes:
-    actual_shapes = (tensor_shape.TensorShape(2),
-                     tensor_shape.TensorShape((3, 4)))
-    self.assertEqual(actual_shapes, result.output_shapes)
-
-    iterator = result.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(5):
-        sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def test_assert_wrong_partial_element_shape(self):
-
-    def create_dataset(_):
-      return (array_ops.ones(2, dtype=dtypes.float32),
-              array_ops.zeros((3, 4), dtype=dtypes.int32))
-
-    dataset = dataset_ops.Dataset.range(3).map(create_dataset)
-    wrong_shapes = (tensor_shape.TensorShape(2),
-                    tensor_shape.TensorShape((None, 10)))
-    with self.assertRaises(ValueError):
-      dataset.apply(batching.assert_element_shape(wrong_shapes))
-
-  def test_assert_partial_element_shape_on_unknown_shape_dataset(self):
-
-    def create_unknown_shape_dataset(x):
-      return script_ops.py_func(
-          lambda _: (  # pylint: disable=g-long-lambda
-              np.ones(2, dtype=np.float32),
-              np.zeros((3, 4), dtype=np.int32)),
-          [x],
-          [dtypes.float32, dtypes.int32])
-
-    dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset)
-    unknown_shapes = (tensor_shape.TensorShape(None),
-                      tensor_shape.TensorShape(None))
-    self.assertEqual(unknown_shapes, dataset.output_shapes)
-
-    expected_shapes = (tensor_shape.TensorShape(2),
-                       tensor_shape.TensorShape((None, 4)))
-    result = dataset.apply(batching.assert_element_shape(expected_shapes))
-    self.assertEqual(expected_shapes, result.output_shapes)
-
-    iterator = result.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(5):
-        sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def test_assert_wrong_partial_element_shape_on_unknown_shape_dataset(self):
-
-    def create_unknown_shape_dataset(x):
-      return script_ops.py_func(
-          lambda _: (  # pylint: disable=g-long-lambda
-              np.ones(2, dtype=np.float32),
-              np.zeros((3, 4), dtype=np.int32)),
-          [x],
-          [dtypes.float32, dtypes.int32])
-
-    dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset)
-    unknown_shapes = (tensor_shape.TensorShape(None),
-                      tensor_shape.TensorShape(None))
-    self.assertEqual(unknown_shapes, dataset.output_shapes)
-
-    wrong_shapes = (tensor_shape.TensorShape(2),
-                    tensor_shape.TensorShape((None, 10)))
-    iterator = (
-        dataset.apply(batching.assert_element_shape(wrong_shapes))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-
-
 class UnbatchDatasetBenchmark(test.Benchmark):
 
   def benchmarkNativeUnbatch(self):
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/python/data/experimental/kernel_tests/bucketing_test.py
similarity index 99%
rename from tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
rename to tensorflow/python/data/experimental/kernel_tests/bucketing_test.py
index ae401f786c..153a03989b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/bucketing_test.py
@@ -21,7 +21,7 @@ import random
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import grouping
+from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
diff --git a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_op_test.py
similarity index 99%
rename from tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/csv_dataset_op_test.py
index 5b3c512b64..4ee1779710 100644
--- a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_op_test.py
@@ -27,9 +27,9 @@ import zlib
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import error_ops
-from tensorflow.contrib.data.python.ops import readers
 from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import error_ops
+from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.eager import context
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py b/tensorflow/python/data/experimental/kernel_tests/dataset_constructor_op_test.py
similarity index 97%
rename from tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/dataset_constructor_op_test.py
index 722e87e555..3fc7157bc5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/dataset_constructor_op_test.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import batching
+from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/python/data/experimental/kernel_tests/dataset_serialization_test_base.py
similarity index 99%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py
rename to tensorflow/python/data/experimental/kernel_tests/dataset_serialization_test_base.py
index 595cecef4d..7f435b8239 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/dataset_serialization_test_base.py
@@ -22,7 +22,7 @@ import os
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
diff --git a/tensorflow/contrib/data/python/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
similarity index 97%
rename from tensorflow/contrib/data/python/kernel_tests/directed_interleave_dataset_test.py
rename to tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
index bc10c21472..796a692c56 100644
--- a/tensorflow/contrib/data/python/kernel_tests/directed_interleave_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import interleave_ops
+from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
@@ -84,7 +84,7 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
 
     # Use chi-squared test to assert that the observed distribution matches the
     # expected distribution. Based on the implementation in
-    # "tensorflow/python/kernel_tests/multinomial_op_test.py".
+    # "third_party/tensorflow/python/kernel_tests/multinomial_op_test.py".
     for probs in [[.85, .05, .1], rand_probs, [1.]]:
       probs = np.asarray(probs)
       classes = len(probs)
diff --git a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
similarity index 97%
rename from tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
index 6d01bf585c..c6ee88c676 100644
--- a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
@@ -21,8 +21,8 @@ import time
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import optimization
 from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
similarity index 76%
rename from tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
rename to tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
index cc22ea1df7..8c07afbac5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
@@ -18,10 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
-import numpy as np
 
-from tensorflow.contrib.data.python.ops import get_single_element
-from tensorflow.contrib.data.python.ops import grouping
+from tensorflow.python.data.experimental.ops import get_single_element
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
@@ -69,32 +67,6 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
         with self.assertRaisesRegexp(error, error_msg):
           sess.run(element, feed_dict={skip_t: skip, take_t: take})
 
-  @parameterized.named_parameters(
-      ("SumZero", 0),
-      ("SumOne", 1),
-      ("SumFive", 5),
-      ("SumTen", 10),
-  )
-  def testReduceDataset(self, stop):
-    def init_fn(_):
-      return np.int64(0)
-
-    def reduce_fn(state, value):
-      return state + value
-
-    def finalize_fn(state):
-      return state
-
-    sum_reducer = grouping.Reducer(init_fn, reduce_fn, finalize_fn)
-
-    stop_t = array_ops.placeholder(dtypes.int64, shape=[])
-    dataset = dataset_ops.Dataset.range(stop_t)
-    element = get_single_element.reduce_dataset(dataset, sum_reducer)
-
-    with self.cached_session() as sess:
-      value = sess.run(element, feed_dict={stop_t: stop})
-      self.assertEqual(stop * (stop - 1) / 2, value)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/indexed_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
similarity index 97%
rename from tensorflow/contrib/data/python/kernel_tests/indexed_dataset_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
index d4d3d4adb2..c93a8353ce 100644
--- a/tensorflow/contrib/data/python/kernel_tests/indexed_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import unittest
 
-from tensorflow.contrib.data.python.ops import indexed_dataset_ops
+from tensorflow.python.data.experimental.ops import indexed_dataset_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/interleave_dataset_op_test.py
similarity index 99%
rename from tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/interleave_dataset_op_test.py
index 28bd670ab5..560902caad 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/interleave_dataset_op_test.py
@@ -24,7 +24,7 @@ import time
 
 from six.moves import zip_longest
 
-from tensorflow.contrib.data.python.ops import interleave_ops
+from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/iterator_ops_test.py
similarity index 98%
rename from tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/iterator_ops_test.py
index 58a1d7c93b..94393d6d4b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/iterator_ops_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import iterator_ops
+from tensorflow.python.data.experimental.ops import iterator_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_dataset_op_test.py
similarity index 98%
rename from tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/map_dataset_op_test.py
index 385c4ef6ea..2f0bd1456b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_dataset_op_test.py
@@ -24,11 +24,11 @@ import time
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import batching
-from tensorflow.contrib.data.python.ops import error_ops
-from tensorflow.contrib.data.python.ops import optimization
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import error_ops
+from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
similarity index 99%
rename from tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
index 751e6d5b30..612ee332c4 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 
 import time
 
-from tensorflow.contrib.data.python.ops import map_defun
 from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import map_defun
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
similarity index 81%
rename from tensorflow/contrib/data/python/kernel_tests/optimization/BUILD
rename to tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index d7b5edcd9a..68f73bddb5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -12,9 +12,9 @@ py_test(
     srcs = ["assert_next_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
@@ -26,12 +26,12 @@ py_test(
     srcs = ["hoist_random_uniform_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -44,11 +44,11 @@ py_test(
     srcs = ["latency_all_edges_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/kernel_tests:stats_dataset_test_base",
-        "//tensorflow/contrib/data/python/ops:optimization",
-        "//tensorflow/contrib/data/python/ops:stats_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/kernel_tests:stats_dataset_test_base",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:stats_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
@@ -59,7 +59,6 @@ py_test(
     srcs = ["map_vectorization_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -68,6 +67,7 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
@@ -81,12 +81,12 @@ py_test(
     srcs = ["map_and_filter_fusion_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -99,12 +99,12 @@ py_test(
     srcs = ["map_parallelization_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -120,11 +120,11 @@ py_test(
         "optonly",
     ],
     deps = [
-        "//tensorflow/contrib/data/python/ops:batching",
-        "//tensorflow/contrib/data/python/ops:interleave_ops",
-        "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
+        "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
@@ -137,11 +137,11 @@ py_test(
     srcs = ["noop_elimination_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/ops:batching",
-        "//tensorflow/contrib/data/python/ops:interleave_ops",
-        "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
+        "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
@@ -154,9 +154,9 @@ py_test(
     srcs = ["optimize_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/assert_next_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_op_test.py
similarity index 97%
rename from tensorflow/contrib/data/python/kernel_tests/optimization/assert_next_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_op_test.py
index fe1b5280ba..45b77b5c20 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/assert_next_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_op_test.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/hoist_random_uniform_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
similarity index 98%
rename from tensorflow/contrib/data/python/kernel_tests/optimization/hoist_random_uniform_test.py
rename to tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
index b43efb5c7c..3cd9753665 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/hoist_random_uniform_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
-from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/latency_all_edges_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
similarity index 91%
rename from tensorflow/contrib/data/python/kernel_tests/optimization/latency_all_edges_test.py
rename to tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
index e4f18222fd..45623876ae 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/latency_all_edges_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
@@ -17,9 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.kernel_tests import stats_dataset_test_base
-from tensorflow.contrib.data.python.ops import optimization
-from tensorflow.contrib.data.python.ops import stats_ops
+from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops import stats_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/map_and_filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
similarity index 99%
rename from tensorflow/contrib/data/python/kernel_tests/optimization/map_and_filter_fusion_test.py
rename to tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
index e9e3fc81e5..a439635716 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/map_and_filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
-from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/map_parallelization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
similarity index 97%
rename from tensorflow/contrib/data/python/kernel_tests/optimization/map_parallelization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
index f7907eb890..334d8e3778 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/map_parallelization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
-from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
similarity index 99%
rename from tensorflow/contrib/data/python/kernel_tests/optimization/map_vectorization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index a5ea85f454..d47492753e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -22,8 +22,8 @@ import time
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import optimization
 from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/model_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_op_test.py
similarity index 98%
rename from tensorflow/contrib/data/python/kernel_tests/optimization/model_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_op_test.py
index 33c250ab2a..a9f2ce8c03 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/model_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_op_test.py
@@ -21,8 +21,8 @@ import time
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import batching
-from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/noop_elimination_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
similarity index 97%
rename from tensorflow/contrib/data/python/kernel_tests/optimization/noop_elimination_test.py
rename to tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
index b9e60cfa4e..092e0ff62a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/noop_elimination_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/optimize_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_op_test.py
similarity index 98%
rename from tensorflow/contrib/data/python/kernel_tests/optimization/optimize_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_op_test.py
index 04f499f8c5..eb661796c0 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimization/optimize_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_op_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/data/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/parsing_ops_test.py
similarity index 99%
rename from tensorflow/contrib/data/python/kernel_tests/parsing_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/parsing_ops_test.py
index 66ccaceea5..13f924b656 100644
--- a/tensorflow/contrib/data/python/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parsing_ops_test.py
@@ -22,9 +22,9 @@ import copy
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import parsing_ops as contrib_parsing_ops
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.experimental.ops import parsing_ops as contrib_parsing_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
@@ -846,6 +846,5 @@ class ParseExampleTest(test_base.DatasetTestBase):
                       "allow_missing to be True."))
 
 
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetching_ops_test.py
similarity index 99%
rename from tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/prefetching_ops_test.py
index 7a6a7a709a..7d7b842c17 100644
--- a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetching_ops_test.py
@@ -19,9 +19,9 @@ from __future__ import print_function
 
 import threading
 
-from tensorflow.contrib.data.python.ops import prefetching_ops
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.compat import compat
+from tensorflow.python.data.experimental.ops import prefetching_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
diff --git a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/range_dataset_op_test.py
similarity index 95%
rename from tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/range_dataset_op_test.py
index 2e901587f4..22412c3965 100644
--- a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/range_dataset_op_test.py
@@ -17,8 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import counter
-from tensorflow.contrib.data.python.ops import enumerate_ops
+from tensorflow.python.data.experimental.ops import counter
+from tensorflow.python.data.experimental.ops import enumerate_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test.py
similarity index 99%
rename from tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test.py
index 66ed547b6d..a02f4bd14f 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test.py
@@ -23,8 +23,8 @@ import zlib
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests import reader_dataset_ops_test_base
-from tensorflow.contrib.data.python.ops import readers
+from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
+from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import nest
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test_base.py b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
similarity index 99%
rename from tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test_base.py
rename to tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
index f443b5501b..b6ab80d132 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
@@ -22,9 +22,9 @@ import gzip
 import os
 import zlib
 
-from tensorflow.contrib.data.python.ops import readers
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers as core_readers
diff --git a/tensorflow/contrib/data/python/kernel_tests/resample_test.py b/tensorflow/python/data/experimental/kernel_tests/resample_test.py
similarity index 99%
rename from tensorflow/contrib/data/python/kernel_tests/resample_test.py
rename to tensorflow/python/data/experimental/kernel_tests/resample_test.py
index 32474bd411..775648c943 100644
--- a/tensorflow/contrib/data/python/kernel_tests/resample_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/resample_test.py
@@ -23,7 +23,7 @@ from absl.testing import parameterized
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.contrib.data.python.ops import resampling
+from tensorflow.python.data.experimental.ops import resampling
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/scan_dataset_op_test.py
similarity index 99%
rename from tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/scan_dataset_op_test.py
index bdf80eae4e..78ec80de23 100644
--- a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/scan_dataset_op_test.py
@@ -21,7 +21,7 @@ import itertools
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import scan_ops
+from tensorflow.python.data.experimental.ops import scan_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
similarity index 90%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/BUILD
rename to tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index aa89674c6e..20c02a5366 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -13,7 +13,6 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/ops:iterator_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -24,6 +23,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/experimental/ops:iterator_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//third_party/py/numpy",
     ],
@@ -37,10 +37,10 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -81,9 +81,9 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:readers",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/experimental/ops:readers",
     ],
 )
 
@@ -126,8 +126,8 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/kernel_tests:reader_dataset_ops_test_base",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/kernel_tests:reader_dataset_ops_test_base",
         "//tensorflow/python/data/ops:readers",
     ],
 )
@@ -160,8 +160,8 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:grouping",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -174,8 +174,8 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:grouping",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -189,9 +189,9 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:error_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:error_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -222,9 +222,9 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
@@ -258,8 +258,8 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
@@ -288,10 +288,10 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:interleave_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -326,8 +326,8 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/kernel_tests:reader_dataset_ops_test_base",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/kernel_tests:reader_dataset_ops_test_base",
     ],
 )
 
@@ -370,8 +370,8 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:interleave_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
@@ -384,8 +384,8 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:scan_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:scan_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
@@ -411,10 +411,10 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
-        "//tensorflow/contrib/data/python/ops:iterator_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python/data/experimental/ops:iterator_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
@@ -427,8 +427,8 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:shuffle_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:shuffle_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
@@ -441,10 +441,10 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:iterator_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python/data/experimental/ops:iterator_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
@@ -457,11 +457,11 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/kernel_tests:sql_dataset_op_test_base",
-        "//tensorflow/contrib/data/python/ops:readers",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/experimental/kernel_tests:sql_dataset_op_test_base",
+        "//tensorflow/python/data/experimental/ops:readers",
     ],
 )
 
@@ -473,10 +473,10 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:stats_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/experimental/ops:stats_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
@@ -490,8 +490,8 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/kernel_tests:reader_dataset_ops_test_base",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/kernel_tests:reader_dataset_ops_test_base",
         "//tensorflow/python/data/ops:readers",
     ],
 )
@@ -505,8 +505,8 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/kernel_tests:reader_dataset_ops_test_base",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/kernel_tests:reader_dataset_ops_test_base",
         "//tensorflow/python/data/ops:readers",
     ],
 )
@@ -519,8 +519,8 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -534,8 +534,8 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test_base",
-        "//tensorflow/contrib/data/python/ops:unique",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:unique",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/batch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/batch_dataset_serialization_test.py
similarity index 94%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/batch_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/batch_dataset_serialization_test.py
index af87d8b608..d72a6df14c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/batch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/batch_dataset_serialization_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import batching
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/cache_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py
similarity index 98%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/cache_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py
index 1b6059ccbc..2bcf77f5d8 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/cache_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py
@@ -21,7 +21,7 @@ import os
 
 from absl.testing import parameterized
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/concatenate_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/concatenate_dataset_serialization_test.py
similarity index 94%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/concatenate_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/concatenate_dataset_serialization_test.py
index 96f13d75a3..c075dff8cb 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/concatenate_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/concatenate_dataset_serialization_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/csv_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/csv_dataset_serialization_test.py
similarity index 93%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/csv_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/csv_dataset_serialization_test.py
index 247f2046ea..d4983492e7 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/csv_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/csv_dataset_serialization_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 import gzip
 import os
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import readers
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_constructor_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_constructor_serialization_test.py
similarity index 97%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/dataset_constructor_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/dataset_constructor_serialization_test.py
index 2139b5c33d..41a095fb1a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_constructor_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_constructor_serialization_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.platform import test
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
new file mode 100644
index 0000000000..7f435b8239
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
@@ -0,0 +1,692 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base class for testing serializable datasets."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.util import nest
+
+
+def remove_variants(get_next_op):
+  # TODO(b/72408568): Remove this once session.run can get
+  # variant tensors.
+  """Remove variants from a nest structure, so sess.run will execute."""
+
+  def _remove_variant(x):
+    if isinstance(x, ops.Tensor) and x.dtype == dtypes.variant:
+      return ()
+    else:
+      return x
+
+  return nest.map_structure(_remove_variant, get_next_op)
+
+
+class DatasetSerializationTestBase(test.TestCase):
+  """Base class for testing serializable datasets."""
+
+  def tearDown(self):
+    self._delete_ckpt()
+
+  # TODO(b/72657739): Remove sparse_tensor argument, which is to test the
+  # (deprecated) saveable `SparseTensorSliceDataset`, once the API
+  # `from_sparse_tensor_slices()`and related tests are deleted.
+  def run_core_tests(self, ds_fn1, ds_fn2, num_outputs, sparse_tensors=False):
+    """Runs the core tests.
+
+    Args:
+      ds_fn1: 0-argument function that returns a Dataset.
+      ds_fn2: 0-argument function that returns a Dataset different from
+        ds_fn1. If None, verify_restore_in_modified_graph test is not run.
+      num_outputs: Total number of outputs expected from this Dataset.
+      sparse_tensors: Whether dataset is built from SparseTensor(s).
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    self.verify_unused_iterator(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    self.verify_fully_used_iterator(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    self.verify_exhausted_iterator(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    self.verify_init_before_restore(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    self.verify_multiple_breaks(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    self.verify_reset_restored_iterator(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    self.verify_restore_in_empty_graph(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    if ds_fn2:
+      self.verify_restore_in_modified_graph(
+          ds_fn1, ds_fn2, num_outputs, sparse_tensors=sparse_tensors)
+
+  def verify_unused_iterator(self,
+                             ds_fn,
+                             num_outputs,
+                             sparse_tensors=False,
+                             verify_exhausted=True):
+    """Verifies that saving and restoring an unused iterator works.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    self.verify_run_with_breaks(
+        ds_fn, [0],
+        num_outputs,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+  def verify_fully_used_iterator(self, ds_fn, num_outputs,
+                                 sparse_tensors=False):
+    """Verifies that saving and restoring a fully used iterator works.
+
+    Note that this only checks saving and restoring an iterator from which
+    `num_outputs` items have been produced but does not check for an
+    exhausted iterator, i.e., one from which an OutOfRange error has been
+    returned.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      sparse_tensors: See `run_core_tests`.
+
+    Raises:
+      AssertionError if test fails.
+    """
+    self.verify_run_with_breaks(
+        ds_fn, [num_outputs], num_outputs, sparse_tensors=sparse_tensors)
+
+  def verify_exhausted_iterator(self, ds_fn, num_outputs, sparse_tensors=False):
+    """Verifies that saving and restoring an exhausted iterator works.
+
+    An exhausted iterator is one which has returned an OutOfRange error.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      sparse_tensors: See `run_core_tests`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    self.gen_outputs(
+        ds_fn, [],
+        num_outputs,
+        verify_exhausted=True,
+        sparse_tensors=sparse_tensors)
+    actual = self.gen_outputs(
+        ds_fn, [],
+        0,
+        ckpt_saved=True,
+        verify_exhausted=True,
+        sparse_tensors=sparse_tensors)
+    self.assertEqual(len(actual), 0)
+
+  def verify_init_before_restore(self,
+                                 ds_fn,
+                                 num_outputs,
+                                 sparse_tensors=False,
+                                 verify_exhausted=True):
+    """Verifies that restoring into an already initialized iterator works.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    self.verify_run_with_breaks(
+        ds_fn,
+        self.gen_break_points(num_outputs),
+        num_outputs,
+        init_before_restore=True,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+  def verify_multiple_breaks(self,
+                             ds_fn,
+                             num_outputs,
+                             num_breaks=10,
+                             sparse_tensors=False,
+                             verify_exhausted=True):
+    """Attempts to save/restore at multiple break points.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      num_breaks: The number of break points. These are uniformly spread in
+        [0, num_outputs] both inclusive.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    self.verify_run_with_breaks(
+        ds_fn,
+        self.gen_break_points(num_outputs, num_breaks),
+        num_outputs,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+  def verify_reset_restored_iterator(self,
+                                     ds_fn,
+                                     num_outputs,
+                                     break_point=None,
+                                     sparse_tensors=False,
+                                     verify_exhausted=True):
+    """Attempts to re-initialize a restored iterator.
+
+    This is useful when restoring a training checkpoint during validation.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      break_point: Break point. Optional. Defaults to num_outputs/2.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    break_point = num_outputs // 2 if not break_point else break_point
+
+    # Collect ground truth containing all outputs.
+    expected = self.gen_outputs(
+        ds_fn, [],
+        num_outputs,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+    # Skip some items and save checkpoint.
+    self.gen_outputs(
+        ds_fn, [],
+        break_point,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=False)
+
+    actual = []
+    # Restore from checkpoint and then run init_op.
+    with ops.Graph().as_default() as g:
+      saver = self._import_meta_graph()
+      init_op, get_next_op = self._get_iterator_ops_from_collection(
+          ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
+      with self.session(graph=g) as sess:
+        self._restore(saver, sess)
+        self._initialize(init_op, sess)
+        for _ in range(num_outputs):
+          actual.append(sess.run(get_next_op))
+        if verify_exhausted:
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(get_next_op)
+    self.match(expected, actual)
+
+  def verify_restore_in_modified_graph(self,
+                                       ds_fn1,
+                                       ds_fn2,
+                                       num_outputs,
+                                       break_point=None,
+                                       sparse_tensors=False,
+                                       verify_exhausted=True):
+    """Attempts to restore an iterator in a modified graph.
+
+    Builds an input pipeline using ds_fn1, runs it for `break_point` steps
+    and saves a checkpoint. Then builds a new graph using ds_fn2, restores
+    the checkpoint from ds_fn1 and verifies that the restore is successful.
+
+    Args:
+      ds_fn1: See `run_core_tests`.
+      ds_fn2: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      break_point: Break point. Optional. Defaults to num_outputs/2.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    break_point = num_outputs // 2 if not break_point else break_point
+
+    # Skip `break_point` items and store the remaining produced from ds_fn1
+    # in `expected`.
+    self.gen_outputs(
+        ds_fn1, [],
+        break_point,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=False)
+    expected = self.gen_outputs(
+        ds_fn1, [],
+        num_outputs - break_point,
+        ckpt_saved=True,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+    # Generate `break_point` items from ds_fn1 and save checkpoint.
+    self.gen_outputs(
+        ds_fn1, [],
+        break_point,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=False)
+
+    actual = []
+    # Build graph for ds_fn2 but load checkpoint for ds_fn1.
+    with ops.Graph().as_default() as g:
+      _, get_next_op, saver = self._build_graph(
+          ds_fn2, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
+      with self.session(graph=g) as sess:
+        self._restore(saver, sess)
+        for _ in range(num_outputs - break_point):
+          actual.append(sess.run(get_next_op))
+        if verify_exhausted:
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(get_next_op)
+
+    self.match(expected, actual)
+
+  def verify_restore_in_empty_graph(self,
+                                    ds_fn,
+                                    num_outputs,
+                                    break_point=None,
+                                    sparse_tensors=False,
+                                    verify_exhausted=True):
+    """Attempts to restore an iterator in an empty graph.
+
+    Builds an input pipeline using ds_fn, runs it for `break_point` steps
+    and saves a checkpoint. Then builds a new empty graph, restores
+    the checkpoint from ds_fn and verifies that the restore is successful.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      break_point: Break point. Optional. Defaults to num_outputs/2.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    break_point = num_outputs // 2 if not break_point else break_point
+
+    # Skip `break_point` items and store the remaining produced from ds_fn
+    # in `expected`.
+    self.gen_outputs(
+        ds_fn, [],
+        break_point,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=False)
+    expected = self.gen_outputs(
+        ds_fn, [],
+        num_outputs - break_point,
+        ckpt_saved=True,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+    # Generate `break_point` items from ds_fn and save checkpoint.
+    self.gen_outputs(
+        ds_fn, [],
+        break_point,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=False)
+
+    actual = []
+    # Build an empty graph but load checkpoint for ds_fn.
+    with ops.Graph().as_default() as g:
+      get_next_op, saver = self._build_empty_graph(
+          ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
+      with self.session(graph=g) as sess:
+        self._restore(saver, sess)
+        for _ in range(num_outputs - break_point):
+          actual.append(sess.run(get_next_op))
+        if verify_exhausted:
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(get_next_op)
+
+    self.match(expected, actual)
+
+  def verify_error_on_save(self,
+                           ds_fn,
+                           num_outputs,
+                           error,
+                           break_point=None,
+                           sparse_tensors=False):
+    """Attempts to save a non-saveable iterator.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      error: Declared error when trying to save iterator.
+      break_point: Break point. Optional. Defaults to num_outputs/2.
+      sparse_tensors: See `run_core_tests`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+
+    break_point = num_outputs // 2 if not break_point else break_point
+    with ops.Graph().as_default() as g:
+      init_op, get_next_op, saver = self._build_graph(
+          ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
+      with self.session(graph=g) as sess:
+        self._initialize(init_op, sess)
+        for _ in range(break_point):
+          sess.run(get_next_op)
+        with self.assertRaises(error):
+          self._save(sess, saver)
+
+  def verify_run_with_breaks(self,
+                             ds_fn,
+                             break_points,
+                             num_outputs,
+                             init_before_restore=False,
+                             sparse_tensors=False,
+                             verify_exhausted=True):
+    """Verifies that ds_fn() produces the same outputs with and without breaks.
+
+    1. Builds a Dataset using `ds_fn` and produces `num_outputs` items from it
+       *without* stopping at break points.
+    2. Builds a Dataset using `ds_fn` and produces `num_outputs` items from it
+       with stopping at break points.
+
+    Deep matches outputs from 1 and 2.
+
+    Args:
+      ds_fn: See `gen_outputs`.
+      break_points: See `gen_outputs`.
+      num_outputs: See `gen_outputs`.
+      init_before_restore: See `gen_outputs`.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    expected = self.gen_outputs(
+        ds_fn, [],
+        num_outputs,
+        init_before_restore=init_before_restore,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+    actual = self.gen_outputs(
+        ds_fn,
+        break_points,
+        num_outputs,
+        init_before_restore=init_before_restore,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+    self.match(expected, actual)
+
+  def gen_outputs(self,
+                  ds_fn,
+                  break_points,
+                  num_outputs,
+                  ckpt_saved=False,
+                  init_before_restore=False,
+                  sparse_tensors=False,
+                  verify_exhausted=True,
+                  save_checkpoint_at_end=True):
+    """Generates elements from input dataset while stopping at break points.
+
+    Produces `num_outputs` outputs and saves the state of the iterator in the
+    Saver checkpoint.
+
+    Args:
+      ds_fn: 0-argument function that returns the dataset.
+      break_points: A list of integers. For each `break_point` in
+        `break_points`, we produce outputs till `break_point` number of items
+        have been produced and then checkpoint the state. The current graph
+        and session are destroyed and a new graph and session are used to
+        produce outputs till next checkpoint or till `num_outputs` elements
+        have been produced. `break_point` must be <= `num_outputs`.
+      num_outputs: The total number of outputs to produce from the iterator.
+      ckpt_saved: Whether a checkpoint already exists. If False, we build the
+        graph from ds_fn.
+      init_before_restore: Whether init should be called before saver.restore.
+        This is just so that we can verify that restoring an already initialized
+        iterator works.
+      sparse_tensors:  Whether dataset is built from SparseTensor(s).
+      verify_exhausted: Whether to verify that the iterator has been exhausted
+        after producing `num_outputs` elements.
+      save_checkpoint_at_end: Whether to save a checkpoint after producing all
+        outputs. If False, checkpoints are saved each break point but not at the
+        end. Note that checkpoints overwrite each other so there is always only
+        a single checkpoint available. Defaults to True.
+
+    Returns:
+      A list of `num_outputs` items.
+    """
+    outputs = []
+
+    def get_ops():
+      if ckpt_saved:
+        saver = self._import_meta_graph()
+        init_op, get_next_op = self._get_iterator_ops_from_collection(
+            ds_fn, sparse_tensors=sparse_tensors)
+      else:
+        init_op, get_next_op, saver = self._build_graph(
+            ds_fn, sparse_tensors=sparse_tensors)
+      return init_op, get_next_op, saver
+
+    for i in range(len(break_points) + 1):
+      with ops.Graph().as_default() as g:
+        init_op, get_next_op, saver = get_ops()
+        get_next_op = remove_variants(get_next_op)
+        with self.session(graph=g) as sess:
+          if ckpt_saved:
+            if init_before_restore:
+              self._initialize(init_op, sess)
+            self._restore(saver, sess)
+          else:
+            self._initialize(init_op, sess)
+          start = break_points[i - 1] if i > 0 else 0
+          end = break_points[i] if i < len(break_points) else num_outputs
+          num_iters = end - start
+          for _ in range(num_iters):
+            outputs.append(sess.run(get_next_op))
+          if i == len(break_points) and verify_exhausted:
+            with self.assertRaises(errors.OutOfRangeError):
+              sess.run(get_next_op)
+          if save_checkpoint_at_end or i < len(break_points):
+            self._save(sess, saver)
+            ckpt_saved = True
+
+    return outputs
+
+  def match(self, expected, actual):
+    """Matches nested structures.
+
+    Recursively matches shape and values of `expected` and `actual`.
+    Handles scalars, numpy arrays and other python sequence containers
+    e.g. list, dict.
+
+    Args:
+      expected: Nested structure 1.
+      actual: Nested structure 2.
+
+    Raises:
+      AssertionError if matching fails.
+    """
+    if isinstance(expected, np.ndarray):
+      expected = expected.tolist()
+    if isinstance(actual, np.ndarray):
+      actual = actual.tolist()
+    self.assertEqual(type(expected), type(actual))
+
+    if nest.is_sequence(expected):
+      self.assertEqual(len(expected), len(actual))
+      if isinstance(expected, dict):
+        for key1, key2 in zip(sorted(expected), sorted(actual)):
+          self.assertEqual(key1, key2)
+          self.match(expected[key1], actual[key2])
+      else:
+        for item1, item2 in zip(expected, actual):
+          self.match(item1, item2)
+    else:
+      self.assertEqual(expected, actual)
+
+  def does_not_match(self, expected, actual):
+    with self.assertRaises(AssertionError):
+      self.match(expected, actual)
+
+  def gen_break_points(self, num_outputs, num_samples=10):
+    """Generates `num_samples` breaks points in [0, num_outputs]."""
+    return np.linspace(0, num_outputs, num_samples, dtype=int)
+
+  def _build_graph(self, ds_fn, sparse_tensors=False):
+    iterator = ds_fn().make_initializable_iterator()
+
+    saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+    init_op = iterator.initializer
+    if sparse_tensors:
+      get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+    else:
+      get_next = iterator.get_next()
+    self._add_iterator_ops_to_collection(init_op, get_next, ds_fn,
+                                         sparse_tensors)
+    saver = saver_lib.Saver(allow_empty=True)
+    return init_op, get_next, saver
+
+  def _build_empty_graph(self, ds_fn, sparse_tensors=False):
+    iterator = iterator_ops.Iterator.from_structure(
+        self._get_output_types(ds_fn),
+        output_shapes=self._get_output_shapes(ds_fn),
+        output_classes=self._get_output_classes(ds_fn))
+    saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+    if sparse_tensors:
+      get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+    else:
+      get_next = iterator.get_next()
+    saver = saver_lib.Saver(allow_empty=True)
+    return get_next, saver
+
+  def _add_iterator_ops_to_collection(self,
+                                      init_op,
+                                      get_next,
+                                      ds_fn,
+                                      sparse_tensors=False):
+    ops.add_to_collection("iterator_ops", init_op)
+    # `get_next` may be a tuple e.g. in TensorSliceDataset. Since Collections
+    # do not support tuples we flatten the tensors and restore the shape in
+    # `_get_iterator_ops_from_collection`.
+    if sparse_tensors:  # specific for deprecated `from_sparse_tensor_slices`.
+      ops.add_to_collection("iterator_ops", get_next.indices)
+      ops.add_to_collection("iterator_ops", get_next.values)
+      ops.add_to_collection("iterator_ops", get_next.dense_shape)
+      return
+
+    get_next_list = nest.flatten(get_next)
+    for i, output_class in enumerate(
+        nest.flatten(self._get_output_classes(ds_fn))):
+      if output_class is sparse_tensor.SparseTensor:
+        ops.add_to_collection("iterator_ops", get_next_list[i].indices)
+        ops.add_to_collection("iterator_ops", get_next_list[i].values)
+        ops.add_to_collection("iterator_ops", get_next_list[i].dense_shape)
+      else:
+        ops.add_to_collection("iterator_ops", get_next_list[i])
+
+  def _get_iterator_ops_from_collection(self, ds_fn, sparse_tensors=False):
+    all_ops = ops.get_collection("iterator_ops")
+    if sparse_tensors:  # specific for deprecated `from_sparse_tensor_slices`.
+      init_op, indices, values, dense_shape = all_ops
+      return init_op, sparse_tensor.SparseTensor(indices, values, dense_shape)
+    get_next_list = []
+    i = 1
+    for output_class in nest.flatten(self._get_output_classes(ds_fn)):
+      if output_class is sparse_tensor.SparseTensor:
+        indices, values, dense_shape = all_ops[i:i + 3]
+        i += 3
+        get_next_list.append(
+            sparse_tensor.SparseTensor(indices, values, dense_shape))
+      else:
+        get_next_list.append(all_ops[i])
+        i += 1
+    return all_ops[0], nest.pack_sequence_as(
+        self._get_output_types(ds_fn), get_next_list)
+
+  def _get_output_types(self, ds_fn):
+    with ops.Graph().as_default():
+      return ds_fn().output_types
+
+  def _get_output_shapes(self, ds_fn):
+    with ops.Graph().as_default():
+      return ds_fn().output_shapes
+
+  def _get_output_classes(self, ds_fn):
+    with ops.Graph().as_default():
+      return ds_fn().output_classes
+
+  def _ckpt_path(self):
+    return os.path.join(self.get_temp_dir(), "iterator")
+
+  def _latest_ckpt(self):
+    return checkpoint_management.latest_checkpoint(self.get_temp_dir())
+
+  def _save(self, sess, saver):
+    saver.save(sess, self._ckpt_path())
+
+  def _restore(self, saver, sess):
+    sess.run(lookup_ops.tables_initializer())
+    saver.restore(sess, self._latest_ckpt())
+
+  def _initialize(self, init_op, sess):
+    sess.run(variables.global_variables_initializer())
+    sess.run(lookup_ops.tables_initializer())
+    sess.run(init_op)
+
+  def _import_meta_graph(self):
+    meta_file_path = self._ckpt_path() + ".meta"
+    return saver_lib.import_meta_graph(meta_file_path)
+
+  def _delete_ckpt(self):
+    # Remove all checkpoint files.
+    prefix = self._ckpt_path()
+    pattern = prefix + "*"
+    files = gfile.Glob(pattern)
+    map(gfile.Remove, files)
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/filter_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
similarity index 95%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/filter_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
index 7c170078a1..225f6cbac0 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/filter_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py
similarity index 89%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py
index 34392d88d4..70caf3e0d5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py
@@ -17,8 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.kernel_tests import reader_dataset_ops_test_base
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/flat_map_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py
similarity index 97%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/flat_map_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py
index 16051ffd3f..c30534a9e9 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/flat_map_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/group_by_reducer_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_reducer_serialization_test.py
similarity index 93%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/group_by_reducer_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/group_by_reducer_serialization_test.py
index 571e0899bb..169c8845d0 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/group_by_reducer_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_reducer_serialization_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import grouping
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/group_by_window_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_window_serialization_test.py
similarity index 93%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/group_by_window_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/group_by_window_serialization_test.py
index f86af4084e..e5bc76288e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/group_by_window_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_window_serialization_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import grouping
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/ignore_errors_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/ignore_errors_serialization_test.py
similarity index 90%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/ignore_errors_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/ignore_errors_serialization_test.py
index 65ae9923b8..df1f43129a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/ignore_errors_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/ignore_errors_serialization_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import error_ops
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/interleave_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/interleave_dataset_serialization_test.py
similarity index 96%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/interleave_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/interleave_dataset_serialization_test.py
index 243f6405a1..0c1d40ce39 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/interleave_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/interleave_dataset_serialization_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import sparse_ops
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
similarity index 94%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
index c9cd211328..166ffa99ca 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 
 import math
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import batching
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/map_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py
similarity index 97%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/map_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py
index ab783e5cce..b93156a96c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/map_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/optimize_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
similarity index 89%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/optimize_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
index d5c03495e3..ed4a1da596 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/optimize_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
@@ -17,8 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/padded_batch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/padded_batch_dataset_serialization_test.py
similarity index 95%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/padded_batch_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/padded_batch_dataset_serialization_test.py
index 9ac42a461a..6f72b24673 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/padded_batch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/padded_batch_dataset_serialization_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py
similarity index 95%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py
index 1f8a584df9..b8f38e8a28 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import interleave_ops
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import sparse_ops
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/parallel_map_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py
similarity index 96%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/parallel_map_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py
index 3fb7605be1..a0bdd4fa59 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/parallel_map_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -65,7 +65,7 @@ class ParallelMapDatasetSerializationTest(
     for ds_fn in [self._build_ds, self._build_ds_with_prefetch]:
       self.run_core_tests(
           ds_fn,
-          lambda: ds_fn(multiplier=15.0),
+          lambda: ds_fn(multiplier=15.0),  # pylint: disable=cell-var-from-loop
           self._num_outputs)
 
   def testSaveStatefulFunction(self):
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/parse_example_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py
similarity index 90%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/parse_example_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py
index d3fa84e74c..a0dd6960b0 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/parse_example_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py
@@ -17,8 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.kernel_tests import reader_dataset_ops_test_base
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/prefetch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/prefetch_dataset_serialization_test.py
similarity index 93%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/prefetch_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/prefetch_dataset_serialization_test.py
index c802402461..00d74c0025 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/prefetch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/prefetch_dataset_serialization_test.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/range_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
similarity index 97%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/range_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
index 6341190847..ef99d01c73 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/range_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/sample_from_datasets_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/sample_from_datasets_serialization_test.py
similarity index 90%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/sample_from_datasets_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/sample_from_datasets_serialization_test.py
index fdb35ea624..c23c1ecdfb 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/sample_from_datasets_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/sample_from_datasets_serialization_test.py
@@ -17,8 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import interleave_ops
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/scan_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/scan_dataset_serialization_test.py
similarity index 89%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/scan_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/scan_dataset_serialization_test.py
index af9ef48c0f..5f50160619 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/scan_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/scan_dataset_serialization_test.py
@@ -17,8 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import scan_ops
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import scan_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/sequence_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py
similarity index 97%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/sequence_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py
index 2afebca0f5..fe99a3d3d9 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/sequence_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/serialization_integration_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
similarity index 97%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/serialization_integration_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
index 6aac50ecd9..88d5c896c9 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/serialization_integration_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py
similarity index 89%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py
index f199ec835e..f847ac19f9 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py
@@ -17,8 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import shuffle_ops
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import shuffle_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/shuffle_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
similarity index 96%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/shuffle_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
index a59fa94d66..a04f1ddafc 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/shuffle_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
@@ -17,8 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/sql_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py
similarity index 88%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/sql_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py
index 93b26ed58a..b179770ce3 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/sql_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py
@@ -19,9 +19,9 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.data.python.kernel_tests import sql_dataset_op_test_base
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import readers
+from tensorflow.python.data.experimental.kernel_tests import sql_dataset_op_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/stats_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
similarity index 96%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/stats_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
index a10f85263a..ef7061b190 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/stats_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
@@ -17,8 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import stats_ops
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import stats_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/textline_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/textline_dataset_serialization_test.py
similarity index 90%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/textline_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/textline_dataset_serialization_test.py
index 2483787f44..c87a7443a7 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/textline_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/textline_dataset_serialization_test.py
@@ -17,8 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.kernel_tests import reader_dataset_ops_test_base
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/tf_record_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/tf_record_dataset_serialization_test.py
similarity index 95%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/tf_record_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/tf_record_dataset_serialization_test.py
index 55a6257a27..f0dcc131d4 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/tf_record_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/tf_record_dataset_serialization_test.py
@@ -21,8 +21,8 @@ import gzip
 import os
 import zlib
 
-from tensorflow.contrib.data.python.kernel_tests import reader_dataset_ops_test_base
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/unbatch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/unbatch_dataset_serialization_test.py
similarity index 91%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/unbatch_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/unbatch_dataset_serialization_test.py
index b2a5a8a20d..528598dfe4 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/unbatch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/unbatch_dataset_serialization_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import batching
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/unique_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/unique_dataset_serialization_test.py
similarity index 89%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/unique_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/unique_dataset_serialization_test.py
index 22f15b8846..e2862af4d6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/unique_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/unique_dataset_serialization_test.py
@@ -17,8 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import unique
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import unique
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/zip_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/zip_dataset_serialization_test.py
similarity index 94%
rename from tensorflow/contrib/data/python/kernel_tests/serialization/zip_dataset_serialization_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/zip_dataset_serialization_test.py
index 340a6ff72e..4ea6131c22 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/zip_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/zip_dataset_serialization_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization_integration_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization_integration_test.py
new file mode 100644
index 0000000000..88d5c896c9
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization_integration_test.py
@@ -0,0 +1,85 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration test for dataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import saver as saver_lib
+
+
+class SerializationIntegrationTest(test.TestCase):
+
+  def _build_input_pipeline(self, name, num_outputs):
+    with ops.name_scope(name):
+      ds = dataset_ops.Dataset.range(num_outputs).shuffle(
+          10, reshuffle_each_iteration=False).prefetch(10)
+      iterator = ds.make_initializable_iterator()
+      saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
+      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+      return iterator.initializer, iterator.get_next()
+
+  def _build_graph(self, num_pipelines, num_outputs):
+    init_ops = []
+    get_next_ops = []
+    for i in range(num_pipelines):
+      name = "input_pipeline_%d" % i
+      init_op, get_next_op = self._build_input_pipeline(name, num_outputs)
+      init_ops.append(init_op)
+      get_next_ops.append(get_next_op)
+    saver = saver_lib.Saver()
+    return init_ops, get_next_ops, saver
+
+  def _ckpt_path(self):
+    return os.path.join(self.get_temp_dir(), "iterator")
+
+  def testConcurrentSaves(self):
+    num_pipelines = 100
+    num_outputs = 100
+    break_point = 10
+    all_outputs = [[] for _ in range(num_pipelines)]
+    with ops.Graph().as_default() as g:
+      init_ops, get_next_ops, saver = self._build_graph(num_pipelines,
+                                                        num_outputs)
+      with self.session(graph=g) as sess:
+        sess.run(init_ops)
+        for _ in range(break_point):
+          output = sess.run(get_next_ops)
+          for i in range(num_pipelines):
+            all_outputs[i].append(output[i])
+        saver.save(sess, self._ckpt_path())
+
+    with ops.Graph().as_default() as g:
+      init_ops, get_next_ops, saver = self._build_graph(num_pipelines,
+                                                        num_outputs)
+      with self.session(graph=g) as sess:
+        saver.restore(sess, self._ckpt_path())
+        for _ in range(num_outputs - break_point):
+          output = sess.run(get_next_ops)
+          for i in range(num_pipelines):
+            all_outputs[i].append(output[i])
+
+    for output in all_outputs:
+      self.assertSequenceEqual(sorted(output), range(num_outputs))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_dataset_op_test.py
similarity index 98%
rename from tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/shuffle_dataset_op_test.py
index c97002a255..50895b5945 100644
--- a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_dataset_op_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import shuffle_ops
+from tensorflow.python.data.experimental.ops import shuffle_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
diff --git a/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_op_test.py
similarity index 99%
rename from tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/sql_dataset_op_test.py
index 52823d3fca..301f75488a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_op_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.kernel_tests import sql_dataset_op_test_base
+from tensorflow.python.data.experimental.kernel_tests import sql_dataset_op_test_base
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test_base.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_op_test_base.py
similarity index 98%
rename from tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test_base.py
rename to tensorflow/python/data/experimental/kernel_tests/sql_dataset_op_test_base.py
index 319a2ea263..a135c357f0 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_op_test_base.py
@@ -23,7 +23,7 @@ import os
 
 import sqlite3
 
-from tensorflow.contrib.data.python.ops import readers
+from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
similarity index 98%
rename from tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
index be8ae5e955..6761fbd16b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests import stats_dataset_test_base
-from tensorflow.contrib.data.python.ops import stats_ops
+from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base
+from tensorflow.python.data.experimental.ops import stats_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
similarity index 100%
rename from tensorflow/contrib/data/python/kernel_tests/stats_dataset_test_base.py
rename to tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
diff --git a/tensorflow/contrib/data/python/kernel_tests/threadpool_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/threadpool_dataset_ops_test.py
similarity index 96%
rename from tensorflow/contrib/data/python/kernel_tests/threadpool_dataset_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/threadpool_dataset_ops_test.py
index 08de3a9143..4432dcb05a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/threadpool_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/threadpool_dataset_ops_test.py
@@ -22,8 +22,8 @@ import threading
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import threadpool
-from tensorflow.contrib.data.python.ops import unique
+from tensorflow.python.data.experimental.ops import threadpool
+from tensorflow.python.data.experimental.ops import unique
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/data/python/kernel_tests/unique_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_dataset_op_test.py
similarity index 98%
rename from tensorflow/contrib/data/python/kernel_tests/unique_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/unique_dataset_op_test.py
index 8856ce5afb..b5a0b20f3f 100644
--- a/tensorflow/contrib/data/python/kernel_tests/unique_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unique_dataset_op_test.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import unique
+from tensorflow.python.data.experimental.ops import unique
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/writer_ops_test.py
similarity index 98%
rename from tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/writer_ops_test.py
index fca546a570..25a2e63ba1 100644
--- a/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/writer_ops_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.data.python.ops import writers
+from tensorflow.python.data.experimental.ops import writers
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
new file mode 100644
index 0000000000..915d399f1b
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -0,0 +1,377 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+)
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
+py_library(
+    name = "counter",
+    srcs = ["counter.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":scan_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_library(
+    name = "get_single_element",
+    srcs = ["get_single_element.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "iterator_ops",
+    srcs = [
+        "iterator_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:basic_session_run_hooks",
+        "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:session_run_hook",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:optional_ops",
+    ],
+)
+
+py_library(
+    name = "random_ops",
+    srcs = [
+        "random_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "readers",
+    srcs = [
+        "readers.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":batching",
+        ":interleave_ops",
+        ":optimization",
+        ":parsing_ops",
+        ":shuffle_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:convert",
+        "//tensorflow/python/data/util:nest",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "shuffle_ops",
+    srcs = [
+        "shuffle_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_library(
+    name = "batching",
+    srcs = ["batching.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":get_single_element",
+        ":grouping",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:convert",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "enumerate_ops",
+    srcs = ["enumerate_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_library(
+    name = "error_ops",
+    srcs = ["error_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "grouping",
+    srcs = ["grouping.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "interleave_ops",
+    srcs = ["interleave_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":random_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:stateless_random_ops_gen",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "optimization",
+    srcs = ["optimization.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "parsing_ops",
+    srcs = ["parsing_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+py_library(
+    name = "map_defun",
+    srcs = ["map_defun.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_library(
+    name = "resampling",
+    srcs = ["resampling.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":batching",
+        ":interleave_ops",
+        ":scan_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "scan_ops",
+    srcs = ["scan_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "stats_ops",
+    srcs = ["stats_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "threadpool",
+    srcs = ["threadpool.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_library(
+    name = "unique",
+    srcs = [
+        "unique.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "writers",
+    srcs = [
+        "writers.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_library(
+    name = "indexed_dataset_ops",
+    srcs = ["indexed_dataset_ops.py"],
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "prefetching_ops",
+    srcs = ["prefetching_ops.py"],
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "dataset_ops",
+    deps = [
+        ":batching",
+        ":counter",
+        ":enumerate_ops",
+        ":error_ops",
+        ":get_single_element",
+        ":grouping",
+        ":indexed_dataset_ops",
+        ":interleave_ops",
+        ":map_defun",
+        ":optimization",
+        ":prefetching_ops",
+        ":readers",
+        ":resampling",
+        ":scan_ops",
+        ":shuffle_ops",
+        ":stats_ops",
+        ":threadpool",
+        ":unique",
+        ":writers",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
new file mode 100644
index 0000000000..d42af9e7e9
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -0,0 +1,669 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Batching dataset transformations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import get_single_element
+from tensorflow.python.data.experimental.ops import grouping
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import convert
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+def batch_window(dataset):
+  """Batches a window of tensors.
+
+  Args:
+    dataset: the input dataset.
+
+  Returns:
+    A `Tensor` representing the batch of the entire input dataset.
+  """
+  if isinstance(dataset.output_classes, tuple):
+    raise TypeError("Input dataset expected to have a single component")
+  if dataset.output_classes is ops.Tensor:
+    return _batch_dense_window(dataset)
+  elif dataset.output_classes is sparse_tensor.SparseTensor:
+    return _batch_sparse_window(dataset)
+  else:
+    raise TypeError("Unsupported dataset type: %s" % dataset.output_classes)
+
+
+def _batch_dense_window(dataset):
+  """Batches a window of dense tensors."""
+
+  def key_fn(_):
+    return np.int64(0)
+
+  def shape_init_fn(_):
+    return array_ops.shape(first_element)
+
+  def shape_reduce_fn(state, value):
+    check_ops.assert_equal(state, array_ops.shape(value))
+    return state
+
+  def finalize_fn(state):
+    return state
+
+  if dataset.output_shapes.is_fully_defined():
+    shape = dataset.output_shapes
+  else:
+    first_element = get_single_element.get_single_element(dataset.take(1))
+    shape_reducer = grouping.Reducer(shape_init_fn, shape_reduce_fn,
+                                     finalize_fn)
+    shape = get_single_element.get_single_element(
+        dataset.apply(grouping.group_by_reducer(key_fn, shape_reducer)))
+
+  def batch_init_fn(_):
+    batch_shape = array_ops.concat([[0], shape], 0)
+    return gen_array_ops.empty(batch_shape, dtype=dataset.output_types)
+
+  def batch_reduce_fn(state, value):
+    return array_ops.concat([state, [value]], 0)
+
+  batch_reducer = grouping.Reducer(batch_init_fn, batch_reduce_fn, finalize_fn)
+  return get_single_element.get_single_element(
+      dataset.apply(grouping.group_by_reducer(key_fn, batch_reducer)))
+
+
+def _batch_sparse_window(dataset):
+  """Batches a window of sparse tensors."""
+
+  def key_fn(_):
+    return np.int64(0)
+
+  def shape_init_fn(_):
+    return first_element.dense_shape
+
+  def shape_reduce_fn(state, value):
+    check_ops.assert_equal(state, value.dense_shape)
+    return state
+
+  def finalize_fn(state):
+    return state
+
+  if dataset.output_shapes.is_fully_defined():
+    shape = dataset.output_shapes
+  else:
+    first_element = get_single_element.get_single_element(dataset.take(1))
+    shape_reducer = grouping.Reducer(shape_init_fn, shape_reduce_fn,
+                                     finalize_fn)
+    shape = get_single_element.get_single_element(
+        dataset.apply(grouping.group_by_reducer(key_fn, shape_reducer)))
+
+  def batch_init_fn(_):
+    indices_shape = array_ops.concat([[0], [array_ops.size(shape) + 1]], 0)
+    return sparse_tensor.SparseTensor(
+        indices=gen_array_ops.empty(indices_shape, dtype=dtypes.int64),
+        values=constant_op.constant([], shape=[0], dtype=dataset.output_types),
+        dense_shape=array_ops.concat(
+            [np.array([0], dtype=np.int64),
+             math_ops.cast(shape, dtypes.int64)], 0))
+
+  def batch_reduce_fn(state, value):
+    return sparse_ops.sparse_concat(0, [state, value])
+
+  def reshape_fn(value):
+    return sparse_ops.sparse_reshape(
+        value,
+        array_ops.concat([np.array([1], dtype=np.int64), value.dense_shape], 0))
+
+  batch_reducer = grouping.Reducer(batch_init_fn, batch_reduce_fn, finalize_fn)
+  return get_single_element.get_single_element(
+      dataset.map(reshape_fn).apply(
+          grouping.group_by_reducer(key_fn, batch_reducer)))
+
+
+@tf_export("data.experimental.dense_to_sparse_batch")
+def dense_to_sparse_batch(batch_size, row_shape):
+  """A transformation that batches ragged elements into `tf.SparseTensor`s.
+
+  Like `Dataset.padded_batch()`, this transformation combines multiple
+  consecutive elements of the dataset, which might have different
+  shapes, into a single element. The resulting element has three
+  components (`indices`, `values`, and `dense_shape`), which
+  comprise a `tf.SparseTensor` that represents the same data. The
+  `row_shape` represents the dense shape of each row in the
+  resulting `tf.SparseTensor`, to which the effective batch size is
+  prepended. For example:
+
+  ```python
+  # NOTE: The following examples use `{ ... }` to represent the
+  # contents of a dataset.
+  a = { ['a', 'b', 'c'], ['a', 'b'], ['a', 'b', 'c', 'd'] }
+
+  a.apply(tf.data.experimental.dense_to_sparse_batch(
+      batch_size=2, row_shape=[6])) ==
+  {
+      ([[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],  # indices
+       ['a', 'b', 'c', 'a', 'b'],                 # values
+       [2, 6]),                                   # dense_shape
+      ([[0, 0], [0, 1], [0, 2], [0, 3]],
+       ['a', 'b', 'c', 'd'],
+       [1, 6])
+  }
+  ```
+
+  Args:
+    batch_size: A `tf.int64` scalar `tf.Tensor`, representing the
+      number of consecutive elements of this dataset to combine in a
+      single batch.
+    row_shape: A `tf.TensorShape` or `tf.int64` vector tensor-like
+      object representing the equivalent dense shape of a row in the
+      resulting `tf.SparseTensor`. Each element of this dataset must
+      have the same rank as `row_shape`, and must have size less
+      than or equal to `row_shape` in each dimension.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    return _DenseToSparseBatchDataset(dataset, batch_size, row_shape)
+
+  return _apply_fn
+
+
+def padded_batch_window(dataset, padded_shape, padding_value=None):
+  """Batches a window of tensors with padding.
+
+  Args:
+    dataset: the input dataset.
+    padded_shape: (Optional.) `tf.TensorShape` or `tf.int64` vector tensor-like
+      object representing the shape to which the input elements should be padded
+      prior to batching. Any unknown dimensions (e.g. `tf.Dimension(None)` in a
+      `tf.TensorShape` or `-1` in a tensor-like object) will be padded to the
+      maximum size of that dimension in each batch.
+    padding_value: (Optional.) A scalar-shaped `tf.Tensor`, representing the
+      padding value to use. Defaults are `0` for numeric types and the empty
+      string for string types. If `dataset` contains `tf.SparseTensor`, this
+      value is ignored.
+
+  Returns:
+    A `Tensor` representing the batch of the entire input dataset.
+
+  Raises:
+    ValueError: if invalid arguments are provided.
+  """
+  if not issubclass(dataset.output_classes,
+                    (ops.Tensor, sparse_tensor.SparseTensor)):
+    raise TypeError("Input dataset expected to have a single tensor component")
+  if issubclass(dataset.output_classes, (ops.Tensor)):
+    return _padded_batch_dense_window(dataset, padded_shape, padding_value)
+  elif issubclass(dataset.output_classes, (sparse_tensor.SparseTensor)):
+    if padding_value is not None:
+      raise ValueError("Padding value not allowed for sparse tensors")
+    return _padded_batch_sparse_window(dataset, padded_shape)
+  else:
+    raise TypeError("Unsupported dataset type: %s" % dataset.output_classes)
+
+
+def _padded_batch_dense_window(dataset, padded_shape, padding_value=None):
+  """Batches a window of dense tensors with padding."""
+
+  padded_shape = math_ops.cast(
+      convert.partial_shape_to_tensor(padded_shape), dtypes.int32)
+
+  def key_fn(_):
+    return np.int64(0)
+
+  def max_init_fn(_):
+    return padded_shape
+
+  def max_reduce_fn(state, value):
+    """Computes the maximum shape to pad to."""
+    condition = math_ops.reduce_all(
+        math_ops.logical_or(
+            math_ops.less_equal(array_ops.shape(value), padded_shape),
+            math_ops.equal(padded_shape, -1)))
+    assert_op = control_flow_ops.Assert(condition, [
+        "Actual shape greater than padded shape: ",
+        array_ops.shape(value), padded_shape
+    ])
+    with ops.control_dependencies([assert_op]):
+      return math_ops.maximum(state, array_ops.shape(value))
+
+  def finalize_fn(state):
+    return state
+
+  # Compute the padded shape.
+  max_reducer = grouping.Reducer(max_init_fn, max_reduce_fn, finalize_fn)
+  padded_shape = get_single_element.get_single_element(
+      dataset.apply(grouping.group_by_reducer(key_fn, max_reducer)))
+
+  if padding_value is None:
+    if dataset.output_types == dtypes.string:
+      padding_value = ""
+    elif dataset.output_types == dtypes.bool:
+      padding_value = False
+    elif dataset.output_types == dtypes.variant:
+      raise TypeError("Unable to create padding for field of type 'variant'")
+    else:
+      padding_value = 0
+
+  def batch_init_fn(_):
+    batch_shape = array_ops.concat(
+        [np.array([0], dtype=np.int32), padded_shape], 0)
+    return gen_array_ops.empty(batch_shape, dtype=dataset.output_types)
+
+  def batch_reduce_fn(state, value):
+    return array_ops.concat([state, [value]], 0)
+
+  def pad_fn(value):
+    shape = array_ops.shape(value)
+    left = array_ops.zeros_like(shape)
+    right = padded_shape - shape
+    return array_ops.pad(
+        value, array_ops.stack([left, right], 1), constant_values=padding_value)
+
+  batch_reducer = grouping.Reducer(batch_init_fn, batch_reduce_fn, finalize_fn)
+  return get_single_element.get_single_element(
+      dataset.map(pad_fn).apply(
+          grouping.group_by_reducer(key_fn, batch_reducer)))
+
+
+def _padded_batch_sparse_window(dataset, padded_shape):
+  """Batches a window of sparse tensors with padding."""
+
+  def key_fn(_):
+    return np.int64(0)
+
+  def max_init_fn(_):
+    return convert.partial_shape_to_tensor(padded_shape)
+
+  def max_reduce_fn(state, value):
+    """Computes the maximum shape to pad to."""
+    condition = math_ops.reduce_all(
+        math_ops.logical_or(
+            math_ops.less_equal(value.dense_shape, padded_shape),
+            math_ops.equal(padded_shape, -1)))
+    assert_op = control_flow_ops.Assert(condition, [
+        "Actual shape greater than padded shape: ", value.dense_shape,
+        padded_shape
+    ])
+    with ops.control_dependencies([assert_op]):
+      return math_ops.maximum(state, value.dense_shape)
+
+  def finalize_fn(state):
+    return state
+
+  # Compute the padded shape.
+  max_reducer = grouping.Reducer(max_init_fn, max_reduce_fn, finalize_fn)
+  padded_shape = get_single_element.get_single_element(
+      dataset.apply(grouping.group_by_reducer(key_fn, max_reducer)))
+
+  def batch_init_fn(_):
+    indices_shape = array_ops.concat([[0], [array_ops.size(padded_shape) + 1]],
+                                     0)
+    return sparse_tensor.SparseTensor(
+        indices=gen_array_ops.empty(indices_shape, dtype=dtypes.int64),
+        values=constant_op.constant([], shape=[0], dtype=dataset.output_types),
+        dense_shape=array_ops.concat(
+            [np.array([0], dtype=np.int64), padded_shape], 0))
+
+  def batch_reduce_fn(state, value):
+    padded_value = sparse_tensor.SparseTensor(
+        indices=value.indices, values=value.values, dense_shape=padded_shape)
+    reshaped_value = sparse_ops.sparse_reshape(
+        padded_value,
+        array_ops.concat(
+            [np.array([1], dtype=np.int64), padded_value.dense_shape], 0))
+    return sparse_ops.sparse_concat(0, [state, reshaped_value])
+
+  reducer = grouping.Reducer(batch_init_fn, batch_reduce_fn, finalize_fn)
+  return get_single_element.get_single_element(
+      dataset.apply(grouping.group_by_reducer(key_fn, reducer)))
+
+
+class _UnbatchDataset(dataset_ops.UnaryDataset):
+  """A dataset that splits the elements of its input into multiple elements."""
+
+  def __init__(self, input_dataset):
+    """See `unbatch()` for more details."""
+    super(_UnbatchDataset, self).__init__(input_dataset)
+    flat_shapes = nest.flatten(input_dataset.output_shapes)
+    if any(s.ndims == 0 for s in flat_shapes):
+      raise ValueError("Cannot unbatch an input with scalar components.")
+    known_batch_dim = tensor_shape.Dimension(None)
+    for s in flat_shapes:
+      try:
+        known_batch_dim = known_batch_dim.merge_with(s[0])
+      except ValueError:
+        raise ValueError("Cannot unbatch an input whose components have "
+                         "different batch sizes.")
+    self._input_dataset = input_dataset
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.unbatch_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        **dataset_ops.flat_structure(self))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return nest.map_structure(lambda s: s[1:],
+                              self._input_dataset.output_shapes)
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+@tf_export("data.experimental.unbatch")
+def unbatch():
+  """Splits elements of a dataset into multiple elements on the batch dimension.
+
+  For example, if elements of the dataset are shaped `[B, a0, a1, ...]`,
+  where `B` may vary for each input element, then for each element in the
+  dataset, the unbatched dataset will contain `B` consecutive elements
+  of shape `[a0, a1, ...]`.
+
+  ```python
+  # NOTE: The following example uses `{ ... }` to represent the contents
+  # of a dataset.
+  a = { ['a', 'b', 'c'], ['a', 'b'], ['a', 'b', 'c', 'd'] }
+
+  a.apply(tf.data.experimental.unbatch()) == {
+      'a', 'b', 'c', 'a', 'b', 'a', 'b', 'c', 'd'}
+  ```
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    if not sparse.any_sparse(dataset.output_classes):
+      return _UnbatchDataset(dataset)
+
+    # NOTE(mrry): We must ensure that any SparseTensors in `dataset`
+    # are normalized to the rank-1 dense representation, so that the
+    # sparse-oblivious unbatching logic will slice them
+    # appropriately. This leads to a somewhat inefficient re-encoding step
+    # for all SparseTensor components.
+    # TODO(mrry): Consider optimizing this in future
+    # if it turns out to be a bottleneck.
+    def normalize(arg, *rest):
+      if rest:
+        return sparse.serialize_many_sparse_tensors((arg,) + rest)
+      else:
+        return sparse.serialize_many_sparse_tensors(arg)
+
+    normalized_dataset = dataset.map(normalize)
+
+    # NOTE(mrry): Our `map()` has lost information about the sparseness
+    # of any SparseTensor components, so re-apply the structure of the
+    # original dataset.
+    restructured_dataset = _RestructuredDataset(
+        normalized_dataset,
+        dataset.output_types,
+        dataset.output_shapes,
+        dataset.output_classes,
+        allow_unsafe_cast=True)
+    return _UnbatchDataset(restructured_dataset)
+
+  return _apply_fn
+
+
+class _DenseToSparseBatchDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that batches ragged dense elements into `tf.SparseTensor`s."""
+
+  def __init__(self, input_dataset, batch_size, row_shape):
+    """See `Dataset.dense_to_sparse_batch()` for more details."""
+    super(_DenseToSparseBatchDataset, self).__init__(input_dataset)
+    if not isinstance(input_dataset.output_types, dtypes.DType):
+      raise TypeError("DenseToSparseDataset requires an input whose elements "
+                      "have a single component, whereas the input has %r." %
+                      input_dataset.output_types)
+    self._input_dataset = input_dataset
+    self._batch_size = batch_size
+    self._row_shape = row_shape
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.dense_to_sparse_batch_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._batch_size,
+        row_shape=convert.partial_shape_to_tensor(self._row_shape),
+        **dataset_ops.flat_structure(self))
+
+  @property
+  def output_classes(self):
+    return sparse_tensor.SparseTensor
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.vector(None).concatenate(self._row_shape)
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class _RestructuredDataset(dataset_ops.UnaryDataset):
+  """An internal helper for changing the structure and shape of a dataset."""
+
+  def __init__(self,
+               dataset,
+               output_types,
+               output_shapes=None,
+               output_classes=None,
+               allow_unsafe_cast=False):
+    """Creates a new dataset with the given output types and shapes.
+
+    The given `dataset` must have a structure that is convertible:
+    * `dataset.output_types` must be the same as `output_types` module nesting.
+    * Each shape in `dataset.output_shapes` must be compatible with each shape
+      in `output_shapes` (if given).
+
+    Note: This helper permits "unsafe casts" for shapes, equivalent to using
+    `tf.Tensor.set_shape()` where domain-specific knowledge is available.
+
+    Args:
+      dataset: A `Dataset` object.
+      output_types: A nested structure of `tf.DType` objects.
+      output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects.
+        If omitted, the shapes will be inherited from `dataset`.
+      output_classes: (Optional.) A nested structure of class types.
+        If omitted, the class types will be inherited from `dataset`.
+      allow_unsafe_cast: (Optional.) If `True`, the caller may switch the
+        reported output types and shapes of the restructured dataset, e.g. to
+        switch a sparse tensor represented as `tf.variant` to its user-visible
+        type and shape.
+
+    Raises:
+      ValueError: If either `output_types` or `output_shapes` is not compatible
+        with the structure of `dataset`.
+    """
+    super(_RestructuredDataset, self).__init__(dataset)
+    self._input_dataset = dataset
+
+    if not allow_unsafe_cast:
+      # Validate that the types are compatible.
+      output_types = nest.map_structure(dtypes.as_dtype, output_types)
+      flat_original_types = nest.flatten(dataset.output_types)
+      flat_new_types = nest.flatten(output_types)
+      if flat_original_types != flat_new_types:
+        raise ValueError(
+            "Dataset with output types %r cannot be restructured to have "
+            "output types %r" % (dataset.output_types, output_types))
+
+    self._output_types = output_types
+
+    if output_shapes is None:
+      # Inherit shapes from the original `dataset`.
+      self._output_shapes = nest.pack_sequence_as(output_types,
+                                                  nest.flatten(
+                                                      dataset.output_shapes))
+    else:
+      if not allow_unsafe_cast:
+        # Validate that the shapes are compatible.
+        nest.assert_same_structure(output_types, output_shapes)
+        flat_original_shapes = nest.flatten(dataset.output_shapes)
+        flat_new_shapes = nest.flatten_up_to(output_types, output_shapes)
+
+        for original_shape, new_shape in zip(flat_original_shapes,
+                                             flat_new_shapes):
+          if not original_shape.is_compatible_with(new_shape):
+            raise ValueError(
+                "Dataset with output shapes %r cannot be restructured to have "
+                "incompatible output shapes %r" % (dataset.output_shapes,
+                                                   output_shapes))
+      self._output_shapes = nest.map_structure_up_to(
+          output_types, tensor_shape.as_shape, output_shapes)
+    if output_classes is None:
+      # Inherit class types from the original `dataset`.
+      self._output_classes = nest.pack_sequence_as(output_types,
+                                                   nest.flatten(
+                                                       dataset.output_classes))
+    else:
+      self._output_classes = output_classes
+
+  def _as_variant_tensor(self):
+    return self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+
+class _MapAndBatchDataset(dataset_ops.MapDataset):
+  """A `Dataset` that maps a function over a batch of elements."""
+
+  def __init__(self, input_dataset, map_func, batch_size, num_parallel_calls,
+               drop_remainder):
+    """See `Dataset.map()` for details."""
+    super(_MapAndBatchDataset, self).__init__(input_dataset, map_func)
+    self._batch_size_t = ops.convert_to_tensor(
+        batch_size, dtype=dtypes.int64, name="batch_size")
+    self._num_parallel_calls_t = ops.convert_to_tensor(
+        num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
+    self._drop_remainder_t = ops.convert_to_tensor(
+        drop_remainder, dtype=dtypes.bool, name="drop_remainder")
+
+    self._batch_size = batch_size
+    self._drop_remainder = drop_remainder
+
+  def _as_variant_tensor(self):
+    # pylint: disable=protected-access
+    input_resource = self._input_dataset._as_variant_tensor()
+    return gen_dataset_ops.map_and_batch_dataset_v2(
+        input_resource,
+        self._map_func.captured_inputs,
+        f=self._map_func,
+        batch_size=self._batch_size_t,
+        num_parallel_calls=self._num_parallel_calls_t,
+        drop_remainder=self._drop_remainder_t,
+        **dataset_ops.flat_structure(self))
+    # pylint: enable=protected-access
+
+  @property
+  def output_shapes(self):
+    dim = self._batch_size if self._drop_remainder else None
+    return nest.pack_sequence_as(self._output_shapes, [
+        tensor_shape.vector(dim).concatenate(s)
+        for s in nest.flatten(self._output_shapes)
+    ])
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+
+@tf_export("data.experimental.map_and_batch")
+def map_and_batch(map_func,
+                  batch_size,
+                  num_parallel_batches=None,
+                  drop_remainder=False,
+                  num_parallel_calls=None):
+  """Fused implementation of `map` and `batch`.
+
+  Maps `map_func` across `batch_size` consecutive elements of this dataset
+  and then combines them into a batch. Functionally, it is equivalent to `map`
+  followed by `batch`. However, by fusing the two transformations together, the
+  implementation can be more efficient. Surfacing this transformation in the API
+  is temporary. Once automatic input pipeline optimization is implemented,
+  the fusing of `map` and `batch` will happen automatically and this API will be
+  deprecated.
+
+  Args:
+    map_func: A function mapping a nested structure of tensors to another
+      nested structure of tensors.
+    batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+      consecutive elements of this dataset to combine in a single batch.
+    num_parallel_batches: (Optional.) A `tf.int64` scalar `tf.Tensor`,
+      representing the number of batches to create in parallel. On one hand,
+      higher values can help mitigate the effect of stragglers. On the other
+      hand, higher values can increase contention if CPU is scarce.
+    drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
+      whether the last batch should be dropped in case its size is smaller than
+      desired; the default behavior is not to drop the smaller batch.
+    num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
+        representing the number of elements to process in parallel. If not
+        specified, `batch_size * num_parallel_batches` elements will be
+        processed in parallel.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+
+  Raises:
+    ValueError: If both `num_parallel_batches` and `num_parallel_calls` are
+      specified.
+  """
+
+  if num_parallel_batches is None and num_parallel_calls is None:
+    num_parallel_calls = batch_size
+  elif num_parallel_batches is not None and num_parallel_calls is None:
+    num_parallel_calls = batch_size * num_parallel_batches
+  elif num_parallel_batches is not None and num_parallel_calls is not None:
+    raise ValueError("The `num_parallel_batches` and `num_parallel_calls` "
+                     "arguments are mutually exclusive.")
+
+  def _apply_fn(dataset):
+    return _MapAndBatchDataset(dataset, map_func, batch_size,
+                               num_parallel_calls, drop_remainder)
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/counter.py b/tensorflow/python/data/experimental/ops/counter.py
new file mode 100644
index 0000000000..42200eaef9
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/counter.py
@@ -0,0 +1,55 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Counter Dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import scan_ops
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.Counter")
+def Counter(start=0, step=1, dtype=dtypes.int64):
+  """Creates a `Dataset` that counts from `start` in steps of size `step`.
+
+  For example:
+
+  ```python
+  Dataset.count() == [0, 1, 2, ...)
+  Dataset.count(2) == [2, 3, ...)
+  Dataset.count(2, 5) == [2, 7, 12, ...)
+  Dataset.count(0, -1) == [0, -1, -2, ...)
+  Dataset.count(10, -1) == [10, 9, ...)
+  ```
+
+  Args:
+    start: (Optional.) The starting value for the counter. Defaults to 0.
+    step: (Optional.) The step size for the counter. Defaults to 1.
+    dtype: (Optional.) The data type for counter elements. Defaults to
+      `tf.int64`.
+
+  Returns:
+    A `Dataset` of scalar `dtype` elements.
+  """
+  with ops.name_scope("counter"):
+    start = ops.convert_to_tensor(start, dtype=dtype, name="start")
+    step = ops.convert_to_tensor(step, dtype=dtype, name="step")
+    return dataset_ops.Dataset.from_tensors(0).repeat(None).apply(
+        scan_ops.scan(start, lambda state, _: (state + step, state)))
diff --git a/tensorflow/python/data/experimental/ops/enumerate_ops.py b/tensorflow/python/data/experimental/ops/enumerate_ops.py
new file mode 100644
index 0000000000..a1af98f552
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/enumerate_ops.py
@@ -0,0 +1,60 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Enumerate dataset transformations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.enumerate_dataset")
+def enumerate_dataset(start=0):
+  """A transformation that enumerate the elements of a dataset.
+
+  It is Similar to python's `enumerate`.
+  For example:
+
+  ```python
+  # NOTE: The following examples use `{ ... }` to represent the
+  # contents of a dataset.
+  a = { 1, 2, 3 }
+  b = { (7, 8), (9, 10) }
+
+  # The nested structure of the `datasets` argument determines the
+  # structure of elements in the resulting dataset.
+  a.apply(tf.data.experimental.enumerate(start=5)) == { (5, 1), (6, 2), (7, 3) }
+  b.apply(tf.data.experimental.enumerate()) == { (0, (7, 8)), (1, (9, 10)) }
+  ```
+
+  Args:
+    start: A `tf.int64` scalar `tf.Tensor`, representing the start
+      value for enumeration.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    max_value = np.iinfo(dtypes.int64.as_numpy_dtype).max
+    return dataset_ops.Dataset.zip((dataset_ops.Dataset.range(start, max_value),
+                                    dataset))
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/error_ops.py b/tensorflow/python/data/experimental/ops/error_ops.py
new file mode 100644
index 0000000000..82e274b70c
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/error_ops.py
@@ -0,0 +1,78 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ignore_errors dataset transformations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.ignore_errors")
+def ignore_errors():
+  """Creates a `Dataset` from another `Dataset` and silently ignores any errors.
+
+  Use this transformation to produce a dataset that contains the same elements
+  as the input, but silently drops any elements that caused an error. For
+  example:
+
+  ```python
+  dataset = tf.data.Dataset.from_tensor_slices([1., 2., 0., 4.])
+
+  # Computing `tf.check_numerics(1. / 0.)` will raise an InvalidArgumentError.
+  dataset = dataset.map(lambda x: tf.check_numerics(1. / x, "error"))
+
+  # Using `ignore_errors()` will drop the element that causes an error.
+  dataset =
+      dataset.apply(tf.data.experimental.ignore_errors())  # ==> {1., 0.5, 0.2}
+  ```
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    return _IgnoreErrorsDataset(dataset)
+
+  return _apply_fn
+
+
+class _IgnoreErrorsDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that silently ignores errors when computing its input."""
+
+  def __init__(self, input_dataset):
+    """See `Dataset.ignore_errors()` for details."""
+    super(_IgnoreErrorsDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+
+  def _as_variant_tensor(self):
+    return gen_experimental_dataset_ops.experimental_ignore_errors_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        **dataset_ops.flat_structure(self))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
diff --git a/tensorflow/python/data/experimental/ops/get_single_element.py b/tensorflow/python/data/experimental/ops/get_single_element.py
new file mode 100644
index 0000000000..132526166c
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/get_single_element.py
@@ -0,0 +1,72 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for Datasets and Iterators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.get_single_element")
+def get_single_element(dataset):
+  """Returns the single element in `dataset` as a nested structure of tensors.
+
+  This function enables you to use a `tf.data.Dataset` in a stateless
+  "tensor-in tensor-out" expression, without creating a `tf.data.Iterator`.
+  This can be useful when your preprocessing transformations are expressed
+  as a `Dataset`, and you want to use the transformation at serving time.
+  For example:
+
+  ```python
+  input_batch = tf.placeholder(tf.string, shape=[BATCH_SIZE])
+
+  def preprocessing_fn(input_str):
+    # ...
+    return image, label
+
+  dataset = (tf.data.Dataset.from_tensor_slices(input_batch)
+             .map(preprocessing_fn, num_parallel_calls=BATCH_SIZE)
+             .batch(BATCH_SIZE))
+
+  image_batch, label_batch = tf.data.experimental.get_single_element(dataset)
+  ```
+
+  Args:
+    dataset: A `tf.data.Dataset` object containing a single element.
+
+  Returns:
+    A nested structure of `tf.Tensor` objects, corresponding to the single
+    element of `dataset`.
+
+  Raises:
+    TypeError: if `dataset` is not a `tf.data.Dataset` object.
+    InvalidArgumentError (at runtime): if `dataset` does not contain exactly
+      one element.
+  """
+  if not isinstance(dataset, dataset_ops.Dataset):
+    raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
+
+  nested_ret = nest.pack_sequence_as(
+      dataset.output_types, gen_dataset_ops.dataset_to_single_element(
+          dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          **dataset_ops.flat_structure(dataset)))
+  return sparse.deserialize_sparse_tensors(
+      nested_ret, dataset.output_types, dataset.output_shapes,
+      dataset.output_classes)
diff --git a/tensorflow/python/data/experimental/ops/grouping.py b/tensorflow/python/data/experimental/ops/grouping.py
new file mode 100644
index 0000000000..18ba583220
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/grouping.py
@@ -0,0 +1,551 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Grouping dataset transformations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.group_by_reducer")
+def group_by_reducer(key_func, reducer):
+  """A transformation that groups elements and performs a reduction.
+
+  This transformation maps element of a dataset to a key using `key_func` and
+  groups the elements by key. The `reducer` is used to process each group; its
+  `init_func` is used to initialize state for each group when it is created, the
+  `reduce_func` is used to update the state every time an element is mapped to
+  the matching group, and the `finalize_func` is used to map the final state to
+  an output value.
+
+  Args:
+    key_func: A function mapping a nested structure of tensors
+      (having shapes and types defined by `self.output_shapes` and
+      `self.output_types`) to a scalar `tf.int64` tensor.
+    reducer: An instance of `Reducer`, which captures the reduction logic using
+      the `init_func`, `reduce_func`, and `finalize_func` functions.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    return _GroupByReducerDataset(dataset, key_func, reducer)
+
+  return _apply_fn
+
+
+@tf_export("data.experimental.group_by_window")
+def group_by_window(key_func,
+                    reduce_func,
+                    window_size=None,
+                    window_size_func=None):
+  """A transformation that groups windows of elements by key and reduces them.
+
+  This transformation maps each consecutive element in a dataset to a key
+  using `key_func` and groups the elements by key. It then applies
+  `reduce_func` to at most `window_size_func(key)` elements matching the same
+  key. All except the final window for each key will contain
+  `window_size_func(key)` elements; the final window may be smaller.
+
+  You may provide either a constant `window_size` or a window size determined by
+  the key through `window_size_func`.
+
+  Args:
+    key_func: A function mapping a nested structure of tensors
+      (having shapes and types defined by `self.output_shapes` and
+      `self.output_types`) to a scalar `tf.int64` tensor.
+    reduce_func: A function mapping a key and a dataset of up to `window_size`
+      consecutive elements matching that key to another dataset.
+    window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+      consecutive elements matching the same key to combine in a single
+      batch, which will be passed to `reduce_func`. Mutually exclusive with
+      `window_size_func`.
+    window_size_func: A function mapping a key to a `tf.int64` scalar
+      `tf.Tensor`, representing the number of consecutive elements matching
+      the same key to combine in a single batch, which will be passed to
+      `reduce_func`. Mutually exclusive with `window_size`.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+
+  Raises:
+    ValueError: if neither or both of {`window_size`, `window_size_func`} are
+      passed.
+  """
+  if (window_size is not None and window_size_func or
+      not (window_size is not None or window_size_func)):
+    raise ValueError("Must pass either window_size or window_size_func.")
+
+  if window_size is not None:
+
+    def constant_window_func(unused_key):
+      return ops.convert_to_tensor(window_size, dtype=dtypes.int64)
+
+    window_size_func = constant_window_func
+
+  assert window_size_func is not None
+
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    return _GroupByWindowDataset(dataset, key_func, reduce_func,
+                                 window_size_func)
+
+  return _apply_fn
+
+
+@tf_export("data.experimental.bucket_by_sequence_length")
+def bucket_by_sequence_length(element_length_func,
+                              bucket_boundaries,
+                              bucket_batch_sizes,
+                              padded_shapes=None,
+                              padding_values=None,
+                              pad_to_bucket_boundary=False,
+                              no_padding=False):
+  """A transformation that buckets elements in a `Dataset` by length.
+
+  Elements of the `Dataset` are grouped together by length and then are padded
+  and batched.
+
+  This is useful for sequence tasks in which the elements have variable length.
+  Grouping together elements that have similar lengths reduces the total
+  fraction of padding in a batch which increases training step efficiency.
+
+  Args:
+    element_length_func: function from element in `Dataset` to `tf.int32`,
+      determines the length of the element, which will determine the bucket it
+      goes into.
+    bucket_boundaries: `list<int>`, upper length boundaries of the buckets.
+    bucket_batch_sizes: `list<int>`, batch size per bucket. Length should be
+      `len(bucket_boundaries) + 1`.
+    padded_shapes: Nested structure of `tf.TensorShape` to pass to
+      `tf.data.Dataset.padded_batch`. If not provided, will use
+      `dataset.output_shapes`, which will result in variable length dimensions
+      being padded out to the maximum length in each batch.
+    padding_values: Values to pad with, passed to
+      `tf.data.Dataset.padded_batch`. Defaults to padding with 0.
+    pad_to_bucket_boundary: bool, if `False`, will pad dimensions with unknown
+      size to maximum length in batch. If `True`, will pad dimensions with
+      unknown size to bucket boundary minus 1 (i.e., the maximum length in each
+      bucket), and caller must ensure that the source `Dataset` does not contain
+      any elements with length longer than `max(bucket_boundaries)`.
+    no_padding: `bool`, indicates whether to pad the batch features (features
+      need to be either of type `tf.SparseTensor` or of same shape).
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+
+  Raises:
+    ValueError: if `len(bucket_batch_sizes) != len(bucket_boundaries) + 1`.
+  """
+  with ops.name_scope("bucket_by_seq_length"):
+    if len(bucket_batch_sizes) != (len(bucket_boundaries) + 1):
+      raise ValueError(
+          "len(bucket_batch_sizes) must equal len(bucket_boundaries) + 1")
+
+    batch_sizes = constant_op.constant(bucket_batch_sizes, dtype=dtypes.int64)
+
+    def element_to_bucket_id(*args):
+      """Return int64 id of the length bucket for this element."""
+      seq_length = element_length_func(*args)
+
+      boundaries = list(bucket_boundaries)
+      buckets_min = [np.iinfo(np.int32).min] + boundaries
+      buckets_max = boundaries + [np.iinfo(np.int32).max]
+      conditions_c = math_ops.logical_and(
+          math_ops.less_equal(buckets_min, seq_length),
+          math_ops.less(seq_length, buckets_max))
+      bucket_id = math_ops.reduce_min(array_ops.where(conditions_c))
+
+      return bucket_id
+
+    def window_size_fn(bucket_id):
+      # The window size is set to the batch size for this bucket
+      window_size = batch_sizes[bucket_id]
+      return window_size
+
+    def make_padded_shapes(shapes, none_filler=None):
+      padded = []
+      for shape in nest.flatten(shapes):
+        shape = tensor_shape.TensorShape(shape)
+        shape = [
+            none_filler if d.value is None else d
+            for d in shape
+        ]
+        padded.append(shape)
+      return nest.pack_sequence_as(shapes, padded)
+
+    def batching_fn(bucket_id, grouped_dataset):
+      """Batch elements in dataset."""
+      batch_size = window_size_fn(bucket_id)
+      if no_padding:
+        return grouped_dataset.batch(batch_size)
+      none_filler = None
+      if pad_to_bucket_boundary:
+        err_msg = ("When pad_to_bucket_boundary=True, elements must have "
+                   "length < max(bucket_boundaries).")
+        check = check_ops.assert_less(
+            bucket_id,
+            constant_op.constant(len(bucket_batch_sizes) - 1,
+                                 dtype=dtypes.int64),
+            message=err_msg)
+        with ops.control_dependencies([check]):
+          boundaries = constant_op.constant(bucket_boundaries,
+                                            dtype=dtypes.int64)
+          bucket_boundary = boundaries[bucket_id]
+          none_filler = bucket_boundary - 1
+      shapes = make_padded_shapes(
+          padded_shapes or grouped_dataset.output_shapes,
+          none_filler=none_filler)
+      return grouped_dataset.padded_batch(batch_size, shapes, padding_values)
+
+    def _apply_fn(dataset):
+      return dataset.apply(
+          group_by_window(element_to_bucket_id, batching_fn,
+                          window_size_func=window_size_fn))
+
+    return _apply_fn
+
+
+def _map_x_dataset(map_func):
+  """A transformation that maps `map_func` across its input.
+
+  This transformation is similar to `tf.data.Dataset.map`, but in addition to
+  supporting dense and sparse tensor inputs, it also supports dataset inputs.
+
+  Args:
+    map_func: A function mapping a nested structure of tensors and/or datasets
+      (having shapes and types defined by `self.output_shapes` and
+     `self.output_types`) to another nested structure of tensors and/or
+     datasets.
+
+  Returns:
+    Dataset: A `Dataset`.
+  """
+
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    return _MapXDataset(dataset, map_func)
+
+  return _apply_fn
+
+
+class _GroupByReducerDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that groups its input and performs a reduction."""
+
+  def __init__(self, input_dataset, key_func, reducer):
+    """See `group_by_reducer()` for details."""
+    super(_GroupByReducerDataset, self).__init__(input_dataset)
+
+    self._input_dataset = input_dataset
+
+    self._make_key_func(key_func, input_dataset)
+    self._make_init_func(reducer.init_func)
+    self._make_reduce_func(reducer.reduce_func, input_dataset)
+    self._make_finalize_func(reducer.finalize_func)
+
+  def _make_key_func(self, key_func, input_dataset):
+    """Make wrapping Defun for key_func."""
+    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+        key_func, "tf.data.experimental.group_by_reducer()", input_dataset)
+    if not (
+        wrapped_func.output_types == dtypes.int64 and
+        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+      raise ValueError(
+          "`key_func` must return a single tf.int64 tensor. "
+          "Got type=%s and shape=%s"
+          % (wrapped_func.output_types, wrapped_func.output_shapes))
+    self._key_func = wrapped_func.function
+
+  def _make_init_func(self, init_func):
+    """Make wrapping Defun for init_func."""
+    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+        init_func,
+        "tf.data.experimental.group_by_reducer()",
+        input_classes=ops.Tensor,
+        input_shapes=tensor_shape.scalar(),
+        input_types=dtypes.int64)
+    self._init_func = wrapped_func.function
+    self._state_classes = wrapped_func.output_classes
+    self._state_shapes = wrapped_func.output_shapes
+    self._state_types = wrapped_func.output_types
+
+  def _make_reduce_func(self, reduce_func, input_dataset):
+    """Make wrapping Defun for reduce_func."""
+
+    # Iteratively rerun the reduce function until reaching a fixed point on
+    # `self._state_shapes`.
+    need_to_rerun = True
+    while need_to_rerun:
+
+      wrapped_func = dataset_ops.StructuredFunctionWrapper(
+          reduce_func,
+          "tf.data.experimental.group_by_reducer()",
+          input_classes=(self._state_classes, input_dataset.output_classes),
+          input_shapes=(self._state_shapes, input_dataset.output_shapes),
+          input_types=(self._state_types, input_dataset.output_types),
+          add_to_graph=False)
+
+      # Extract and validate class information from the returned values.
+      for new_state_class, state_class in zip(
+          nest.flatten(wrapped_func.output_classes),
+          nest.flatten(self._state_classes)):
+        if not issubclass(new_state_class, state_class):
+          raise TypeError(
+              "The element classes for the new state must match the initial "
+              "state. Expected %s; got %s." %
+              (self._state_classes, wrapped_func.output_classes))
+
+      # Extract and validate type information from the returned values.
+      for new_state_type, state_type in zip(
+          nest.flatten(wrapped_func.output_types),
+          nest.flatten(self._state_types)):
+        if new_state_type != state_type:
+          raise TypeError(
+              "The element types for the new state must match the initial "
+              "state. Expected %s; got %s." %
+              (self._state_types, wrapped_func.output_types))
+
+      # Extract shape information from the returned values.
+      flat_state_shapes = nest.flatten(self._state_shapes)
+      flat_new_state_shapes = nest.flatten(wrapped_func.output_shapes)
+      weakened_state_shapes = [
+          original.most_specific_compatible_shape(new)
+          for original, new in zip(flat_state_shapes, flat_new_state_shapes)
+      ]
+
+      need_to_rerun = False
+      for original_shape, weakened_shape in zip(flat_state_shapes,
+                                                weakened_state_shapes):
+        if original_shape.ndims is not None and (
+            weakened_shape.ndims is None or
+            original_shape.as_list() != weakened_shape.as_list()):
+          need_to_rerun = True
+          break
+
+      if need_to_rerun:
+        self._state_shapes = nest.pack_sequence_as(self._state_shapes,
+                                                   weakened_state_shapes)
+
+    self._reduce_func = wrapped_func.function
+    self._reduce_func.add_to_graph(ops.get_default_graph())
+
+  def _make_finalize_func(self, finalize_func):
+    """Make wrapping Defun for finalize_func."""
+    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+        finalize_func,
+        "tf.data.experimental.group_by_reducer()",
+        input_classes=self._state_classes,
+        input_shapes=self._state_shapes,
+        input_types=self._state_types)
+    self._finalize_func = wrapped_func.function
+    self._output_classes = wrapped_func.output_classes
+    self._output_shapes = wrapped_func.output_shapes
+    self._output_types = wrapped_func.output_types
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.group_by_reducer_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._key_func.captured_inputs,
+        self._init_func.captured_inputs,
+        self._reduce_func.captured_inputs,
+        self._finalize_func.captured_inputs,
+        key_func=self._key_func,
+        init_func=self._init_func,
+        reduce_func=self._reduce_func,
+        finalize_func=self._finalize_func,
+        **dataset_ops.flat_structure(self))
+
+
+class _GroupByWindowDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that groups its input and performs a windowed reduction."""
+
+  def __init__(self, input_dataset, key_func, reduce_func, window_size_func):
+    """See `group_by_window()` for details."""
+    super(_GroupByWindowDataset, self).__init__(input_dataset)
+
+    self._input_dataset = input_dataset
+
+    self._make_key_func(key_func, input_dataset)
+    self._make_reduce_func(reduce_func, input_dataset)
+    self._make_window_size_func(window_size_func)
+
+  def _make_window_size_func(self, window_size_func):
+    """Make wrapping Defun for window_size_func."""
+    def window_size_func_wrapper(key):
+      return ops.convert_to_tensor(window_size_func(key), dtype=dtypes.int64)
+    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+        window_size_func_wrapper,
+        "tf.data.experimental.group_by_window()",
+        input_classes=ops.Tensor,
+        input_shapes=tensor_shape.scalar(),
+        input_types=dtypes.int64)
+    if not (
+        wrapped_func.output_types == dtypes.int64 and
+        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+      raise ValueError(
+          "`window_size_func` must return a single tf.int64 scalar tensor.")
+    self._window_size_func = wrapped_func.function
+
+  def _make_key_func(self, key_func, input_dataset):
+    """Make wrapping Defun for key_func."""
+    def key_func_wrapper(*args):
+      return ops.convert_to_tensor(key_func(*args), dtype=dtypes.int64)
+    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+        key_func_wrapper, "tf.data.experimental.group_by_window()",
+        input_dataset)
+    if not (
+        wrapped_func.output_types == dtypes.int64 and
+        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+      raise ValueError(
+          "`key_func` must return a single tf.int64 scalar tensor.")
+    self._key_func = wrapped_func.function
+
+  def _make_reduce_func(self, reduce_func, input_dataset):
+    """Make wrapping Defun for reduce_func."""
+    nested_dataset = dataset_ops._NestedDatasetComponent(input_dataset)  # pylint: disable=protected-access
+    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+        reduce_func,
+        "tf.data.experimental.reduce_by_window()",
+        input_classes=(ops.Tensor, nested_dataset),
+        input_shapes=(tensor_shape.scalar(), nested_dataset),
+        input_types=(dtypes.int64, nested_dataset),
+        experimental_nested_dataset_support=True)
+    if not isinstance(
+        wrapped_func.output_classes, dataset_ops._NestedDatasetComponent):  # pylint: disable=protected-access
+      raise TypeError("`reduce_func` must return a `Dataset` object.")
+    self._output_classes = wrapped_func.output_classes.output_classes
+    self._output_types = wrapped_func.output_types.output_types
+    self._output_shapes = wrapped_func.output_shapes.output_shapes
+    self._reduce_func = wrapped_func.function
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.group_by_window_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._key_func.captured_inputs,
+        self._reduce_func.captured_inputs,
+        self._window_size_func.captured_inputs,
+        key_func=self._key_func,
+        reduce_func=self._reduce_func,
+        window_size_func=self._window_size_func,
+        **dataset_ops.flat_structure(self))
+
+
+@tf_export("data.experimental.Reducer")
+class Reducer(object):
+  """A reducer is used for reducing a set of elements.
+
+  A reducer is represented as a tuple of the three functions:
+    1) initialization function: key => initial state
+    2) reduce function: (old state, input) => new state
+    3) finalization function: state => result
+  """
+
+  def __init__(self, init_func, reduce_func, finalize_func):
+    self._init_func = init_func
+    self._reduce_func = reduce_func
+    self._finalize_func = finalize_func
+
+  @property
+  def init_func(self):
+    return self._init_func
+
+  @property
+  def reduce_func(self):
+    return self._reduce_func
+
+  @property
+  def finalize_func(self):
+    return self._finalize_func
+
+
+class _MapXDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that maps a function over elements in its input."""
+
+  def __init__(self, input_dataset, map_func):
+    """See `map_x_dataset()` for details."""
+    super(_MapXDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+
+    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+        map_func,
+        "tf.data.experimental.map_x_dataset()",
+        input_dataset,
+        experimental_nested_dataset_support=True)
+    self._output_classes = wrapped_func.output_classes
+    self._output_shapes = wrapped_func.output_shapes
+    self._output_types = wrapped_func.output_types
+    self._map_func = wrapped_func.function
+
+  def _as_variant_tensor(self):
+    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
+    return gen_dataset_ops.map_dataset(
+        input_t,
+        self._map_func.captured_inputs,
+        f=self._map_func,
+        **dataset_ops.flat_structure(self))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
diff --git a/tensorflow/contrib/data/python/ops/indexed_dataset_ops.py b/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
similarity index 100%
rename from tensorflow/contrib/data/python/ops/indexed_dataset_ops.py
rename to tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
new file mode 100644
index 0000000000..a3c094859e
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -0,0 +1,262 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Non-deterministic dataset transformations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import random_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.ops import gen_stateless_random_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.parallel_interleave")
+def parallel_interleave(map_func,
+                        cycle_length,
+                        block_length=1,
+                        sloppy=False,
+                        buffer_output_elements=None,
+                        prefetch_input_elements=None):
+  """A parallel version of the `Dataset.interleave()` transformation.
+
+  `parallel_interleave()` maps `map_func` across its input to produce nested
+  datasets, and outputs their elements interleaved. Unlike
+  `tf.data.Dataset.interleave`, it gets elements from `cycle_length` nested
+  datasets in parallel, which increases the throughput, especially in the
+  presence of stragglers. Furthermore, the `sloppy` argument can be used to
+  improve performance, by relaxing the requirement that the outputs are produced
+  in a deterministic order, and allowing the implementation to skip over nested
+  datasets whose elements are not readily available when requested.
+
+  Example usage:
+
+  ```python
+  # Preprocess 4 files concurrently.
+  filenames = tf.data.Dataset.list_files("/path/to/data/train*.tfrecords")
+  dataset = filenames.apply(
+      tf.data.experimental.parallel_interleave(
+          lambda filename: tf.data.TFRecordDataset(filename),
+          cycle_length=4))
+  ```
+
+  WARNING: If `sloppy` is `True`, the order of produced elements is not
+  deterministic.
+
+  Args:
+    map_func: A function mapping a nested structure of tensors to a `Dataset`.
+    cycle_length: The number of input `Dataset`s to interleave from in parallel.
+    block_length: The number of consecutive elements to pull from an input
+      `Dataset` before advancing to the next input `Dataset`.
+    sloppy: If false, elements are produced in deterministic order. Otherwise,
+      the implementation is allowed, for the sake of expediency, to produce
+      elements in a non-deterministic order.
+    buffer_output_elements: The number of elements each iterator being
+      interleaved should buffer (similar to the `.prefetch()` transformation for
+      each interleaved iterator).
+    prefetch_input_elements: The number of input elements to transform to
+      iterators before they are needed for interleaving.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+  def _apply_fn(dataset):
+    return readers.ParallelInterleaveDataset(
+        dataset, map_func, cycle_length, block_length, sloppy,
+        buffer_output_elements, prefetch_input_elements)
+
+  return _apply_fn
+
+
+class _DirectedInterleaveDataset(dataset_ops.Dataset):
+  """A substitute for `Dataset.interleave()` on a fixed list of datasets."""
+
+  def __init__(self, selector_input, data_inputs):
+    self._selector_input = selector_input
+    self._data_inputs = list(data_inputs)
+
+    for data_input in data_inputs[1:]:
+      if (data_input.output_types != data_inputs[0].output_types or
+          data_input.output_classes != data_inputs[0].output_classes):
+        raise TypeError("All datasets must have the same type and class.")
+
+  def _as_variant_tensor(self):
+    # pylint: disable=protected-access
+    return (
+        gen_experimental_dataset_ops.experimental_directed_interleave_dataset(
+            self._selector_input._as_variant_tensor(), [
+                data_input._as_variant_tensor()
+                for data_input in self._data_inputs
+            ], **dataset_ops.flat_structure(self)))
+    # pylint: enable=protected-access
+
+  def _inputs(self):
+    return [self._selector_input] + self._data_inputs
+
+  @property
+  def output_classes(self):
+    return self._data_inputs[0].output_classes
+
+  @property
+  def output_shapes(self):
+    ret = self._data_inputs[0].output_shapes
+    for data_input in self._data_inputs[1:]:
+      ret = nest.pack_sequence_as(ret, [
+          ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip(
+              nest.flatten(ret), nest.flatten(data_input.output_shapes))
+      ])
+    return ret
+
+  @property
+  def output_types(self):
+    return self._data_inputs[0].output_types
+
+
+@tf_export("data.experimental.sample_from_datasets")
+def sample_from_datasets(datasets, weights=None, seed=None):
+  """Samples elements at random from the datasets in `datasets`.
+
+  Args:
+    datasets: A list of `tf.data.Dataset` objects with compatible structure.
+    weights: (Optional.) A list of `len(datasets)` floating-point values where
+      `weights[i]` represents the probability with which an element should be
+      sampled from `datasets[i]`, or a `tf.data.Dataset` object where each
+      element is such a list. Defaults to a uniform distribution across
+      `datasets`.
+    seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+      random seed that will be used to create the distribution. See
+      `tf.set_random_seed` for behavior.
+
+  Returns:
+    A dataset that interleaves elements from `datasets` at random, according to
+    `weights` if provided, otherwise with uniform probability.
+
+  Raises:
+    TypeError: If the `datasets` or `weights` arguments have the wrong type.
+    ValueError: If the `weights` argument is specified and does not match the
+      length of the `datasets` element.
+  """
+  num_datasets = len(datasets)
+  if not isinstance(weights, dataset_ops.Dataset):
+    if weights is None:
+      # Select inputs with uniform probability.
+      logits = [[1.0] * num_datasets]
+
+    else:
+      # Use the given `weights` as the probability of choosing the respective
+      # input.
+      weights = ops.convert_to_tensor(weights, name="weights")
+      if weights.dtype not in (dtypes.float32, dtypes.float64):
+        raise TypeError("`weights` must be convertible to a tensor of "
+                        "`tf.float32` or `tf.float64` elements.")
+      if not weights.shape.is_compatible_with([num_datasets]):
+        raise ValueError(
+            "`weights` must be a vector of length `len(datasets)`.")
+
+      # The `stateless_multinomial()` op expects log-probabilities, as opposed
+      # to weights.
+      logits = array_ops.expand_dims(math_ops.log(weights, name="logits"), 0)
+
+    # NOTE(mrry): We only specialize when `weights` is not a `Dataset`. When it
+    # is a `Dataset`, it is possible that evaluating it has a side effect the
+    # user depends on.
+    if len(datasets) == 1:
+      return datasets[0]
+
+    def select_dataset_constant_logits(seed):
+      return array_ops.squeeze(
+          gen_stateless_random_ops.stateless_multinomial(logits, 1, seed=seed),
+          axis=[0, 1])
+
+    selector_input = dataset_ops.MapDataset(
+        random_ops.RandomDataset(seed).batch(2),
+        select_dataset_constant_logits,
+        use_inter_op_parallelism=False)
+
+  else:
+    # Use each element of the given `weights` dataset as the probability of
+    # choosing the respective input.
+
+    # The `stateless_multinomial()` op expects log-probabilities, as opposed to
+    # weights.
+    logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits"))
+
+    def select_dataset_varying_logits(logits, seed):
+      return array_ops.squeeze(
+          gen_stateless_random_ops.stateless_multinomial(logits, 1, seed=seed),
+          axis=[0, 1])
+
+    logits_and_seeds = dataset_ops.Dataset.zip(
+        (logits_ds, random_ops.RandomDataset(seed).batch(2)))
+    selector_input = dataset_ops.MapDataset(
+        logits_and_seeds,
+        select_dataset_varying_logits,
+        use_inter_op_parallelism=False)
+
+  return _DirectedInterleaveDataset(selector_input, datasets)
+
+
+@tf_export("data.experimental.choose_from_datasets")
+def choose_from_datasets(datasets, choice_dataset):
+  """Creates a dataset that deterministically chooses elements from `datasets`.
+
+  For example, given the following datasets:
+
+  ```python
+  datasets = [tf.data.Dataset.from_tensors("foo").repeat(),
+              tf.data.Dataset.from_tensors("bar").repeat(),
+              tf.data.Dataset.from_tensors("baz").repeat()]
+
+  # Define a dataset containing `[0, 1, 2, 0, 1, 2, 0, 1, 2]`.
+  choice_dataset = tf.data.Dataset.range(3).repeat(3)
+
+  result = tf.data.experimental.choose_from_datasets(datasets, choice_dataset)
+  ```
+
+  The elements of `result` will be:
+
+  ```
+  "foo", "bar", "baz", "foo", "bar", "baz", "foo", "bar", "baz"
+  ```
+
+  Args:
+    datasets: A list of `tf.data.Dataset` objects with compatible structure.
+    choice_dataset: A `tf.data.Dataset` of scalar `tf.int64` tensors between
+      `0` and `len(datasets) - 1`.
+
+  Returns:
+    A dataset that interleaves elements from `datasets` according to the values
+    of `choice_dataset`.
+
+  Raises:
+    TypeError: If the `datasets` or `choice_dataset` arguments have the wrong
+      type.
+  """
+  if not (choice_dataset.output_types == dtypes.int64
+          and choice_dataset.output_shapes.is_compatible_with(
+              tensor_shape.scalar())
+          and choice_dataset.output_classes == ops.Tensor):
+    raise TypeError("`choice_dataset` must be a dataset of scalar "
+                    "`tf.int64` tensors.")
+  return _DirectedInterleaveDataset(choice_dataset, datasets)
diff --git a/tensorflow/python/data/experimental/ops/iterator_ops.py b/tensorflow/python/data/experimental/ops/iterator_ops.py
new file mode 100644
index 0000000000..72d7d58f06
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/iterator_ops.py
@@ -0,0 +1,268 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Iterator ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.ops import optional_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.make_saveable_from_iterator")
+def make_saveable_from_iterator(iterator):
+  """Returns a SaveableObject for saving/restore iterator state using Saver.
+
+  Args:
+    iterator: Iterator.
+
+  For example:
+
+  ```python
+  with tf.Graph().as_default():
+    ds = tf.data.Dataset.range(10)
+    iterator = ds.make_initializable_iterator()
+    # Build the iterator SaveableObject.
+    saveable_obj = tf.data.experimental.make_saveable_from_iterator(iterator)
+    # Add the SaveableObject to the SAVEABLE_OBJECTS collection so
+    # it can be automatically saved using Saver.
+    tf.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, saveable_obj)
+    saver = tf.train.Saver()
+
+    while continue_training:
+      ... Perform training ...
+      if should_save_checkpoint:
+        saver.save()
+  ```
+
+  Note: When restoring the iterator, the existing iterator state is completely
+  discarded. This means that any changes you may have made to the Dataset
+  graph will be discarded as well! This includes the new Dataset graph
+  that you may have built during validation. So, while running validation,
+  make sure to run the initializer for the validation input pipeline after
+  restoring the checkpoint.
+
+  Note: Not all iterators support checkpointing yet. Attempting to save the
+  state of an unsupported iterator will throw an error.
+  """
+  return _Saveable(iterator._iterator_resource)  # pylint: disable=protected-access
+
+
+class _Saveable(saver_lib.BaseSaverBuilder.SaveableObject):
+  """SaveableObject for saving/restoring iterator state."""
+
+  def __init__(self, iterator_resource):
+    serialized_iterator = gen_dataset_ops.serialize_iterator(iterator_resource)
+    specs = [
+        saver_lib.BaseSaverBuilder.SaveSpec(serialized_iterator, "",
+                                            iterator_resource.name + "-state")
+    ]
+    super(_Saveable, self).__init__(iterator_resource, specs,
+                                    iterator_resource.name)
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    with ops.colocate_with(self.op):
+      return gen_dataset_ops.deserialize_iterator(self.op, restored_tensors[0])
+
+
+@tf_export("data.experimental.CheckpointInputPipelineHook")
+class CheckpointInputPipelineHook(session_run_hook.SessionRunHook):
+  """Checkpoints input pipeline state every N steps or seconds.
+
+  This hook saves the state of the iterators in the `Graph` so that when
+  training is resumed the input pipeline continues from where it left off.
+  This could potentially avoid overfitting in certain pipelines where the
+  number of training steps per eval are small compared to the dataset
+  size or if the training pipeline is pre-empted.
+
+  Differences from `CheckpointSaverHook`:
+  1. Saves only the input pipelines in the "iterators" collection and not the
+     global variables or other saveable objects.
+  2. Does not write the `GraphDef` and `MetaGraphDef` to the summary.
+
+  Example of checkpointing the training pipeline:
+
+  ```python
+  est = tf.estimator.Estimator(model_fn)
+  while True:
+    est.train(
+        train_input_fn,
+        hooks=[tf.data.experimental.CheckpointInputPipelineHook(est)],
+        steps=train_steps_per_eval)
+    # Note: We do not pass the hook here.
+    metrics = est.evaluate(eval_input_fn)
+    if should_stop_the_training(metrics):
+      break
+  ```
+
+  This hook should be used if the input pipeline state needs to be saved
+  separate from the model checkpoint. Doing so may be useful for a few reasons:
+  1. The input pipeline checkpoint may be large, if there are large shuffle
+     or prefetch buffers for instance, and may bloat the checkpoint size.
+  2. If the input pipeline is shared between training and validation, restoring
+     the checkpoint during validation may override the validation input
+     pipeline.
+
+  For saving the input pipeline checkpoint alongside the model weights use
+  `tf.data.experimental.make_saveable_from_iterator` directly to create a
+  `SaveableObject` and add to the `SAVEABLE_OBJECTS` collection. Note, however,
+  that you will need to be careful not to restore the training iterator during
+  eval. You can do that by not adding the iterator to the SAVEABLE_OBJECTS
+  collector when building the eval graph.
+  """
+
+  def __init__(self, estimator):
+    """Initializes a `CheckpointInputPipelineHook`.
+
+    Args:
+      estimator: Estimator.
+
+    Raises:
+      ValueError: One of `save_steps` or `save_secs` should be set.
+      ValueError: At most one of saver or scaffold should be set.
+    """
+    # `checkpoint_basename` is "input.ckpt" for non-distributed pipelines or
+    # of the form "input_<task_type>_<task_id>.ckpt" for distributed pipelines.
+    # Note: The default `checkpoint_basename` used by `CheckpointSaverHook` is
+    # "model.ckpt". We intentionally choose the input pipeline checkpoint prefix
+    # to be different to avoid conflicts with the model checkpoint.
+
+    # pylint: disable=protected-access
+    checkpoint_prefix = "input"
+    if estimator._config.num_worker_replicas > 1:
+      # Distributed setting.
+      suffix = "_{}_{}".format(estimator._config.task_type,
+                               estimator._config.task_id)
+      checkpoint_prefix += suffix
+    # pylint: enable=protected-access
+
+    # We use a composition paradigm instead of inheriting from
+    # `CheckpointSaverHook` because `Estimator` does an `isinstance` check
+    # to check whether a `CheckpointSaverHook` is already present in the list
+    # of hooks and if not, adds one. Inheriting from `CheckpointSaverHook`
+    # would thwart this behavior. This hook checkpoints *only the iterators*
+    # and not the graph variables.
+    self._checkpoint_saver_hook = basic_session_run_hooks.CheckpointSaverHook(
+        estimator.model_dir,
+        save_secs=estimator._config.save_checkpoints_secs,  # pylint: disable=protected-access
+        save_steps=estimator._config.save_checkpoints_steps,  # pylint: disable=protected-access
+        checkpoint_basename=checkpoint_prefix + ".ckpt")
+
+    # Name for the protocol buffer file that will contain the list of most
+    # recent checkpoints stored as a `CheckpointState` protocol buffer.
+    # This file, kept in the same directory as the checkpoint files, is
+    # automatically managed by the `Saver` to keep track of recent checkpoints.
+    # The default name used by the `Saver` for this file is "checkpoint". Here
+    # we use the name "checkpoint_<checkpoint_prefix>" so that in case the
+    # `checkpoint_dir` is the same as the model checkpoint directory, there are
+    # no conflicts during restore.
+    self._latest_filename = "checkpoint_" + checkpoint_prefix
+    self._first_run = True
+
+  def begin(self):
+    # Build a Saver that saves all iterators in the `GLOBAL_ITERATORS`
+    # collection if no `Saver` or `Scaffold` is provided.
+    # pylint: disable=protected-access
+    if (self._checkpoint_saver_hook._saver is None and
+        self._checkpoint_saver_hook._scaffold is None):
+      iterators = ops.get_collection(iterator_ops.GLOBAL_ITERATORS)
+      saveables = [_Saveable(i) for i in iterators]
+      self._checkpoint_saver_hook._saver = _CustomSaver(saveables,
+                                                        self._latest_filename)
+    # pylint: enable=protected-access
+    self._checkpoint_saver_hook.begin()
+
+  def _restore_or_save_initial_ckpt(self, session):
+    # Ideally this should be run in after_create_session but is not for the
+    # following reason:
+    # Currently there is no way of enforcing an order of running the
+    # `SessionRunHooks`. Hence it is possible that the `_DatasetInitializerHook`
+    # is run *after* this hook. That is troublesome because
+    # 1. If a checkpoint exists and this hook restores it, the initializer hook
+    #    will override it.
+    # 2. If no checkpoint exists, this hook will try to save an initialized
+    #    iterator which will result in an exception.
+    #
+    # As a temporary fix we enter the following implicit contract between this
+    # hook and the _DatasetInitializerHook.
+    # 1. The _DatasetInitializerHook initializes the iterator in the call to
+    #    after_create_session.
+    # 2. This hook saves the iterator on the first call to `before_run()`, which
+    #    is guaranteed to happen after `after_create_session()` of all hooks
+    #    have been run.
+
+    # Check if there is an existing checkpoint. If so, restore from it.
+    # pylint: disable=protected-access
+    latest_checkpoint_path = checkpoint_management.latest_checkpoint(
+        self._checkpoint_saver_hook._checkpoint_dir,
+        latest_filename=self._latest_filename)
+    if latest_checkpoint_path:
+      self._checkpoint_saver_hook._get_saver().restore(session,
+                                                       latest_checkpoint_path)
+    else:
+      # The checkpoint saved here is the state at step "global_step".
+      # Note: We do not save the GraphDef or MetaGraphDef here.
+      global_step = session.run(self._checkpoint_saver_hook._global_step_tensor)
+      self._checkpoint_saver_hook._save(session, global_step)
+      self._checkpoint_saver_hook._timer.update_last_triggered_step(global_step)
+    # pylint: enable=protected-access
+
+  def before_run(self, run_context):
+    if self._first_run:
+      self._restore_or_save_initial_ckpt(run_context.session)
+      self._first_run = False
+    return self._checkpoint_saver_hook.before_run(run_context)
+
+  def after_run(self, run_context, run_values):
+    self._checkpoint_saver_hook.after_run(run_context, run_values)
+
+  def end(self, session):
+    self._checkpoint_saver_hook.end(session)
+
+
+class _CustomSaver(saver_lib.Saver):
+  """`Saver` with a different default `latest_filename`.
+
+  This is used in the `CheckpointInputPipelineHook` to avoid conflicts with
+  the model ckpt saved by the `CheckpointSaverHook`.
+  """
+
+  def __init__(self, var_list, latest_filename):
+    super(_CustomSaver, self).__init__(var_list)
+    self._latest_filename = latest_filename
+
+  def save(self,
+           sess,
+           save_path,
+           global_step=None,
+           latest_filename=None,
+           meta_graph_suffix="meta",
+           write_meta_graph=True,
+           write_state=True,
+           strip_default_attrs=False):
+    return super(_CustomSaver, self).save(
+        sess, save_path, global_step, latest_filename or self._latest_filename,
+        meta_graph_suffix, write_meta_graph, write_state, strip_default_attrs)
+
+
+tf_export("data.experimental.Optional")(optional_ops.Optional)
+tf_export("data.experimental.get_next_as_optional")(
+    iterator_ops.get_next_as_optional)
diff --git a/tensorflow/contrib/data/python/ops/map_defun.py b/tensorflow/python/data/experimental/ops/map_defun.py
similarity index 100%
rename from tensorflow/contrib/data/python/ops/map_defun.py
rename to tensorflow/python/data/experimental/ops/map_defun.py
diff --git a/tensorflow/contrib/data/python/ops/optimization.py b/tensorflow/python/data/experimental/ops/optimization.py
similarity index 100%
rename from tensorflow/contrib/data/python/ops/optimization.py
rename to tensorflow/python/data/experimental/ops/optimization.py
diff --git a/tensorflow/python/data/experimental/ops/parsing_ops.py b/tensorflow/python/data/experimental/ops/parsing_ops.py
new file mode 100644
index 0000000000..6615b9022a
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/parsing_ops.py
@@ -0,0 +1,152 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental `dataset` API for parsing example."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+class _ParseExampleDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that parses `example` dataset into a `dict` dataset."""
+
+  def __init__(self, input_dataset, features, num_parallel_calls):
+    super(_ParseExampleDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    if not all(types == dtypes.string
+               for types in nest.flatten(input_dataset.output_types)):
+      raise TypeError("Input dataset should be a dataset of vectors of strings")
+    self._num_parallel_calls = num_parallel_calls
+    # pylint: disable=protected-access
+    self._features = parsing_ops._prepend_none_dimension(features)
+    # sparse_keys and dense_keys come back sorted here.
+    (sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults,
+     dense_shapes) = parsing_ops._features_to_raw_params(
+         self._features, [
+             parsing_ops.VarLenFeature, parsing_ops.SparseFeature,
+             parsing_ops.FixedLenFeature, parsing_ops.FixedLenSequenceFeature
+         ])
+    # TODO(b/112859642): Pass sparse_index and sparse_values for SparseFeature.
+    (_, dense_defaults_vec, sparse_keys, sparse_types, dense_keys, dense_shapes,
+     dense_shape_as_shape) = parsing_ops._process_raw_parameters(
+         None, dense_defaults, sparse_keys, sparse_types, dense_keys,
+         dense_types, dense_shapes)
+    # pylint: enable=protected-access
+    self._sparse_keys = sparse_keys
+    self._sparse_types = sparse_types
+    self._dense_keys = dense_keys
+    self._dense_defaults = dense_defaults_vec
+    self._dense_shapes = dense_shapes
+    self._dense_types = dense_types
+    dense_output_shapes = [
+        self._input_dataset.output_shapes.concatenate(shape)
+        for shape in dense_shape_as_shape
+    ]
+    sparse_output_shapes = [
+        self._input_dataset.output_shapes.concatenate([None])
+        for _ in range(len(sparse_keys))
+    ]
+
+    self._output_shapes = dict(
+        zip(self._dense_keys + self._sparse_keys,
+            dense_output_shapes + sparse_output_shapes))
+    self._output_types = dict(
+        zip(self._dense_keys + self._sparse_keys,
+            self._dense_types + self._sparse_types))
+    self._output_classes = dict(
+        zip(self._dense_keys + self._sparse_keys,
+            [ops.Tensor for _ in range(len(self._dense_defaults))] +
+            [sparse_tensor.SparseTensor for _ in range(len(self._sparse_keys))
+            ]))
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.parse_example_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._num_parallel_calls,
+        self._dense_defaults,
+        self._sparse_keys,
+        self._dense_keys,
+        self._sparse_types,
+        self._dense_shapes,
+        **dataset_ops.flat_structure(self))
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+
+# TODO(b/111553342): add arguments names and example names as well.
+@tf_export("data.experimental.parse_example_dataset")
+def parse_example_dataset(features, num_parallel_calls=1):
+  """A transformation that parses `Example` protos into a `dict` of tensors.
+
+  Parses a number of serialized `Example` protos given in `serialized`. We refer
+  to `serialized` as a batch with `batch_size` many entries of individual
+  `Example` protos.
+
+  This op parses serialized examples into a dictionary mapping keys to `Tensor`
+  and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`,
+  `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature`
+  and `SparseFeature` is mapped to a `SparseTensor`, and each
+  `FixedLenFeature` is mapped to a `Tensor`. See `tf.parse_example` for more
+  details about feature dictionaries.
+
+  Args:
+   features: A `dict` mapping feature keys to `FixedLenFeature`,
+     `VarLenFeature`, and `SparseFeature` values.
+   num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
+      representing the number of parsing processes to call in parallel.
+
+  Returns:
+    A dataset transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+
+  Raises:
+    ValueError: if features argument is None.
+  """
+  if features is None:
+    raise ValueError("Missing: features was %s." % features)
+
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    out_dataset = _ParseExampleDataset(dataset, features, num_parallel_calls)
+    if any([
+        isinstance(feature, parsing_ops.SparseFeature)
+        for _, feature in features.items()
+    ]):
+      # pylint: disable=protected-access
+      # pylint: disable=g-long-lambda
+      out_dataset = out_dataset.map(
+          lambda x: parsing_ops._construct_sparse_tensors_for_sparse_features(
+              features, x), num_parallel_calls=num_parallel_calls)
+    return out_dataset
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/prefetching_ops.py b/tensorflow/python/data/experimental/ops/prefetching_ops.py
new file mode 100644
index 0000000000..48d7136f95
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/prefetching_ops.py
@@ -0,0 +1,531 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrapper for prefetching_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.eager import context
+from tensorflow.python.framework import device as framework_device
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+def function_buffering_resource(string_arg,
+                                target_device,
+                                f,
+                                buffer_size,
+                                output_types,
+                                container="",
+                                shared_name=None,
+                                name=None):
+  """Creates a FunctionBufferingResource.
+
+  A FunctionBufferingResource fills up a buffer by calling a function `f` on
+  `target_device`. `f` should take in only a single string argument as input.
+
+  Args:
+    string_arg: The single string argument to the function.
+    target_device: The device to run `f` on.
+    f: The function to be executed.
+    buffer_size: Size of the buffer to be populated.
+    output_types: The output types generated by the function.
+    container: (Optional) string. Defaults to "".
+    shared_name: (Optional) string.
+    name: (Optional) string to name the op.
+
+  Returns:
+    Handle to a FunctionBufferingResource.
+  """
+  if shared_name is None:
+    shared_name = ""
+  return ged_ops.experimental_function_buffering_resource(
+      string_arg=string_arg,
+      target_device=target_device,
+      shared_name=shared_name,
+      f=f,
+      buffer_size=buffer_size,
+      container=container,
+      name=name,
+      output_types=output_types)
+
+
+def function_buffering_resource_get_next(function_buffer_resource,
+                                         output_types,
+                                         name=None):
+  return ged_ops.experimental_function_buffering_resource_get_next(
+      function_buffer_resource=function_buffer_resource,
+      output_types=output_types,
+      name=name)
+
+
+def function_buffering_resource_reset(function_buffer_resource, name=None):
+  return ged_ops.experimental_function_buffering_resource_reset(
+      function_buffer_resource=function_buffer_resource, name=name)
+
+
+# pylint: disable=protected-access
+class _PrefetchToDeviceIterator(object):
+  """A replacement for `tf.data.Iterator` that prefetches to another device.
+
+  Args:
+    input_dataset: The input dataset
+    one_shot: If true, we make a one shot iterator that's already initialized.
+    device: A fully specified device string where we want to prefetch to
+    buffer_size: Size of the prefetching buffer.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server).
+
+  Returns:
+    An Iterator type object.
+  """
+
+  def __init__(self,
+               input_dataset,
+               one_shot,
+               device,
+               buffer_size,
+               shared_name=None):
+    self._input_dataset = input_dataset
+    self._get_next_call_count = 0
+    self._one_shot = one_shot
+    if shared_name is None:
+      shared_name = ""
+
+    if self._one_shot:
+      self._input_iterator = input_dataset.make_one_shot_iterator()
+    else:
+      self._input_iterator = iterator_ops.Iterator.from_structure(
+          self._input_dataset.output_types, self._input_dataset.output_shapes,
+          shared_name, self._input_dataset.output_classes)
+    input_iterator_handle = self._input_iterator.string_handle()
+
+    @function.Defun(dtypes.string)
+    def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          handle, self._input_iterator.output_types,
+          self._input_iterator.output_shapes,
+          self._input_iterator.output_classes)
+      ret = remote_iterator.get_next()
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
+
+    iterator_device = ged_ops.experimental_iterator_get_device(
+        self._input_iterator._iterator_resource)
+
+    with ops.device(device):
+      self._buffering_resource = function_buffering_resource(
+          f=_prefetch_fn,
+          target_device=iterator_device,
+          string_arg=input_iterator_handle,
+          buffer_size=buffer_size,
+          shared_name=shared_name,
+          output_types=nest.flatten(
+              sparse.as_dense_types(self._input_dataset.output_types,
+                                    self._input_dataset.output_classes)))
+
+    if not self._one_shot:
+      reset_op = function_buffering_resource_reset(self._buffering_resource)
+      with ops.control_dependencies([reset_op]):
+        self._initializer = self._input_iterator.make_initializer(
+            self._input_dataset)
+
+  def get_next(self, name=None):
+    """See `tf.data.Iterator.get_next`."""
+    self._get_next_call_count += 1
+    if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
+      warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
+
+    flat_ret = ged_ops.experimental_function_buffering_resource_get_next(
+        self._buffering_resource,
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        name=name)
+
+    ret = sparse.deserialize_sparse_tensors(
+        nest.pack_sequence_as(self.output_types, flat_ret),
+        self.output_types, self.output_shapes, self.output_classes)
+
+    for tensor, shape in zip(
+        nest.flatten(ret), nest.flatten(self.output_shapes)):
+      if isinstance(tensor, ops.Tensor):
+        tensor.set_shape(shape)
+
+    return ret
+
+  @property
+  def initializer(self):
+    if self._one_shot:
+      raise NotImplementedError("Can't initialize a one_shot_iterator")
+    return self._initializer
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
+  """A replacement for `tf.data.Iterator` that prefetches to another device.
+
+  Args:
+    input_dataset: The input dataset
+    one_shot: If true, we make a one shot iterator that's already initialized.
+    device: A fully specified device string where we want to prefetch to
+    buffer_size: Size of the prefetching buffer.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server).
+
+  Returns:
+    An Iterator type object.
+  """
+
+  def __init__(self,
+               input_dataset,
+               device,
+               buffer_size):
+    with ops.device("/device:CPU:0"):
+      super(_PrefetchToDeviceEagerIterator, self).__init__(input_dataset)
+      input_iterator_handle = gen_dataset_ops.iterator_to_string_handle(
+          self._resource)
+
+    self._device = device
+
+    @function.Defun(dtypes.string)
+    def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          handle, self.output_types, self.output_shapes, self.output_classes)
+      ret = remote_iterator.get_next()
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
+
+    _prefetch_fn.add_to_graph(None)
+
+    with ops.device(device):
+      self._buffering_resource = function_buffering_resource(
+          f=_prefetch_fn,
+          output_types=self._flat_output_types,
+          target_device=ged_ops.experimental_iterator_get_device(
+              self._resource),
+          string_arg=input_iterator_handle,
+          buffer_size=buffer_size,
+          shared_name=iterator_ops._generate_shared_name(
+              "function_buffer_resource"))
+
+  def _next_internal(self):
+    """Returns a nested structure of `tf.Tensor`s containing the next element.
+    """
+    # This runs in sync mode as iterators use an error status to communicate
+    # that there is no more data to iterate over.
+    # TODO(b/77291417): Fix
+    with context.execution_mode(context.SYNC):
+      with ops.device(self._device):
+        ret = ged_ops.experimental_function_buffering_resource_get_next(
+            function_buffer_resource=self._buffering_resource,
+            output_types=self._flat_output_types)
+      return sparse.deserialize_sparse_tensors(
+          nest.pack_sequence_as(self._output_types, ret), self._output_types,
+          self._output_shapes, self._output_classes)
+# pylint: enable=protected-access
+
+
+class _PrefetchToDeviceDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` whose iterator prefetches elements to another device."""
+
+  def __init__(self, input_dataset, device, buffer_size):
+    super(_PrefetchToDeviceDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._device = device
+    self._buffer_size = buffer_size if buffer_size is not None else 1
+
+  # The static analysis cannot tell that the eager iterator's superclass has
+  # a `next()` method.
+  # pylint: disable=non-iterator-returned
+  def __iter__(self):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
+
+    The returned iterator implements the Python iterator protocol and therefore
+    can only be used in eager mode.
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+
+    Raises:
+      RuntimeError: If eager execution is enabled.
+    """
+    if context.executing_eagerly():
+      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
+                                            self._buffer_size)
+    else:
+      raise RuntimeError("dataset.__iter__() is only supported when eager "
+                         "execution is enabled.")
+  # pylint: enable=non-iterator-returned
+
+  def make_one_shot_iterator(self):
+    if context.executing_eagerly():
+      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
+                                            self._buffer_size)
+    else:
+      return _PrefetchToDeviceIterator(self._input_dataset, one_shot=True,
+                                       device=self._device,
+                                       buffer_size=self._buffer_size)
+
+  def make_initializable_iterator(self, shared_name=None):
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=False,
+        device=self._device,
+        buffer_size=self._buffer_size,
+        shared_name=shared_name)
+
+  def _as_variant_tensor(self):
+    # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
+    # transformation methods is called.
+    # TODO(mrry): Investigate support for chaining further transformations after
+    # the prefetch, including GPU support.
+    raise NotImplementedError("`prefetch_to_device()` must be the last "
+                              "transformation in a dataset pipeline.")
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+
+@tf_export("data.experimental.prefetch_to_device")
+def prefetch_to_device(device, buffer_size=None):
+  """A transformation that prefetches dataset values to the given `device`.
+
+  NOTE: Although the transformation creates a `tf.data.Dataset`, the
+  transformation must be the final `Dataset` in the input pipeline.
+
+  Args:
+    device: A string. The name of a device to which elements will be prefetched.
+    buffer_size: (Optional.) The number of elements to buffer on `device`.
+      Defaults to an automatically chosen value.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+  def _apply_fn(dataset):
+    return _PrefetchToDeviceDataset(dataset, device, buffer_size)
+
+  return _apply_fn
+
+
+@tf_export("data.experimental.copy_to_device")
+def copy_to_device(target_device, source_device="/cpu:0"):
+  """A transformation that copies dataset elements to the given `target_device`.
+
+  Args:
+    target_device: The name of a device to which elements will be copied.
+    source_device: The original device on which `input_dataset` will be placed.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    return _CopyToDeviceDataset(
+        dataset, target_device=target_device, source_device=source_device)
+
+  return _apply_fn
+
+
+# TODO(rohanj): Use the _input_hostmem attr on the RemoteCall ops to indicate
+# all inputs to the Op are in host memory, thereby avoiding some unnecessary
+# Sends and Recvs.
+class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that copies elements to another device."""
+
+  def __init__(self, input_dataset, target_device, source_device="/cpu:0"):
+    """Constructs a _CopyToDeviceDataset.
+
+    Args:
+      input_dataset: `Dataset` to be copied
+      target_device: The name of the device to which elements would be copied.
+      source_device: Device where input_dataset would be placed.
+    """
+    super(_CopyToDeviceDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._target_device = target_device
+    spec = framework_device.DeviceSpec().from_string(self._target_device)
+    self._is_gpu_target = (spec.device_type == "GPU")
+    self._source_device_string = source_device
+    self._source_device = ops.convert_to_tensor(source_device)
+
+    self._flat_output_shapes = nest.flatten(
+        sparse.as_dense_shapes(self._input_dataset.output_shapes,
+                               self._input_dataset.output_classes))
+    self._flat_output_types = nest.flatten(
+        sparse.as_dense_types(self._input_dataset.output_types,
+                              self._input_dataset.output_classes))
+
+    @function.Defun()
+    def _init_func():
+      """Creates an iterator for the input dataset.
+
+      Returns:
+        A `string` tensor that encapsulates the iterator created.
+      """
+      # pylint: disable=protected-access
+      ds_variant = self._input_dataset._as_variant_tensor()
+      resource = gen_dataset_ops.anonymous_iterator(
+          output_types=self._flat_output_types,
+          output_shapes=self._flat_output_shapes)
+      with ops.control_dependencies(
+          [gen_dataset_ops.make_iterator(ds_variant, resource)]):
+        return gen_dataset_ops.iterator_to_string_handle(resource)
+
+    @function.Defun()
+    def _remote_init_func():
+      return functional_ops.remote_call(
+          target=self._source_device,
+          args=_init_func.captured_inputs,
+          Tout=[dtypes.string],
+          f=_init_func)
+
+    self._init_func = _remote_init_func
+    self._init_captured_args = _remote_init_func.captured_inputs
+
+    @function.Defun(dtypes.string)
+    def _next_func(string_handle):
+      """Calls get_next for created iterator.
+
+      Args:
+        string_handle: An iterator string handle created by _init_func
+      Returns:
+        The elements generated from `input_dataset`
+      """
+      with ops.device(self._source_device_string):
+        iterator = iterator_ops.Iterator.from_string_handle(
+            string_handle, self.output_types, self.output_shapes,
+            self.output_classes)
+      ret = iterator.get_next()
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
+
+    @function.Defun(dtypes.string)
+    def _remote_next_func(string_handle):
+      return functional_ops.remote_call(
+          target=self._source_device,
+          args=[string_handle] + _next_func.captured_inputs,
+          Tout=self._flat_output_types,
+          f=_next_func)
+
+    self._next_func = _remote_next_func
+    self._next_captured_args = _remote_next_func.captured_inputs
+
+    @function.Defun(dtypes.string)
+    def _finalize_func(string_handle):
+      """Destroys the iterator resource created.
+
+      Args:
+        string_handle: An iterator string handle created by _init_func
+      Returns:
+        Tensor constant 0
+      """
+      iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
+          string_handle,
+          output_types=self._flat_output_types,
+          output_shapes=self._flat_output_shapes)
+      with ops.control_dependencies([
+          resource_variable_ops.destroy_resource_op(
+              iterator_resource, ignore_lookup_error=True)]):
+        return array_ops.constant(0, dtypes.int64)
+
+    @function.Defun(dtypes.string)
+    def _remote_finalize_func(string_handle):
+      return functional_ops.remote_call(
+          target=self._source_device,
+          args=[string_handle] + _finalize_func.captured_inputs,
+          Tout=[dtypes.int64],
+          f=_finalize_func)
+
+    self._finalize_func = _remote_finalize_func
+    self._finalize_captured_args = _remote_finalize_func.captured_inputs
+
+    g = ops.get_default_graph()
+    _remote_init_func.add_to_graph(g)
+    _remote_next_func.add_to_graph(g)
+    _remote_finalize_func.add_to_graph(g)
+    # pylint: enable=protected-scope
+
+  # The one_shot_iterator implementation needs a 0 arg _make_dataset function
+  # that thereby captures all the inputs required to create the dataset. Since
+  # there are strings that are inputs to the GeneratorDataset which can't be
+  # placed on a GPU, this fails for the GPU case. Therefore, disabling it for
+  # GPU
+  def make_one_shot_iterator(self):
+    if self._is_gpu_target:
+      raise ValueError("Cannot create a one shot iterator when using "
+                       "`tf.data.experimental.copy_to_device()` on GPU. Please "
+                       "use `Dataset.make_initializable_iterator()` instead.")
+    else:
+      return super(_CopyToDeviceDataset, self).make_one_shot_iterator()
+
+  def _as_variant_tensor(self):
+    with ops.device(self._target_device):
+      return gen_dataset_ops.generator_dataset(
+          self._init_captured_args,
+          self._next_captured_args,
+          self._finalize_captured_args,
+          init_func=self._init_func,
+          next_func=self._next_func,
+          finalize_func=self._finalize_func,
+          output_types=self._flat_output_types,
+          output_shapes=self._flat_output_shapes)
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
diff --git a/tensorflow/python/data/experimental/ops/random_ops.py b/tensorflow/python/data/experimental/ops/random_ops.py
new file mode 100644
index 0000000000..e3a2aeab31
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/random_ops.py
@@ -0,0 +1,54 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Datasets for random number generators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import random_seed
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.RandomDataset")
+class RandomDataset(dataset_ops.DatasetSource):
+  """A `Dataset` of pseudorandom values."""
+
+  def __init__(self, seed=None):
+    """A `Dataset` of pseudorandom values."""
+    super(RandomDataset, self).__init__()
+    self._seed, self._seed2 = random_seed.get_seed(seed)
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.random_dataset(
+        seed=self._seed,
+        seed2=self._seed2,
+        **dataset_ops.flat_structure(self))
+
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.int64
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
new file mode 100644
index 0000000000..3b2d094514
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -0,0 +1,904 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for reader Datasets."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import csv
+
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import interleave_ops
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops import parsing_ops
+from tensorflow.python.data.experimental.ops import shuffle_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.data.util import convert
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.util.tf_export import tf_export
+
+_ACCEPTABLE_CSV_TYPES = (dtypes.float32, dtypes.float64, dtypes.int32,
+                         dtypes.int64, dtypes.string)
+
+
+def _is_valid_int32(str_val):
+  try:
+    # Checks equality to prevent int32 overflow
+    return dtypes.int32.as_numpy_dtype(str_val) == dtypes.int64.as_numpy_dtype(
+        str_val)
+  except (ValueError, OverflowError):
+    return False
+
+
+def _is_valid_int64(str_val):
+  try:
+    dtypes.int64.as_numpy_dtype(str_val)
+    return True
+  except (ValueError, OverflowError):
+    return False
+
+
+def _is_valid_float(str_val, float_dtype):
+  try:
+    return float_dtype.as_numpy_dtype(str_val) < np.inf
+  except ValueError:
+    return False
+
+
+def _infer_type(str_val, na_value, prev_type):
+  """Given a string, infers its tensor type.
+
+  Infers the type of a value by picking the least 'permissive' type possible,
+  while still allowing the previous type inference for this column to be valid.
+
+  Args:
+    str_val: String value to infer the type of.
+    na_value: Additional string to recognize as a NA/NaN CSV value.
+    prev_type: Type previously inferred based on values of this column that
+      we've seen up till now.
+  Returns:
+    Inferred dtype.
+  """
+  if str_val in ("", na_value):
+    # If the field is null, it gives no extra information about its type
+    return prev_type
+
+  type_list = [
+      dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64, dtypes.string
+  ]  # list of types to try, ordered from least permissive to most
+
+  type_functions = [
+      _is_valid_int32,
+      _is_valid_int64,
+      lambda str_val: _is_valid_float(str_val, dtypes.float32),
+      lambda str_val: _is_valid_float(str_val, dtypes.float64),
+      lambda str_val: True,
+  ]  # Corresponding list of validation functions
+
+  for i in range(len(type_list)):
+    validation_fn = type_functions[i]
+    if validation_fn(str_val) and (prev_type is None or
+                                   prev_type in type_list[:i + 1]):
+      return type_list[i]
+
+
+def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header):
+  """Generator that yields rows of CSV file(s) in order."""
+  for fn in filenames:
+    with file_io.FileIO(fn, "r") as f:
+      rdr = csv.reader(
+          f,
+          delimiter=field_delim,
+          quoting=csv.QUOTE_MINIMAL if use_quote_delim else csv.QUOTE_NONE)
+      if header:
+        next(rdr)  # Skip header lines
+
+      for csv_row in rdr:
+        if len(csv_row) != num_cols:
+          raise ValueError(
+              "Problem inferring types: CSV row has different number of fields "
+              "than expected.")
+        yield csv_row
+
+
+def _infer_column_defaults(filenames, num_cols, field_delim, use_quote_delim,
+                           na_value, header, num_rows_for_inference,
+                           select_columns):
+  """Infers column types from the first N valid CSV records of files."""
+  if select_columns is None:
+    select_columns = range(num_cols)
+  inferred_types = [None] * len(select_columns)
+
+  for i, csv_row in enumerate(
+      _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header)):
+    if num_rows_for_inference is not None and i >= num_rows_for_inference:
+      break
+
+    for j, col_index in enumerate(select_columns):
+      inferred_types[j] = _infer_type(csv_row[col_index], na_value,
+                                      inferred_types[j])
+
+  # Replace None's with a default type
+  inferred_types = [t or dtypes.string for t in inferred_types]
+  # Default to 0 or '' for null values
+  return [
+      constant_op.constant([0 if t is not dtypes.string else ""], dtype=t)
+      for t in inferred_types
+  ]
+
+
+def _infer_column_names(filenames, field_delim, use_quote_delim):
+  """Infers column names from first rows of files."""
+  csv_kwargs = {
+      "delimiter": field_delim,
+      "quoting": csv.QUOTE_MINIMAL if use_quote_delim else csv.QUOTE_NONE
+  }
+  with file_io.FileIO(filenames[0], "r") as f:
+    try:
+      column_names = next(csv.reader(f, **csv_kwargs))
+    except StopIteration:
+      raise ValueError(("Received StopIteration when reading the header line "
+                        "of %s.  Empty file?") % filenames[0])
+
+  for name in filenames[1:]:
+    with file_io.FileIO(name, "r") as f:
+      try:
+        if next(csv.reader(f, **csv_kwargs)) != column_names:
+          raise ValueError(
+              "Files have different column names in the header row.")
+      except StopIteration:
+        raise ValueError(("Received StopIteration when reading the header line "
+                          "of %s.  Empty file?") % filenames[0])
+  return column_names
+
+
+def _get_sorted_col_indices(select_columns, column_names):
+  """Transforms select_columns argument into sorted column indices."""
+  names_to_indices = {n: i for i, n in enumerate(column_names)}
+  num_cols = len(column_names)
+  for i, v in enumerate(select_columns):
+    if isinstance(v, int):
+      if v < 0 or v >= num_cols:
+        raise ValueError(
+            "Column index %d specified in select_columns out of valid range." %
+            v)
+      continue
+    if v not in names_to_indices:
+      raise ValueError(
+          "Value '%s' specified in select_columns not a valid column index or "
+          "name." % v)
+    select_columns[i] = names_to_indices[v]
+
+  # Sort and ensure there are no duplicates
+  result = sorted(set(select_columns))
+  if len(result) != len(select_columns):
+    raise ValueError("select_columns contains duplicate columns")
+  return result
+
+
+def _maybe_shuffle_and_repeat(
+    dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed):
+  """Optionally shuffle and repeat dataset, as requested."""
+  if num_epochs != 1 and shuffle:
+    # Use shuffle_and_repeat for perf
+    return dataset.apply(
+        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
+                                       shuffle_seed))
+  elif shuffle:
+    return dataset.shuffle(shuffle_buffer_size, shuffle_seed)
+  elif num_epochs != 1:
+    return dataset.repeat(num_epochs)
+  return dataset
+
+
+def make_tf_record_dataset(file_pattern,
+                           batch_size,
+                           parser_fn=None,
+                           num_epochs=None,
+                           shuffle=True,
+                           shuffle_buffer_size=None,
+                           shuffle_seed=None,
+                           prefetch_buffer_size=optimization.AUTOTUNE,
+                           num_parallel_reads=None,
+                           num_parallel_parser_calls=None,
+                           drop_final_batch=False):
+  """Reads and optionally parses TFRecord files into a dataset.
+
+  Provides common functionality such as batching, optional parsing, shuffling,
+  and performant defaults.
+
+  Args:
+    file_pattern: List of files or patterns of TFRecord file paths.
+      See `tf.gfile.Glob` for pattern rules.
+    batch_size: An int representing the number of records to combine
+      in a single batch.
+    parser_fn: (Optional.) A function accepting string input to parse
+      and process the record contents. This function must map records
+      to components of a fixed shape, so they may be batched. By
+      default, uses the record contents unmodified.
+    num_epochs: (Optional.) An int specifying the number of times this
+      dataset is repeated.  If None (the default), cycles through the
+      dataset forever.
+    shuffle: (Optional.) A bool that indicates whether the input
+      should be shuffled. Defaults to `True`.
+    shuffle_buffer_size: (Optional.) Buffer size to use for
+      shuffling. A large buffer size ensures better shuffling, but
+      increases memory usage and startup time.
+    shuffle_seed: (Optional.) Randomization seed to use for shuffling.
+    prefetch_buffer_size: (Optional.) An int specifying the number of
+      feature batches to prefetch for performance improvement.
+      Defaults to auto-tune. Set to 0 to disable prefetching.
+    num_parallel_reads: (Optional.) Number of threads used to read
+      records from files. By default or if set to a value >1, the
+      results will be interleaved.
+    num_parallel_parser_calls: (Optional.) Number of parallel
+      records to parse in parallel. Defaults to an automatic selection.
+    drop_final_batch: (Optional.) Whether the last batch should be
+      dropped in case its size is smaller than `batch_size`; the
+      default behavior is not to drop the smaller batch.
+
+  Returns:
+    A dataset, where each element matches the output of `parser_fn`
+    except it will have an additional leading `batch-size` dimension,
+    or a `batch_size`-length 1-D tensor of strings if `parser_fn` is
+    unspecified.
+  """
+  files = dataset_ops.Dataset.list_files(
+      file_pattern, shuffle=shuffle, seed=shuffle_seed)
+
+  if num_parallel_reads is None:
+    # Note: We considered auto-tuning this value, but there is a concern
+    # that this affects the mixing of records from different files, which
+    # could affect training convergence/accuracy, so we are defaulting to
+    # a constant for now.
+    num_parallel_reads = 24
+  dataset = core_readers.TFRecordDataset(
+      files, num_parallel_reads=num_parallel_reads)
+
+  if shuffle_buffer_size is None:
+    # TODO(josh11b): Auto-tune this value when not specified
+    shuffle_buffer_size = 10000
+  dataset = _maybe_shuffle_and_repeat(
+      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
+
+  # NOTE(mrry): We set `drop_final_batch=True` when `num_epochs is None` to
+  # improve the shape inference, because it makes the batch dimension static.
+  # It is safe to do this because in that case we are repeating the input
+  # indefinitely, and all batches will be full-sized.
+  drop_final_batch = drop_final_batch or num_epochs is None
+
+  if parser_fn is None:
+    dataset = dataset.batch(batch_size, drop_remainder=drop_final_batch)
+  else:
+    # TODO(josh11b): if num_parallel_parser_calls is None, use some function
+    # of num cores instead of map_and_batch's default behavior of one batch.
+    dataset = dataset.apply(batching.map_and_batch(
+        parser_fn, batch_size, num_parallel_calls=num_parallel_parser_calls,
+        drop_remainder=drop_final_batch))
+
+  if prefetch_buffer_size == 0:
+    return dataset
+  else:
+    return dataset.prefetch(buffer_size=prefetch_buffer_size)
+
+
+@tf_export("data.experimental.make_csv_dataset")
+def make_csv_dataset(
+    file_pattern,
+    batch_size,
+    column_names=None,
+    column_defaults=None,
+    label_name=None,
+    select_columns=None,
+    field_delim=",",
+    use_quote_delim=True,
+    na_value="",
+    header=True,
+    num_epochs=None,
+    shuffle=True,
+    shuffle_buffer_size=10000,
+    shuffle_seed=None,
+    prefetch_buffer_size=optimization.AUTOTUNE,
+    num_parallel_reads=1,
+    sloppy=False,
+    num_rows_for_inference=100,
+    compression_type=None,
+):
+  """Reads CSV files into a dataset.
+
+  Reads CSV files into a dataset, where each element is a (features, labels)
+  tuple that corresponds to a batch of CSV rows. The features dictionary
+  maps feature column names to `Tensor`s containing the corresponding
+  feature data, and labels is a `Tensor` containing the batch's label data.
+
+  Args:
+    file_pattern: List of files or patterns of file paths containing CSV
+      records. See `tf.gfile.Glob` for pattern rules.
+    batch_size: An int representing the number of records to combine
+      in a single batch.
+    column_names: An optional list of strings that corresponds to the CSV
+      columns, in order. One per column of the input record. If this is not
+      provided, infers the column names from the first row of the records.
+      These names will be the keys of the features dict of each dataset element.
+    column_defaults: A optional list of default values for the CSV fields. One
+      item per selected column of the input record. Each item in the list is
+      either a valid CSV dtype (float32, float64, int32, int64, or string), or a
+      `Tensor` with one of the aforementioned types. The tensor can either be
+      a scalar default value (if the column is optional), or an empty tensor (if
+      the column is required). If a dtype is provided instead of a tensor, the
+      column is also treated as required. If this list is not provided, tries
+      to infer types based on reading the first num_rows_for_inference rows of
+      files specified, and assumes all columns are optional, defaulting to `0`
+      for numeric values and `""` for string values. If both this and
+      `select_columns` are specified, these must have the same lengths, and
+      `column_defaults` is assumed to be sorted in order of increasing column
+      index.
+    label_name: A optional string corresponding to the label column. If
+      provided, the data for this column is returned as a separate `Tensor` from
+      the features dictionary, so that the dataset complies with the format
+      expected by a `tf.Estimator.train` or `tf.Estimator.evaluate` input
+      function.
+    select_columns: An optional list of integer indices or string column
+      names, that specifies a subset of columns of CSV data to select. If
+      column names are provided, these must correspond to names provided in
+      `column_names` or inferred from the file header lines. When this argument
+      is specified, only a subset of CSV columns will be parsed and returned,
+      corresponding to the columns specified. Using this results in faster
+      parsing and lower memory usage. If both this and `column_defaults` are
+      specified, these must have the same lengths, and `column_defaults` is
+      assumed to be sorted in order of increasing column index.
+    field_delim: An optional `string`. Defaults to `","`. Char delimiter to
+      separate fields in a record.
+    use_quote_delim: An optional bool. Defaults to `True`. If false, treats
+      double quotation marks as regular characters inside of the string fields.
+    na_value: Additional string to recognize as NA/NaN.
+    header: A bool that indicates whether the first rows of provided CSV files
+      correspond to header lines with column names, and should not be included
+      in the data.
+    num_epochs: An int specifying the number of times this dataset is repeated.
+      If None, cycles through the dataset forever.
+    shuffle: A bool that indicates whether the input should be shuffled.
+    shuffle_buffer_size: Buffer size to use for shuffling. A large buffer size
+      ensures better shuffling, but increases memory usage and startup time.
+    shuffle_seed: Randomization seed to use for shuffling.
+    prefetch_buffer_size: An int specifying the number of feature
+      batches to prefetch for performance improvement. Recommended value is the
+      number of batches consumed per training step. Defaults to auto-tune.
+
+    num_parallel_reads: Number of threads used to read CSV records from files.
+      If >1, the results will be interleaved.
+    sloppy: If `True`, reading performance will be improved at
+      the cost of non-deterministic ordering. If `False`, the order of elements
+      produced is deterministic prior to shuffling (elements are still
+      randomized if `shuffle=True`. Note that if the seed is set, then order
+      of elements after shuffling is deterministic). Defaults to `False`.
+    num_rows_for_inference: Number of rows of a file to use for type inference
+      if record_defaults is not provided. If None, reads all the rows of all
+      the files. Defaults to 100.
+    compression_type: (Optional.) A `tf.string` scalar evaluating to one of
+      `""` (no compression), `"ZLIB"`, or `"GZIP"`. Defaults to no compression.
+
+  Returns:
+    A dataset, where each element is a (features, labels) tuple that corresponds
+    to a batch of `batch_size` CSV rows. The features dictionary maps feature
+    column names to `Tensor`s containing the corresponding column data, and
+    labels is a `Tensor` containing the column data for the label column
+    specified by `label_name`.
+
+  Raises:
+    ValueError: If any of the arguments is malformed.
+  """
+  # Create dataset of all matching filenames
+  filenames = _get_file_names(file_pattern, False)
+  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
+  if shuffle:
+    dataset = dataset.shuffle(len(filenames), shuffle_seed)
+
+  # Clean arguments; figure out column names and defaults
+
+  if column_names is None:
+    if not header:
+      raise ValueError("Cannot infer column names without a header line.")
+    # If column names are not provided, infer from the header lines
+    column_names = _infer_column_names(filenames, field_delim, use_quote_delim)
+  if len(column_names) != len(set(column_names)):
+    raise ValueError("Cannot have duplicate column names.")
+
+  if select_columns is not None:
+    select_columns = _get_sorted_col_indices(select_columns, column_names)
+
+  if column_defaults is not None:
+    column_defaults = [
+        constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
+        for x in column_defaults
+    ]
+  else:
+    # If column defaults are not provided, infer from records at graph
+    # construction time
+    column_defaults = _infer_column_defaults(
+        filenames, len(column_names), field_delim, use_quote_delim, na_value,
+        header, num_rows_for_inference, select_columns)
+
+  if select_columns is not None and len(column_defaults) != len(select_columns):
+    raise ValueError(
+        "If specified, column_defaults and select_columns must have same "
+        "length."
+    )
+  if select_columns is not None and len(column_names) > len(select_columns):
+    # Pick the relevant subset of column names
+    column_names = [column_names[i] for i in select_columns]
+
+  if label_name is not None and label_name not in column_names:
+    raise ValueError("`label_name` provided must be one of the columns.")
+
+  def filename_to_dataset(filename):
+    return CsvDataset(
+        filename,
+        record_defaults=column_defaults,
+        field_delim=field_delim,
+        use_quote_delim=use_quote_delim,
+        na_value=na_value,
+        select_cols=select_columns,
+        header=header,
+        compression_type=compression_type,
+    )
+
+  def map_fn(*columns):
+    """Organizes columns into a features dictionary.
+
+    Args:
+      *columns: list of `Tensor`s corresponding to one csv record.
+    Returns:
+      An OrderedDict of feature names to values for that particular record. If
+      label_name is provided, extracts the label feature to be returned as the
+      second element of the tuple.
+    """
+    features = collections.OrderedDict(zip(column_names, columns))
+    if label_name is not None:
+      label = features.pop(label_name)
+      return features, label
+    return features
+
+  # Read files sequentially (if num_parallel_reads=1) or in parallel
+  dataset = dataset.apply(
+      interleave_ops.parallel_interleave(
+          filename_to_dataset, cycle_length=num_parallel_reads, sloppy=sloppy))
+
+  dataset = _maybe_shuffle_and_repeat(
+      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
+
+  # Apply batch before map for perf, because map has high overhead relative
+  # to the size of the computation in each map.
+  # NOTE(mrry): We set `drop_remainder=True` when `num_epochs is None` to
+  # improve the shape inference, because it makes the batch dimension static.
+  # It is safe to do this because in that case we are repeating the input
+  # indefinitely, and all batches will be full-sized.
+  dataset = dataset.batch(batch_size=batch_size,
+                          drop_remainder=num_epochs is None)
+  dataset = dataset_ops.MapDataset(
+      dataset, map_fn, use_inter_op_parallelism=False)
+  dataset = dataset.prefetch(prefetch_buffer_size)
+
+  return dataset
+
+
+_DEFAULT_READER_BUFFER_SIZE_BYTES = 4 * 1024 * 1024  # 4 MB
+
+
+@tf_export("data.experimental.CsvDataset")
+class CsvDataset(dataset_ops.DatasetSource):
+  """A Dataset comprising lines from one or more CSV files."""
+
+  def __init__(self,
+               filenames,
+               record_defaults,
+               compression_type=None,
+               buffer_size=None,
+               header=False,
+               field_delim=",",
+               use_quote_delim=True,
+               na_value="",
+               select_cols=None):
+    """Creates a `CsvDataset` by reading and decoding CSV files.
+
+    The elements of this dataset correspond to records from the file(s).
+    RFC 4180 format is expected for CSV files
+    (https://tools.ietf.org/html/rfc4180)
+    Note that we allow leading and trailing spaces with int or float field.
+
+
+    For example, suppose we have a file 'my_file0.csv' with four CSV columns of
+    different data types:
+    ```
+    abcdefg,4.28E10,5.55E6,12
+    hijklmn,-5.3E14,,2
+    ```
+
+    We can construct a CsvDataset from it as follows:
+    ```python
+    dataset = tf.data.experimental.CsvDataset(
+        "my_file*.csv",
+        [tf.float32,  # Required field, use dtype or empty tensor
+         tf.constant([0.0], dtype=tf.float32),  # Optional field, default to 0.0
+         tf.int32,  # Required field, use dtype or empty tensor
+         ],
+        select_cols=[1,2,3]  # Only parse last three columns
+    )
+    ```
+
+    The expected output of its iterations is:
+    ```python
+    next_element = dataset.make_one_shot_iterator().get_next()
+    with tf.Session() as sess:
+      while True:
+        try:
+          print(sess.run(next_element))
+        except tf.errors.OutOfRangeError:
+          break
+
+    >> (4.28e10, 5.55e6, 12)
+    >> (-5.3e14, 0.0, 2)
+    ```
+
+    Args:
+      filenames: A `tf.string` tensor containing one or more filenames.
+      record_defaults: A list of default values for the CSV fields. Each item in
+        the list is either a valid CSV `DType` (float32, float64, int32, int64,
+        string), or a `Tensor` object with one of the above types. One per
+        column of CSV data, with either a scalar `Tensor` default value for the
+        column if it is optional, or `DType` or empty `Tensor` if required. If
+        both this and `select_columns` are specified, these must have the same
+        lengths, and `column_defaults` is assumed to be sorted in order of
+        increasing column index.
+      compression_type: (Optional.) A `tf.string` scalar evaluating to one of
+        `""` (no compression), `"ZLIB"`, or `"GZIP"`. Defaults to no
+        compression.
+      buffer_size: (Optional.) A `tf.int64` scalar denoting the number of bytes
+        to buffer while reading files. Defaults to 4MB.
+      header: (Optional.) A `tf.bool` scalar indicating whether the CSV file(s)
+        have header line(s) that should be skipped when parsing. Defaults to
+        `False`.
+      field_delim: (Optional.) A `tf.string` scalar containing the delimiter
+        character that separates fields in a record. Defaults to `","`.
+      use_quote_delim: (Optional.) A `tf.bool` scalar. If `False`, treats
+        double quotation marks as regular characters inside of string fields
+        (ignoring RFC 4180, Section 2, Bullet 5). Defaults to `True`.
+      na_value: (Optional.) A `tf.string` scalar indicating a value that will
+        be treated as NA/NaN.
+      select_cols: (Optional.) A sorted list of column indices to select from
+        the input data. If specified, only this subset of columns will be
+        parsed. Defaults to parsing all columns.
+    """
+    super(CsvDataset, self).__init__()
+    self._filenames = ops.convert_to_tensor(
+        filenames, dtype=dtypes.string, name="filenames")
+    self._compression_type = convert.optional_param_to_tensor(
+        "compression_type",
+        compression_type,
+        argument_default="",
+        argument_dtype=dtypes.string)
+    record_defaults = [
+        constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
+        for x in record_defaults
+    ]
+    self._record_defaults = ops.convert_n_to_tensor(
+        record_defaults, name="record_defaults")
+    self._buffer_size = convert.optional_param_to_tensor(
+        "buffer_size", buffer_size, _DEFAULT_READER_BUFFER_SIZE_BYTES)
+    self._header = ops.convert_to_tensor(
+        header, dtype=dtypes.bool, name="header")
+    self._field_delim = ops.convert_to_tensor(
+        field_delim, dtype=dtypes.string, name="field_delim")
+    self._use_quote_delim = ops.convert_to_tensor(
+        use_quote_delim, dtype=dtypes.bool, name="use_quote_delim")
+    self._na_value = ops.convert_to_tensor(
+        na_value, dtype=dtypes.string, name="na_value")
+    self._select_cols = convert.optional_param_to_tensor(
+        "select_cols",
+        select_cols,
+        argument_default=[],
+        argument_dtype=dtypes.int64,
+    )
+    self._output_shapes = tuple(
+        tensor_shape.scalar() for _ in range(len(record_defaults)))
+    self._output_types = tuple(d.dtype for d in self._record_defaults)
+    self._output_classes = tuple(
+        ops.Tensor for _ in range(len(record_defaults)))
+
+  def _as_variant_tensor(self):
+    # Constructs graph node for the dataset op.
+    return gen_experimental_dataset_ops.experimental_csv_dataset(
+        filenames=self._filenames,
+        record_defaults=self._record_defaults,
+        buffer_size=self._buffer_size,
+        header=self._header,
+        output_shapes=self._output_shapes,
+        field_delim=self._field_delim,
+        use_quote_delim=self._use_quote_delim,
+        na_value=self._na_value,
+        select_cols=self._select_cols,
+        compression_type=self._compression_type,
+    )
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+
+@tf_export("data.experimental.make_batched_features_dataset")
+def make_batched_features_dataset(file_pattern,
+                                  batch_size,
+                                  features,
+                                  reader=core_readers.TFRecordDataset,
+                                  label_key=None,
+                                  reader_args=None,
+                                  num_epochs=None,
+                                  shuffle=True,
+                                  shuffle_buffer_size=10000,
+                                  shuffle_seed=None,
+                                  prefetch_buffer_size=optimization.AUTOTUNE,
+                                  reader_num_threads=1,
+                                  parser_num_threads=2,
+                                  sloppy_ordering=False,
+                                  drop_final_batch=False):
+  """Returns a `Dataset` of feature dictionaries from `Example` protos.
+
+  If label_key argument is provided, returns a `Dataset` of tuple
+  comprising of feature dictionaries and label.
+
+  Example:
+
+  ```
+  serialized_examples = [
+    features {
+      feature { key: "age" value { int64_list { value: [ 0 ] } } }
+      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
+      feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } }
+    },
+    features {
+      feature { key: "age" value { int64_list { value: [] } } }
+      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
+      feature { key: "kws" value { bytes_list { value: [ "sports" ] } } }
+    }
+  ]
+  ```
+
+  We can use arguments:
+
+  ```
+  features: {
+    "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
+    "gender": FixedLenFeature([], dtype=tf.string),
+    "kws": VarLenFeature(dtype=tf.string),
+  }
+  ```
+
+  And the expected output is:
+
+  ```python
+  {
+    "age": [[0], [-1]],
+    "gender": [["f"], ["f"]],
+    "kws": SparseTensor(
+      indices=[[0, 0], [0, 1], [1, 0]],
+      values=["code", "art", "sports"]
+      dense_shape=[2, 2]),
+  }
+  ```
+
+  Args:
+    file_pattern: List of files or patterns of file paths containing
+      `Example` records. See `tf.gfile.Glob` for pattern rules.
+    batch_size: An int representing the number of records to combine
+      in a single batch.
+    features: A `dict` mapping feature keys to `FixedLenFeature` or
+      `VarLenFeature` values. See `tf.parse_example`.
+    reader: A function or class that can be
+      called with a `filenames` tensor and (optional) `reader_args` and returns
+      a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`.
+    label_key: (Optional) A string corresponding to the key labels are stored in
+      `tf.Examples`. If provided, it must be one of the `features` key,
+      otherwise results in `ValueError`.
+    reader_args: Additional arguments to pass to the reader class.
+    num_epochs: Integer specifying the number of times to read through the
+      dataset. If None, cycles through the dataset forever. Defaults to `None`.
+    shuffle: A boolean, indicates whether the input should be shuffled. Defaults
+      to `True`.
+    shuffle_buffer_size: Buffer size of the ShuffleDataset. A large capacity
+      ensures better shuffling but would increase memory usage and startup time.
+    shuffle_seed: Randomization seed to use for shuffling.
+    prefetch_buffer_size: Number of feature batches to prefetch in order to
+      improve performance. Recommended value is the number of batches consumed
+      per training step. Defaults to auto-tune.
+    reader_num_threads: Number of threads used to read `Example` records. If >1,
+      the results will be interleaved.
+    parser_num_threads: Number of threads to use for parsing `Example` tensors
+      into a dictionary of `Feature` tensors.
+    sloppy_ordering: If `True`, reading performance will be improved at
+      the cost of non-deterministic ordering. If `False`, the order of elements
+      produced is deterministic prior to shuffling (elements are still
+      randomized if `shuffle=True`. Note that if the seed is set, then order
+      of elements after shuffling is deterministic). Defaults to `False`.
+    drop_final_batch: If `True`, and the batch size does not evenly divide the
+      input dataset size, the final smaller batch will be dropped. Defaults to
+      `False`.
+
+  Returns:
+    A dataset of `dict` elements, (or a tuple of `dict` elements and label).
+    Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects.
+
+  Raises:
+    ValueError: If `label_key` is not one of the `features` keys.
+  """
+  # Create dataset of all matching filenames
+  filenames = _get_file_names(file_pattern, False)
+  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
+  if shuffle:
+    dataset = dataset.shuffle(len(filenames), shuffle_seed)
+
+  # Read `Example` records from files as tensor objects.
+  if reader_args is None:
+    reader_args = []
+
+  # Read files sequentially (if reader_num_threads=1) or in parallel
+  dataset = dataset.apply(
+      interleave_ops.parallel_interleave(
+          lambda filename: reader(filename, *reader_args),
+          cycle_length=reader_num_threads,
+          sloppy=sloppy_ordering))
+
+  # Extract values if the `Example` tensors are stored as key-value tuples.
+  if dataset.output_types == (dtypes.string, dtypes.string):
+    dataset = dataset_ops.MapDataset(
+        dataset, lambda _, v: v, use_inter_op_parallelism=False)
+
+  # Apply dataset repeat and shuffle transformations.
+  dataset = _maybe_shuffle_and_repeat(
+      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
+
+  # NOTE(mrry): We set `drop_remainder=True` when `num_epochs is None` to
+  # improve the shape inference, because it makes the batch dimension static.
+  # It is safe to do this because in that case we are repeating the input
+  # indefinitely, and all batches will be full-sized.
+  dataset = dataset.batch(
+      batch_size, drop_remainder=drop_final_batch or num_epochs is None)
+
+  # Parse `Example` tensors to a dictionary of `Feature` tensors.
+  dataset = dataset.apply(
+      parsing_ops.parse_example_dataset(
+          features, num_parallel_calls=parser_num_threads))
+
+  if label_key:
+    if label_key not in features:
+      raise ValueError(
+          "The `label_key` provided (%r) must be one of the `features` keys." %
+          label_key)
+    dataset = dataset.map(lambda x: (x, x.pop(label_key)))
+
+  dataset = dataset.prefetch(prefetch_buffer_size)
+  return dataset
+
+
+def _get_file_names(file_pattern, shuffle):
+  """Parse list of file names from pattern, optionally shuffled.
+
+  Args:
+    file_pattern: File glob pattern, or list of glob patterns.
+    shuffle: Whether to shuffle the order of file names.
+
+  Returns:
+    List of file names matching `file_pattern`.
+
+  Raises:
+    ValueError: If `file_pattern` is empty, or pattern matches no files.
+  """
+  if isinstance(file_pattern, list):
+    if not file_pattern:
+      raise ValueError("File pattern is empty.")
+    file_names = []
+    for entry in file_pattern:
+      file_names.extend(gfile.Glob(entry))
+  else:
+    file_names = list(gfile.Glob(file_pattern))
+
+  if not file_names:
+    raise ValueError("No files match %s." % file_pattern)
+
+  # Sort files so it will be deterministic for unit tests.
+  if not shuffle:
+    file_names = sorted(file_names)
+  return file_names
+
+
+@tf_export("data.experimental.SqlDataset")
+class SqlDataset(dataset_ops.DatasetSource):
+  """A `Dataset` consisting of the results from a SQL query."""
+
+  def __init__(self, driver_name, data_source_name, query, output_types):
+    """Creates a `SqlDataset`.
+
+    `SqlDataset` allows a user to read data from the result set of a SQL query.
+    For example:
+
+    ```python
+    dataset = tf.data.experimental.SqlDataset("sqlite", "/foo/bar.sqlite3",
+                                              "SELECT name, age FROM people",
+                                              (tf.string, tf.int32))
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+    # Prints the rows of the result set of the above query.
+    while True:
+      try:
+        print(sess.run(next_element))
+      except tf.errors.OutOfRangeError:
+        break
+    ```
+
+    Args:
+      driver_name: A 0-D `tf.string` tensor containing the database type.
+        Currently, the only supported value is 'sqlite'.
+      data_source_name: A 0-D `tf.string` tensor containing a connection string
+        to connect to the database.
+      query: A 0-D `tf.string` tensor containing the SQL query to execute.
+      output_types: A tuple of `tf.DType` objects representing the types of the
+        columns returned by `query`.
+    """
+    super(SqlDataset, self).__init__()
+    self._driver_name = ops.convert_to_tensor(
+        driver_name, dtype=dtypes.string, name="driver_name")
+    self._data_source_name = ops.convert_to_tensor(
+        data_source_name, dtype=dtypes.string, name="data_source_name")
+    self._query = ops.convert_to_tensor(
+        query, dtype=dtypes.string, name="query")
+    self._output_types = output_types
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.sql_dataset(self._driver_name,
+                                       self._data_source_name, self._query,
+                                       nest.flatten(self.output_types),
+                                       nest.flatten(self.output_shapes))
+
+  @property
+  def output_classes(self):
+    return nest.map_structure(lambda _: ops.Tensor, self._output_types)
+
+  @property
+  def output_shapes(self):
+    return nest.map_structure(lambda _: tensor_shape.TensorShape([]),
+                              self._output_types)
+
+  @property
+  def output_types(self):
+    return self._output_types
diff --git a/tensorflow/python/data/experimental/ops/resampling.py b/tensorflow/python/data/experimental/ops/resampling.py
new file mode 100644
index 0000000000..3a3040ae9a
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/resampling.py
@@ -0,0 +1,296 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Resampling dataset transformations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import interleave_ops
+from tensorflow.python.data.experimental.ops import scan_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.rejection_resample")
+def rejection_resample(class_func, target_dist, initial_dist=None, seed=None):
+  """A transformation that resamples a dataset to achieve a target distribution.
+
+  **NOTE** Resampling is performed via rejection sampling; some fraction
+  of the input values will be dropped.
+
+  Args:
+    class_func: A function mapping an element of the input dataset to a scalar
+      `tf.int32` tensor. Values should be in `[0, num_classes)`.
+    target_dist: A floating point type tensor, shaped `[num_classes]`.
+    initial_dist: (Optional.)  A floating point type tensor, shaped
+      `[num_classes]`.  If not provided, the true class distribution is
+      estimated live in a streaming fashion.
+    seed: (Optional.) Python integer seed for the resampler.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    target_dist_t = ops.convert_to_tensor(target_dist, name="target_dist")
+    class_values_ds = dataset.map(class_func)
+
+    # Get initial distribution.
+    if initial_dist is not None:
+      initial_dist_t = ops.convert_to_tensor(initial_dist, name="initial_dist")
+      acceptance_dist, prob_of_original = (
+          _calculate_acceptance_probs_with_mixing(initial_dist_t,
+                                                  target_dist_t))
+      initial_dist_ds = dataset_ops.Dataset.from_tensors(
+          initial_dist_t).repeat()
+      acceptance_dist_ds = dataset_ops.Dataset.from_tensors(
+          acceptance_dist).repeat()
+      prob_of_original_ds = dataset_ops.Dataset.from_tensors(
+          prob_of_original).repeat()
+    else:
+      initial_dist_ds = _estimate_initial_dist_ds(
+          target_dist_t, class_values_ds)
+      acceptance_and_original_prob_ds = initial_dist_ds.map(
+          lambda initial: _calculate_acceptance_probs_with_mixing(  # pylint: disable=g-long-lambda
+              initial, target_dist_t))
+      acceptance_dist_ds = acceptance_and_original_prob_ds.map(
+          lambda accept_prob, _: accept_prob)
+      prob_of_original_ds = acceptance_and_original_prob_ds.map(
+          lambda _, prob_original: prob_original)
+    filtered_ds = _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds,
+                             class_values_ds, seed)
+    # Prefetch filtered dataset for speed.
+    filtered_ds = filtered_ds.prefetch(3)
+
+    prob_original_static = _get_prob_original_static(
+        initial_dist_t, target_dist_t) if initial_dist is not None else None
+    if prob_original_static == 1:
+      return dataset_ops.Dataset.zip((class_values_ds, dataset))
+    elif prob_original_static == 0:
+      return filtered_ds
+    else:
+      return interleave_ops.sample_from_datasets(
+          [dataset_ops.Dataset.zip((class_values_ds, dataset)), filtered_ds],
+          weights=prob_of_original_ds.map(lambda prob: [(prob, 1.0 - prob)]),
+          seed=seed)
+
+  return _apply_fn
+
+
+def _get_prob_original_static(initial_dist_t, target_dist_t):
+  """Returns the static probability of sampling from the original.
+
+  `tensor_util.constant_value(prob_of_original)` returns `None` if it encounters
+  an Op that it isn't defined for. We have some custom logic to avoid this.
+
+  Args:
+    initial_dist_t: A tensor of the initial distribution.
+    target_dist_t: A tensor of the target distribution.
+
+  Returns:
+    The probability of sampling from the original distribution as a constant,
+    if it is a constant, or `None`.
+  """
+  init_static = tensor_util.constant_value(initial_dist_t)
+  target_static = tensor_util.constant_value(target_dist_t)
+
+  if init_static is None or target_static is None:
+    return None
+  else:
+    return np.min(target_static / init_static)
+
+
+def _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds, class_values_ds,
+               seed):
+  """Filters a dataset based on per-class acceptance probabilities.
+
+  Args:
+    dataset: The dataset to be filtered.
+    acceptance_dist_ds: A dataset of acceptance probabilities.
+    initial_dist_ds: A dataset of the initial probability distribution, given or
+        estimated.
+    class_values_ds: A dataset of the corresponding classes.
+    seed: (Optional.) Python integer seed for the resampler.
+
+  Returns:
+    A dataset of (class value, data) after filtering.
+  """
+  def maybe_warn_on_large_rejection(accept_dist, initial_dist):
+    proportion_rejected = math_ops.reduce_sum((1 - accept_dist) * initial_dist)
+    return control_flow_ops.cond(
+        math_ops.less(proportion_rejected, .5),
+        lambda: accept_dist,
+        lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
+            accept_dist, [proportion_rejected, initial_dist, accept_dist],
+            message="Proportion of examples rejected by sampler is high: ",
+            summarize=100,
+            first_n=10))
+
+  acceptance_dist_ds = (dataset_ops.Dataset.zip((acceptance_dist_ds,
+                                                 initial_dist_ds))
+                        .map(maybe_warn_on_large_rejection))
+
+  def _gather_and_copy(class_val, acceptance_prob, data):
+    return class_val, array_ops.gather(acceptance_prob, class_val), data
+
+  current_probabilities_and_class_and_data_ds = dataset_ops.Dataset.zip(
+      (class_values_ds, acceptance_dist_ds, dataset)).map(_gather_and_copy)
+  filtered_ds = (
+      current_probabilities_and_class_and_data_ds
+      .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
+  return filtered_ds.map(lambda class_value, _, data: (class_value, data))
+
+
+def _estimate_initial_dist_ds(
+    target_dist_t, class_values_ds, dist_estimation_batch_size=32,
+    smoothing_constant=10):
+  num_classes = (target_dist_t.shape[0].value or
+                 array_ops.shape(target_dist_t)[0])
+  initial_examples_per_class_seen = array_ops.fill(
+      [num_classes], np.int64(smoothing_constant))
+
+  def update_estimate_and_tile(num_examples_per_class_seen, c):
+    updated_examples_per_class_seen, dist = _estimate_data_distribution(
+        c, num_examples_per_class_seen)
+    tiled_dist = array_ops.tile(
+        array_ops.expand_dims(dist, 0), [dist_estimation_batch_size, 1])
+    return updated_examples_per_class_seen, tiled_dist
+
+  initial_dist_ds = (class_values_ds.batch(dist_estimation_batch_size)
+                     .apply(scan_ops.scan(initial_examples_per_class_seen,
+                                          update_estimate_and_tile))
+                     .apply(batching.unbatch()))
+
+  return initial_dist_ds
+
+
+def _get_target_to_initial_ratio(initial_probs, target_probs):
+  # Add tiny to initial_probs to avoid divide by zero.
+  denom = (initial_probs + np.finfo(initial_probs.dtype.as_numpy_dtype).tiny)
+  return target_probs / denom
+
+
+def _estimate_data_distribution(c, num_examples_per_class_seen):
+  """Estimate data distribution as labels are seen.
+
+  Args:
+    c: The class labels.  Type `int32`, shape `[batch_size]`.
+    num_examples_per_class_seen: Type `int64`, shape `[num_classes]`,
+      containing counts.
+
+  Returns:
+    num_examples_per_lass_seen: Updated counts.  Type `int64`, shape
+      `[num_classes]`.
+    dist: The updated distribution.  Type `float32`, shape `[num_classes]`.
+  """
+  num_classes = num_examples_per_class_seen.get_shape()[0].value
+  # Update the class-count based on what labels are seen in batch.
+  num_examples_per_class_seen = math_ops.add(
+      num_examples_per_class_seen, math_ops.reduce_sum(
+          array_ops.one_hot(c, num_classes, dtype=dtypes.int64), 0))
+  init_prob_estimate = math_ops.truediv(
+      num_examples_per_class_seen,
+      math_ops.reduce_sum(num_examples_per_class_seen))
+  dist = math_ops.cast(init_prob_estimate, dtypes.float32)
+  return num_examples_per_class_seen, dist
+
+
+def _calculate_acceptance_probs_with_mixing(initial_probs, target_probs):
+  """Calculates the acceptance probabilities and mixing ratio.
+
+  In this case, we assume that we can *either* sample from the original data
+  distribution with probability `m`, or sample from a reshaped distribution
+  that comes from rejection sampling on the original distribution. This
+  rejection sampling is done on a per-class basis, with `a_i` representing the
+  probability of accepting data from class `i`.
+
+  This method is based on solving the following analysis for the reshaped
+  distribution:
+
+  Let F be the probability of a rejection (on any example).
+  Let p_i be the proportion of examples in the data in class i (init_probs)
+  Let a_i is the rate the rejection sampler should *accept* class i
+  Let t_i is the target proportion in the minibatches for class i (target_probs)
+
+  ```
+  F = sum_i(p_i * (1-a_i))
+    = 1 - sum_i(p_i * a_i)     using sum_i(p_i) = 1
+  ```
+
+  An example with class `i` will be accepted if `k` rejections occur, then an
+  example with class `i` is seen by the rejector, and it is accepted. This can
+  be written as follows:
+
+  ```
+  t_i = sum_k=0^inf(F^k * p_i * a_i)
+      = p_i * a_j / (1 - F)    using geometric series identity, since 0 <= F < 1
+      = p_i * a_i / sum_j(p_j * a_j)        using F from above
+  ```
+
+  Note that the following constraints hold:
+  ```
+  0 <= p_i <= 1, sum_i(p_i) = 1
+  0 <= a_i <= 1
+  0 <= t_i <= 1, sum_i(t_i) = 1
+  ```
+
+  A solution for a_i in terms of the other variables is the following:
+    ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
+
+  If we try to minimize the amount of data rejected, we get the following:
+
+  M_max = max_i [ t_i / p_i ]
+  M_min = min_i [ t_i / p_i ]
+
+  The desired probability of accepting data if it comes from class `i`:
+
+  a_i = (t_i/p_i - m) / (M_max - m)
+
+  The desired probability of pulling a data element from the original dataset,
+  rather than the filtered one:
+
+  m = M_min
+
+  Args:
+    initial_probs: A Tensor of the initial probability distribution, given or
+      estimated.
+    target_probs: A Tensor of the corresponding classes.
+
+  Returns:
+    (A 1D Tensor with the per-class acceptance probabilities, the desired
+    probability of pull from the original distribution.)
+  """
+  ratio_l = _get_target_to_initial_ratio(initial_probs, target_probs)
+  max_ratio = math_ops.reduce_max(ratio_l)
+  min_ratio = math_ops.reduce_min(ratio_l)
+
+  # Target prob to sample from original distribution.
+  m = min_ratio
+
+  # TODO(joelshor): Simplify fraction, if possible.
+  a_i = (ratio_l - m) / (max_ratio - m)
+  return a_i, m
diff --git a/tensorflow/python/data/experimental/ops/scan_ops.py b/tensorflow/python/data/experimental/ops/scan_ops.py
new file mode 100644
index 0000000000..e05e7c5a18
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/scan_ops.py
@@ -0,0 +1,177 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Scan dataset transformation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+class _ScanDataset(dataset_ops.UnaryDataset):
+  """A dataset that scans a function across its input."""
+
+  def __init__(self, input_dataset, initial_state, scan_func):
+    """See `scan()` for details."""
+    super(_ScanDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+
+    with ops.name_scope("initial_state"):
+      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+      # values to tensors.
+      self._initial_state = nest.pack_sequence_as(initial_state, [
+          sparse_tensor.SparseTensor.from_value(t)
+          if sparse_tensor.is_sparse(t) else ops.convert_to_tensor(
+              t, name="component_%d" % i)
+          for i, t in enumerate(nest.flatten(initial_state))
+      ])
+
+    # Compute initial values for the state classes, shapes and types based on
+    # the initial state. The shapes may be refined by running `tf_scan_func` one
+    # or more times below.
+    self._state_classes = sparse.get_classes(self._initial_state)
+    self._state_shapes = nest.pack_sequence_as(
+        self._initial_state,
+        [t.get_shape() for t in nest.flatten(self._initial_state)])
+    self._state_types = nest.pack_sequence_as(
+        self._initial_state,
+        [t.dtype for t in nest.flatten(self._initial_state)])
+
+    # Will be populated by calling `tf_scan_func`.
+    self._output_classes = None
+    self._output_shapes = None
+    self._output_types = None
+
+    # Iteratively rerun the scan function until reaching a fixed point on
+    # `self._state_shapes`.
+    need_to_rerun = True
+    while need_to_rerun:
+
+      wrapped_func = dataset_ops.StructuredFunctionWrapper(
+          scan_func,
+          "tf.data.experimental.scan()",
+          input_classes=(self._state_classes, input_dataset.output_classes),
+          input_shapes=(self._state_shapes, input_dataset.output_shapes),
+          input_types=(self._state_types, input_dataset.output_types),
+          add_to_graph=False)
+      if not (
+          isinstance(wrapped_func.output_types, collections.Sequence) and
+          len(wrapped_func.output_types) == 2):
+        raise TypeError("The scan function must return a pair comprising the "
+                        "new state and the output value.")
+
+      new_state_classes, self._output_classes = wrapped_func.output_classes
+
+      # Extract and validate class information from the returned values.
+      for new_state_class, state_class in zip(
+          nest.flatten(new_state_classes),
+          nest.flatten(self._state_classes)):
+        if not issubclass(new_state_class, state_class):
+          raise TypeError(
+              "The element classes for the new state must match the initial "
+              "state. Expected %s; got %s." %
+              (self._state_classes, new_state_classes))
+
+      # Extract and validate type information from the returned values.
+      new_state_types, self._output_types = wrapped_func.output_types
+      for new_state_type, state_type in zip(
+          nest.flatten(new_state_types), nest.flatten(self._state_types)):
+        if new_state_type != state_type:
+          raise TypeError(
+              "The element types for the new state must match the initial "
+              "state. Expected %s; got %s." %
+              (self._state_types, new_state_types))
+
+      # Extract shape information from the returned values.
+      new_state_shapes, self._output_shapes = wrapped_func.output_shapes
+
+      flat_state_shapes = nest.flatten(self._state_shapes)
+      flat_new_state_shapes = nest.flatten(new_state_shapes)
+      weakened_state_shapes = [
+          original.most_specific_compatible_shape(new)
+          for original, new in zip(flat_state_shapes, flat_new_state_shapes)
+      ]
+
+      need_to_rerun = False
+      for original_shape, weakened_shape in zip(flat_state_shapes,
+                                                weakened_state_shapes):
+        if original_shape.ndims is not None and (
+            weakened_shape.ndims is None or
+            original_shape.as_list() != weakened_shape.as_list()):
+          need_to_rerun = True
+          break
+
+      if need_to_rerun:
+        self._state_shapes = nest.pack_sequence_as(self._state_shapes,
+                                                   weakened_state_shapes)
+
+    self._scan_func = wrapped_func.function
+    self._scan_func.add_to_graph(ops.get_default_graph())
+
+  def _as_variant_tensor(self):
+    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
+    return gen_dataset_ops.scan_dataset(
+        input_t,
+        nest.flatten(sparse.serialize_sparse_tensors(self._initial_state)),
+        self._scan_func.captured_inputs,
+        f=self._scan_func,
+        **dataset_ops.flat_structure(self))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+
+@tf_export("data.experimental.scan")
+def scan(initial_state, scan_func):
+  """A transformation that scans a function across an input dataset.
+
+  This transformation is a stateful relative of `tf.data.Dataset.map`.
+  In addition to mapping `scan_func` across the elements of the input dataset,
+  `scan()` accumulates one or more state tensors, whose initial values are
+  `initial_state`.
+
+  Args:
+    initial_state: A nested structure of tensors, representing the initial state
+      of the accumulator.
+    scan_func: A function that maps `(old_state, input_element)` to
+      `(new_state, output_element). It must take two arguments and return a
+      pair of nested structures of tensors. The `new_state` must match the
+      structure of `initial_state`.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+  def _apply_fn(dataset):
+    return _ScanDataset(dataset, initial_state, scan_func)
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/shuffle_ops.py b/tensorflow/python/data/experimental/ops/shuffle_ops.py
new file mode 100644
index 0000000000..a4307212da
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/shuffle_ops.py
@@ -0,0 +1,102 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental shuffle ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import random_seed
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+class _ShuffleAndRepeatDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that fuses `shuffle` and `repeat`."""
+
+  def __init__(self, input_dataset, buffer_size, count=None, seed=None):
+    super(_ShuffleAndRepeatDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._buffer_size = ops.convert_to_tensor(
+        buffer_size, dtype=dtypes.int64, name="buffer_size")
+    if count is None:
+      self._count = constant_op.constant(-1, dtype=dtypes.int64, name="count")
+    else:
+      self._count = ops.convert_to_tensor(
+          count, dtype=dtypes.int64, name="count")
+    self._seed, self._seed2 = random_seed.get_seed(seed)
+
+  def _as_variant_tensor(self):
+    # pylint: disable=protected-access
+    input_resource = self._input_dataset._as_variant_tensor()
+    return gen_dataset_ops.shuffle_and_repeat_dataset(
+        input_resource,
+        buffer_size=self._buffer_size,
+        count=self._count,
+        seed=self._seed,
+        seed2=self._seed2,
+        **dataset_ops.flat_structure(self))
+    # pylint: enable=protected-access
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+@tf_export("data.experimental.shuffle_and_repeat")
+def shuffle_and_repeat(buffer_size, count=None, seed=None):
+  """Shuffles and repeats a Dataset returning a new permutation for each epoch.
+
+  `dataset.apply(tf.data.experimental.shuffle_and_repeat(buffer_size, count))`
+
+  is equivalent to
+
+  `dataset.shuffle(buffer_size, reshuffle_each_iteration=True).repeat(count)`
+
+  The difference is that the latter dataset is not serializable. So,
+  if you need to checkpoint an input pipeline with reshuffling you must use
+  this implementation.
+
+  Args:
+    buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
+      maximum number elements that will be buffered when prefetching.
+    count: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+      number of times the dataset should be repeated. The default behavior
+      (if `count` is `None` or `-1`) is for the dataset be repeated
+      indefinitely.
+    seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+      random seed that will be used to create the distribution. See
+      `tf.set_random_seed` for behavior.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):  # pylint: disable=missing-docstring
+    return _ShuffleAndRepeatDataset(dataset, buffer_size, count, seed)
+
+  return _apply_fn
diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/python/data/experimental/ops/stats_ops.py
similarity index 92%
rename from tensorflow/contrib/data/python/ops/stats_ops.py
rename to tensorflow/python/data/experimental/ops/stats_ops.py
index bc47c5989d..c918d223e8 100644
--- a/tensorflow/contrib/data/python/ops/stats_ops.py
+++ b/tensorflow/python/data/experimental/ops/stats_ops.py
@@ -21,8 +21,10 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("data.experimental.StatsAggregator")
 class StatsAggregator(object):
   """A stateful resource that aggregates statistics from one or more iterators.
 
@@ -34,7 +36,7 @@ class StatsAggregator(object):
 
   ```python
   dataset = ...
-  dataset = dataset.apply(stats_ops.latency_stats("total_bytes"))
+  dataset = dataset.apply(tf.data.experimental.latency_stats("total_bytes"))
   ```
 
   To associate a `StatsAggregator` with a `tf.data.Dataset` object, use
@@ -46,7 +48,7 @@ class StatsAggregator(object):
 
   # Apply `set_stats_aggregator` to associate `dataset` with `stats_aggregator`.
   dataset = dataset.apply(
-      tf.contrib.data.set_stats_aggregator(stats_aggregator))
+      tf.data.experimental.set_stats_aggregator(stats_aggregator))
   iterator = dataset.make_one_shot_iterator()
   ```
 
@@ -111,11 +113,12 @@ class _SetStatsAggregatorDataset(dataset_ops.UnaryDataset):
     return self._input_dataset.output_classes
 
 
+@tf_export("data.experimental.set_stats_aggregator")
 def set_stats_aggregator(stats_aggregator):
   """Set the given `stats_aggregator` for aggregating the input dataset stats.
 
   Args:
-    stats_aggregator: A `tf.contrib.data.StatsAggregator` object.
+    stats_aggregator: A `tf.data.experimental.StatsAggregator` object.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -128,8 +131,8 @@ def set_stats_aggregator(stats_aggregator):
   return _apply_fn
 
 
-# TODO(b/38416882): Properly export in the `tf.contrib.data` API when stable
-# or make private / remove.
+# TODO(b/38416882): Properly export in the `tf.data.experimental` API when
+# stable or make private / remove.
 def bytes_produced_stats(tag):
   """Records the number of bytes produced by each element of the input dataset.
 
@@ -152,6 +155,7 @@ def bytes_produced_stats(tag):
   return _apply_fn
 
 
+@tf_export("data.experimental.latency_stats")
 def latency_stats(tag):
   """Records the latency of producing each element of the input dataset.
 
diff --git a/tensorflow/python/data/experimental/ops/threadpool.py b/tensorflow/python/data/experimental/ops/threadpool.py
new file mode 100644
index 0000000000..3ea017c6e8
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/threadpool.py
@@ -0,0 +1,104 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for controlling threading in `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+from tensorflow.python.ops import resource_variable_ops
+
+_uid_counter = 0
+_uid_lock = threading.Lock()
+
+
+def _generate_shared_name(prefix):
+  with _uid_lock:
+    global _uid_counter
+    uid = _uid_counter
+    _uid_counter += 1
+  return "{}{}".format(prefix, uid)
+
+
+# TODO(b/73383364): Properly export in the `tf.data.experimental` API when
+# stable or make private / remove.
+class PrivateThreadPool(object):
+  """A stateful resource that represents a private thread pool."""
+
+  def __init__(self, num_threads, display_name=None,
+               max_intra_op_parallelism=1):
+    """Creates a `PrivateThreadPool` with the given number of threads."""
+    if context.executing_eagerly():
+      shared_name = _generate_shared_name("privatethreadpool")
+      self._resource = ged_ops.experimental_thread_pool_handle(
+          num_threads=num_threads,
+          max_intra_op_parallelism=max_intra_op_parallelism,
+          display_name=display_name,
+          shared_name=shared_name)
+      self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
+          handle=self._resource, handle_device=context.context().device_name)
+    else:
+      self._resource = ged_ops.experimental_thread_pool_handle(
+          num_threads=num_threads,
+          max_intra_op_parallelism=max_intra_op_parallelism,
+          display_name=display_name)
+
+
+class _ThreadPoolDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that acts as an identity, and sets a custom threadpool."""
+
+  def __init__(self, input_dataset, thread_pool):
+    super(_ThreadPoolDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._thread_pool = thread_pool
+
+  def _as_variant_tensor(self):
+    return ged_ops.experimental_thread_pool_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._thread_pool._resource,  # pylint: disable=protected-access
+        **dataset_ops.flat_structure(self))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+
+# TODO(b/73383364): Properly export in the `tf.data.experimental` API when
+# stable or make private / remove.
+def override_threadpool(dataset, thread_pool):
+  """Returns a new dataset that uses the given thread pool for its operations.
+
+  Args:
+    dataset: A `tf.data.Dataset` object.
+    thread_pool: A `PrivateThreadPool` object.
+
+  Returns:
+    A dataset containing the same values as `dataset`, but which uses
+    `thread_pool` to compute any of its parallel operations (such as
+    `tf.data.Dataset.map`).
+  """
+  return _ThreadPoolDataset(dataset, thread_pool)
diff --git a/tensorflow/python/data/experimental/ops/unique.py b/tensorflow/python/data/experimental/ops/unique.py
new file mode 100644
index 0000000000..2a7775c456
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/unique.py
@@ -0,0 +1,79 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unique element dataset transformations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.unique")
+def unique():
+  """Creates a `Dataset` from another `Dataset`, discarding duplicates.
+
+  Use this transformation to produce a dataset that contains one instance of
+  each unique element in the input. For example:
+
+  ```python
+  dataset = tf.data.Dataset.from_tensor_slices([1, 37, 2, 37, 2, 1])
+
+  # Using `unique()` will drop the duplicate elements.
+  dataset = dataset.apply(tf.data.experimental.unique())  # ==> { 1, 37, 2 }
+  ```
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    return _UniqueDataset(dataset)
+
+  return _apply_fn
+
+
+class _UniqueDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` contains the unique elements from its input."""
+
+  def __init__(self, input_dataset):
+    """See `unique()` for details."""
+    super(_UniqueDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    if input_dataset.output_types not in (dtypes.int32, dtypes.int64,
+                                          dtypes.string):
+      raise TypeError(
+          "`tf.data.experimental.unique()` only supports inputs with a single "
+          "`tf.int32`, `tf.int64`, or `tf.string` component.")
+
+  def _as_variant_tensor(self):
+    return gen_experimental_dataset_ops.experimental_unique_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        **dataset_ops.flat_structure(self))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
diff --git a/tensorflow/python/data/experimental/ops/writers.py b/tensorflow/python/data/experimental/ops/writers.py
new file mode 100644
index 0000000000..994447cb4d
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/writers.py
@@ -0,0 +1,60 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for tf.data writers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import convert
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.TFRecordWriter")
+class TFRecordWriter(object):
+  """Writes data to a TFRecord file."""
+
+  def __init__(self, filename, compression_type=None):
+    self._filename = ops.convert_to_tensor(
+        filename, dtypes.string, name="filename")
+    self._compression_type = convert.optional_param_to_tensor(
+        "compression_type",
+        compression_type,
+        argument_default="",
+        argument_dtype=dtypes.string)
+
+  def write(self, dataset):
+    """Returns a `tf.Operation` to write a dataset to a file.
+
+    Args:
+      dataset: a `tf.data.Dataset` whose elements are to be written to a file
+
+    Returns:
+      A `tf.Operation` that, when run, writes contents of `dataset` to a file.
+    """
+    if not isinstance(dataset, dataset_ops.Dataset):
+      raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
+    if (dataset.output_types != dtypes.string or
+        dataset.output_shapes != tensor_shape.scalar()):
+      raise TypeError(
+          "`dataset` must produce scalar `DT_STRING` tensors whereas it "
+          "produces shape {0} and types {1}".format(dataset.output_shapes,
+                                                    dataset.output_types))
+    return gen_dataset_ops.dataset_to_tf_record(
+        dataset._as_variant_tensor(), self._filename, self._compression_type)  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 6bba72a8e9..3b9d3a639d 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -889,8 +889,8 @@ class Dataset(object):
       will be padded out to the maximum length of all elements in that
       dimension.
 
-    See also `tf.contrib.data.dense_to_sparse_batch`, which combines elements
-    that may have different shapes into a `tf.SparseTensor`.
+    See also `tf.data.experimental.dense_to_sparse_batch`, which combines
+    elements that may have different shapes into a `tf.SparseTensor`.
 
     Args:
       batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
diff --git a/tensorflow/python/data/ops/optional_ops.py b/tensorflow/python/data/ops/optional_ops.py
index 3bbebd7878..aca989e03a 100644
--- a/tensorflow/python/data/ops/optional_ops.py
+++ b/tensorflow/python/data/ops/optional_ops.py
@@ -31,7 +31,7 @@ class Optional(object):
 
   An `Optional` can represent the result of an operation that may fail as a
   value, rather than raising an exception and halting execution. For example,
-  `tf.contrib.data.get_next_as_optional` returns an `Optional` that either
+  `tf.data.experimental.get_next_as_optional` returns an `Optional` that either
   contains the next value from a `tf.data.Iterator` if one exists, or a "none"
   value that indicates the end of the sequence has been reached.
   """
@@ -111,7 +111,7 @@ class Optional(object):
 
 
 class _OptionalImpl(Optional):
-  """Concrete implementation of `tf.contrib.data.Optional`.
+  """Concrete implementation of `tf.data.experimental.Optional`.
 
   NOTE(mrry): This implementation is kept private, to avoid defining
   `Optional.__init__()` in the public API.
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index b0f26631f9..d08da6704c 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -129,7 +129,7 @@ class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
 
   def __init__(self, input_dataset, map_func, cycle_length, block_length,
                sloppy, buffer_output_elements, prefetch_input_elements):
-    """See `tf.contrib.data.parallel_interleave()` for details."""
+    """See `tf.data.experimental.parallel_interleave()` for details."""
     super(ParallelInterleaveDataset, self).__init__(input_dataset, map_func,
                                                     cycle_length, block_length)
     self._sloppy = ops.convert_to_tensor(
@@ -158,7 +158,7 @@ class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
     # pylint: enable=protected-access
 
   def _transformation_name(self):
-    return "tf.contrib.data.parallel_interleave()"
+    return "tf.data.experimental.parallel_interleave()"
 
 
 @tf_export("data.TFRecordDataset")
diff --git a/tensorflow/python/debug/examples/debug_tflearn_iris.py b/tensorflow/python/debug/examples/debug_tflearn_iris.py
index 019f13c450..f9bb3148fb 100644
--- a/tensorflow/python/debug/examples/debug_tflearn_iris.py
+++ b/tensorflow/python/debug/examples/debug_tflearn_iris.py
@@ -94,13 +94,15 @@ def main(_):
         "sepal_length", "sepal_width", "petal_length", "petal_width", "label"]
     batch_size = 32
     def training_input_fn():
-      return tf.contrib.data.make_csv_dataset(
-          [training_data_path], batch_size,
-          column_names=column_names, label_name="label")
+      return tf.data.experimental.make_csv_dataset([training_data_path],
+                                                   batch_size,
+                                                   column_names=column_names,
+                                                   label_name="label")
     def test_input_fn():
-      return tf.contrib.data.make_csv_dataset(
-          [test_data_path], batch_size,
-          column_names=column_names, label_name="label")
+      return tf.data.experimental.make_csv_dataset([test_data_path],
+                                                   batch_size,
+                                                   column_names=column_names,
+                                                   label_name="label")
     feature_columns = [tf.feature_column.numeric_column(feature)
                        for feature in column_names[:-1]]
 
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 5ce5410e0b..533a138a39 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -8,6 +8,7 @@ TENSORFLOW_API_INIT_FILES = [
     "bitwise/__init__.py",
     "compat/__init__.py",
     "data/__init__.py",
+    "data/experimental/__init__.py",
     "debugging/__init__.py",
     "distributions/__init__.py",
     "dtypes/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index 587eb232f5..0747424eab 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -8,6 +8,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "bitwise/__init__.py",
     "compat/__init__.py",
     "data/__init__.py",
+    "data/experimental/__init__.py",
     "debugging/__init__.py",
     "distributions/__init__.py",
     "dtypes/__init__.py",
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-checkpoint-input-pipeline-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-checkpoint-input-pipeline-hook.pbtxt
new file mode 100644
index 0000000000..03c16cda8b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-checkpoint-input-pipeline-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.data.experimental.CheckpointInputPipelineHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.iterator_ops.CheckpointInputPipelineHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'estimator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.__metaclass__.pbtxt
new file mode 100644
index 0000000000..3eeaa1b185
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.data.experimental.CsvDataset.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
new file mode 100644
index 0000000000..0c0405ee02
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -0,0 +1,127 @@
+path: "tensorflow.data.experimental.CsvDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_types"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'filenames\', \'record_defaults\', \'compression_type\', \'buffer_size\', \'header\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \',\', \'True\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'transformation_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch"
+    argspec: "args=[\'self\', \'batch_size\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "cache"
+    argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
+  }
+  member_method {
+    name: "concatenate"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "filter"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flat_map"
+    argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_generator"
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_sparse_tensor_slices"
+    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensor_slices"
+    argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensors"
+    argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interleave"
+    argspec: "args=[\'self\', \'map_func\', \'cycle_length\', \'block_length\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "list_files"
+    argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "make_initializable_iterator"
+    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_one_shot_iterator"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "map"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "padded_batch"
+    argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "prefetch"
+    argspec: "args=[\'self\', \'buffer_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "range"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "repeat"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "shuffle"
+    argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "skip"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "take"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "window"
+    argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "zip"
+    argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional.pbtxt
new file mode 100644
index 0000000000..b4c9459098
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional.pbtxt
@@ -0,0 +1,28 @@
+path: "tensorflow.data.experimental.Optional"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.optional_ops.Optional\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "value_structure"
+    mtype: "<class \'abc.abstractproperty\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "has_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "none_from_structure"
+    argspec: "args=[\'value_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.__metaclass__.pbtxt
new file mode 100644
index 0000000000..2991b12f64
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.data.experimental.RandomDataset.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
new file mode 100644
index 0000000000..bce0be4b17
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -0,0 +1,127 @@
+path: "tensorflow.data.experimental.RandomDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_types"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'transformation_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch"
+    argspec: "args=[\'self\', \'batch_size\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "cache"
+    argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
+  }
+  member_method {
+    name: "concatenate"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "filter"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flat_map"
+    argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_generator"
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_sparse_tensor_slices"
+    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensor_slices"
+    argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensors"
+    argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interleave"
+    argspec: "args=[\'self\', \'map_func\', \'cycle_length\', \'block_length\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "list_files"
+    argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "make_initializable_iterator"
+    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_one_shot_iterator"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "map"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "padded_batch"
+    argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "prefetch"
+    argspec: "args=[\'self\', \'buffer_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "range"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "repeat"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "shuffle"
+    argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "skip"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "take"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "window"
+    argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "zip"
+    argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-reducer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-reducer.pbtxt
new file mode 100644
index 0000000000..6b477a8a72
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-reducer.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.data.experimental.Reducer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.grouping.Reducer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "finalize_func"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "init_func"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reduce_func"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'init_func\', \'reduce_func\', \'finalize_func\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.__metaclass__.pbtxt
new file mode 100644
index 0000000000..948e99ef86
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.data.experimental.SqlDataset.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
new file mode 100644
index 0000000000..8aeae92d96
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -0,0 +1,127 @@
+path: "tensorflow.data.experimental.SqlDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_types"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'driver_name\', \'data_source_name\', \'query\', \'output_types\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'transformation_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch"
+    argspec: "args=[\'self\', \'batch_size\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "cache"
+    argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
+  }
+  member_method {
+    name: "concatenate"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "filter"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flat_map"
+    argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_generator"
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_sparse_tensor_slices"
+    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensor_slices"
+    argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensors"
+    argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interleave"
+    argspec: "args=[\'self\', \'map_func\', \'cycle_length\', \'block_length\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "list_files"
+    argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "make_initializable_iterator"
+    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_one_shot_iterator"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "map"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "padded_batch"
+    argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "prefetch"
+    argspec: "args=[\'self\', \'buffer_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "range"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "repeat"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "shuffle"
+    argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "skip"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "take"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "window"
+    argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "zip"
+    argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt
new file mode 100644
index 0000000000..0bcc8cf3e8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.data.experimental.StatsAggregator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_ops.StatsAggregator\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_summary"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-t-f-record-writer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-t-f-record-writer.pbtxt
new file mode 100644
index 0000000000..6f9d18a701
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-t-f-record-writer.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.data.experimental.TFRecordWriter"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.writers.TFRecordWriter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'filename\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
new file mode 100644
index 0000000000..b14585f8d7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -0,0 +1,139 @@
+path: "tensorflow.data.experimental"
+tf_module {
+  member {
+    name: "CheckpointInputPipelineHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CsvDataset"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "Optional"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomDataset"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "Reducer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SqlDataset"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "StatsAggregator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFRecordWriter"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "Counter"
+    argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
+  }
+  member_method {
+    name: "bucket_by_sequence_length"
+    argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "choose_from_datasets"
+    argspec: "args=[\'datasets\', \'choice_dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "copy_to_device"
+    argspec: "args=[\'target_device\', \'source_device\'], varargs=None, keywords=None, defaults=[\'/cpu:0\'], "
+  }
+  member_method {
+    name: "dense_to_sparse_batch"
+    argspec: "args=[\'batch_size\', \'row_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "enumerate_dataset"
+    argspec: "args=[\'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "get_next_as_optional"
+    argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_single_element"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group_by_reducer"
+    argspec: "args=[\'key_func\', \'reducer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group_by_window"
+    argspec: "args=[\'key_func\', \'reduce_func\', \'window_size\', \'window_size_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "ignore_errors"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latency_stats"
+    argspec: "args=[\'tag\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_batched_features_dataset"
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "make_csv_dataset"
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\'], "
+  }
+  member_method {
+    name: "make_saveable_from_iterator"
+    argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "map_and_batch"
+    argspec: "args=[\'map_func\', \'batch_size\', \'num_parallel_batches\', \'drop_remainder\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "parallel_interleave"
+    argspec: "args=[\'map_func\', \'cycle_length\', \'block_length\', \'sloppy\', \'buffer_output_elements\', \'prefetch_input_elements\'], varargs=None, keywords=None, defaults=[\'1\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "parse_example_dataset"
+    argspec: "args=[\'features\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "prefetch_to_device"
+    argspec: "args=[\'device\', \'buffer_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rejection_resample"
+    argspec: "args=[\'class_func\', \'target_dist\', \'initial_dist\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "sample_from_datasets"
+    argspec: "args=[\'datasets\', \'weights\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scan"
+    argspec: "args=[\'initial_state\', \'scan_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_stats_aggregator"
+    argspec: "args=[\'stats_aggregator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "shuffle_and_repeat"
+    argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unique"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
index 56fb270a49..e205157523 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
@@ -20,4 +20,8 @@ tf_module {
     name: "TextLineDataset"
     mtype: "<class \'abc.ABCMeta\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-checkpoint-input-pipeline-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-checkpoint-input-pipeline-hook.pbtxt
new file mode 100644
index 0000000000..03c16cda8b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-checkpoint-input-pipeline-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.data.experimental.CheckpointInputPipelineHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.iterator_ops.CheckpointInputPipelineHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'estimator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.__metaclass__.pbtxt
new file mode 100644
index 0000000000..3eeaa1b185
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.data.experimental.CsvDataset.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
new file mode 100644
index 0000000000..0c0405ee02
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -0,0 +1,127 @@
+path: "tensorflow.data.experimental.CsvDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_types"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'filenames\', \'record_defaults\', \'compression_type\', \'buffer_size\', \'header\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \',\', \'True\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'transformation_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch"
+    argspec: "args=[\'self\', \'batch_size\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "cache"
+    argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
+  }
+  member_method {
+    name: "concatenate"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "filter"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flat_map"
+    argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_generator"
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_sparse_tensor_slices"
+    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensor_slices"
+    argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensors"
+    argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interleave"
+    argspec: "args=[\'self\', \'map_func\', \'cycle_length\', \'block_length\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "list_files"
+    argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "make_initializable_iterator"
+    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_one_shot_iterator"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "map"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "padded_batch"
+    argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "prefetch"
+    argspec: "args=[\'self\', \'buffer_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "range"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "repeat"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "shuffle"
+    argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "skip"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "take"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "window"
+    argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "zip"
+    argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optional.pbtxt
new file mode 100644
index 0000000000..b4c9459098
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optional.pbtxt
@@ -0,0 +1,28 @@
+path: "tensorflow.data.experimental.Optional"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.optional_ops.Optional\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "value_structure"
+    mtype: "<class \'abc.abstractproperty\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "has_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "none_from_structure"
+    argspec: "args=[\'value_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.__metaclass__.pbtxt
new file mode 100644
index 0000000000..2991b12f64
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.data.experimental.RandomDataset.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
new file mode 100644
index 0000000000..bce0be4b17
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -0,0 +1,127 @@
+path: "tensorflow.data.experimental.RandomDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_types"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'transformation_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch"
+    argspec: "args=[\'self\', \'batch_size\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "cache"
+    argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
+  }
+  member_method {
+    name: "concatenate"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "filter"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flat_map"
+    argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_generator"
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_sparse_tensor_slices"
+    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensor_slices"
+    argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensors"
+    argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interleave"
+    argspec: "args=[\'self\', \'map_func\', \'cycle_length\', \'block_length\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "list_files"
+    argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "make_initializable_iterator"
+    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_one_shot_iterator"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "map"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "padded_batch"
+    argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "prefetch"
+    argspec: "args=[\'self\', \'buffer_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "range"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "repeat"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "shuffle"
+    argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "skip"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "take"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "window"
+    argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "zip"
+    argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-reducer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-reducer.pbtxt
new file mode 100644
index 0000000000..6b477a8a72
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-reducer.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.data.experimental.Reducer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.grouping.Reducer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "finalize_func"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "init_func"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reduce_func"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'init_func\', \'reduce_func\', \'finalize_func\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.__metaclass__.pbtxt
new file mode 100644
index 0000000000..948e99ef86
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.data.experimental.SqlDataset.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
new file mode 100644
index 0000000000..8aeae92d96
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -0,0 +1,127 @@
+path: "tensorflow.data.experimental.SqlDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_types"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'driver_name\', \'data_source_name\', \'query\', \'output_types\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'transformation_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch"
+    argspec: "args=[\'self\', \'batch_size\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "cache"
+    argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
+  }
+  member_method {
+    name: "concatenate"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "filter"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flat_map"
+    argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_generator"
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_sparse_tensor_slices"
+    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensor_slices"
+    argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensors"
+    argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interleave"
+    argspec: "args=[\'self\', \'map_func\', \'cycle_length\', \'block_length\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "list_files"
+    argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "make_initializable_iterator"
+    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_one_shot_iterator"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "map"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "padded_batch"
+    argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "prefetch"
+    argspec: "args=[\'self\', \'buffer_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "range"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "repeat"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "shuffle"
+    argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "skip"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "take"
+    argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "window"
+    argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "zip"
+    argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt
new file mode 100644
index 0000000000..0bcc8cf3e8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.data.experimental.StatsAggregator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_ops.StatsAggregator\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_summary"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-t-f-record-writer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-t-f-record-writer.pbtxt
new file mode 100644
index 0000000000..6f9d18a701
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-t-f-record-writer.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.data.experimental.TFRecordWriter"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.writers.TFRecordWriter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'filename\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
new file mode 100644
index 0000000000..b14585f8d7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -0,0 +1,139 @@
+path: "tensorflow.data.experimental"
+tf_module {
+  member {
+    name: "CheckpointInputPipelineHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CsvDataset"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "Optional"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomDataset"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "Reducer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SqlDataset"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "StatsAggregator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFRecordWriter"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "Counter"
+    argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
+  }
+  member_method {
+    name: "bucket_by_sequence_length"
+    argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "choose_from_datasets"
+    argspec: "args=[\'datasets\', \'choice_dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "copy_to_device"
+    argspec: "args=[\'target_device\', \'source_device\'], varargs=None, keywords=None, defaults=[\'/cpu:0\'], "
+  }
+  member_method {
+    name: "dense_to_sparse_batch"
+    argspec: "args=[\'batch_size\', \'row_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "enumerate_dataset"
+    argspec: "args=[\'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "get_next_as_optional"
+    argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_single_element"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group_by_reducer"
+    argspec: "args=[\'key_func\', \'reducer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group_by_window"
+    argspec: "args=[\'key_func\', \'reduce_func\', \'window_size\', \'window_size_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "ignore_errors"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latency_stats"
+    argspec: "args=[\'tag\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_batched_features_dataset"
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "make_csv_dataset"
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\'], "
+  }
+  member_method {
+    name: "make_saveable_from_iterator"
+    argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "map_and_batch"
+    argspec: "args=[\'map_func\', \'batch_size\', \'num_parallel_batches\', \'drop_remainder\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "parallel_interleave"
+    argspec: "args=[\'map_func\', \'cycle_length\', \'block_length\', \'sloppy\', \'buffer_output_elements\', \'prefetch_input_elements\'], varargs=None, keywords=None, defaults=[\'1\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "parse_example_dataset"
+    argspec: "args=[\'features\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "prefetch_to_device"
+    argspec: "args=[\'device\', \'buffer_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rejection_resample"
+    argspec: "args=[\'class_func\', \'target_dist\', \'initial_dist\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "sample_from_datasets"
+    argspec: "args=[\'datasets\', \'weights\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scan"
+    argspec: "args=[\'initial_state\', \'scan_func\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_stats_aggregator"
+    argspec: "args=[\'stats_aggregator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "shuffle_and_repeat"
+    argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unique"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
index 56fb270a49..e205157523 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
@@ -20,4 +20,8 @@ tf_module {
     name: "TextLineDataset"
     mtype: "<class \'abc.ABCMeta\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
 }
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 3a1c4a45d4..164b3d8303 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -64,8 +64,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
     "//tensorflow/contrib/compiler:xla",
     "//tensorflow/contrib/constrained_optimization:constrained_optimization_pip",
-    "//tensorflow/contrib/data/python/kernel_tests/serialization:dataset_serialization_test_base",
-    "//tensorflow/contrib/data/python/kernel_tests:stats_dataset_test_base",
     "//tensorflow/contrib/eager/python/examples:examples_pip",
     "//tensorflow/contrib/eager/python:evaluator",
     "//tensorflow/contrib/gan:gan",
@@ -106,6 +104,8 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python:meta_graph_testdata",
     "//tensorflow/python:spectral_ops_test_util",
     "//tensorflow/python:util_example_parser_configuration",
+    "//tensorflow/python/data/experimental/kernel_tests/serialization:dataset_serialization_test_base",
+    "//tensorflow/python/data/experimental/kernel_tests:stats_dataset_test_base",
     "//tensorflow/python/data/kernel_tests:test_base",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/eager:eager_pip",
-- 
GitLab


From 80f8931682aeaae89786f0940892a6557b4cfd67 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 17:05:45 -0700
Subject: [PATCH 268/570] Mark bfloat16 as supported for
 ExponentialMovingAverage.

PiperOrigin-RevId: 215307701
---
 tensorflow/python/training/moving_averages.py |  9 ++++---
 .../python/training/moving_averages_test.py   | 27 +++++++++++++++++++
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 177a7ddfa5..041266da3e 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -372,13 +372,13 @@ class ExponentialMovingAverage(object):
 
     Args:
       var_list: A list of Variable or Tensor objects. The variables
-        and Tensors must be of types float16, float32, or float64.
+        and Tensors must be of types bfloat16, float16, float32, or float64.
 
     Returns:
       An Operation that updates the moving averages.
 
     Raises:
-      TypeError: If the arguments are not all float16, float32, or float64.
+      TypeError: If the arguments are not an allowed type.
       ValueError: If the moving average of one of the variables is already
         being computed.
     """
@@ -387,8 +387,9 @@ class ExponentialMovingAverage(object):
       var_list = variables.trainable_variables()
     zero_debias_true = set()  # set of vars to set `zero_debias=True`
     for var in var_list:
-      if var.dtype.base_dtype not in [dtypes.float16, dtypes.float32,
-                                      dtypes.float64]:
+      if var.dtype.base_dtype not in [
+          dtypes.bfloat16, dtypes.float16, dtypes.float32, dtypes.float64
+      ]:
         raise TypeError("The variables must be half, float, or double: %s" %
                         var.name)
 
diff --git a/tensorflow/python/training/moving_averages_test.py b/tensorflow/python/training/moving_averages_test.py
index 93991d0e14..bb2fca66e3 100644
--- a/tensorflow/python/training/moving_averages_test.py
+++ b/tensorflow/python/training/moving_averages_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -110,6 +111,32 @@ class MovingAveragesTest(test.TestCase):
       denominator_2 = denominator_1 * decay + weight_2 * (1.0 - decay)
       self.assertAllClose(numerator_2 / denominator_2, wma_array)
 
+  def testWeightedMovingAverageBfloat16(self):
+    bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
+    with self.cached_session() as sess:
+      decay = 0.5
+      weight = array_ops.placeholder(dtypes.bfloat16, [])
+      val = array_ops.placeholder(dtypes.bfloat16, [])
+
+      wma = moving_averages.weighted_moving_average(val, decay, weight)
+      variables.global_variables_initializer().run()
+
+      # Get the first weighted moving average.
+      val_1 = 3.0
+      weight_1 = 4.0
+      wma_array = sess.run(wma, feed_dict={val: val_1, weight: weight_1})
+      numerator_1 = val_1 * weight_1 * (1.0 - decay)
+      denominator_1 = weight_1 * (1.0 - decay)
+      self.assertAllClose(numerator_1 / denominator_1, wma_array)
+
+      # Get the second weighted moving average.
+      val_2 = 11.0
+      weight_2 = 22.0
+      wma_array = sess.run(wma, feed_dict={val: val_2, weight: weight_2})
+      numerator_2 = numerator_1 * decay + val_2 * weight_2 * (1.0 - decay)
+      denominator_2 = denominator_1 * decay + weight_2 * (1.0 - decay)
+      self.assertAllClose(bfloat16(numerator_2 / denominator_2), wma_array)
+
 
 def _Repeat(value, dim):
   if dim == 1:
-- 
GitLab


From 7dc5f7caa959c70d5ca948f7b0fc5abfea9a5935 Mon Sep 17 00:00:00 2001
From: "Xiaoming (Jason) Cui" <xiaoming.cui@intel.com>
Date: Mon, 1 Oct 2018 17:18:28 -0700
Subject: [PATCH 269/570]  Minor changes, hanged  CHECK_GE to DCHECK_GE due to
 code policy change

---
 tensorflow/core/common_runtime/process_util.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 4570496637..e1dc08d645 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -65,7 +65,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
 #ifdef _OPENMP
     mkl_intra_op = omp_get_max_threads();
 #endif  // _OPENMP
-    CHECK_GE(mkl_intra_op, 1);
+    DCHECK_GE(mkl_intra_op, 1);
     const int32 mkl_inter_op = std::max(
         (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
     VLOG(0)
-- 
GitLab


From bfbe2bbe6a83a4acfa8f87aa5c8228e74b37bb61 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Mon, 1 Oct 2018 17:18:24 -0700
Subject: [PATCH 270/570] [tf.data] More robust solution for input pipeline
 <--> performance model coordination.

PiperOrigin-RevId: 215309735
---
 tensorflow/core/framework/dataset.h           | 12 +--
 tensorflow/core/framework/model.cc            | 83 ++++++++---------
 tensorflow/core/framework/model.h             | 42 +++++----
 .../kernels/data/map_and_batch_dataset_op.cc  | 90 ++++++++++---------
 .../data/parallel_interleave_dataset_op.cc    | 86 +++++++++---------
 .../kernels/data/parallel_map_iterator.cc     | 77 ++++++++--------
 6 files changed, 201 insertions(+), 189 deletions(-)

diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 697e0604bf..8c1151cb56 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -657,15 +657,15 @@ class DatasetBaseIterator : public IteratorBase {
   // When performance modeling is enabled, this method adds a tunable parameter
   // to the model node corresponding to this iterator.
   //
-  // The performance modeling logic may use `value` to set the value of the
+  // The performance modeling logic may use `state` to set the value of the
   // tunable parameter at any point during the lifetime of this iterator. When
-  // it does, it notifies `cond_var`.
+  // it does, it acquires `state->mu` and notifies `state->cond_var`.
   void AddTunableParameter(IteratorContext* ctx, const string& name,
-                           std::atomic<int64>* value, int64 min, int64 max,
-                           condition_variable* cond_var) {
+                           std::shared_ptr<model::SharedState> state, int64 min,
+                           int64 max) {
     if (ctx->model()) {
-      ctx->model()->AddTunableParameter(prefix(), name, value, min, max,
-                                        cond_var);
+      ctx->model()->AddTunableParameter(prefix(), name, std::move(state), min,
+                                        max);
     }
   }
 
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index b0330ec990..bfdb3a6658 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -296,12 +296,12 @@ void Model::AddProcessingTime(const string& name, int64 delta) {
 
 void Model::AddTunableParameter(const string& node_name,
                                 const string& parameter_name,
-                                std::atomic<int64>* value, int64 min, int64 max,
-                                condition_variable* cond_var) {
+                                std::shared_ptr<SharedState> state, int64 min,
+                                int64 max) {
   tf_shared_lock l(mu_);
   auto node = *gtl::FindOrNull(lookup_table_, node_name);
   DCHECK(node);
-  node->add_tunable_param(parameter_name, value, min, max, cond_var);
+  node->add_tunable_param(parameter_name, std::move(state), min, max);
 }
 
 // The optimization algorithm starts by setting all tunable parallelism
@@ -311,54 +311,55 @@ void Model::AddTunableParameter(const string& node_name,
 // is less than or equal to the processing time needed to produce an element
 // divided by CPU budget.
 void Model::Optimize(int64 cpu_budget) {
-  tf_shared_lock lock(mu_);
   std::vector<std::shared_ptr<Model::Node::Tunable>> tunables;
-  const int64 processing_time = ProcessingTime();
-  tunables = CollectTunables();
-  for (auto tunable : tunables) {
-    tunable->value = 1;
-  }
-  while (true) {
-    const int64 output_time = OutputTime();
-    bool all_tunables = true;
-    for (auto& tunable : tunables) {
-      if (tunable->value < tunable->max) {
-        all_tunables = false;
+  {
+    tf_shared_lock lock(mu_);
+    const int64 processing_time = ProcessingTime();
+    tunables = CollectTunables();
+    for (auto tunable : tunables) {
+      tunable->value = 1;
+    }
+    while (true) {
+      const int64 output_time = OutputTime();
+      bool all_tunables = true;
+      for (auto& tunable : tunables) {
+        if (tunable->value < tunable->max) {
+          all_tunables = false;
+          break;
+        }
+      }
+      if (output_time < processing_time / cpu_budget || all_tunables) {
         break;
       }
-    }
-    if (output_time < processing_time / cpu_budget || all_tunables) {
-      break;
-    }
-    int64 best_delta = -1;
-    Model::Node::Tunable* best_tunable = nullptr;
-    for (auto& tunable : tunables) {
-      if (tunable->value == tunable->max) {
-        continue;
+      int64 best_delta = -1;
+      Model::Node::Tunable* best_tunable = nullptr;
+      for (auto& tunable : tunables) {
+        if (tunable->value == tunable->max) {
+          continue;
+        }
+        tunable->value++;
+        int64 delta = output_time - OutputTime();
+        if (delta > best_delta) {
+          best_delta = delta;
+          best_tunable = tunable.get();
+        }
+        tunable->value--;
       }
-      tunable->value++;
-      int64 delta = output_time - OutputTime();
-      if (delta > best_delta) {
-        best_delta = delta;
-        best_tunable = tunable.get();
+      if (!best_tunable) {
+        // NOTE: This can happen because we are performing the optimization
+        // while the model data is changing. If this becomes an issue, we should
+        // look into performing the optimization using a model snapshot.
+        break;
       }
-      tunable->value--;
+      best_tunable->value++;
     }
-    if (!best_tunable) {
-      // NOTE: This can happen because we are performing the optimization
-      // while the model data is changing. If this becomes an issue, we should
-      // look into performing the optimization using a model snapshot.
-      break;
-    }
-    best_tunable->value++;
   }
   VLOG(2) << "Number of knobs: " << tunables.size();
   for (auto& tunable : tunables) {
     VLOG(2) << "Setting tunable parameter: " << tunable->value;
-    tunable->value_ptr->store(tunable->value);
-    if (tunable->cond_var) {
-      tunable->cond_var->notify_all();
-    }
+    mutex_lock l(*tunable->state->mu);
+    tunable->state->value = tunable->value;
+    tunable->state->cond_var->notify_all();
   }
 }
 
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 26402f5cd3..eae0fa70e8 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -33,6 +33,19 @@ namespace tensorflow {
 namespace data {
 namespace model {
 
+// Represents thread-safe state that can be shared between an input pipeline and
+// the performance model.
+struct SharedState {
+ public:
+  explicit SharedState(int64 value, std::shared_ptr<mutex> mu,
+                       std::shared_ptr<condition_variable> cond_var)
+      : value(value), mu(std::move(mu)), cond_var(std::move(cond_var)) {}
+
+  std::shared_ptr<mutex> mu;
+  std::shared_ptr<condition_variable> cond_var;
+  int64 value;
+};
+
 // Abstract representation of a TensorFlow input pipeline that can be used
 // for collecting runtime information and optimizing performance. It collects
 // runtime information about execution of the input pipeline that is used to
@@ -62,8 +75,8 @@ class Model {
   // Adds a tunable parameter for the given node.
   void AddTunableParameter(const string& node_name,
                            const string& parameter_name,
-                           std::atomic<int64>* value, int64 min, int64 max,
-                           condition_variable* cond_var) LOCKS_EXCLUDED(mu_);
+                           std::shared_ptr<SharedState> value, int64 min,
+                           int64 max) LOCKS_EXCLUDED(mu_);
 
   // Runs optimization.
   void Optimize(int64 cpu_budget) LOCKS_EXCLUDED(mu_);
@@ -109,13 +122,8 @@ class Model {
    public:
     // Represents a tunable parameter.
     struct Tunable {
-      Tunable(std::atomic<int64>* value, int64 min, int64 max,
-              condition_variable* cond_var)
-          : value(*value),
-            min(min),
-            max(max),
-            value_ptr(value),
-            cond_var(cond_var) {}
+      Tunable(std::shared_ptr<SharedState> state, int64 min, int64 max)
+          : value(state->value), min(min), max(max), state(std::move(state)) {}
 
       // Identifies the model value of the parameter. This can be different from
       // the actual value (e.g. during optimization search).
@@ -127,12 +135,8 @@ class Model {
       // Identifies the maximum value of the parameter.
       int64 max;
 
-      // Points to the actual value of the parameter. Not owned.
-      std::atomic<int64>* value_ptr;
-
-      // If non-null, this condition variable is notified when the model updates
-      // the actual value of the parameter (via `value_ptr`). Not owned.
-      condition_variable* cond_var;
+      // Shared state of the parameter.
+      std::shared_ptr<SharedState> state;
     };
 
     Node(int64 id, const string& name, std::shared_ptr<Node> output)
@@ -158,12 +162,12 @@ class Model {
     }
 
     // Adds a tunable parameter.
-    void add_tunable_param(const string& name, std::atomic<int64>* value,
-                           int64 min, int64 max, condition_variable* cond_var)
-        LOCKS_EXCLUDED(mu_) {
+    void add_tunable_param(const string& name,
+                           std::shared_ptr<SharedState> state, int64 min,
+                           int64 max) LOCKS_EXCLUDED(mu_) {
       mutex_lock l(mu_);
       tunable_params_[name] =
-          std::make_shared<Tunable>(value, min, max, cond_var);
+          std::make_shared<Tunable>(std::move(state), min, max);
     }
 
     // Returns the unique node ID.
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index b4c7f9e510..bf08970560 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -187,29 +187,31 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            num_parallel_calls_(params.dataset->num_parallel_calls_) {}
+            mu_(std::make_shared<mutex>()),
+            cond_var_(std::make_shared<condition_variable>()),
+            num_parallel_calls_(std::make_shared<model::SharedState>(
+                params.dataset->num_parallel_calls_, mu_, cond_var_)) {}
 
       ~Iterator() override {
-        mutex_lock l(mu_);
+        mutex_lock l(*mu_);
         // Cancel the runner thread.
         cancelled_ = true;
-        cond_var_.notify_all();
+        cond_var_->notify_all();
         // Wait for all in-flight calls to complete.
         while (num_calls_ > 0) {
-          cond_var_.wait(l);
+          cond_var_->wait(l);
         }
       }
 
       Status Initialize(IteratorContext* ctx) override {
-        mutex_lock l(mu_);
+        mutex_lock l(*mu_);
         AddConstantParameter(ctx, "batch_size", dataset()->batch_size_);
-        if (num_parallel_calls_ == kAutoTune) {
-          num_parallel_calls_ = 1;
-          AddTunableParameter(ctx, "parallelism",
-                              &num_parallel_calls_ /* value */, 1 /* min */,
-                              port::NumSchedulableCPUs() /* max */, &cond_var_);
+        if (num_parallel_calls_->value == kAutoTune) {
+          num_parallel_calls_->value = 1;
+          AddTunableParameter(ctx, "parallelism", num_parallel_calls_, 1,
+                              port::NumSchedulableCPUs());
         } else {
-          AddConstantParameter(ctx, "parallelism", num_parallel_calls_);
+          AddConstantParameter(ctx, "parallelism", num_parallel_calls_->value);
         }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
@@ -221,27 +223,27 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                              bool* end_of_sequence) override {
         std::shared_ptr<BatchResult> result;
         {
-          mutex_lock l(mu_);
+          mutex_lock l(*mu_);
           EnsureRunnerThreadStarted(ctx);
           while (batch_results_.empty() ||
                  batch_results_.front()->num_calls > 0) {
             RecordStop(ctx);
-            cond_var_.wait(l);
+            cond_var_->wait(l);
             RecordStart(ctx);
           }
           std::swap(result, batch_results_.front());
           batch_results_.pop_front();
-          cond_var_.notify_all();
+          cond_var_->notify_all();
         }
         return ProcessResult(ctx, result, out_tensors, end_of_sequence);
       }
 
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(mu_);
+        mutex_lock l(*mu_);
         // Wait for all in-flight calls to complete.
         while (num_calls_ > 0) {
-          cond_var_.wait(l);
+          cond_var_->wait(l);
         }
         CHECK_EQ(num_calls_, 0);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
@@ -257,7 +259,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
-        mutex_lock l(mu_);
+        mutex_lock l(*mu_);
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(full_name("call_counter"), &call_counter_));
@@ -298,7 +300,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       void Callback(const std::shared_ptr<IteratorContext>& ctx,
                     const std::shared_ptr<BatchResult>& result,
                     const std::shared_ptr<std::vector<Tensor>>& return_values,
-                    int64 offset, const Status& status) LOCKS_EXCLUDED(mu_) {
+                    int64 offset, const Status& status) LOCKS_EXCLUDED(*mu_) {
         result->UpdateStatus(status);
         if (status.ok()) {
           EnsureOutputAllocated(ctx, result, return_values);
@@ -334,16 +336,16 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       }
 
       void CallCompleted(const std::shared_ptr<BatchResult>& result)
-          LOCKS_EXCLUDED(mu_) {
-        mutex_lock l(mu_);
+          LOCKS_EXCLUDED(*mu_) {
+        mutex_lock l(*mu_);
         num_calls_--;
         result->num_calls--;
-        cond_var_.notify_all();
+        cond_var_->notify_all();
       }
 
       void CallFunction(std::shared_ptr<IteratorContext> ctx,
                         const std::shared_ptr<BatchResult>& result,
-                        int64 offset) LOCKS_EXCLUDED(mu_) {
+                        int64 offset) LOCKS_EXCLUDED(*mu_) {
         // Get the next input element.
         std::vector<Tensor> input_element;
         bool end_of_input;
@@ -400,7 +402,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       }
 
       void EnsureRunnerThreadStarted(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!runner_thread_) {
           std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
           runner_thread_.reset(ctx->env()->StartThread(
@@ -476,14 +478,14 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       }
 
       void RunnerThread(const std::shared_ptr<IteratorContext>& ctx)
-          LOCKS_EXCLUDED(mu_) {
+          LOCKS_EXCLUDED(*mu_) {
         std::vector<std::pair<std::shared_ptr<BatchResult>, int64>> new_calls;
         RecordStart(ctx.get());
         auto stop_cleanup =
             gtl::MakeCleanup([this, &ctx]() { RecordStop(ctx.get()); });
-        new_calls.reserve(num_parallel_calls_);
-        auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(mu_) -> bool {
-          int64 num_parallel_calls = num_parallel_calls_;
+        new_calls.reserve(num_parallel_calls_->value);
+        auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
+          int64 num_parallel_calls = num_parallel_calls_->value;
           int64 max_batch_results =
               (num_parallel_calls + dataset()->batch_size_ - 1) /
               dataset()->batch_size_;
@@ -494,10 +496,10 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         };
         while (true) {
           {
-            mutex_lock l(mu_);
+            mutex_lock l(*mu_);
             while (!cancelled_ && busy()) {
               RecordStop(ctx.get());
-              cond_var_.wait(l);
+              cond_var_->wait(l);
               RecordStart(ctx.get());
             }
 
@@ -524,7 +526,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       }
 
       Status ReadBatchResult(IteratorContext* ctx, IteratorStateReader* reader,
-                             size_t index) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                             size_t index) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         batch_results_.emplace_back(new BatchResult(dataset()->batch_size_));
         std::shared_ptr<BatchResult> result = batch_results_.back();
         string prefix = strings::StrCat("batch_results_", index);
@@ -569,7 +571,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       }
 
       Status ReadStatus(IteratorStateReader* reader, const string& prefix,
-                        Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                        Status* status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         int64 code_int;
         TF_RETURN_IF_ERROR(reader->ReadScalar(
             full_name(strings::StrCat(prefix, "_code")), &code_int));
@@ -587,7 +589,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       }
 
       Status WriteBatchResult(IteratorStateWriter* writer, size_t index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         std::shared_ptr<BatchResult> result = batch_results_[index];
         string prefix = strings::StrCat("batch_results_", index);
         mutex_lock l(result->mu);
@@ -628,7 +630,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       }
 
       Status WriteStatus(IteratorStateWriter* writer, const string& prefix,
-                         const Status& status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                         const Status& status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
                                 static_cast<int64>(status.code())));
@@ -642,24 +644,24 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
       // Used for coordination between the main thread, the runner thread, and
       // the callback threads.
-      mutex mu_;
+      const std::shared_ptr<mutex> mu_;
       // Used for coordination between the main thread, the runner thread, and
       // the callback threads. In particular, the runner thread should only
-      // schedule new calls when the number of in-flight calls is less than the
-      // user specified level of parallelism and there are slots available in
-      // the `batch_results_` buffer.
-      condition_variable cond_var_;
+      // schedule new calls when the number of in-flight calls is less than
+      // `num_parallel_calls_->value` and there are slots available in the
+      // `batch_results_` buffer.
+      const std::shared_ptr<condition_variable> cond_var_;
       // Identifies the maximum number of parallel calls.
-      std::atomic<int64> num_parallel_calls_;
+      const std::shared_ptr<model::SharedState> num_parallel_calls_;
       // Counts the number of outstanding calls for this batch.
-      int64 num_calls_ GUARDED_BY(mu_) = 0;
+      int64 num_calls_ GUARDED_BY(*mu_) = 0;
       // Counts the total number of calls.
-      int64 call_counter_ GUARDED_BY(mu_) = 0;
+      int64 call_counter_ GUARDED_BY(*mu_) = 0;
       std::unique_ptr<IteratorBase> input_impl_;
       // Buffer for storing the (intermediate) batch results.
-      std::deque<std::shared_ptr<BatchResult>> batch_results_ GUARDED_BY(mu_);
-      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(mu_);
-      bool cancelled_ GUARDED_BY(mu_) = false;
+      std::deque<std::shared_ptr<BatchResult>> batch_results_ GUARDED_BY(*mu_);
+      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+      bool cancelled_ GUARDED_BY(*mu_) = false;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 2bb38bf0b9..6b6b3d6ab9 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -1217,7 +1217,10 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            num_parallel_calls_(params.dataset->num_parallel_calls_),
+            mu_(std::make_shared<mutex>()),
+            cond_var_(std::make_shared<condition_variable>()),
+            num_parallel_calls_(std::make_shared<model::SharedState>(
+                params.dataset->num_parallel_calls_, mu_, cond_var_)),
             args_list_(params.dataset->cycle_length_),
             current_elements_(params.dataset->cycle_length_),
             element_in_use_(params.dataset->cycle_length_, false),
@@ -1227,25 +1230,24 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
                 false /* low_latency_hint */)) {}
 
       ~Iterator() override {
-        mutex_lock l(mu_);
+        mutex_lock l(*mu_);
         // Cancel the runner thread.
         cancelled_ = true;
-        cond_var_.notify_all();
+        cond_var_->notify_all();
         // Wait for all in-flight calls to complete.
         while (num_calls_ > 0) {
-          cond_var_.wait(l);
+          cond_var_->wait(l);
         }
       }
 
       Status Initialize(IteratorContext* ctx) override {
-        mutex_lock l(mu_);
-        if (num_parallel_calls_ == kAutoTune) {
-          num_parallel_calls_ = 1;
-          AddTunableParameter(ctx, "parallelism",
-                              &num_parallel_calls_ /* value */, 1 /* min */,
-                              dataset()->cycle_length_ /* max */, &cond_var_);
+        mutex_lock l(*mu_);
+        if (num_parallel_calls_->value == kAutoTune) {
+          num_parallel_calls_->value = 1;
+          AddTunableParameter(ctx, "parallelism", num_parallel_calls_, 1,
+                              dataset()->cycle_length_);
         } else {
-          AddConstantParameter(ctx, "parallelism", num_parallel_calls_);
+          AddConstantParameter(ctx, "parallelism", num_parallel_calls_->value);
         }
         AddConstantParameter(ctx, "cycle_length", dataset()->cycle_length_);
         TF_RETURN_IF_ERROR(
@@ -1259,12 +1261,12 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
         std::shared_ptr<InvocationResult> result;
         do {
           {
-            mutex_lock l(mu_);
+            mutex_lock l(*mu_);
             EnsureRunnerThreadStarted(ctx);
             while (invocation_results_.empty() &&
                    (!end_of_input_ || num_open_ > 0)) {
               RecordStop(ctx);
-              cond_var_.wait(l);
+              cond_var_->wait(l);
               RecordStart(ctx);
             }
             if (!invocation_results_.empty()) {
@@ -1274,7 +1276,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
               *end_of_sequence = true;
               return Status::OK();
             }
-            cond_var_.notify_all();
+            cond_var_->notify_all();
           }
           RecordStop(ctx);
           result->notification.WaitForNotification();
@@ -1290,10 +1292,10 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
 
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(mu_);
+        mutex_lock l(*mu_);
         // Wait for all in-flight calls to complete.
         while (num_calls_ > 0) {
-          cond_var_.wait(l);
+          cond_var_->wait(l);
         }
         CHECK_EQ(num_calls_, 0);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
@@ -1331,7 +1333,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
-        mutex_lock l(mu_);
+        mutex_lock l(*mu_);
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         int64 invocation_results_size;
         TF_RETURN_IF_ERROR(reader->ReadScalar(
@@ -1384,7 +1386,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       };
 
       void EnsureRunnerThreadStarted(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!runner_thread_) {
           std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
           runner_thread_.reset(ctx->env()->StartThread(
@@ -1401,7 +1403,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       void FetchOutputs(
           const std::shared_ptr<IteratorContext>& ctx, int64 cycle_index,
           const std::vector<std::shared_ptr<InvocationResult>>& results)
-          LOCKS_EXCLUDED(mu_) {
+          LOCKS_EXCLUDED(*mu_) {
         RecordStart(ctx.get());
         auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
         bool end_of_input = false;
@@ -1424,14 +1426,14 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
         if (end_of_input) {
           current_elements_[cycle_index].reset();
         }
-        mutex_lock l(mu_);
+        mutex_lock l(*mu_);
         element_in_use_[cycle_index] = false;
         num_calls_--;
         if (end_of_input) {
           args_list_[cycle_index].clear();
           num_open_--;
         }
-        cond_var_.notify_all();
+        cond_var_->notify_all();
       }
 
       // Method responsible for 1) creating iterators out of input elements, 2)
@@ -1442,20 +1444,20 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       void RunnerThread(const std::shared_ptr<IteratorContext>& ctx) {
         RecordStart(ctx.get());
         auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
-        auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(mu_) -> bool {
+        auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
           return element_in_use_[cycle_index_] ||
-                 num_calls_ >= num_parallel_calls_ ||
+                 num_calls_ >= num_parallel_calls_->value ||
                  invocation_results_.size() >=
                      dataset()->cycle_length_ * dataset()->block_length_;
         };
         while (true) {
-          mutex_lock l(mu_);
+          mutex_lock l(*mu_);
           // Wait until this thread is cancelled, the end of input has been
           // reached, or the cycle element at the `cycle_index_` position is
           // not in use and there is space in the `invocation_results_` queue.
           while (!cancelled_ && (!end_of_input_ || num_open_ > 0) && busy()) {
             RecordStop(ctx.get());
-            cond_var_.wait(l);
+            cond_var_->wait(l);
             RecordStart(ctx.get());
           }
 
@@ -1509,13 +1511,13 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
             }
             cycle_index_ = (cycle_index_ + 1) % dataset()->cycle_length_;
           }
-          cond_var_.notify_all();
+          cond_var_->notify_all();
         }
       }
 
       Status WriteStatusLocked(IteratorStateWriter* writer, size_t index,
                                const Status& status)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         TF_RETURN_IF_ERROR(writer->WriteScalar(
             CodeKey(index), static_cast<int64>(status.code())));
         if (!status.ok()) {
@@ -1526,7 +1528,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       }
 
       Status ReadStatusLocked(IteratorStateReader* reader, size_t index,
-                              Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                              Status* status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         int64 code_int;
         TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
         error::Code code = static_cast<error::Code>(code_int);
@@ -1553,7 +1555,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       }
 
       Status WriteCurrentElements(IteratorStateWriter* writer)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         for (int idx = 0; idx < current_elements_.size(); idx++) {
           if (current_elements_[idx]) {
             TF_RETURN_IF_ERROR(SaveInput(writer, current_elements_[idx]));
@@ -1572,7 +1574,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
 
       Status ReadCurrentElements(IteratorContext* ctx,
                                  IteratorStateReader* reader)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         for (int idx = 0; idx < current_elements_.size(); idx++) {
           if (reader->Contains(
                   full_name(strings::StrCat("args_size[", idx, "]")))) {
@@ -1600,7 +1602,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
 
       // Used for coordination between the main thread, the runner thread, and
       // the worker threads.
-      mutex mu_;
+      const std::shared_ptr<mutex> mu_;
 
       // Used for coordination between the main thread, the runner thread, and
       // the worker threads. In particular, the runner thread should only
@@ -1608,45 +1610,45 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       // user specified level of parallelism, there are slots available in the
       // `invocation_results_` buffer, the current cycle element is not in use,
       // and there are elements left to be fetched.
-      condition_variable cond_var_;
+      const std::shared_ptr<condition_variable> cond_var_;
 
       // Identifies the maximum number of parallel calls.
-      std::atomic<int64> num_parallel_calls_;
+      const std::shared_ptr<model::SharedState> num_parallel_calls_;
 
       // Iterator for input elements.
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(*mu_);
 
       // Identifies current cycle element.
       int64 cycle_index_ = 0;
 
       // Arguments for creating an iterator for cycle elements.
-      std::vector<std::vector<Tensor>> args_list_ GUARDED_BY(mu_);
+      std::vector<std::vector<Tensor>> args_list_ GUARDED_BY(*mu_);
 
       // Iterators for the current cycle elements. Concurrent access is
       // protected by `element_in_use_`.
       std::vector<std::unique_ptr<IteratorBase>> current_elements_;
 
       // Identifies cycle elements that are in use by worker threads.
-      std::vector<bool> element_in_use_ GUARDED_BY(mu_);
+      std::vector<bool> element_in_use_ GUARDED_BY(*mu_);
 
       // Buffer for storing the invocation results.
       std::deque<std::shared_ptr<InvocationResult>> invocation_results_
-          GUARDED_BY(mu_);
+          GUARDED_BY(*mu_);
 
       // Identifies whether end of input has been reached.
-      bool end_of_input_ GUARDED_BY(mu_) = false;
+      bool end_of_input_ GUARDED_BY(*mu_) = false;
 
       // Identifies the number of open iterators.
-      int64 num_open_ GUARDED_BY(mu_) = 0;
+      int64 num_open_ GUARDED_BY(*mu_) = 0;
 
       // Identifies the number of outstanding calls.
-      int64 num_calls_ GUARDED_BY(mu_) = 0;
+      int64 num_calls_ GUARDED_BY(*mu_) = 0;
 
       std::unique_ptr<thread::ThreadPool> thread_pool_;
-      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(mu_);
+      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
 
       // Identifies whether background activity should be cancelled.
-      bool cancelled_ GUARDED_BY(mu_) = false;
+      bool cancelled_ GUARDED_BY(*mu_) = false;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index da067a4e6f..13bd4b6036 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -40,30 +40,32 @@ class ParallelMapIterator : public DatasetBaseIterator {
         input_dataset_(input_dataset),
         init_func_(std::move(init_func)),
         map_func_(std::move(map_func)),
-        num_parallel_calls_(num_parallel_calls) {}
+        mu_(std::make_shared<mutex>()),
+        cond_var_(std::make_shared<condition_variable>()),
+        num_parallel_calls_(std::make_shared<model::SharedState>(
+            num_parallel_calls, mu_, cond_var_)) {}
 
   ~ParallelMapIterator() override {
-    mutex_lock l(mu_);
+    mutex_lock l(*mu_);
     // Cancel the runner thread.
     cancelled_ = true;
-    cond_var_.notify_all();
+    cond_var_->notify_all();
     // Wait for all in-flight calls to complete.
     while (num_calls_ > 0) {
-      cond_var_.wait(l);
+      cond_var_->wait(l);
     }
   }
 
   Status Initialize(IteratorContext* ctx) override {
-    mutex_lock l(mu_);
-    if (num_parallel_calls_ == kAutoTune) {
-      num_parallel_calls_ = 1;
+    mutex_lock l(*mu_);
+    if (num_parallel_calls_->value == kAutoTune) {
+      num_parallel_calls_->value = 1;
       // TODO(jsimsa): Surface the number of threads used by `ctx->runner()` and
       // use it here for the maximum.
-      AddTunableParameter(ctx, "parallelism", &num_parallel_calls_ /* value */,
-                          1 /* min */, port::NumSchedulableCPUs() /* max */,
-                          &cond_var_);
+      AddTunableParameter(ctx, "parallelism", num_parallel_calls_, 1,
+                          port::NumSchedulableCPUs());
     } else {
-      AddConstantParameter(ctx, "parallelism", num_parallel_calls_);
+      AddConstantParameter(ctx, "parallelism", num_parallel_calls_->value);
     }
     TF_RETURN_IF_ERROR(
         input_dataset_->MakeIterator(ctx, prefix(), &input_impl_));
@@ -77,16 +79,16 @@ class ParallelMapIterator : public DatasetBaseIterator {
                          bool* end_of_sequence) override {
     std::shared_ptr<InvocationResult> result;
     {
-      mutex_lock l(mu_);
+      mutex_lock l(*mu_);
       EnsureRunnerThreadStarted(ctx);
       while (invocation_results_.empty()) {
         RecordStop(ctx);
-        cond_var_.wait(l);
+        cond_var_->wait(l);
         RecordStart(ctx);
       }
       std::swap(result, invocation_results_.front());
       invocation_results_.pop_front();
-      cond_var_.notify_all();
+      cond_var_->notify_all();
     }
     RecordStop(ctx);
     result->notification.WaitForNotification();
@@ -96,10 +98,10 @@ class ParallelMapIterator : public DatasetBaseIterator {
 
  protected:
   Status SaveInternal(IteratorStateWriter* writer) override {
-    mutex_lock l(mu_);
+    mutex_lock l(*mu_);
     // Wait for all in-flight calls to complete.
     while (num_calls_ > 0) {
-      cond_var_.wait(l);
+      cond_var_->wait(l);
     }
     CHECK_EQ(num_calls_, 0);
     TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
@@ -128,7 +130,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
 
   Status RestoreInternal(IteratorContext* ctx,
                          IteratorStateReader* reader) override {
-    mutex_lock l(mu_);
+    mutex_lock l(*mu_);
     TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
     int64 invocation_results_size;
     TF_RETURN_IF_ERROR(reader->ReadScalar(
@@ -175,7 +177,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   };
 
   void EnsureRunnerThreadStarted(IteratorContext* ctx)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     if (!runner_thread_) {
       std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
       runner_thread_.reset(ctx->env()->StartThread(
@@ -185,18 +187,18 @@ class ParallelMapIterator : public DatasetBaseIterator {
   }
 
   void CallCompleted(const std::shared_ptr<InvocationResult>& result)
-      LOCKS_EXCLUDED(mu_) {
+      LOCKS_EXCLUDED(*mu_) {
     {
-      mutex_lock l(mu_);
+      mutex_lock l(*mu_);
       num_calls_--;
-      cond_var_.notify_all();
+      cond_var_->notify_all();
     }
     result->notification.Notify();
   }
 
   void CallFunction(const std::shared_ptr<IteratorContext>& ctx,
                     const std::shared_ptr<InvocationResult>& result)
-      LOCKS_EXCLUDED(mu_) {
+      LOCKS_EXCLUDED(*mu_) {
     // Get the next input element.
     std::vector<Tensor> input_element;
     result->status =
@@ -239,18 +241,18 @@ class ParallelMapIterator : public DatasetBaseIterator {
     RecordStart(ctx.get());
     auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
     std::vector<std::shared_ptr<InvocationResult>> new_calls;
-    new_calls.reserve(num_parallel_calls_);
-    auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(mu_) -> bool {
-      int64 num_parallel_calls = num_parallel_calls_;
+    new_calls.reserve(num_parallel_calls_->value);
+    auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
+      int64 num_parallel_calls = num_parallel_calls_->value;
       return num_calls_ >= num_parallel_calls ||
              invocation_results_.size() >= num_parallel_calls;
     };
     while (true) {
       {
-        mutex_lock l(mu_);
+        mutex_lock l(*mu_);
         while (!cancelled_ && busy()) {
           RecordStop(ctx.get());
-          cond_var_.wait(l);
+          cond_var_->wait(l);
           RecordStart(ctx.get());
         }
         if (cancelled_) {
@@ -261,7 +263,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
           new_calls.push_back(invocation_results_.back());
           num_calls_++;
         }
-        cond_var_.notify_all();
+        cond_var_->notify_all();
       }
       for (const auto& call : new_calls) {
         CallFunction(ctx, call);
@@ -271,7 +273,8 @@ class ParallelMapIterator : public DatasetBaseIterator {
   }
 
   Status WriteStatusLocked(IteratorStateWriter* writer, size_t index,
-                           const Status& status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                           const Status& status)
+      EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     TF_RETURN_IF_ERROR(
         writer->WriteScalar(CodeKey(index), static_cast<int64>(status.code())));
     if (!status.ok()) {
@@ -282,7 +285,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   }
 
   Status ReadStatusLocked(IteratorStateReader* reader, size_t index,
-                          Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                          Status* status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     int64 code_int;
     TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
     error::Code code = static_cast<error::Code>(code_int);
@@ -312,23 +315,23 @@ class ParallelMapIterator : public DatasetBaseIterator {
   const std::function<Status(IteratorContext*)> init_func_;
   const ParallelMapIteratorFunction map_func_;
   // Used for coordination between the main thread and the runner thread.
-  mutex mu_;
+  const std::shared_ptr<mutex> mu_;
   // Used for coordination between the main thread and the runner thread. In
   // particular, the runner thread should only schedule new calls when the
   // number of in-flight calls is less than the user specified level of
   // parallelism and there are slots available in the `invocation_results_`
   // buffer.
-  condition_variable cond_var_;
+  const std::shared_ptr<condition_variable> cond_var_;
   // Identifies the maximum number of parallel calls.
-  std::atomic<int64> num_parallel_calls_;
+  const std::shared_ptr<model::SharedState> num_parallel_calls_;
   // Counts the number of outstanding calls.
-  int64 num_calls_ GUARDED_BY(mu_) = 0;
+  int64 num_calls_ GUARDED_BY(*mu_) = 0;
   std::unique_ptr<IteratorBase> input_impl_;
   // Buffer for storing the invocation results.
   std::deque<std::shared_ptr<InvocationResult>> invocation_results_
-      GUARDED_BY(mu_);
-  std::unique_ptr<Thread> runner_thread_ GUARDED_BY(mu_);
-  bool cancelled_ GUARDED_BY(mu_) = false;
+      GUARDED_BY(*mu_);
+  std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+  bool cancelled_ GUARDED_BY(*mu_) = false;
 };
 
 }  // namespace
-- 
GitLab


From 9a23e9251ecba026471ff77a5bbbc802a2889a10 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Mon, 1 Oct 2018 17:26:37 -0700
Subject: [PATCH 271/570] [tf.data] Adding `tf.data.Options()`,
 `tf.data.Dataset.options()`, and `tf.data.Dataset.with_options()` to make it
 possible to respectively represent, get, and set options, such as
 optimization configuration, of a tf.data input pipeline.

PiperOrigin-RevId: 215310764
---
 .../core/kernels/data/optimize_dataset_op.cc  |  16 +-
 .../optimization/hoist_random_uniform_test.py |  11 +-
 .../optimization/latency_all_edges_test.py    |   7 +-
 .../map_and_filter_fusion_test.py             |  27 +-
 .../optimization/map_parallelization_test.py  |   6 +-
 .../optimization/map_vectorization_test.py    |  14 +-
 .../optimization/model_dataset_op_test.py     |  20 +-
 .../optimization/noop_elimination_test.py     |   4 +-
 .../optimization/optimize_dataset_op_test.py  |  45 ++-
 .../data/experimental/ops/optimization.py     |  61 +---
 tensorflow/python/data/kernel_tests/BUILD     |  18 +-
 .../data/kernel_tests/dataset_ops_test.py     | 158 ++++++++++-
 tensorflow/python/data/ops/dataset_ops.py     | 268 +++++++++++++++++-
 .../golden/v1/tensorflow.data.-dataset.pbtxt  |   8 +
 ...ow.data.-fixed-length-record-dataset.pbtxt |   8 +
 .../golden/v1/tensorflow.data.-options.pbtxt  |  57 ++++
 .../tensorflow.data.-t-f-record-dataset.pbtxt |   8 +
 .../tensorflow.data.-text-line-dataset.pbtxt  |   8 +
 ...rflow.data.experimental.-csv-dataset.pbtxt |   8 +
 ...ow.data.experimental.-random-dataset.pbtxt |   8 +
 ...rflow.data.experimental.-sql-dataset.pbtxt |   8 +
 .../tools/api/golden/v1/tensorflow.data.pbtxt |   4 +
 .../golden/v2/tensorflow.data.-dataset.pbtxt  |   8 +
 ...ow.data.-fixed-length-record-dataset.pbtxt |   8 +
 .../golden/v2/tensorflow.data.-options.pbtxt  |  57 ++++
 .../tensorflow.data.-t-f-record-dataset.pbtxt |   8 +
 .../tensorflow.data.-text-line-dataset.pbtxt  |   8 +
 ...rflow.data.experimental.-csv-dataset.pbtxt |   8 +
 ...ow.data.experimental.-random-dataset.pbtxt |   8 +
 ...rflow.data.experimental.-sql-dataset.pbtxt |   8 +
 .../tools/api/golden/v2/tensorflow.data.pbtxt |   4 +
 31 files changed, 742 insertions(+), 147 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt

diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index d5b725eac9..1cb7caa738 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -154,12 +154,8 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        IteratorContext::Params params;
-        params.env = ctx->env();
-        params.runner = *(ctx->runner());
-        params.stats_aggregator_getter = ctx->stats_aggregator_getter();
+        IteratorContext::Params params = ctx->params();
         params.lib = dataset()->lib_;
-        params.allocator_getter = ctx->allocator_getter();
         return dataset()->optimized_input_->MakeIterator(
             IteratorContext(params), prefix(), &input_impl_);
       }
@@ -167,14 +163,10 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
-        IteratorContext::Params params;
-        params.env = ctx->env();
-        params.runner = *(ctx->runner());
-        params.stats_aggregator_getter = ctx->stats_aggregator_getter();
+        IteratorContext::Params params = ctx->params();
         params.lib = dataset()->lib_;
-        params.allocator_getter = ctx->allocator_getter();
-        IteratorContext iter_ctx(params);
-        return input_impl_->GetNext(&iter_ctx, out_tensors, end_of_sequence);
+        return input_impl_->GetNext(IteratorContext(params), out_tensors,
+                                    end_of_sequence);
       }
 
      protected:
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
index 3cd9753665..81437c0aec 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
@@ -64,7 +64,9 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
         optimization.assert_next(
             ["Zip[0]", "Map"] if will_optimize else ["Map"])).map(function)
 
-    dataset = dataset.apply(optimization.optimize(["hoist_random_uniform"]))
+    options = dataset_ops.Options()
+    options.experimental_hoist_random_uniform = True
+    dataset = dataset.with_options(options)
     self._testDataset(dataset)
 
   def testAdditionalInputs(self):
@@ -77,9 +79,10 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
           [], minval=1, maxval=10, dtype=dtypes.float32, seed=42)
 
     dataset = dataset_ops.Dataset.range(5).apply(
-        optimization.assert_next(
-            ["Zip[0]", "Map"])).map(random_with_capture).apply(
-                optimization.optimize(["hoist_random_uniform"]))
+        optimization.assert_next(["Zip[0]", "Map"])).map(random_with_capture)
+    options = dataset_ops.Options()
+    options.experimental_hoist_random_uniform = True
+    dataset = dataset.with_options(options)
     self._testDataset(dataset)
 
   def _testDataset(self, dataset):
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
index 45623876ae..26fec0414e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
@@ -28,14 +28,15 @@ from tensorflow.python.platform import test
 class OptimizeStatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
 
   def testLatencyStatsOptimization(self):
-
     stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.from_tensors(1).apply(
         optimization.assert_next(
             ["LatencyStats", "Map", "LatencyStats", "Prefetch",
              "LatencyStats"])).map(lambda x: x * x).prefetch(1).apply(
-                 stats_ops.set_stats_aggregator(stats_aggregator)).apply(
-                     optimization.optimize(["latency_all_edges"]))
+                 stats_ops.set_stats_aggregator(stats_aggregator))
+    options = dataset_ops.Options()
+    options.experimental_latency_all_edges = True
+    dataset = dataset.with_options(options)
     iterator = dataset.make_initializable_iterator()
     get_next = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
index a439635716..7f8a4e6406 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
@@ -72,7 +72,10 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
     for function in functions:
       dataset = dataset.map(function)
 
-    dataset = dataset.prefetch(0).apply(optimization.optimize(["map_fusion"]))
+    dataset = dataset.prefetch(0)
+    options = dataset_ops.Options()
+    options.experimental_map_fusion = True
+    dataset = dataset.with_options(options)
     iterator = dataset.make_one_shot_iterator()
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -124,9 +127,10 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testMapFilterFusion(self, function, predicate):
     dataset = dataset_ops.Dataset.range(10).apply(
         optimization.assert_next(
-            ["Map",
-             "FilterByLastComponent"])).map(function).filter(predicate).apply(
-                 optimization.optimize(["map_and_filter_fusion"]))
+            ["Map", "FilterByLastComponent"])).map(function).filter(predicate)
+    options = dataset_ops.Options()
+    options.experimental_map_and_filter_fusion = True
+    dataset = dataset.with_options(options)
     self._testMapAndFilter(dataset, function, predicate)
 
   def _testMapAndFilter(self, dataset, function, predicate):
@@ -156,10 +160,11 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     # We are currently not supporting functions with additional inputs.
     dataset = dataset_ops.Dataset.range(10).apply(
-        optimization.assert_next(
-            ["Map", "Filter"])).map(function).filter(predicate).apply(
-                optimization.optimize(["map_and_filter_fusion"]))
-
+        optimization.assert_next(["Map",
+                                  "Filter"])).map(function).filter(predicate)
+    options = dataset_ops.Options()
+    options.experimental_map_and_filter_fusion = True
+    dataset = dataset.with_options(options)
     self._testMapAndFilter(dataset, function, predicate)
 
   @staticmethod
@@ -197,8 +202,10 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
     for predicate in predicates:
       dataset = dataset.filter(predicate)
 
-    dataset = dataset.prefetch(0).apply(
-        optimization.optimize(["filter_fusion"]))
+    dataset = dataset.prefetch(0)
+    options = dataset_ops.Options()
+    options.experimental_filter_fusion = True
+    dataset = dataset.with_options(options)
     iterator = dataset.make_one_shot_iterator()
     get_next = iterator.get_next()
     with self.cached_session() as sess:
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
index 334d8e3778..ce9c9bc47b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
@@ -62,8 +62,10 @@ class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testMapParallelization(self, function, should_optimize):
     next_nodes = ["ParallelMap"] if should_optimize else ["Map"]
     dataset = dataset_ops.Dataset.range(5).apply(
-        optimization.assert_next(next_nodes)).map(function).apply(
-            optimization.optimize(["map_parallelization"]))
+        optimization.assert_next(next_nodes)).map(function)
+    options = dataset_ops.Options()
+    options.experimental_map_parallelization = True
+    dataset = dataset.with_options(options)
     iterator = dataset.make_one_shot_iterator()
     get_next = iterator.get_next()
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index d47492753e..32ebc49c40 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -69,10 +69,11 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
           map_fn, num_parallel_calls=num_parallel_calls).batch(batch_size)
 
     unoptimized = _make_dataset([map_node_name, "Batch"])
-    optimized = _make_dataset(["Batch", map_node_name] if expect_optimized else
-                              [map_node_name, "Batch"]).apply(
-                                  optimization.optimize(["map_vectorization"]))
-
+    optimized = _make_dataset(["Batch", map_node_name]
+                              if expect_optimized else [map_node_name, "Batch"])
+    options = dataset_ops.Options()
+    options.experimental_map_vectorization = True
+    optimized = optimized.with_options(options)
     return unoptimized, optimized
 
   @parameterized.named_parameters(
@@ -179,7 +180,10 @@ class MapVectorizationBenchmark(test.Benchmark):
     unoptimized = input_dataset.map(map_fn).batch(batch_size)
     unoptimized_op = unoptimized.make_one_shot_iterator().get_next()
 
-    optimized = unoptimized.apply(optimization.optimize(["map_vectorization"]))
+    optimized = input_dataset.map(map_fn).batch(batch_size)
+    options = dataset_ops.Options()
+    options.experimental_map_vectorization = True
+    optimized = optimized.with_options(options)
     optimized_op = optimized.make_one_shot_iterator().get_next()
 
     unoptimized_time = self._run(
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_op_test.py
index a9f2ce8c03..82516356df 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_op_test.py
@@ -37,7 +37,9 @@ class ModelDatasetTest(test_base.DatasetTestBase):
                                                 np.random.rand(4 * k,
                                                                1))).repeat()
     dataset = dataset.map(math_ops.matmul)
-    iterator = dataset.apply(optimization.model()).make_one_shot_iterator()
+    options = dataset_ops.Options()
+    options.experimental_autotune = True
+    iterator = dataset.with_options(options).make_one_shot_iterator()
     get_next = iterator.get_next()
 
     deltas = []
@@ -61,7 +63,9 @@ class ModelDatasetTest(test_base.DatasetTestBase):
                                                                1))).repeat()
     dataset = dataset.map(
         math_ops.matmul, num_parallel_calls=optimization.AUTOTUNE)
-    iterator = dataset.apply(optimization.model()).make_one_shot_iterator()
+    options = dataset_ops.Options()
+    options.experimental_autotune = True
+    iterator = dataset.with_options(options).make_one_shot_iterator()
     get_next = iterator.get_next()
 
     deltas = []
@@ -89,7 +93,9 @@ class ModelDatasetTest(test_base.DatasetTestBase):
             math_ops.matmul,
             num_parallel_calls=optimization.AUTOTUNE,
             batch_size=batch_size))
-    iterator = dataset.apply(optimization.model()).make_one_shot_iterator()
+    options = dataset_ops.Options()
+    options.experimental_autotune = True
+    iterator = dataset.with_options(options).make_one_shot_iterator()
     get_next = iterator.get_next()
 
     deltas = []
@@ -116,7 +122,9 @@ class ModelDatasetTest(test_base.DatasetTestBase):
         lambda _: dataset,
         cycle_length=10,
         num_parallel_calls=optimization.AUTOTUNE)
-    iterator = dataset.apply(optimization.model()).make_one_shot_iterator()
+    options = dataset_ops.Options()
+    options.experimental_autotune = True
+    iterator = dataset.with_options(options).make_one_shot_iterator()
     get_next = iterator.get_next()
 
     deltas = []
@@ -161,7 +169,9 @@ class ModelDatasetTest(test_base.DatasetTestBase):
         lambda _: dataset, cycle_length=2)
 
     dataset = dataset.map(f3, num_parallel_calls=optimization.AUTOTUNE)
-    iterator = dataset.apply(optimization.model()).make_one_shot_iterator()
+    options = dataset_ops.Options()
+    options.experimental_autotune = True
+    iterator = dataset.with_options(options).make_one_shot_iterator()
     get_next = iterator.get_next()
 
     deltas = []
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
index 092e0ff62a..fb0640fe9f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
@@ -40,7 +40,9 @@ class NoopEliminationTest(test_base.DatasetTestBase):
             ["FiniteRepeat", "FiniteSkip", "Prefetch", "Prefetch"]))
     dataset = dataset.repeat(some_tensor).skip(5).prefetch(0).take(-1).skip(
         0).repeat(1).prefetch(0)
-    dataset = dataset.apply(optimization.optimize(["noop_elimination"]))
+    options = dataset_ops.Options()
+    options.experimental_noop_elimination = True
+    dataset = dataset.with_options(options)
 
     iterator = dataset.make_one_shot_iterator()
     get_next = iterator.get_next()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_op_test.py
index eb661796c0..760cd8cc4e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_op_test.py
@@ -33,23 +33,10 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
 
   def testOptimizationDefault(self):
     dataset = dataset_ops.Dataset.range(10).apply(
-        optimization.assert_next(
-            ["Map", "Batch"])).map(lambda x: x * x).batch(10).apply(
-                optimization.optimize())
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testOptimizationEmpty(self):
-    dataset = dataset_ops.Dataset.range(10).apply(
-        optimization.assert_next(
-            ["Map", "Batch"])).map(lambda x: x * x).batch(10).apply(
-                optimization.optimize([]))
-    iterator = dataset.make_one_shot_iterator()
+        optimization.assert_next(["Map",
+                                  "Batch"])).map(lambda x: x * x).batch(10)
+    iterator = dataset.with_options(
+        dataset_ops.Options()).make_one_shot_iterator()
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -60,8 +47,10 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
   def testOptimizationFusion(self):
     dataset = dataset_ops.Dataset.range(10).apply(
         optimization.assert_next(
-            ["MapAndBatch"])).map(lambda x: x * x).batch(10).apply(
-                optimization.optimize(["map_and_batch_fusion"]))
+            ["MapAndBatch"])).map(lambda x: x * x).batch(10)
+    options = dataset_ops.Options()
+    options.experimental_map_and_batch_fusion = True
+    dataset = dataset.with_options(options)
     iterator = dataset.make_one_shot_iterator()
     get_next = iterator.get_next()
 
@@ -72,8 +61,10 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
 
   def testOptimizationStatefulFunction(self):
     dataset = dataset_ops.Dataset.range(10).map(
-        lambda _: random_ops.random_uniform([])).batch(10).apply(
-            optimization.optimize(["map_and_batch_fusion"]))
+        lambda _: random_ops.random_uniform([])).batch(10)
+    options = dataset_ops.Options()
+    options.experimental_map_and_batch_fusion = True
+    dataset = dataset.with_options(options)
     iterator = dataset.make_one_shot_iterator()
     get_next = iterator.get_next()
 
@@ -82,8 +73,10 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
 
   def testOptimizationLargeInputFromTensor(self):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None))
-    dataset = dataset_ops.Dataset.from_tensors(input_t).apply(
-        optimization.optimize())
+    dataset = dataset_ops.Dataset.from_tensors(input_t)
+    options = dataset_ops.Options()
+    options.experimental_map_and_batch_fusion = True
+    dataset = dataset.with_options(options)
     iterator = dataset.make_initializable_iterator()
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -94,8 +87,10 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
 
   def testOptimizationLargeInputFromTensorSlices(self):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None, None))
-    dataset = dataset_ops.Dataset.from_tensor_slices(input_t).apply(
-        optimization.optimize())
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_t)
+    options = dataset_ops.Options()
+    options.experimental_map_and_batch_fusion = True
+    dataset = dataset.with_options(options)
     iterator = dataset.make_initializable_iterator()
     init_op = iterator.initializer
     get_next = iterator.get_next()
diff --git a/tensorflow/python/data/experimental/ops/optimization.py b/tensorflow/python/data/experimental/ops/optimization.py
index 30348ede36..276dde8383 100644
--- a/tensorflow/python/data/experimental/ops/optimization.py
+++ b/tensorflow/python/data/experimental/ops/optimization.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 
 # A constant that can be used to enable auto-tuning.
@@ -58,7 +57,7 @@ def model():
 
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    return _ModelDataset(dataset)
+    return dataset_ops._ModelDataset(dataset)  # pylint: disable=protected-access
 
   return _apply_fn
 
@@ -78,7 +77,7 @@ def optimize(optimizations=None):
 
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    return _OptimizeDataset(dataset, optimizations)
+    return dataset_ops._OptimizeDataset(dataset, optimizations)  # pylint: disable=protected-access
 
   return _apply_fn
 
@@ -113,59 +112,3 @@ class _AssertNextDataset(dataset_ops.UnaryDataset):
   def output_types(self):
     return self._input_dataset.output_types
 
-
-class _ModelDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that acts as an identity, and models performance."""
-
-  def __init__(self, input_dataset):
-    """See `optimize()` for details."""
-    super(_ModelDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.model_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-class _OptimizeDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that acts as an identity, and applies optimizations."""
-
-  def __init__(self, input_dataset, optimizations):
-    """See `optimize()` for details."""
-    super(_OptimizeDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-    if optimizations is None:
-      optimizations = []
-    self._optimizations = ops.convert_to_tensor(
-        optimizations, dtype=dtypes.string, name="optimizations")
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.optimize_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._optimizations,
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index cadfe7f9e0..bf76860aa4 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -115,8 +115,10 @@ tf_py_test(
     srcs = ["dataset_ops_test.py"],
     additional_deps = [
         ":test_base",
-        "//tensorflow/core:protos_all_py",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
@@ -172,20 +174,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "inputs_test",
-    size = "small",
-    srcs = ["inputs_test.py"],
-    additional_deps = [
-        ":test_base",
-        "@absl_py//absl/testing:parameterized",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
 tf_py_test(
     name = "interleave_dataset_op_test",
     size = "small",
diff --git a/tensorflow/python/data/kernel_tests/dataset_ops_test.py b/tensorflow/python/data/kernel_tests/dataset_ops_test.py
index f115f9d9c7..b9f8875b9f 100644
--- a/tensorflow/python/data/kernel_tests/dataset_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_ops_test.py
@@ -18,13 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+import numpy as np
+
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.platform import test
 
 
-class DatasetOpsTest(test_base.DatasetTestBase):
+class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testAsSerializedGraph(self):
     dataset = dataset_ops.Dataset.range(10)
@@ -33,6 +40,155 @@ class DatasetOpsTest(test_base.DatasetTestBase):
           sess.run(dataset._as_serialized_graph()))
       self.assertTrue(any([node.op != "RangeDataset" for node in graph.node]))
 
+  @staticmethod
+  def make_apply_fn(dataset):
+
+    def apply_fn(dataset):
+
+      def _apply_fn(dataset):
+        return dataset.cache()
+
+      return dataset.apply(_apply_fn)
+
+    return apply_fn
+
+  @staticmethod
+  def make_gen():
+
+    def gen():
+      yield 42
+
+    return gen
+
+  @staticmethod
+  def make_interleave_fn(dataset, num_parallel_calls=None):
+
+    def interleave_fn(dataset):
+      return dataset.interleave(
+          lambda x: dataset_ops.Dataset.range(0),
+          cycle_length=2,
+          num_parallel_calls=num_parallel_calls)
+
+    return interleave_fn
+
+  @parameterized.named_parameters(
+      ("FixedLengthRecord", readers.FixedLengthRecordDataset("", 42)),
+      ("FromGenerator",
+       dataset_ops.Dataset.from_generator(make_gen.__func__(), dtypes.int32),
+       1),
+      ("FromSparseTensorSlices",
+       dataset_ops.Dataset.from_sparse_tensor_slices(
+           sparse_tensor.SparseTensor(
+               indices=np.array([[0, 0], [1, 0], [2, 0]]),
+               values=np.array([0, 0, 0]),
+               dense_shape=np.array([3, 1])))),
+      ("FromTensors", dataset_ops.Dataset.from_tensors([42])),
+      ("FromTensorSlices", dataset_ops.Dataset.from_tensors([42])),
+      ("Range", dataset_ops.Dataset.range(10)),
+      ("TextLine", readers.TextLineDataset("")),
+      ("TFRecord", readers.TFRecordDataset(""), 1),
+  )
+  def testDatasetSourceInputs(self, dataset, num_inputs=0):
+    self.assertEqual(num_inputs, len(dataset._inputs()))
+
+  @parameterized.named_parameters(
+      ("Apply", make_apply_fn.__func__(dataset_ops.Dataset.range(0)),
+       dataset_ops.Dataset.range(0)),
+      ("Batch", lambda x: x.batch(10), dataset_ops.Dataset.range(0)),
+      ("Cache", lambda x: x.cache(), dataset_ops.Dataset.range(0)),
+      ("Filter", lambda x: x.filter(lambda x: True),
+       dataset_ops.Dataset.range(0)),
+      ("FlatMap", lambda x: x.flat_map(lambda x: dataset_ops.Dataset.range(0)),
+       dataset_ops.Dataset.range(0)),
+      ("Interleave", make_interleave_fn.__func__(dataset_ops.Dataset.range(0)),
+       dataset_ops.Dataset.range(0)),
+      ("Map", lambda x: x.map(lambda x: x), dataset_ops.Dataset.range(0)),
+      ("PaddedBatch", lambda x: x.padded_batch(10, []),
+       dataset_ops.Dataset.range(0)),
+      ("ParallelInterleave",
+       make_interleave_fn.__func__(dataset_ops.Dataset.range(0), 2),
+       dataset_ops.Dataset.range(0)),
+      ("ParallelMap", lambda x: x.map(lambda x: x, num_parallel_calls=2),
+       dataset_ops.Dataset.range(0)),
+      ("Repeat", lambda x: x.repeat(), dataset_ops.Dataset.range(0)),
+      ("Shuffle", lambda x: x.shuffle(10), dataset_ops.Dataset.range(0)),
+      ("Skip", lambda x: x.skip(1), dataset_ops.Dataset.range(0)),
+      ("Take", lambda x: x.take(1), dataset_ops.Dataset.range(0)),
+      ("Window", lambda x: x.window(10), dataset_ops.Dataset.range(0)),
+  )
+  def testUnaryTransformationInputs(self, dataset_fn, input_dataset):
+    self.assertEqual([input_dataset], dataset_fn(input_dataset)._inputs())
+
+  @parameterized.named_parameters(
+      ("Concatenate", lambda x, y: x.concatenate(y),
+       dataset_ops.Dataset.range(0), dataset_ops.Dataset.range(1)))
+  def testBinaryTransformationInputs(self, dataset_fn, input1, input2):
+    self.assertEqual([input1, input2], dataset_fn(input1, input2)._inputs())
+
+  @parameterized.named_parameters(
+      ("ZipOne", dataset_ops.Dataset.zip, (dataset_ops.Dataset.range(0))),
+      ("ZipNest", dataset_ops.Dataset.zip,
+       (dataset_ops.Dataset.range(0),
+        (dataset_ops.Dataset.range(1), dataset_ops.Dataset.range(2)))),
+      ("ZipTuple", dataset_ops.Dataset.zip,
+       (dataset_ops.Dataset.range(0), dataset_ops.Dataset.range(1))))
+  def testVariadicTransformationInputs(self, dataset_fn, input_datasets):
+    self.assertEqual(
+        nest.flatten(input_datasets),
+        dataset_fn(input_datasets)._inputs())
+
+  def testCollectInputs(self):
+    ds1 = dataset_ops.Dataset.range(0)
+    ds2 = ds1.concatenate(ds1)
+    ds3 = dataset_ops.Dataset.zip((ds2, ds1, ds2))
+
+    inputs = []
+    queue = [ds3]
+    while queue:
+      ds = queue[0]
+      queue = queue[1:]
+      queue.extend(ds._inputs())
+      inputs.append(ds)
+
+    self.assertEqual(5, inputs.count(ds1))
+    self.assertEqual(2, inputs.count(ds2))
+    self.assertEqual(1, inputs.count(ds3))
+
+  def testOptionsDefault(self):
+    ds = dataset_ops.Dataset.range(0)
+    self.assertEqual(dataset_ops.Options(), ds.options())
+
+  def testOptionsOnce(self):
+    options = dataset_ops.Options()
+    ds = dataset_ops.Dataset.range(0).with_options(options).cache()
+    self.assertEqual(options, ds.options())
+
+  def testOptionsTwiceSame(self):
+    options = dataset_ops.Options()
+    options.experimental_autotune = True
+    ds = dataset_ops.Dataset.range(0).with_options(options).with_options(
+        options)
+    self.assertEqual(options, ds.options())
+
+  def testOptionsTwiceDifferent(self):
+    options1 = dataset_ops.Options()
+    options1.experimental_autotune = True
+    options2 = dataset_ops.Options()
+    options2.experimental_filter_fusion = False
+    ds = dataset_ops.Dataset.range(0).with_options(options1).with_options(
+        options2)
+    self.assertTrue(ds.options().experimental_autotune)
+    self.assertFalse(ds.options().experimental_filter_fusion)
+
+  def testOptionsTwiceDifferentError(self):
+    options1 = dataset_ops.Options()
+    options1.experimental_autotune = True
+    options2 = dataset_ops.Options()
+    options2.experimental_autotune = False
+    with self.assertRaisesRegexp(ValueError,
+                                 "Cannot merge incompatible values of option"):
+      dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 3b9d3a639d..46ce191f7b 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -86,6 +86,18 @@ class Dataset(object):
 
     raise NotImplementedError("Dataset._inputs")
 
+  def options(self):
+    """Returns the options for this dataset.
+
+    Returns:
+      A `tf.data.Options` object representing the dataset options.
+    """
+    for input_dataset in self._inputs():
+      options = input_dataset.options()
+      if options is not None:
+        return options
+    return Options()
+
   def make_initializable_iterator(self, shared_name=None):
     """Creates an `Iterator` for enumerating the elements of this dataset.
 
@@ -114,6 +126,13 @@ class Dataset(object):
       raise RuntimeError(
           "dataset.make_initializable_iterator is not supported when eager "
           "execution is enabled.")
+    dataset = self
+    options = self.options()
+    static_optimizations = options._static_optimizations()  # pylint: disable=protected-access
+    if static_optimizations:
+      dataset = _OptimizeDataset(dataset, static_optimizations)
+    if options.experimental_autotune:
+      dataset = _ModelDataset(dataset)
     if shared_name is None:
       shared_name = ""
     if compat.forward_compatible(2018, 8, 3):
@@ -123,11 +142,12 @@ class Dataset(object):
       iterator_resource = gen_dataset_ops.iterator(
           container="", shared_name=shared_name, **flat_structure(self))
     with ops.colocate_with(iterator_resource):
-      initializer = gen_dataset_ops.make_iterator(self._as_variant_tensor(),
-                                                  iterator_resource)
+      initializer = gen_dataset_ops.make_iterator(
+          dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          iterator_resource)
     return iterator_ops.Iterator(iterator_resource, initializer,
-                                 self.output_types, self.output_shapes,
-                                 self.output_classes)
+                                 dataset.output_types, dataset.output_shapes,
+                                 dataset.output_classes)
 
   def __iter__(self):
     """Creates an `Iterator` for enumerating the elements of this dataset.
@@ -162,7 +182,14 @@ class Dataset(object):
     # a 0-argument function.
     @function.Defun(capture_by_value=True)
     def _make_dataset():
-      return self._as_variant_tensor()  # pylint: disable=protected-access
+      dataset = self
+      options = self.options()
+      static_optimizations = options._static_optimizations()  # pylint: disable=protected-access
+      if static_optimizations:
+        dataset = _OptimizeDataset(dataset, static_optimizations)
+      if options.experimental_autotune:
+        dataset = _ModelDataset(dataset)
+      return dataset._as_variant_tensor()  # pylint: disable=protected-access
 
     try:
       _make_dataset.add_to_graph(ops.get_default_graph())
@@ -1325,6 +1352,146 @@ class Dataset(object):
         output_shapes,
         output_classes)
 
+  def with_options(self, options):
+    """Returns a new `tf.data.Dataset` with the given options set.
+
+    The options are "global" in the sense they apply to the entire input
+    pipeline in which the `with_options` transformation is used. If options are
+    set multiple times, they are merged if possible (see
+    `tf.data.Options.merge()` for details).
+
+    Args:
+      options: A `tf.data.Options` that identifies the options the use.
+
+    Returns:
+      Dataset: A `Dataset` with the given options.
+
+    Raises:
+      ValueError: if options are set more than once
+    """
+    return _OptionsDataset(self, options)
+
+
+@tf_export("data.Options")
+class Options(object):
+  """Represents options for tf.data.Dataset.
+
+  An `Options` object can be for instance used to control which static
+  optimizations to apply or whether to use performance modeling to dynamically
+  tune the parallelism of operations such as `tf.data.Dataset.map` or
+  `tf.data.Dataset.interleave`.
+  """
+  for _name, _ty, _docstring in [
+      ("experimental_autotune", bool,
+       "Whether to dynamically adjust the values of tunable parameters (e.g. "
+       "degrees of parallelism)."),
+      ("experimental_filter_fusion", bool,
+       "Whether to fuse filter transformations."),
+      ("experimental_hoist_random_uniform", bool,
+       "Whether to hoist `tf.random_uniform()` ops out of map transformations."
+      ),
+      ("experimental_latency_all_edges", bool,
+       "Whether to add latency measurements on all edges."),
+      ("experimental_map_and_batch_fusion", bool,
+       "Whether to fuse map and batch transformations."),
+      ("experimental_map_and_filter_fusion", bool,
+       "Whether to fuse map and filter transformations."),
+      ("experimental_map_fusion", bool, "Whether to fuse map transformations."),
+      ("experimental_map_parallelization", bool,
+       "Whether to parallelize stateless map transformations."),
+      ("experimental_map_vectorization", bool,
+       "Whether to vectorize map transformations."),
+      ("experimental_noop_elimination", bool,
+       "Whether to eliminate no-op transformations."),
+      ("experimental_shuffle_and_repeat_fusion", bool,
+       "Whether to fuse shuffle and repeat transformations."),
+  ]:
+
+    def _make_getter(name):  # pylint: disable=no-self-argument
+
+      def getter(self):
+        return getattr(self, "_" + name)
+
+      return getter
+
+    def _make_setter(name, ty):  # pylint: disable=no-self-argument
+
+      def setter(self, value):
+        if not isinstance(value, ty):
+          raise TypeError(
+              "Attempting to set the option %s to incompatible value: %r" %
+              (name, value))
+        setattr(self, "_" + name, value)
+
+      return setter
+
+    vars()["_" + _name] = None
+    vars()[_name] = property(
+        _make_getter(_name), _make_setter(_name, _ty), None, _docstring)
+
+  def __init__(self):
+    pass
+
+  def __eq__(self, other):
+    if isinstance(other, self.__class__):
+      return self.__dict__ == other.__dict__
+    else:
+      return False
+
+  def __ne__(self, other):
+    return not self.__eq__(other)
+
+  def _static_optimizations(self):
+    """Produces the list of enabled static optimizations."""
+    experimental_optimizations = [
+        "filter_fusion", "hoist_random_uniform", "latency_all_edges",
+        "map_and_batch_fusion", "map_and_filter_fusion", "map_fusion",
+        "map_parallelization", "map_vectorization", "noop_elimination",
+        "shuffle_and_repeat_fusion"
+    ]
+    result = []
+    for exp_opt in experimental_optimizations:
+      if getattr(self, "experimental_" + exp_opt):
+        result.append(exp_opt)
+    return result
+
+  def merge(self, options):
+    """Merges itself with the given `tf.data.Options`.
+
+    The given `tf.data.Options` can be merged as long as there does not exist an
+    attribute that is set to different values in `self` and `options`.
+
+    Args:
+      options: a `tf.data.Options` to merge with
+
+    Raises:
+      ValueError: if the given `tf.data.Options` cannot be merged
+
+    Returns:
+      New `tf.data.Options()` object which is the result of merging self with
+      the input `tf.data.Options`.
+    """
+    result = Options()
+    for other in [self, options]:
+      for name in [
+          "experimental_autotune", "experimental_filter_fusion",
+          "experimental_hoist_random_uniform", "experimental_latency_all_edges",
+          "experimental_map_and_batch_fusion",
+          "experimental_map_and_filter_fusion", "experimental_map_fusion",
+          "experimental_map_parallelization", "experimental_map_vectorization",
+          "experimental_noop_elimination",
+          "experimental_shuffle_and_repeat_fusion"
+      ]:
+        this = getattr(result, name)
+        that = getattr(other, name)
+        if that is not None:
+          if this is None:
+            setattr(result, name, that)
+          elif this != that:
+            raise ValueError(
+                "Cannot merge incompatible values of option: %s" % (name))
+    return result
+
 
 class DatasetSource(Dataset):
   """Abstract class representing a dataset with no inputs."""
@@ -1664,6 +1831,9 @@ class StructuredFunctionWrapper(object):
           flat_classes.append(component)
           flat_shapes.append(component)
           flat_types.append(component)
+          if t.options() is not None:  # pylint: disable=protected-access
+            warnings.warn("Encountered a nested dataset with options. These "
+                          "options will not be applied to the outer dataset.")
         else:
           try:
             t = ops.convert_to_tensor(t)
@@ -2703,3 +2873,91 @@ class WindowDataset(UnaryDataset):
   @property
   def output_types(self):
     return self._output_types
+
+
+class _OptionsDataset(UnaryDataset):
+  """An identity `Dataset` that stores options."""
+
+  def __init__(self, input_dataset, options):
+    super(_OptionsDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._options = input_dataset.options()
+    if self._options:
+      self._options = self._options.merge(options)
+    else:
+      self._options = options
+
+  def _as_variant_tensor(self):
+    return self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
+
+  def options(self):
+    return self._options
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class _ModelDataset(UnaryDataset):
+  """A `Dataset` that acts as an identity, and models performance."""
+
+  def __init__(self, input_dataset):
+    """See `optimize()` for details."""
+    super(_ModelDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.model_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        **flat_structure(self))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class _OptimizeDataset(UnaryDataset):
+  """A `Dataset` that acts as an identity, and applies optimizations."""
+
+  def __init__(self, input_dataset, optimizations):
+    """See `optimize()` for details."""
+    super(_OptimizeDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    if optimizations is None:
+      optimizations = []
+    self._optimizations = ops.convert_to_tensor(
+        optimizations, dtype=dtypes.string, name="optimizations")
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.optimize_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._optimizations,
+        **flat_structure(self))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index 825afb622f..8b7f63e43e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "options"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "padded_batch"
     argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
@@ -118,6 +122,10 @@ tf_class {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
   }
+  member_method {
+    name: "with_options"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zip"
     argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index cdad5f6360..a7bfa82c65 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "options"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "padded_batch"
     argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
@@ -119,6 +123,10 @@ tf_class {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
   }
+  member_method {
+    name: "with_options"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zip"
     argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
new file mode 100644
index 0000000000..d15dccc173
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
@@ -0,0 +1,57 @@
+path: "tensorflow.data.Options"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Options\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_autotune"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_hoist_random_uniform"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_latency_all_edges"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_map_and_batch_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_map_and_filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_map_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_map_parallelization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_map_vectorization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_noop_elimination"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_shuffle_and_repeat_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index df41bff1b5..7b7a9ebaf0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "options"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "padded_batch"
     argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
@@ -119,6 +123,10 @@ tf_class {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
   }
+  member_method {
+    name: "with_options"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zip"
     argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index 028bcc2ce9..2817f900e1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "options"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "padded_batch"
     argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
@@ -119,6 +123,10 @@ tf_class {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
   }
+  member_method {
+    name: "with_options"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zip"
     argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index 0c0405ee02..2520e28a3c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "options"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "padded_batch"
     argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
@@ -120,6 +124,10 @@ tf_class {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
   }
+  member_method {
+    name: "with_options"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zip"
     argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index bce0be4b17..1dd53b1eab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "options"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "padded_batch"
     argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
@@ -120,6 +124,10 @@ tf_class {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
   }
+  member_method {
+    name: "with_options"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zip"
     argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index 8aeae92d96..8fdd9dc52e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "options"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "padded_batch"
     argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
@@ -120,6 +124,10 @@ tf_class {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
   }
+  member_method {
+    name: "with_options"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zip"
     argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
index e205157523..3023276a1d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "Iterator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Options"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFRecordDataset"
     mtype: "<class \'abc.ABCMeta\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index 825afb622f..8b7f63e43e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "options"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "padded_batch"
     argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
@@ -118,6 +122,10 @@ tf_class {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
   }
+  member_method {
+    name: "with_options"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zip"
     argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index cdad5f6360..a7bfa82c65 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "options"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "padded_batch"
     argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
@@ -119,6 +123,10 @@ tf_class {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
   }
+  member_method {
+    name: "with_options"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zip"
     argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
new file mode 100644
index 0000000000..d15dccc173
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
@@ -0,0 +1,57 @@
+path: "tensorflow.data.Options"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Options\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_autotune"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_hoist_random_uniform"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_latency_all_edges"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_map_and_batch_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_map_and_filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_map_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_map_parallelization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_map_vectorization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_noop_elimination"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_shuffle_and_repeat_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index df41bff1b5..7b7a9ebaf0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "options"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "padded_batch"
     argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
@@ -119,6 +123,10 @@ tf_class {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
   }
+  member_method {
+    name: "with_options"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zip"
     argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index 028bcc2ce9..2817f900e1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "options"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "padded_batch"
     argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
@@ -119,6 +123,10 @@ tf_class {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
   }
+  member_method {
+    name: "with_options"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zip"
     argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 0c0405ee02..2520e28a3c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "options"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "padded_batch"
     argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
@@ -120,6 +124,10 @@ tf_class {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
   }
+  member_method {
+    name: "with_options"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zip"
     argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index bce0be4b17..1dd53b1eab 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "options"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "padded_batch"
     argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
@@ -120,6 +124,10 @@ tf_class {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
   }
+  member_method {
+    name: "with_options"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zip"
     argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index 8aeae92d96..8fdd9dc52e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "options"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "padded_batch"
     argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
@@ -120,6 +124,10 @@ tf_class {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
   }
+  member_method {
+    name: "with_options"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zip"
     argspec: "args=[\'datasets\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
index e205157523..3023276a1d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "Iterator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Options"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFRecordDataset"
     mtype: "<class \'abc.ABCMeta\'>"
-- 
GitLab


From bacf1949f92bb1daa9e5c8a31cc6924e532551e9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 17:33:55 -0700
Subject: [PATCH 272/570] [XLA] Add kAllToAll and kCollectivePermute to
 EffectiveOperandPrecisionIsOutputPrecision list.

PiperOrigin-RevId: 215311766
---
 tensorflow/compiler/xla/service/bfloat16_support.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/compiler/xla/service/bfloat16_support.cc b/tensorflow/compiler/xla/service/bfloat16_support.cc
index 23645346e6..5b48f10505 100644
--- a/tensorflow/compiler/xla/service/bfloat16_support.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_support.cc
@@ -78,8 +78,10 @@ bool BFloat16Support::EffectiveOperandPrecisionIsOutputPrecision(
     const HloInstruction& hlo, int64 operand_index) {
   switch (hlo.opcode()) {
     case HloOpcode::kAbs:
+    case HloOpcode::kAllToAll:
     case HloOpcode::kBroadcast:
     case HloOpcode::kClamp:
+    case HloOpcode::kCollectivePermute:
     case HloOpcode::kConcatenate:
     case HloOpcode::kConvert:
     case HloOpcode::kCopy:
-- 
GitLab


From beede8525be5386451bf0098992c37416d1864db Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Mon, 1 Oct 2018 17:45:22 -0700
Subject: [PATCH 273/570] Make Keras/TPU more robust to closed TF sessions.

PiperOrigin-RevId: 215313156
---
 .../contrib/tpu/python/tpu/keras_support.py   | 278 ++++++++++--------
 1 file changed, 155 insertions(+), 123 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index 696656e840..a3a7fd8bb0 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -46,6 +46,7 @@ from __future__ import print_function
 
 import abc
 import collections
+import contextlib
 import re
 import sys
 import time
@@ -94,21 +95,56 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 
 
+# TODO(b/114775106): temporary shim to optionally initialize the TPU
+# This increases the odds our session is initialized, but shouldn't be needed.
+def _maybe_initialize_tpu(session):
+  """Initialize the TPU if it has not already been initialized."""
+  try:
+
+    def test_op():
+      return constant_op.constant(1) + constant_op.constant(1)
+
+    session.run(tpu.rewrite(test_op))
+  except errors.FailedPreconditionError as _:
+    session.run(tpu.initialize_system())
+
+
+@contextlib.contextmanager
+def _tpu_session_context():
+  """Initialize the TPU and cleans cache entries for bad sessions."""
+  try:
+    _maybe_initialize_tpu(K.get_session())
+    yield
+  except (errors.FailedPreconditionError, errors.AbortedError) as e:
+    K.clear_session()
+    raise Exception("""
+An error occurred connecting or initializing your TPU.
+
+The session has been reset. re-run keras_to_tpu_model to create a new session.
+""" + e)
+
+
 def setup_tpu_session(cluster_resolver):
   """Construct or return a `tf.Session` connected to the given cluster."""
   master = cluster_resolver.master()
 
   # Use the existing session if we're already connected to this TPU
-  if (K.get_session()._target == master and
-      getattr(K.get_session(), '_tpu_initialized', None)):
-    return
+  # N.B K.get_session() is a non-trivial operation, and may fail if the remote
+  # session has been reset.
+  try:
+    default_session = K.get_session()
+    if (default_session._target == master and
+        getattr(default_session, '_tpu_initialized', None)):
+      return
+  except errors.AbortedError as _:
+    # We lost the remote session and need to re-initialize.
+    logging.warning('Lost remote session: creating a new session.')
 
   cluster_spec = cluster_resolver.cluster_spec()
   config = config_pb2.ConfigProto(isolate_session_state=True)
   if cluster_spec:
     config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
 
-  logging.info('Initialize')
   tpu_session = tf_session.Session(target=master, config=config)
   tpu_session.run(tpu.initialize_system())
   tpu_session._tpu_initialized = True
@@ -1391,97 +1427,74 @@ class KerasTPUModel(models.Model):
       raise EnvironmentError('KerasTPUModel currently does not support eager '
                              'mode.')
 
-    assert not self._numpy_to_infeed_manager_list  # Ensure empty.
-
-    infeed_managers = []  # Managers to clean up at the end of the fit call.
-    if isinstance(x, dataset_ops.Dataset):
-      # TODO(b/111413240): Support taking a tf.data.Dataset directly.
-      raise ValueError(
-          'Taking a Dataset directly is not yet supported. Please '
-          'wrap your dataset construction code in a function and '
-          'pass that to fit instead. For examples, see: '
-          'https://github.com/tensorflow/tpu/tree/master/models/experimental'
-          '/keras')
-    if callable(x):
-      with ops.device('/job:%s/device:CPU:0' %
-                      self._tpu_assignment.worker_name):
-        dataset = x()
-        if steps_per_epoch is None:
-          raise ValueError('When using tf.data as input to a model, you '
-                           'should specify the steps_per_epoch argument.')
-        if y is not None:
-          raise ValueError('When using tf.data as input to a model, y must be '
-                           'None')
-        infeed_manager = TPUDatasetInfeedManager(
-            dataset, self._tpu_assignment, model_fn_lib.ModeKeys.TRAIN)
+    with _tpu_session_context():
+      assert not self._numpy_to_infeed_manager_list  # Ensure empty.
+
+      infeed_managers = []  # Managers to clean up at the end of the fit call.
+      if isinstance(x, dataset_ops.Dataset):
+        # TODO(b/111413240): Support taking a tf.data.Dataset directly.
+        raise ValueError(
+            'Taking a Dataset directly is not yet supported. Please '
+            'wrap your dataset construction code in a function and '
+            'pass that to fit instead. For examples, see: '
+            'https://github.com/tensorflow/tpu/tree/master/models/experimental'
+            '/keras')
+      if callable(x):
+        with ops.device(
+            '/job:%s/device:CPU:0' % self._tpu_assignment.worker_name):
+          dataset = x()
+          if steps_per_epoch is None:
+            raise ValueError('When using tf.data as input to a model, you '
+                             'should specify the steps_per_epoch argument.')
+          if y is not None:
+            raise ValueError('When using tf.data as input to a model, y must '
+                             'be None')
+          infeed_manager = TPUDatasetInfeedManager(
+              dataset, self._tpu_assignment, model_fn_lib.ModeKeys.TRAIN)
+          # Use dummy numpy inputs for the rest of Keras' shape checking. We
+          # intercept them when building the model.
+          x = infeed_manager.dummy_x
+          y = infeed_manager.dummy_y
+          infeed_managers.append((x, infeed_manager))
+
+      if isinstance(validation_data, dataset_ops.Dataset):
+        # TODO(b/111413240): Support taking a tf.data.Dataset directly.
+        raise ValueError(
+            'Taking a Dataset directly is not yet supported. Please '
+            'wrap your dataset construction code in a function and '
+            'pass that to fit instead. For examples, see: '
+            'https://github.com/tensorflow/tpu/tree/master/models/experimental'
+            '/keras')
+      if callable(validation_data):
+        dataset = validation_data()
+        if validation_steps is None:
+          raise ValueError('When using tf.data as validation for a model, you '
+                           'should specify the validation_steps argument.')
+        infeed_manager = TPUDatasetInfeedManager(dataset, self._tpu_assignment,
+                                                 model_fn_lib.ModeKeys.EVAL)
         # Use dummy numpy inputs for the rest of Keras' shape checking. We
         # intercept them when building the model.
-        x = infeed_manager.dummy_x
-        y = infeed_manager.dummy_y
-        infeed_managers.append((x, infeed_manager))
+        val_x = infeed_manager.dummy_x
+        val_y = infeed_manager.dummy_y
+        infeed_managers.append((val_x, infeed_manager))
+        validation_data = (val_x, val_y)
 
-    if isinstance(validation_data, dataset_ops.Dataset):
-      # TODO(b/111413240): Support taking a tf.data.Dataset directly.
-      raise ValueError(
-          'Taking a Dataset directly is not yet supported. Please '
-          'wrap your dataset construction code in a function and '
-          'pass that to fit instead. For examples, see: '
-          'https://github.com/tensorflow/tpu/tree/master/models/experimental'
-          '/keras')
-    if callable(validation_data):
-      dataset = validation_data()
-      if validation_steps is None:
-        raise ValueError('When using tf.data as validation for a model, you '
-                         'should specify the validation_steps argument.')
-      infeed_manager = TPUDatasetInfeedManager(
-          dataset, self._tpu_assignment, model_fn_lib.ModeKeys.EVAL)
-      # Use dummy numpy inputs for the rest of Keras' shape checking. We
-      # intercept them when building the model.
-      val_x = infeed_manager.dummy_x
-      val_y = infeed_manager.dummy_y
-      infeed_managers.append((val_x, infeed_manager))
-      validation_data = (val_x, val_y)
-
-    self._numpy_to_infeed_manager_list = infeed_managers
-    try:
-      if not kwargs.get('_pipeline', True):
-        logging.info('Running non-pipelined training loop (`_pipeline=%s`).',
-                     kwargs['_pipeline'])
-        kwargs.pop('_pipeline')
-        return super(KerasTPUModel, self).fit(
-            x,
-            y,
-            batch_size,
-            epochs,
-            verbose,
-            callbacks,
-            validation_split,
-            validation_data,
-            shuffle,
-            class_weight,
-            sample_weight,
-            initial_epoch,
-            steps_per_epoch,
-            validation_steps,
-            **kwargs)
-      return self._pipeline_fit(
-          x,
-          y,
-          batch_size,
-          epochs,
-          verbose,
-          callbacks,
-          validation_split,
-          validation_data,
-          shuffle,
-          class_weight,
-          sample_weight,
-          initial_epoch,
-          steps_per_epoch,
-          validation_steps,
-          **kwargs)
-    finally:
-      self._numpy_to_infeed_manager_list = []
+      self._numpy_to_infeed_manager_list = infeed_managers
+      try:
+        if not kwargs.get('_pipeline', True):
+          logging.info('Running non-pipelined training loop (`_pipeline=%s`).',
+                       kwargs['_pipeline'])
+          kwargs.pop('_pipeline')
+          return super(KerasTPUModel, self).fit(
+              x, y, batch_size, epochs, verbose, callbacks, validation_split,
+              validation_data, shuffle, class_weight, sample_weight,
+              initial_epoch, steps_per_epoch, validation_steps, **kwargs)
+        return self._pipeline_fit(x, y, batch_size, epochs, verbose, callbacks,
+                                  validation_split, validation_data, shuffle,
+                                  class_weight, sample_weight, initial_epoch,
+                                  steps_per_epoch, validation_steps, **kwargs)
+      finally:
+        self._numpy_to_infeed_manager_list = []
 
   def evaluate(self,
                x=None,
@@ -1492,37 +1505,38 @@ class KerasTPUModel(models.Model):
                steps=None):
     assert not self._numpy_to_infeed_manager_list  # Ensure empty.
 
-    infeed_managers = []  # Managers to clean up at the end of the fit call.
-    if isinstance(x, dataset_ops.Dataset):
-      # TODO(b/111413240): Support taking a tf.data.Dataset directly.
-      raise ValueError(
-          'Taking a Dataset directly is not yet supported. Please '
-          'wrap your dataset construction code in a function and '
-          'pass that to fit instead. For examples, see: '
-          'https://github.com/tensorflow/tpu/tree/master/models/experimental'
-          '/keras')
-    if callable(x):
-      dataset = x()
-      if steps is None:
-        raise ValueError('When using tf.data as input to a model, you '
-                         'should specify the steps argument.')
-      if y is not None:
-        raise ValueError('When using tf.data as input to a model, y must be '
-                         'None')
-      infeed_manager = TPUDatasetInfeedManager(
-          dataset, self._tpu_assignment, model_fn_lib.ModeKeys.EVAL)
-      # Use dummy numpy inputs for the rest of Keras' shape checking. We
-      # intercept them when building the model.
-      x = infeed_manager.dummy_x
-      y = infeed_manager.dummy_y
-      infeed_managers.append((x, infeed_manager))
-
-    self._numpy_to_infeed_manager_list = infeed_managers
-    try:
-      return super(KerasTPUModel, self).evaluate(x, y, batch_size, verbose,
-                                                 sample_weight, steps)
-    finally:
-      self._numpy_to_infeed_manager_list = []
+    with _tpu_session_context():
+      infeed_managers = []  # Managers to clean up at the end of the fit call.
+      if isinstance(x, dataset_ops.Dataset):
+        # TODO(b/111413240): Support taking a tf.data.Dataset directly.
+        raise ValueError(
+            'Taking a Dataset directly is not yet supported. Please '
+            'wrap your dataset construction code in a function and '
+            'pass that to fit instead. For examples, see: '
+            'https://github.com/tensorflow/tpu/tree/master/models/experimental'
+            '/keras')
+      if callable(x):
+        dataset = x()
+        if steps is None:
+          raise ValueError('When using tf.data as input to a model, you '
+                           'should specify the steps argument.')
+        if y is not None:
+          raise ValueError('When using tf.data as input to a model, y must be '
+                           'None')
+        infeed_manager = TPUDatasetInfeedManager(dataset, self._tpu_assignment,
+                                                 model_fn_lib.ModeKeys.EVAL)
+        # Use dummy numpy inputs for the rest of Keras' shape checking. We
+        # intercept them when building the model.
+        x = infeed_manager.dummy_x
+        y = infeed_manager.dummy_y
+        infeed_managers.append((x, infeed_manager))
+
+      self._numpy_to_infeed_manager_list = infeed_managers
+      try:
+        return super(KerasTPUModel, self).evaluate(x, y, batch_size, verbose,
+                                                   sample_weight, steps)
+      finally:
+        self._numpy_to_infeed_manager_list = []
 
   def _pipeline_fit(self, x, y, batch_size, epochs, verbose, callbacks,
                     validation_split, validation_data, shuffle, class_weight,
@@ -1910,6 +1924,24 @@ class KerasTPUModel(models.Model):
 
     return val_x, val_y, val_sample_weights
 
+  def predict(self,
+              x,
+              batch_size=None,
+              verbose=0,
+              steps=None,
+              max_queue_size=10,
+              workers=1,
+              use_multiprocessing=False):
+    with _tpu_session_context():
+      return super(KerasTPUModel, self).predict(
+          x,
+          batch_size=batch_size,
+          verbose=verbose,
+          steps=steps,
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing)
+
   @property
   def optimizer(self):
     if self._tpu_model:
-- 
GitLab


From 991f06fd50fc73285ce415d57f720994c2b2e861 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Mon, 1 Oct 2018 19:42:12 -0700
Subject: [PATCH 274/570] [XLA] Migrate from gtl::FlatSet to
 absl::flat_hash_set

PiperOrigin-RevId: 215324035
---
 tensorflow/compiler/jit/BUILD                 |  2 +
 tensorflow/compiler/jit/deadness_analysis.cc  | 10 ++--
 .../jit/encapsulate_subgraphs_pass.cc         |  7 +--
 .../jit/encapsulate_xla_computations_pass.cc  | 10 ++--
 .../compiler/jit/mark_for_compilation_pass.cc |  6 +--
 .../compiler/jit/partially_decluster_pass.cc  |  7 +--
 .../jit/resource_operation_safety_analysis.cc |  4 +-
 tensorflow/compiler/tests/BUILD               |  1 +
 tensorflow/compiler/tests/randomized_tests.cc | 14 +++---
 tensorflow/compiler/xla/client/BUILD          |  1 +
 tensorflow/compiler/xla/client/xla_builder.cc |  4 +-
 tensorflow/compiler/xla/client/xla_builder.h  |  4 +-
 tensorflow/compiler/xla/service/BUILD         | 27 +++++++++++
 .../xla/service/bfloat16_propagation.cc       |  9 ++--
 .../xla/service/bfloat16_propagation.h        | 11 +++--
 .../compiler/xla/service/buffer_assignment.cc | 48 ++++++++++---------
 .../compiler/xla/service/buffer_assignment.h  | 22 ++++-----
 .../compiler/xla/service/buffer_liveness.h    |  4 +-
 .../xla/service/buffer_value_containers.h     |  4 +-
 tensorflow/compiler/xla/service/call_graph.cc |  9 ++--
 tensorflow/compiler/xla/service/call_graph.h  | 10 ++--
 .../compiler/xla/service/copy_insertion.cc    |  6 +--
 tensorflow/compiler/xla/service/cpu/BUILD     |  1 +
 .../compiler/xla/service/cpu/ir_emitter.cc    |  8 ++--
 .../xla/service/cpu/tests/cpu_noalias_test.cc |  2 +-
 tensorflow/compiler/xla/service/gpu/BUILD     |  3 ++
 .../xla/service/gpu/gpu_copy_insertion.cc     |  2 +-
 .../xla/service/gpu/instruction_fusion.cc     |  5 +-
 .../xla/service/gpu/multi_output_fusion.cc    |  6 +--
 .../compiler/xla/service/heap_simulator.cc    | 13 ++---
 .../compiler/xla/service/heap_simulator.h     |  6 +--
 .../xla/service/hlo_alias_analysis.cc         |  9 ++--
 tensorflow/compiler/xla/service/hlo_buffer.cc |  2 +-
 .../compiler/xla/service/hlo_computation.cc   | 11 ++---
 .../compiler/xla/service/hlo_computation.h    |  2 +-
 tensorflow/compiler/xla/service/hlo_cse.cc    |  6 +--
 .../xla/service/hlo_dataflow_analysis.cc      |  9 ++--
 .../compiler/xla/service/hlo_domain_map.cc    |  3 +-
 .../compiler/xla/service/hlo_domain_map.h     |  4 +-
 .../xla/service/hlo_domain_metadata.h         |  8 ++--
 .../compiler/xla/service/hlo_instruction.cc   |  4 +-
 .../xla/service/hlo_memory_scheduler.cc       |  7 +--
 tensorflow/compiler/xla/service/hlo_module.cc |  9 ++--
 .../xla/service/hlo_module_group_util.cc      |  6 +--
 .../compiler/xla/service/hlo_pass_pipeline.cc |  6 +--
 .../xla/service/hlo_rematerialization.cc      |  3 +-
 .../xla/service/hlo_rematerialization.h       |  3 +-
 .../compiler/xla/service/hlo_schedule.cc      |  5 +-
 tensorflow/compiler/xla/service/hlo_value.cc  |  4 +-
 .../xla/service/indexed_array_analysis.cc     |  2 +-
 .../compiler/xla/service/layout_assignment.h  |  7 ++-
 tensorflow/compiler/xla/service/llvm_ir/BUILD |  1 +
 .../xla/service/llvm_ir/alias_analysis.cc     |  6 +--
 .../xla/service/llvm_ir/alias_analysis.h      |  1 -
 .../xla/service/multi_output_fusion.cc        |  6 +--
 .../compiler/xla/service/name_uniquer.h       |  4 +-
 .../compiler/xla/service/shape_inference.cc   |  4 +-
 .../compiler/xla/service/shaped_buffer.cc     |  4 +-
 .../xla/service/tuple_points_to_analysis.h    |  1 -
 .../while_loop_invariant_code_motion.cc       |  8 ++--
 .../xla/service/while_loop_simplifier.cc      |  3 +-
 tensorflow/compiler/xla/tests/BUILD           |  2 +-
 .../compiler/xla/tests/test_utils_test.cc     |  5 +-
 63 files changed, 235 insertions(+), 186 deletions(-)

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index f20270931f..661b444a42 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -325,6 +325,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -407,6 +408,7 @@ cc_library(
         "//tensorflow/core/kernels:bounds_check",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index e63d4b7792..e0b9932d80 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/jit/deadness_analysis.h"
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/jit/deadness_analysis_internal.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/tensor_id.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
 // ALGORITHM OVERVIEW
@@ -298,7 +298,7 @@ class SymbolPredicate : public Predicate {
 
 template <typename FunctionTy>
 /*static*/ void Predicate::Visit(Predicate* p, const FunctionTy& func) {
-  gtl::FlatSet<Predicate*> visited;
+  absl::flat_hash_set<Predicate*> visited;
   std::vector<Predicate*> stack;
 
   stack.push_back(p);
@@ -467,7 +467,7 @@ Predicate* PredicateFactory::MakeAndOrImpl(
       is_and ? Predicate::Kind::kAnd : Predicate::Kind::kOr;
   Predicate::Kind other_pred_kind =
       is_and ? Predicate::Kind::kOr : Predicate::Kind::kAnd;
-  gtl::FlatSet<Predicate*> simplified_ops_set;
+  absl::flat_hash_set<Predicate*> simplified_ops_set;
   std::vector<Predicate*> simplified_ops;
   for (Predicate* op : operands) {
     // Simplify A&A => A and  A|A => A.
@@ -492,7 +492,7 @@ Predicate* PredicateFactory::MakeAndOrImpl(
   }
 
   // Simplify "A&~A=>False" and "A|~A=>True".
-  gtl::FlatSet<Predicate*> negated_ops;
+  absl::flat_hash_set<Predicate*> negated_ops;
   for (Predicate* op : simplified_ops) {
     if (op->kind() == Predicate::Kind::kNot) {
       negated_ops.insert(dynamic_cast<NotPredicate&>(*op).operand());
@@ -512,7 +512,7 @@ Predicate* PredicateFactory::MakeAndOrImpl(
   //
   // First find any predicates contained in all subops.
   std::vector<Predicate*> common_inner_operands;
-  gtl::FlatSet<Predicate*> common_inner_operands_set;
+  absl::flat_hash_set<Predicate*> common_inner_operands_set;
   for (Predicate* op : simplified_ops) {
     if (op->kind() != other_pred_kind) {
       common_inner_operands.clear();
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index d165341f21..da27f837e8 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/tensor_id.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/public/session_options.h"
@@ -78,7 +78,8 @@ void SortControlInputs(GraphDef* gdef) {
 namespace {
 
 bool AreAllParentsGuaranteedConst(
-    const Node& n, const gtl::FlatSet<const Node*>& runtime_const_nodes) {
+    const Node& n,
+    const absl::flat_hash_set<const Node*>& runtime_const_nodes) {
   if (n.type_string() == "GuaranteeConst") {
     // If the current node is itself a cast-to-const, no need
     // to look at the incoming edges.
@@ -101,7 +102,7 @@ bool AreAllParentsGuaranteedConst(
 void MarkGuaranteedConstants(
     const Graph& graph,
     const std::vector<std::pair<const Node*, Node*>>& src_arg_pairs) {
-  gtl::FlatSet<const Node*> guaranteed_const_nodes;
+  absl::flat_hash_set<const Node*> guaranteed_const_nodes;
   std::vector<const Node*> srcs;
   srcs.reserve(src_arg_pairs.size());
   for (const auto& src_arg : src_arg_pairs) {
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index 755c364c62..2ce6fa73fc 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/encapsulate_xla_computations_pass.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -62,7 +62,7 @@ DataType EdgeType(const Edge* edge) {
 }
 
 // Adds the control inputs of `node` to `*deps`.
-void AddControlInputs(const Node& node, gtl::FlatSet<Node*>* deps) {
+void AddControlInputs(const Node& node, absl::flat_hash_set<Node*>* deps) {
   for (const Edge* edge : node.in_edges()) {
     if (edge->IsControlEdge()) {
       deps->insert(edge->src());
@@ -71,7 +71,7 @@ void AddControlInputs(const Node& node, gtl::FlatSet<Node*>* deps) {
 }
 
 // Adds the control outputs of `node` to `*deps`.
-void AddControlOutputs(const Node& node, gtl::FlatSet<Node*>* deps) {
+void AddControlOutputs(const Node& node, absl::flat_hash_set<Node*>* deps) {
   for (const Edge* edge : node.out_edges()) {
     if (edge->IsControlEdge()) {
       deps->insert(edge->dst());
@@ -246,7 +246,7 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
 
     // Data and control inputs to the new XlaLaunch node.
     std::vector<std::pair<Node*, int>> data_inputs(num_inputs);
-    gtl::FlatSet<Node*> control_inputs;
+    absl::flat_hash_set<Node*> control_inputs;
     DataTypeVector arg_types(num_args);
 
     AddControlInputs(*launch, &control_inputs);
@@ -266,7 +266,7 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
 
     // Outputs.
     const int num_outputs = launch->output_types().size();
-    gtl::FlatSet<Node*> control_outputs;
+    absl::flat_hash_set<Node*> control_outputs;
     std::vector<std::vector<std::pair<Node*, int>>> data_outputs(num_outputs);
     DataTypeVector output_types(num_outputs);
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 133d982360..4f0c370e65 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/jit/deadness_analysis.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
@@ -42,7 +43,6 @@ limitations under the License.
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
 
@@ -371,7 +371,7 @@ bool IsXlaFusable(const NodeDef& node) {
 Status FindCompilationCandidates(
     const Graph& graph, FunctionLibraryDefinition* flib_def, Env* env,
     const std::function<bool(const Node*, const DeviceType&)>& is_compilable_fn,
-    OrderedNodeSet* candidates, gtl::FlatSet<Node*>* isolated_nodes) {
+    OrderedNodeSet* candidates, absl::flat_hash_set<Node*>* isolated_nodes) {
   OptimizerOptions opts;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
       new ProcessFunctionLibraryRuntime(nullptr, env, TF_GRAPH_DEF_VERSION,
@@ -849,7 +849,7 @@ Status MarkForCompilationPass::RunImpl(
   Graph* graph = options.graph->get();
 
   OrderedNodeSet compilation_candidates;
-  gtl::FlatSet<Node*> isolated_nodes;
+  absl::flat_hash_set<Node*> isolated_nodes;
   TF_RETURN_IF_ERROR(FindCompilationCandidates(
       *graph, options.flib_def,
       (options.session_options != nullptr) ? options.session_options->env
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc
index 10fc9e85d9..b1f9e9088f 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@@ -15,17 +15,18 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/partially_decluster_pass.h"
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace tensorflow {
 namespace {
-Status FindNodesToDecluster(const Graph& graph, gtl::FlatSet<Node*>* result,
+Status FindNodesToDecluster(const Graph& graph,
+                            absl::flat_hash_set<Node*>* result,
                             absl::Span<Node* const> post_order) {
   // Find nodes that have at least one user outside their cluster that expects
   // hostmem output.  These nodes should be cloned to outside the cluster to
@@ -171,7 +172,7 @@ Status PartiallyDeclusterToRemoveDeviceToHostCopies(Graph* graph) {
   GetPostOrder(*graph, &post_order, /*stable_comparator=*/NodeComparatorName(),
                /*edge_filter=*/NotBackedge);
 
-  gtl::FlatSet<Node*> nodes_to_partially_decluster;
+  absl::flat_hash_set<Node*> nodes_to_partially_decluster;
   TF_RETURN_IF_ERROR(
       FindNodesToDecluster(*graph, &nodes_to_partially_decluster, post_order));
 
diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis.cc b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
index 657bb409db..e039d46ec8 100644
--- a/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
+++ b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
@@ -82,6 +82,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
@@ -89,7 +90,6 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/tensor_id.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/util/ptr_util.h"
 
@@ -176,7 +176,7 @@ string ResourceOpToString(const ResourceOp& resource_op) {
 // point.
 class ResourceOpSet {
  private:
-  using Impl = gtl::FlatSet<ResourceOp>;
+  using Impl = absl::flat_hash_set<ResourceOp>;
 
  public:
   ResourceOpSet() = default;
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 3cf74fa788..822fedf121 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -1105,6 +1105,7 @@ cc_library(
         "//tensorflow/core:test",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:ops_util",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index bddda6f302..7a96f4c25c 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include <random>
 #include <unordered_map>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/jit/defs.h"
@@ -63,7 +64,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
@@ -457,7 +457,7 @@ Tensor OpTest::RandomTensor(DataType dtype, bool needs_unique_values,
   Tensor tensor(dtype, TensorShape(shape));
   switch (dtype) {
     case DT_FLOAT: {
-      gtl::FlatSet<float> already_generated;
+      absl::flat_hash_set<float> already_generated;
       std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
       test::FillFn<float>(&tensor, [&](int i) -> float {
         float generated;
@@ -470,7 +470,7 @@ Tensor OpTest::RandomTensor(DataType dtype, bool needs_unique_values,
       break;
     }
     case DT_DOUBLE: {
-      gtl::FlatSet<double> already_generated;
+      absl::flat_hash_set<double> already_generated;
       std::uniform_real_distribution<double> distribution(-1.0, 1.0);
       test::FillFn<double>(&tensor, [&](int i) -> double {
         double generated;
@@ -483,7 +483,7 @@ Tensor OpTest::RandomTensor(DataType dtype, bool needs_unique_values,
       break;
     }
     case DT_COMPLEX64: {
-      gtl::FlatSet<std::pair<float, float>> already_generated;
+      absl::flat_hash_set<std::pair<float, float>> already_generated;
       std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
       test::FillFn<complex64>(&tensor, [&](int i) {
         complex64 generated;
@@ -500,7 +500,7 @@ Tensor OpTest::RandomTensor(DataType dtype, bool needs_unique_values,
       break;
     }
     case DT_INT32: {
-      gtl::FlatSet<int32> already_generated;
+      absl::flat_hash_set<int32> already_generated;
       std::uniform_int_distribution<int32> distribution(-(1 << 20), 1 << 20);
       test::FillFn<int32>(&tensor, [&](int i) -> int32 {
         int32 generated;
@@ -513,7 +513,7 @@ Tensor OpTest::RandomTensor(DataType dtype, bool needs_unique_values,
       break;
     }
     case DT_INT64: {
-      gtl::FlatSet<int64> already_generated;
+      absl::flat_hash_set<int64> already_generated;
       std::uniform_int_distribution<int64> distribution(-(1LL << 40),
                                                         1LL << 40);
       test::FillFn<int64>(&tensor, [&](int i) -> int64 {
@@ -527,7 +527,7 @@ Tensor OpTest::RandomTensor(DataType dtype, bool needs_unique_values,
       break;
     }
     case DT_BOOL: {
-      gtl::FlatSet<bool> already_generated;
+      absl::flat_hash_set<bool> already_generated;
       std::bernoulli_distribution distribution;
       test::FillFn<bool>(&tensor, [&](int i) -> bool {
         bool generated;
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 1191cff109..dc097f3696 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -221,6 +221,7 @@ cc_library(
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 5277de6a85..e0ec91dba1 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/mutex.h"
 
 namespace xla {
@@ -2290,7 +2290,7 @@ StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
   // also a valid dependency order). The related ops will be added to the
   // subgraph in the same order.
   std::set<int64> related_ops;
-  tensorflow::gtl::FlatSet<int64> related_calls;  // Related computations.
+  absl::flat_hash_set<int64> related_calls;  // Related computations.
   std::queue<int64> worklist;
   worklist.push(root->id());
   related_ops.insert(root->id());
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index b7295e8a53..cd0d5ca5d3 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/padding.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stacktrace.h"
 #include "tensorflow/core/platform/types.h"
@@ -1035,7 +1035,7 @@ class XlaBuilder {
   std::map<int64, HloComputationProto> embedded_;
 
   // The unique parameter numbers.
-  tensorflow::gtl::FlatSet<int64> parameter_numbers_;
+  absl::flat_hash_set<int64> parameter_numbers_;
 
   // The metadata to attach to each op. This is structured as a "modal"-like
   // operation, in order to simplify client code (and not sprinkle this metadata
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 8da6364786..13803f5ebe 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -147,6 +147,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -183,6 +184,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -336,6 +338,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -490,6 +493,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -781,6 +785,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -959,6 +964,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -995,6 +1001,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -1043,6 +1050,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1136,6 +1144,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -1230,6 +1239,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1275,6 +1285,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -1348,6 +1359,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1660,6 +1672,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
@@ -2064,6 +2077,7 @@ cc_library(
         ":logical_buffer",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -2099,6 +2113,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -2120,6 +2135,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -2203,6 +2219,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2225,6 +2242,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -2286,6 +2304,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -2343,6 +2362,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -2370,6 +2390,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2487,6 +2508,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -2616,6 +2638,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -2655,6 +2678,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
     ],
 )
@@ -2730,6 +2754,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -3300,6 +3325,7 @@ cc_library(
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
     ],
 )
@@ -3387,6 +3413,7 @@ cc_library(
         "//tensorflow/core:ptr_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index 58f78f8e24..002be9c970 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/bfloat16_propagation.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -81,7 +82,7 @@ void BFloat16Propagation::RevertIfFusionInternalBF16Changes(
   };
 
   auto root = fusion->fused_instructions_computation()->root_instruction();
-  tensorflow::gtl::FlatSet<const HloValue*> changed_root_buffers;
+  absl::flat_hash_set<const HloValue*> changed_root_buffers;
 
   auto root_changes_it = changes_to_bf16_.find(root);
   if (root_changes_it != changes_to_bf16_.end()) {
@@ -500,7 +501,7 @@ void BFloat16Propagation::AdjustCalledComputationRoot(HloInstruction* hlo) {
 
 bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper(
     HloComputation* computation,
-    tensorflow::gtl::FlatSet<const HloComputation*>* visited_computations) {
+    absl::flat_hash_set<const HloComputation*>* visited_computations) {
   bool parameter_changed = false;
   auto insts = computation->MakeInstructionPostOrder();
   // Do the adjustment on each instruction in the computation in reverse
@@ -560,7 +561,7 @@ bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper(
       // another input parameter. A fixed point will be reached because the
       // parameters can only be changed from BF16 to F32, not the other way
       // around.
-      tensorflow::gtl::FlatSet<const HloComputation*> visited_in_while;
+      absl::flat_hash_set<const HloComputation*> visited_in_while;
       while (ResolveInconsistencyOfAliasingBuffersHelper(hlo->while_condition(),
                                                          &visited_in_while) ||
              ResolveInconsistencyOfAliasingBuffersHelper(hlo->while_body(),
@@ -587,7 +588,7 @@ void BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers(
     HloModule* module) {
   const auto& computations_topological_order =
       module->MakeComputationPostOrder();
-  tensorflow::gtl::FlatSet<const HloComputation*> resolved;
+  absl::flat_hash_set<const HloComputation*> resolved;
   for (auto comp_it = computations_topological_order.rbegin();
        comp_it != computations_topological_order.rend(); ++comp_it) {
     if (ContainsKey(resolved, *comp_it)) {
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.h b/tensorflow/compiler/xla/service/bfloat16_propagation.h
index c74326f631..5fcaa15c83 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.h
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -82,7 +83,7 @@ class BFloat16Propagation : public HloModulePass {
 
   // The set of instructions to consider using bfloat16, computed in the forward
   // pass.
-  tensorflow::gtl::FlatSet<const HloInstruction*> consider_using_bfloat16_;
+  absl::flat_hash_set<const HloInstruction*> consider_using_bfloat16_;
 
   // ***************************
   // Functions called and state produced by the backward pass (from root to
@@ -111,12 +112,12 @@ class BFloat16Propagation : public HloModulePass {
 
   // The set of HloInstructions that have been visited in the
   // opportunity-finding pass.
-  tensorflow::gtl::FlatSet<const HloInstruction*>
+  absl::flat_hash_set<const HloInstruction*>
       instructions_visited_in_backward_pass_;
 
   // The set of HloComputations that have been visited in the
   // opportunity-finding pass.
-  tensorflow::gtl::FlatSet<const HloComputation*>
+  absl::flat_hash_set<const HloComputation*>
       computations_visited_in_backward_pass_;
 
   // ***************************
@@ -132,7 +133,7 @@ class BFloat16Propagation : public HloModulePass {
   // point is reached.
   bool ResolveInconsistencyOfAliasingBuffersHelper(
       HloComputation* computation,
-      tensorflow::gtl::FlatSet<const HloComputation*>* visited_computations);
+      absl::flat_hash_set<const HloComputation*>* visited_computations);
 
   // Makes the parameters of called computations match how they are called by
   // the given HLO.
@@ -183,7 +184,7 @@ class BFloat16Propagation : public HloModulePass {
                                       PrimitiveType target_type);
 
   // The set of F32 HLO values that must be kept in F32.
-  tensorflow::gtl::FlatSet<const HloValue*> values_that_must_be_kept_as_f32_;
+  absl::flat_hash_set<const HloValue*> values_that_must_be_kept_as_f32_;
 
   // Mapping from each HloComputation to the number of callers to it in the
   // module. Populated at the beginning of this pass.
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 3efa0b1dad..2c2d1626c2 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -43,9 +44,9 @@ namespace xla {
 namespace {
 
 using absl::flat_hash_map;
+using absl::flat_hash_set;
 using absl::StrAppend;
 using absl::StrAppendFormat;
-using ::tensorflow::gtl::FlatSet;
 using ::tensorflow::strings::HumanReadableNumBytes;
 
 template <typename T>
@@ -129,8 +130,8 @@ Status GatherComputationsByAllocationType(
 
   // Sets for quickly checking membership. Computations are returned in vectors
   // for stable iteration.
-  FlatSet<const HloComputation*> thread_local_set;
-  FlatSet<const HloComputation*> global_set;
+  flat_hash_set<const HloComputation*> thread_local_set;
+  flat_hash_set<const HloComputation*> global_set;
 
   while (!worklist.empty()) {
     auto worklist_front = worklist.front();
@@ -445,7 +446,7 @@ bool BufferAssignment::SharesSliceAtIndex(
 bool BufferAssignment::HaveDisjointSlices(const HloInstruction* hlo_a,
                                           const HloInstruction* hlo_b) const {
   using SliceSet =
-      FlatSet<BufferAllocation::Slice, BufferAllocation::Slice::Hasher>;
+      flat_hash_set<BufferAllocation::Slice, BufferAllocation::Slice::Hasher>;
   // Gets the slices all of instr's subshapes.  If any subshape doesn't have an
   // assigned slice, returns the empty set.
   auto collect_slices = [&](const HloInstruction* instr) -> SliceSet {
@@ -815,9 +816,9 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
 
 Status BufferAssigner::AssignBuffersForComputation(
     const HloComputation* computation, bool is_thread_local,
-    const FlatSet<const LogicalBuffer*>& colocated_buffers,
-    const FlatSet<BufferAllocation::Index>& colocated_allocations,
-    flat_hash_map<const HloComputation*, FlatSet<const LogicalBuffer*>>*
+    const flat_hash_set<const LogicalBuffer*>& colocated_buffers,
+    const flat_hash_set<BufferAllocation::Index>& colocated_allocations,
+    flat_hash_map<const HloComputation*, flat_hash_set<const LogicalBuffer*>>*
         buffers_to_assign_sequentially,
     BufferAssignment* assignment) {
   // Buffers are sorted and assigned to BufferAllocations in decreasing order of
@@ -853,8 +854,8 @@ Status BufferAssigner::AssignBuffersForComputation(
     // buffers_to_assign_sequentially map, even if we end up with an empty set
     // of buffers. This ensures we can correctly determine whether to run
     // whole-module heap simulation.
-    buffers_to_assign_sequentially->emplace(computation,
-                                            FlatSet<const LogicalBuffer*>());
+    buffers_to_assign_sequentially->emplace(
+        computation, flat_hash_set<const LogicalBuffer*>());
   }
 
   // Sort the LogicalBuffers first by size. We assign the larger LogicalBuffers
@@ -1046,11 +1047,11 @@ Status BufferAssigner::AssignBuffersForComputation(
   return Status::OK();
 }
 
-flat_hash_map<LogicalBuffer::Color, FlatSet<const LogicalBuffer*>,
+flat_hash_map<LogicalBuffer::Color, flat_hash_set<const LogicalBuffer*>,
               LogicalBuffer::Color::Hasher>
 BufferAssigner::SplitBuffersByColor(
-    const FlatSet<const LogicalBuffer*>& buffers) {
-  flat_hash_map<LogicalBuffer::Color, FlatSet<const LogicalBuffer*>,
+    const flat_hash_set<const LogicalBuffer*>& buffers) {
+  flat_hash_map<LogicalBuffer::Color, flat_hash_set<const LogicalBuffer*>,
                 LogicalBuffer::Color::Hasher>
       color_map;
   for (auto buffer : buffers) {
@@ -1060,7 +1061,8 @@ BufferAssigner::SplitBuffersByColor(
 }
 
 Status BufferAssigner::AssignBuffersWithSequentialOrdering(
-    const flat_hash_map<const HloComputation*, FlatSet<const LogicalBuffer*>>&
+    const flat_hash_map<const HloComputation*,
+                        flat_hash_set<const LogicalBuffer*>>&
         buffers_to_assign_sequentially,
     bool run_whole_module_heap_simulation, BufferAssignment* assignment) {
   // Run the sequence of instructions through the heap simulator.  The heuristic
@@ -1086,10 +1088,11 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
     // only live for the duration of their calling instructions.
     VLOG(1) << "Running whole-module heap simulation";
     HloSchedule schedule(&assignment->module());
-    FlatSet<const LogicalBuffer*> all_buffers_to_assign;
+    flat_hash_set<const LogicalBuffer*> all_buffers_to_assign;
     for (const auto& pair : buffers_to_assign_sequentially) {
       const HloComputation* computation = pair.first;
-      const FlatSet<const LogicalBuffer*>& buffers_to_assign = pair.second;
+      const flat_hash_set<const LogicalBuffer*>& buffers_to_assign =
+          pair.second;
       const std::vector<const HloInstruction*>* instruction_sequence =
           hlo_ordering.SequentialOrder(*computation);
       CHECK(instruction_sequence != nullptr) << computation->name();
@@ -1123,7 +1126,8 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
     VLOG(1) << "Running per-computation heap simulation";
     for (const auto& pair : buffers_to_assign_sequentially) {
       const HloComputation* computation = pair.first;
-      const FlatSet<const LogicalBuffer*>& buffers_to_assign = pair.second;
+      const flat_hash_set<const LogicalBuffer*>& buffers_to_assign =
+          pair.second;
       const std::vector<const HloInstruction*>* instruction_sequence =
           hlo_ordering.SequentialOrder(*computation);
       CHECK(instruction_sequence != nullptr) << computation->name();
@@ -1198,7 +1202,7 @@ std::vector<const LogicalBuffer*> ComputePeakMemoryLogicalBuffers(
 
   // Next gather the set of logical buffers live at the earliest point of
   // maximal live set size.
-  tensorflow::gtl::FlatSet<const LogicalBuffer*> live_buffers;
+  absl::flat_hash_set<const LogicalBuffer*> live_buffers;
   live_size = 0;
   for (const auto& event : heap_trace.events()) {
     const LogicalBuffer* buffer = id_to_buffer.at(event.buffer_id());
@@ -1588,8 +1592,8 @@ void BufferAssigner::BuildColocatedBufferSets(
 void BufferAssigner::AssignColocatedBufferSets(
     const std::vector<ColocatedBufferSet>& colocated_buffer_sets,
     BufferAssignment* assignment,
-    FlatSet<const LogicalBuffer*>* colocated_buffers,
-    FlatSet<BufferAllocation::Index>* colocated_allocations) {
+    flat_hash_set<const LogicalBuffer*>* colocated_buffers,
+    flat_hash_set<BufferAllocation::Index>* colocated_allocations) {
   for (const ColocatedBufferSet& colocated_buffer_set : colocated_buffer_sets) {
     BufferAllocation* allocation = nullptr;
     // Set 'entry_parameter_number' and 'entry_parameter_shape_idx' if entry
@@ -1662,8 +1666,8 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   // Once b/32491382 enables module-level liveness analysis, we may be able
   // to assign colocated buffers (or at least reuse their allocation for
   // buffers outside of the set) in AssignBuffersForComputation.
-  FlatSet<const LogicalBuffer*> colocated_buffers;
-  FlatSet<BufferAllocation::Index> colocated_allocations;
+  flat_hash_set<const LogicalBuffer*> colocated_buffers;
+  flat_hash_set<BufferAllocation::Index> colocated_allocations;
   std::vector<ColocatedBufferSet> colocated_buffer_sets;
   BuildColocatedBufferSets(module, assignment->liveness(),
                            assignment->buffer_size_, &colocated_buffer_sets);
@@ -1681,7 +1685,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
 
   // First assign buffers for global computatations. Temporary buffers for
   // sequential computations are collected in 'buffers_to_assign_sequentially'.
-  flat_hash_map<const HloComputation*, FlatSet<const LogicalBuffer*>>
+  flat_hash_map<const HloComputation*, flat_hash_set<const LogicalBuffer*>>
       buffers_to_assign_sequentially;
   for (auto* computation : global_computations) {
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 9ba40617a3..899cd36e1f 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -554,11 +554,10 @@ class BufferAssigner {
   // true.
   Status AssignBuffersForComputation(
       const HloComputation* computation, bool is_thread_local,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>& colocated_buffers,
-      const tensorflow::gtl::FlatSet<BufferAllocation::Index>&
-          colocated_allocations,
+      const absl::flat_hash_set<const LogicalBuffer*>& colocated_buffers,
+      const absl::flat_hash_set<BufferAllocation::Index>& colocated_allocations,
       absl::flat_hash_map<const HloComputation*,
-                          tensorflow::gtl::FlatSet<const LogicalBuffer*>>*
+                          absl::flat_hash_set<const LogicalBuffer*>>*
           buffers_to_assign_sequentially,
       BufferAssignment* assignment);
 
@@ -569,7 +568,7 @@ class BufferAssigner {
   // assuming all global computations are sequentially ordered.
   Status AssignBuffersWithSequentialOrdering(
       const absl::flat_hash_map<const HloComputation*,
-                                tensorflow::gtl::FlatSet<const LogicalBuffer*>>&
+                                absl::flat_hash_set<const LogicalBuffer*>>&
           buffers_to_assign_sequentially,
       bool run_whole_module_heap_simulation, BufferAssignment* assignment);
 
@@ -589,7 +588,7 @@ class BufferAssigner {
   // alias. Explicitly handling these colocated buffers is necessary because
   // points-to analysis is computation level scope and does not recognize
   // aliasing across computations (b/32491382).
-  using ColocatedBufferSet = tensorflow::gtl::FlatSet<const LogicalBuffer*>;
+  using ColocatedBufferSet = absl::flat_hash_set<const LogicalBuffer*>;
 
   // Returns a vector of ColocatedBufferSet objects, where each
   // ColocatedBufferSet aggregates a set of related LogicalBuffers from 'module'
@@ -604,8 +603,8 @@ class BufferAssigner {
   void AssignColocatedBufferSets(
       const std::vector<ColocatedBufferSet>& colocated_buffer_sets,
       BufferAssignment* assignment,
-      tensorflow::gtl::FlatSet<const LogicalBuffer*>* colocated_buffers,
-      tensorflow::gtl::FlatSet<BufferAllocation::Index>* colocated_allocations);
+      absl::flat_hash_set<const LogicalBuffer*>* colocated_buffers,
+      absl::flat_hash_set<BufferAllocation::Index>* colocated_allocations);
 
   // Adds the 'colocated_set' of buffers to 'colocated_buffer_sets', maintaining
   // the invariant that all sets in 'colocated_buffer_sets' are disjoint.
@@ -624,10 +623,9 @@ class BufferAssigner {
   // Split a set of buffers into several sets, each of which contains buffers
   // colored with the same color.
   absl::flat_hash_map<LogicalBuffer::Color,
-                      tensorflow::gtl::FlatSet<const LogicalBuffer*>,
+                      absl::flat_hash_set<const LogicalBuffer*>,
                       LogicalBuffer::Color::Hasher>
-  SplitBuffersByColor(
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>& buffers);
+  SplitBuffersByColor(const absl::flat_hash_set<const LogicalBuffer*>& buffers);
 
   // If true, buffer assignments assumes that input parameter buffers and output
   // buffers can be shared if their sizes match.
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.h b/tensorflow/compiler/xla/service/buffer_liveness.h
index 2911bbcfbf..f939a426ea 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.h
+++ b/tensorflow/compiler/xla/service/buffer_liveness.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
 
@@ -101,7 +101,7 @@ class BufferLiveness {
   // Set of LogicalBuffers which are aliased in the output of other
   // instructions. For example, a LogicalBuffer which is inserted into a tuple
   // is considered to be aliased and will be in this set.
-  tensorflow::gtl::FlatSet<const LogicalBuffer*> aliased_buffers_;
+  absl::flat_hash_set<const LogicalBuffer*> aliased_buffers_;
 
   // LogicalBuffers that may be live out of the entry computation.
   PointsToSet::BufferSet maybe_live_out_buffers_;
diff --git a/tensorflow/compiler/xla/service/buffer_value_containers.h b/tensorflow/compiler/xla/service/buffer_value_containers.h
index 305914fca8..cc46af5eee 100644
--- a/tensorflow/compiler/xla/service/buffer_value_containers.h
+++ b/tensorflow/compiler/xla/service/buffer_value_containers.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BUFFER_VALUE_CONTAINERS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_BUFFER_VALUE_CONTAINERS_H_
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/core/lib/gtl/compactptrset.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
 
@@ -38,7 +38,7 @@ BufferValueCompactPointerSet ToBufferValueCompactPointerSet(
   return output;
 }
 
-using BufferValueFlatSet = tensorflow::gtl::FlatSet<const BufferValue*>;
+using BufferValueFlatSet = absl::flat_hash_set<const BufferValue*>;
 template <class LogicalBufferContainerT>
 BufferValueFlatSet ToBufferValueFlatSet(
     const LogicalBufferContainerT& logical_buffer_container) {
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index 23b2a32709..bdd5069632 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <queue>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -138,7 +139,7 @@ CallGraphNode& CallGraph::GetNode(const HloComputation* computation) {
 
 bool CallGraph::DominatesHelper(
     const HloComputation* a, const HloComputation* b,
-    tensorflow::gtl::FlatSet<const HloComputation*>* visited) const {
+    absl::flat_hash_set<const HloComputation*>* visited) const {
   if (a == b || ContainsKey(*visited, b)) {
     // The call graph is guaranteed to be acyclic so any previously visited node
     // we encounter was already determined to be dominated.
@@ -163,7 +164,7 @@ bool CallGraph::DominatesHelper(
 
 bool CallGraph::Dominates(const HloComputation* a,
                           const HloComputation* b) const {
-  tensorflow::gtl::FlatSet<const HloComputation*> visited;
+  absl::flat_hash_set<const HloComputation*> visited;
   return DominatesHelper(a, b, &visited);
 }
 
@@ -277,7 +278,7 @@ std::unique_ptr<CallGraph> CallGraph::Build(const HloModule* module) {
 
 Status CallGraph::VisitNodesInternal(
     const VisitorFunction& visitor_func, const CallGraphNode& node,
-    tensorflow::gtl::FlatSet<const CallGraphNode*>* visited) const {
+    absl::flat_hash_set<const CallGraphNode*>* visited) const {
   auto pair = visited->insert(&node);
   if (!pair.second) {
     // Node was not inserted. Node has already been visited.
@@ -294,7 +295,7 @@ Status CallGraph::VisitNodesInternal(
 
 Status CallGraph::VisitNodes(const VisitorFunction& visitor_func,
                              bool visit_unreachable_nodes) const {
-  tensorflow::gtl::FlatSet<const CallGraphNode*> visited;
+  absl::flat_hash_set<const CallGraphNode*> visited;
   if (visit_unreachable_nodes) {
     // Traverse from all roots in the call graph.
     for (const CallGraphNode& node : nodes()) {
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index 0c2e9b99db..cb56f4789d 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -21,10 +21,10 @@ limitations under the License.
 #include <ostream>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
 
@@ -145,12 +145,12 @@ class CallGraphNode {
   // The computations called by this computation. The vector is used for a
   // stable ordering and the set enables fast membership testing.
   std::vector<HloComputation*> callees_;
-  tensorflow::gtl::FlatSet<HloComputation*> callee_set_;
+  absl::flat_hash_set<HloComputation*> callee_set_;
 
   // The computations which call this computation. The vector is used for a
   // stable ordering and the set enables fast membership testing.
   std::vector<HloComputation*> callers_;
-  tensorflow::gtl::FlatSet<HloComputation*> caller_set_;
+  absl::flat_hash_set<HloComputation*> caller_set_;
 
   // The call sites in this computation
   std::vector<CallSite> callsites_;
@@ -250,14 +250,14 @@ class CallGraph {
   // 'visited'.
   Status VisitNodesInternal(
       const VisitorFunction& visitor_func, const CallGraphNode& node,
-      tensorflow::gtl::FlatSet<const CallGraphNode*>* visited) const;
+      absl::flat_hash_set<const CallGraphNode*>* visited) const;
 
   // Recursive helper for computing whether 'a' dominates 'b' in the call
   // graph. 'b_ancestor' is the currently visited node (which starts at 'b'),
   // and 'visited' is the set of computations which have been visited.
   bool DominatesHelper(
       const HloComputation* a, const HloComputation* b,
-      tensorflow::gtl::FlatSet<const HloComputation*>* visited) const;
+      absl::flat_hash_set<const HloComputation*>* visited) const;
 
   // The HLO module represented by this call graph.
   const HloModule* module_ = nullptr;
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 7f78412924..f35324aa35 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -904,7 +904,7 @@ class CopyRemover {
     // The heads of all the value lists. Each value list represents the HLO
     // values contained in a particular HLO buffer. The values in the list are
     // in dependency order.
-    tensorflow::gtl::FlatSet<const ValueNode*> value_lists_;
+    absl::flat_hash_set<const ValueNode*> value_lists_;
 
     // Copy removal requires fast access to the value list elements
     // corresponding to the source and destination values of the kCopy
@@ -1009,7 +1009,7 @@ Status CopyInsertion::AddSpecialCaseCopies(const CallGraph& call_graph,
     HloInstruction* root = computation->root_instruction();
 
     // Mark nondistinct/ambiguous indices.
-    tensorflow::gtl::FlatSet<const HloBuffer*> seen;
+    absl::flat_hash_set<const HloBuffer*> seen;
     ShapeUtil::ForEachSubshape(
         root->shape(), [&](const Shape& /*subshape*/, const ShapeIndex& index) {
           std::vector<const HloBuffer*> buffers_at_index =
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 6a83909a3b..ae4c6e962d 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -291,6 +291,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 953a75c35f..a70abb117a 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/logging.h"
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
@@ -68,7 +69,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
 
@@ -1400,8 +1400,8 @@ static bool ReductionPreservesLayout(const HloInstruction& reduce) {
   // [0->0, 3->1].
   absl::flat_hash_map<int64, int64> unreduced_dim_map;
 
-  gtl::FlatSet<int64> reduced_dims(reduce.dimensions().begin(),
-                                   reduce.dimensions().end());
+  absl::flat_hash_set<int64> reduced_dims(reduce.dimensions().begin(),
+                                          reduce.dimensions().end());
 
   const Shape& operand_shape = reduce.operand(0)->shape();
   const Shape& result_shape = reduce.shape();
@@ -1977,7 +1977,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   //
   // * Implement the memcpy within the innermost loop.
 
-  gtl::FlatSet<int64> inner_dims;
+  absl::flat_hash_set<int64> inner_dims;
   for (int64 dim : LayoutUtil::MinorToMajor(layout)) {
     if (operand->shape().dimensions(dim) != slice->shape().dimensions(dim)) {
       break;
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
index 7af51db55a..b35fd9dad8 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -121,7 +121,7 @@ TEST_F(CpuNoAliasTest, Concat) {
     CHECK: %read_concat2_array = load {{.*}} !alias.scope [[concat1_noalias]], !noalias [[concat1_scope]]
     CHECK-DAG: [[buf_size32:![0-9]+]] = !{!"buffer:{{.*}} size:32
     CHECK-DAG: [[buf_size48:![0-9]+]] = !{!"buffer:{{.*}} size:48
-    CHECK-DAG: [[param_x_noalias]] = !{[[buf_size32]], [[buf_size48]]}
+    CHECK-DAG: [[param_x_noalias]] = !{[[buf_size48]], [[buf_size32]]}
     CHECK-DAG: [[concat1_scope]] = !{[[buf_size32]]}
     CHECK-DAG: [[concat1_noalias]] = !{[[buf_size48]]}
   )";
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index e65d3fa332..a838464cae 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -476,6 +476,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:instruction_fusion",
         "//tensorflow/compiler/xla/service:pattern_matcher",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -508,6 +509,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:multi_output_fusion",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -541,6 +543,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
index 79c74e7e8b..e2ab00ce41 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <set>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 4d5d8e99f8..b61f038739 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -125,8 +126,8 @@ bool IsIEEEFloatingPointScalarConstant(const HloInstruction* constant) {
   }
 
   // Compute the precise number of operands to the new fusion.
-  tensorflow::gtl::FlatSet<const HloInstruction*> operands(
-      a->operands().begin(), a->operands().end());
+  absl::flat_hash_set<const HloInstruction*> operands(a->operands().begin(),
+                                                      a->operands().end());
   operands.insert(b->operands().begin(), b->operands().end());
   // If there's an edge between `a` and `b`, don't count it: We're fusing that
   // producer -> consumer relationship.
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index c21f76f6eb..835924024b 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -101,7 +101,7 @@ bool GpuMultiOutputFusion::IsFusible(HloInstruction* instr) {
 
 int64 GpuMultiOutputFusion::GetProfit(HloInstruction* instr1,
                                       HloInstruction* instr2) {
-  tensorflow::gtl::FlatSet<HloInstruction*> in_list;
+  absl::flat_hash_set<HloInstruction*> in_list;
   for (auto instr : instr1->operands()) {
     if (!IsProfitableOperand(instr)) {
       continue;
@@ -148,7 +148,7 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
   bool changed = false;
   RecomputeReachability();
 
-  tensorflow::gtl::FlatSet<HloInstruction*> to_fuse;
+  absl::flat_hash_set<HloInstruction*> to_fuse;
   // Keep a list of the instructions to fuse after making all the fusion
   // decisions. We first aggressively add instructions to potential_fusion_list,
   // then filter out instructions that will be no longer fusible because of
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 147776c8c4..b343305554 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -26,7 +27,7 @@ limitations under the License.
 namespace xla {
 
 using absl::flat_hash_map;
-using tensorflow::gtl::FlatSet;
+using absl::flat_hash_set;
 
 /*static*/
 StatusOr<int64> HeapSimulator::MinimumMemoryForModule(
@@ -116,9 +117,9 @@ Status HeapSimulator::RunComputation(
   // 'used_buffers' is the reverse map - it tracks which buffers were used by an
   // instruction, so that we can remove the instructions from a buffer's live
   // set after they are visited.
-  flat_hash_map<const BufferValue*, FlatSet<const HloInstruction*>>
+  flat_hash_map<const BufferValue*, flat_hash_set<const HloInstruction*>>
       live_buffers;
-  flat_hash_map<const HloInstruction*, FlatSet<const BufferValue*>>
+  flat_hash_map<const HloInstruction*, flat_hash_set<const BufferValue*>>
       used_buffers;
   auto add_user_to_buffer = [this, &live_buffers, &used_buffers](
                                 const HloInstruction* user,
@@ -216,7 +217,7 @@ Status HeapSimulator::RunComputation(
       VLOG(4) << "  Removing user " << instruction->name() << " from buffer "
               << operand_buffer->ToString();
       auto it = live_buffers.find(operand_buffer);
-      FlatSet<const HloInstruction*>* live_set = &it->second;
+      flat_hash_set<const HloInstruction*>* live_set = &it->second;
       live_set->erase(instruction);
       if (live_set->empty()) {
         live_buffers.erase(it);
@@ -238,7 +239,7 @@ Status HeapSimulator::RunComputation(
     // that we should assign.
 
     // Make sure each buffer get reused at most once.
-    FlatSet<const BufferValue*> reused_buffers;
+    flat_hash_set<const BufferValue*> reused_buffers;
     for (const BufferValue* buffer : buffers_defined_by_instruction) {
       if (IgnoreBuffer(buffer)) {
         continue;
@@ -326,7 +327,7 @@ Status HeapSimulator::RunComputation(
   to_free.reserve(live_buffers.size());
   for (const auto& buffer_pending : live_buffers) {
     const BufferValue* buffer = buffer_pending.first;
-    const FlatSet<const HloInstruction*>& pending = buffer_pending.second;
+    const flat_hash_set<const HloInstruction*>& pending = buffer_pending.second;
     CHECK_EQ(pending.size(), 1) << *buffer;
     CHECK(*pending.begin() == nullptr) << *buffer;
     to_free.push_back(buffer);
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index a5bb3f81f7..b0295a6163 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/buffer_value_containers.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
 
@@ -197,8 +197,8 @@ class HeapSimulator {
       shared_buffers_;
 
   // Hold some sets for error-checking the sequence of Alloc and Free calls.
-  tensorflow::gtl::FlatSet<const BufferValue*> allocated_buffers_;
-  tensorflow::gtl::FlatSet<const BufferValue*> freed_buffers_;
+  absl::flat_hash_set<const BufferValue*> allocated_buffers_;
+  absl::flat_hash_set<const BufferValue*> freed_buffers_;
 
   // Debugging information filled in while the heap simulator runs.
   HeapSimulatorTrace debug_trace_;
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index b6e1f52cf5..c3da12e273 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -120,7 +121,7 @@ class BufferValueMap {
   }
 
   // Return a set of all the values in the given buffer.
-  const tensorflow::gtl::FlatSet<const HloValue*>& GetValuesInBuffer(
+  const absl::flat_hash_set<const HloValue*>& GetValuesInBuffer(
       BufferNumber buffer_number) const {
     return buffers_.at(buffer_number);
   }
@@ -143,7 +144,7 @@ class BufferValueMap {
   // Move the given value into the given buffer.
   void MoveValueToBuffer(const HloValue& value, BufferNumber buffer_number) {
     BufferNumber old_buffer_number = value_to_buffer_number_.at(&value);
-    tensorflow::gtl::FlatSet<const HloValue*>& old_value_set =
+    absl::flat_hash_set<const HloValue*>& old_value_set =
         buffers_.at(old_buffer_number);
     old_value_set.erase(&value);
     if (old_value_set.empty()) {
@@ -291,7 +292,7 @@ class BufferValueMap {
   const HloDataflowAnalysis& dataflow_;
 
   // A map containing the set of values contained in each buffer.
-  absl::flat_hash_map<BufferNumber, tensorflow::gtl::FlatSet<const HloValue*>>
+  absl::flat_hash_map<BufferNumber, absl::flat_hash_set<const HloValue*>>
       buffers_;
 
   // A map indicating which buffer each value is contained in.
@@ -351,7 +352,7 @@ bool HloAliasAnalysis::InstructionBuffersAreAmbiguous(
 
 bool HloAliasAnalysis::InstructionBuffersAreDistinct(
     const HloInstruction* instruction) const {
-  tensorflow::gtl::FlatSet<const HloBuffer*> buffers_seen;
+  absl::flat_hash_set<const HloBuffer*> buffers_seen;
   for (const auto& pair :
        dataflow_analysis_->GetInstructionValueSet(instruction)) {
     const HloValueSet& value_set = pair.second;
diff --git a/tensorflow/compiler/xla/service/hlo_buffer.cc b/tensorflow/compiler/xla/service/hlo_buffer.cc
index 6c11a073b7..9c3aa0e64d 100644
--- a/tensorflow/compiler/xla/service/hlo_buffer.cc
+++ b/tensorflow/compiler/xla/service/hlo_buffer.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 257dd5876f..6ef67ab0a8 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -278,10 +278,9 @@ void HloComputation::set_root_instruction(HloInstruction* new_root_instruction,
 namespace {
 
 // Helper which builds a post order of the HLO call graph.
-void ComputeComputationPostOrder(
-    HloComputation* computation,
-    tensorflow::gtl::FlatSet<HloComputation*>* visited,
-    std::vector<HloComputation*>* post_order) {
+void ComputeComputationPostOrder(HloComputation* computation,
+                                 absl::flat_hash_set<HloComputation*>* visited,
+                                 std::vector<HloComputation*>* post_order) {
   if (visited->insert(computation).second) {
     for (auto* instruction : computation->instructions()) {
       for (HloComputation* called_computation :
@@ -416,7 +415,7 @@ std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
 
 std::vector<HloComputation*> HloComputation::MakeEmbeddedComputationsList()
     const {
-  tensorflow::gtl::FlatSet<HloComputation*> visited;
+  absl::flat_hash_set<HloComputation*> visited;
   std::vector<HloComputation*> post_order;
 
   // To avoid special handling of this computation, cast away const of
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index af929ac009..d87ab4bda1 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/iterator_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -41,7 +42,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index b59c9ba3ed..e602107cbe 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
 namespace xla {
@@ -137,8 +137,8 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
     // HLO instructions are grouped into equivalency classes by using the
     // cse_equal predicate defined above. This set holds a representative
     // instruction for each class.
-    tensorflow::gtl::FlatSet<HloInstruction*, decltype(&CseHash),
-                             decltype(cse_equal)>
+    absl::flat_hash_set<HloInstruction*, decltype(&CseHash),
+                        decltype(cse_equal)>
         representatives(/*N=*/computation->instruction_count() + 1, &CseHash,
                         cse_equal);
     for (auto instruction : computation->MakeInstructionPostOrder()) {
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 6a63681996..44cde4a3d2 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <queue>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
@@ -91,7 +92,7 @@ HloDataflowAnalysis::HloDataflowAnalysis(
 
 bool HloDataflowAnalysis::AreTransitiveUsesElementwiseOrTuple(
     const HloInstruction* inst) {
-  tensorflow::gtl::FlatSet<const HloInstruction*> visited;
+  absl::flat_hash_set<const HloInstruction*> visited;
   absl::InlinedVector<const HloInstruction*, 4> stack;
   stack.push_back(inst);
   while (!stack.empty()) {
@@ -159,8 +160,8 @@ void HloDataflowAnalysis::MarkValueForDeletion(HloValue::Id value_id) {
 void HloDataflowAnalysis::DeleteMarkedValues() {
 #ifndef NDEBUG
   // Verify that no marked-for-deletion values are in any of the value sets.
-  tensorflow::gtl::FlatSet<HloValue::Id> id_set(value_ids_to_delete_.begin(),
-                                                value_ids_to_delete_.end());
+  absl::flat_hash_set<HloValue::Id> id_set(value_ids_to_delete_.begin(),
+                                           value_ids_to_delete_.end());
   for (const auto& pair : value_sets_) {
     const HloInstruction* instruction = pair.first;
     const InstructionValueSet& instruction_value_set = pair.second;
@@ -673,7 +674,7 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
 
 void HloDataflowAnalysis::Propagate() {
   std::queue<HloInstruction*> worklist;
-  tensorflow::gtl::FlatSet<HloInstruction*> workset;
+  absl::flat_hash_set<HloInstruction*> workset;
   auto add_to_worklist = [&worklist, &workset](HloInstruction* instruction) {
     if (workset.insert(instruction).second) {
       worklist.push(instruction);
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.cc b/tensorflow/compiler/xla/service/hlo_domain_map.cc
index 159c39d557..6ca1255ede 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -217,7 +218,7 @@ bool HloDomainMap::IsDomainInstruction(HloInstruction* instruction) const {
 
 /* static */ std::vector<HloInstruction*>
 HloDomainMap::MakeNonDomainInstructions(
-    const tensorflow::gtl::FlatSet<HloInstruction*>& instruction_set,
+    const absl::flat_hash_set<HloInstruction*>& instruction_set,
     const InstructionOrderMap& instructions_order) {
   std::vector<HloInstruction*> instructions;
   instructions.reserve(instruction_set.size());
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.h b/tensorflow/compiler/xla/service/hlo_domain_map.h
index 8584bc021d..c8d581b746 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.h
@@ -20,13 +20,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
 
@@ -110,7 +110,7 @@ class HloDomainMap {
   // Out of an instruction set, returns a vector of all the ones which are not
   // a kDomain kind.
   static std::vector<HloInstruction*> MakeNonDomainInstructions(
-      const tensorflow::gtl::FlatSet<HloInstruction*>& instruction_set,
+      const absl::flat_hash_set<HloInstruction*>& instruction_set,
       const InstructionOrderMap& instructions_order);
 
   // Populates domain_metadata_id_ that maps each HloInstruction to the unique
diff --git a/tensorflow/compiler/xla/service/hlo_domain_metadata.h b/tensorflow/compiler/xla/service/hlo_domain_metadata.h
index 302807f816..d3c83c15ae 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_metadata.h
@@ -20,11 +20,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
 
@@ -42,7 +42,7 @@ class DomainMetadata {
     // operand/user pathways, without crossing a kDomain instruction of a given
     // kind. The reach_set can contain kDomain instructions of other kinds, if
     // two domains of different kind intersect each other.
-    tensorflow::gtl::FlatSet<HloInstruction*> reach_set;
+    absl::flat_hash_set<HloInstruction*> reach_set;
 
     // The same instructions in reach_set, but purged from kDomain instructions
     // and ordered according to their computation graph post-order, i.e.
@@ -55,8 +55,8 @@ class DomainMetadata {
     // whose dataflow enters the reach set (domain), while the exit_domains
     // contains the set of kDomain instructions whose dataflow exit the reach
     // set.
-    tensorflow::gtl::FlatSet<HloInstruction*> enter_domains;
-    tensorflow::gtl::FlatSet<HloInstruction*> exit_domains;
+    absl::flat_hash_set<HloInstruction*> enter_domains;
+    absl::flat_hash_set<HloInstruction*> exit_domains;
   };
 
   virtual ~DomainMetadata() = default;
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 5d5c9c7e58..0207f9ae3f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/ascii.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/human_readable_json.h"
 #include "tensorflow/core/platform/logging.h"
@@ -1433,7 +1433,7 @@ int64 HloInstruction::operand_index(const HloInstruction* target) const {
 
 HloInstruction::InstructionVector HloInstruction::unique_operands() const {
   InstructionVector unique;
-  tensorflow::gtl::FlatSet<const HloInstruction*> seen;
+  absl::flat_hash_set<const HloInstruction*> seen;
   for (HloInstruction* operand : operands()) {
     if (seen.insert(operand).second) {
       unique.push_back(operand);
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
index 1c2b2868fd..55314d0ae9 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
@@ -111,7 +112,7 @@ class ListScheduler {
     // LogicalBuffer is in an operand of the instruction as indicated by
     // points-to analysis.
     for (auto* instruction : computation.instructions()) {
-      tensorflow::gtl::FlatSet<const LogicalBuffer*> instr_uses;
+      absl::flat_hash_set<const LogicalBuffer*> instr_uses;
       for (auto* operand : instruction->operands()) {
         points_to_analysis.GetPointsToSet(operand).ForEachElement(
             [&](const ShapeIndex& /*index*/,
@@ -360,7 +361,7 @@ class ListScheduler {
   std::unordered_map<const LogicalBuffer*, int64> unscheduled_use_count_;
 
   // Set of instructions which have been scheduled.
-  tensorflow::gtl::FlatSet<const HloInstruction*> scheduled_instructions_;
+  absl::flat_hash_set<const HloInstruction*> scheduled_instructions_;
 };
 
 int64 SumLogicalBufferSizes(
@@ -418,7 +419,7 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
         points_to_analysis.GetBuffersDefinedByInstruction(hlo), size_function);
     total_sizes[hlo] = logical_buffer_size;
     cumulative_total_size += logical_buffer_size;
-    tensorflow::gtl::FlatSet<const HloInstruction*> unique_operands(
+    absl::flat_hash_set<const HloInstruction*> unique_operands(
         hlo->operands().begin(), hlo->operands().end());
     for (const HloInstruction* operand : unique_operands) {
       extra_users[hlo] += extra_users[operand];
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 9359e9a8be..7527e35c95 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -328,10 +329,10 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
 
   // Because we didn't uniquify the names or the ids, double-check that the
   // instruction and computation names and ids are unique from the proto.
-  tensorflow::gtl::FlatSet<string> computation_names;
-  tensorflow::gtl::FlatSet<string> instruction_names;
-  tensorflow::gtl::FlatSet<int> computation_ids;
-  tensorflow::gtl::FlatSet<int> instruction_ids;
+  absl::flat_hash_set<string> computation_names;
+  absl::flat_hash_set<string> instruction_names;
+  absl::flat_hash_set<int> computation_ids;
+  absl::flat_hash_set<int> instruction_ids;
   for (HloComputation* computation : module->computations()) {
     TF_RET_CHECK(!ContainsKey(computation_names, computation->name()))
         << "Computation name is not unique: " << computation->name();
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.cc b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
index d83ee71490..fddeb5f0a2 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -42,7 +42,7 @@ std::vector<HloInstruction*> HloModuleGroupUtil::GlobalPredecessors(
     HloInstruction* instruction) {
   std::vector<HloInstruction*>
       predecessors;  // Use a vector to avoid non-determinism.
-  tensorflow::gtl::FlatSet<HloInstruction*> unique;
+  absl::flat_hash_set<HloInstruction*> unique;
 
   // Adds to the unique predecessors list; if the predecessors is a companion
   // instruction, also add companion instructions; if the predecessors is a
@@ -119,7 +119,7 @@ std::vector<HloInstruction*> HloModuleGroupUtil::GlobalSuccessors(
     HloInstruction* instruction) {
   std::vector<HloInstruction*>
       successors;  // Use a vector to avoid non-determinism.
-  tensorflow::gtl::FlatSet<HloInstruction*> unique;
+  absl::flat_hash_set<HloInstruction*> unique;
 
   // Adds to the unique successors list; if the successor is a companion
   // instruction, also add companion instructions; if the successor is a
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 59fd01cb58..5e004ce78a 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -75,8 +75,8 @@ StatusOr<bool> HloPassPipeline::RunPassesInternal(
 std::vector<HloPassInterface*> HloPassPipeline::GetEnabledPasses(
     const DebugOptions& debug_options) {
   auto repeated_field = debug_options.xla_disable_hlo_passes();
-  tensorflow::gtl::FlatSet<string> disabled_pass_names(repeated_field.begin(),
-                                                       repeated_field.end());
+  absl::flat_hash_set<string> disabled_pass_names(repeated_field.begin(),
+                                                  repeated_field.end());
   if (!disabled_pass_names.empty()) {
     VLOG(1) << "Passes disabled by --xla_disable_hlo_passes: "
             << absl::StrJoin(disabled_pass_names, ", ");
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index abdd9a9212..5ac43808ee 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -981,7 +982,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   // rematerialization is essentially a move). If the next rematerialization of
   // the instruction is also a move then the rematerialization is added to the
   // blacklist.
-  tensorflow::gtl::FlatSet<const HloInstruction*> remat_move_instructions;
+  absl::flat_hash_set<const HloInstruction*> remat_move_instructions;
 
   // The map from instructions to their rematerializable status.
   absl::flat_hash_map<const HloInstruction*, bool> remat_able;
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 5a02e3a8bb..70d83c04f0 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -16,6 +16,7 @@
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REMATERIALIZATION_H_
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -122,7 +123,7 @@ class HloRematerialization : public HloModulePass {
 
   // Set of computations which have had rematerialization
   // applied. Rematerialization is only applied once per computation.
-  tensorflow::gtl::FlatSet<const HloComputation*> rematerialized_computations_;
+  absl::flat_hash_set<const HloComputation*> rematerialized_computations_;
 
   // Count of the total instructions rematerialized.
   int64 instructions_rematerialized_ = 0;
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.cc b/tensorflow/compiler/xla/service/hlo_schedule.cc
index 7c5c98f04e..9972eb2077 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -119,7 +120,7 @@ Status HloSchedule::UpdateComputationSchedule(
   }
 
   // Set of all HloInstructions in the schedule.
-  tensorflow::gtl::FlatSet<int> ids_in_schedule;
+  absl::flat_hash_set<int> ids_in_schedule;
   for (int id : sequences_.at(computation->unique_id()).ids()) {
     InsertOrDie(&ids_in_schedule, id);
   }
@@ -210,7 +211,7 @@ Status HloSchedule::Update() {
   if (sequences_.size() > nonfusion_computations.size()) {
     // Schedule contains some computations which have been removed from the
     // HloModule. Remove them from the schedule as well.
-    tensorflow::gtl::FlatSet<int64> nonfusion_computations_ids;
+    absl::flat_hash_set<int64> nonfusion_computations_ids;
     for (const HloComputation* computation : nonfusion_computations) {
       nonfusion_computations_ids.insert(computation->unique_id());
     }
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index 8549487702..59594ab2f0 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -167,7 +167,7 @@ void HloValue::SetPositionsAndComputeUses(
   positions_.insert(positions_.end(), positions.begin(), positions.end());
 
   // Gather the computation roots at which this value appears.
-  tensorflow::gtl::FlatSet<HloInstruction*> root_positions;
+  absl::flat_hash_set<HloInstruction*> root_positions;
   for (const HloPosition& position : positions_) {
     if (position.instruction ==
         position.instruction->parent()->root_instruction()) {
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.cc b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
index 7ee789276d..1ebb331977 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -24,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
 namespace gtl = ::tensorflow::gtl;
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 1591256fad..15f0adcaaf 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -504,7 +504,7 @@ class LayoutAssignment : public HloModulePass {
 
   // Every copy added to the module by the layout assignment pass is registered
   // here.
-  tensorflow::gtl::FlatSet<HloInstruction*> added_copies_;
+  absl::flat_hash_set<HloInstruction*> added_copies_;
 
   // The pointer to the channel layout constraints passed in with the
   // constructor. If not nullptr, this is an input/output argument.
@@ -521,8 +521,7 @@ class LayoutAssignment : public HloModulePass {
 
   // The set of HLO instructions which lacked any layout constraint, thus
   // receiving propagated default layouts.
-  tensorflow::gtl::FlatSet<const HloInstruction*>
-      unconstrained_layout_instructions_;
+  absl::flat_hash_set<const HloInstruction*> unconstrained_layout_instructions_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 3934d2e493..6223a34b12 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -39,6 +39,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:logical_buffer",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@llvm//:core",
     ],
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
index e5370eca56..643ecd0fba 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
 
-#include <unordered_set>
+#include <map>
 
 #include "llvm/IR/MDBuilder.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -164,9 +164,7 @@ llvm::MDNode* AliasAnalysis::GetNoaliasMetadataForBuffer(
     add_buffers_to_worklist(operand);
   }
 
-  tensorflow::gtl::FlatSet<BufferAllocation::Slice,
-                           BufferAllocation::Slice::Hasher>
-      buffers;
+  std::set<BufferAllocation::Slice> buffers;
   for (const LogicalBuffer* buffer : worklist) {
     // Skip buffers which cannot be added to the noalias set.
     if (!assignment.HasAllocation(*buffer) ||
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
index 88cde2d3d9..2b46b3c396 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
 namespace llvm_ir {
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index 95b1c20663..2ca527bc4c 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/multi_output_fusion.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -50,7 +50,7 @@ StatusOr<bool> MultiOutputFusion::Run(HloModule* module) {
       all_fusion_candidates_.push_back(instruction);
 
       std::vector<HloInstruction*> candidates;
-      tensorflow::gtl::FlatSet<HloInstruction*> candidates_set;
+      absl::flat_hash_set<HloInstruction*> candidates_set;
       VLOG(10) << "Looking at instruction: " << instruction->name();
       for (auto operand : instruction->operands()) {
         // Filter out the non-interesting instructions -- they
@@ -172,7 +172,7 @@ void MultiOutputFusion::Update(HloInstruction* instr1, HloInstruction* instr2) {
   // Update the fusible list for fusion. Variable new_fusibles keeps
   // track of the new or changed entries.
   std::vector<std::pair<HloInstruction*, int64>> new_fusibles;
-  tensorflow::gtl::FlatSet<HloInstruction*> in_list;
+  absl::flat_hash_set<HloInstruction*> in_list;
   auto it = fusion_node.fusibles.begin();
   while (it != fusion_node.fusibles.end()) {
     HloInstruction* instr = it->first;
diff --git a/tensorflow/compiler/xla/service/name_uniquer.h b/tensorflow/compiler/xla/service/name_uniquer.h
index 1ac60f1cf4..8909d0f4fe 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.h
+++ b/tensorflow/compiler/xla/service/name_uniquer.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace xla {
@@ -69,7 +69,7 @@ class NameUniquer {
     int64 next_ = 0;
 
     // Set of all the identifiers which has been used.
-    tensorflow::gtl::FlatSet<int64> used_;
+    absl::flat_hash_set<int64> used_;
   };
 
   // The string to use to separate the prefix of the name from the uniquing
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 6ccea9d2b5..e379911462 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -577,7 +577,7 @@ Status ValidateDotDimensionNumbers(
   // Check that dimension numbers are unique.
   auto dims_unique = [](absl::Span<const int64> contracting_dims,
                         absl::Span<const int64> batch_dims) -> bool {
-    tensorflow::gtl::FlatSet<int64> dim_set;
+    absl::flat_hash_set<int64> dim_set;
     auto is_unique = [&dim_set](int64 i) -> bool {
       return dim_set.insert(i).second;
     };
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 921a984589..56952e3ada 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -147,7 +147,7 @@ void ScopedShapedBuffer::Deallocate() {
   // Deallocate all non-null buffers. A buffer may appear in more than one spot
   // in the shape (eg, a tuple with a repeated element) so keep track of what
   // has been deallocated.
-  tensorflow::gtl::FlatSet<void*> deallocated_ptrs;
+  absl::flat_hash_set<void*> deallocated_ptrs;
   for (auto& pair : buffers_) {
     se::DeviceMemoryBase& memory_base = pair.second;
     if (!memory_base.is_null() &&
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index 78392d3bb2..64ad1dc80e 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -36,7 +36,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/compactptrset.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index 2590473c77..9795b2830b 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -16,17 +16,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h"
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/service/tuple_util.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
 
 using absl::flat_hash_map;
+using absl::flat_hash_set;
 using absl::InlinedVector;
-using tensorflow::gtl::FlatSet;
 
 // Copies `to_hoist` to the computation containing `while_instr`, hoisting its
 // operands as needed.  All of its transitive operands are expected to be either
@@ -35,7 +35,7 @@ using tensorflow::gtl::FlatSet;
 // them into `hoisted_instructions`.
 static void CreateLoopInvariantCopy(
     flat_hash_map<HloInstruction*, HloInstruction*>* hoisted_instructions,
-    FlatSet<HloInstruction*>* unhoisted_invariant_instructions,
+    flat_hash_set<HloInstruction*>* unhoisted_invariant_instructions,
     HloInstruction* while_instr, HloInstruction* to_hoist) {
   HloComputation* parent_of_while = while_instr->parent();
   HloComputation* while_body = while_instr->while_body();
@@ -153,7 +153,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
   // unprofitable to be hoisted alone by NotWorthHoistingIndividually.  When we
   // hoist an instruction in this set, we move it from
   // unhoisted_invariant_instructions to hoisted_instructions.
-  FlatSet<HloInstruction*> unhoisted_invariant_instructions;
+  flat_hash_set<HloInstruction*> unhoisted_invariant_instructions;
 
   // Invariant GTE's axiomatically satisfy the constraints for
   // unhoisted_invariant_instructions -- they can be legally hoisted, but there
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 07de8492ba..630d71e5ca 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
@@ -114,7 +115,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
     return false;
   }
 
-  tensorflow::gtl::FlatSet<int64> used_tuple_indices;
+  absl::flat_hash_set<int64> used_tuple_indices;
   for (HloComputation* comp : {while_body, while_cond}) {
     // The HLO verifier ensures that while_input's shape matches while_init's
     // shape, which we verified above is a tuple.
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 06b6330321..8a0ae33042 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -2146,11 +2146,11 @@ xla_test(
         ":test_utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index 181e5cbe29..bc433eac8f 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -145,7 +146,7 @@ ENTRY %sort.148.1589 (parameter.0: f32[1048576], parameter.1: s32[1048576]) -> (
   ASSERT_EQ(args.size(), 2);
   const Literal& key_arg = args[0];
 
-  tensorflow::gtl::FlatSet<uint32> key_set;
+  absl::flat_hash_set<uint32> key_set;
   for (const float& value : key_arg.data<float>()) {
     EXPECT_TRUE(key_set.insert(tensorflow::bit_cast<uint32>(value)).second);
   }
@@ -168,7 +169,7 @@ ENTRY %sort.148.1589 (parameter.0: s32[1048576], parameter.1: s32[1048576]) -> (
   ASSERT_EQ(args.size(), 2);
   const Literal& key_arg = args[0];
 
-  tensorflow::gtl::FlatSet<int32> key_set;
+  absl::flat_hash_set<int32> key_set;
   for (const int32& value : key_arg.data<int32>()) {
     EXPECT_TRUE(key_set.insert(tensorflow::bit_cast<uint32>(value)).second);
   }
-- 
GitLab


From 350388fca9cb9509962ff393a9d21fb2879c9179 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Oct 2018 19:56:47 -0700
Subject: [PATCH 275/570] Add mode_override to the TPU embedding enqueue ops.
 This allows the mode to be overridden at runtime allowing dynamic switching
 between inference and training modes. Not fully implemented yet.

PiperOrigin-RevId: 215325071
---
 tensorflow/contrib/tpu/BUILD                  |   3 +
 .../contrib/tpu/ops/tpu_embedding_ops.cc      |  52 ++++--
 tensorflow/contrib/tpu/python/ops/tpu_ops.py  | 148 ++++++++++++++++++
 3 files changed, 186 insertions(+), 17 deletions(-)

diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 0c4bdab191..10ed1c2891 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -135,6 +135,9 @@ tf_gen_op_wrapper_py(
     name = "tpu_ops",
     hidden = [
         "SendTPUEmbeddingGradients",
+        "EnqueueTPUEmbeddingIntegerBatch",
+        "EnqueueTPUEmbeddingSparseBatch",
+        "EnqueueTPUEmbeddingSparseTensorBatch",
     ],
     deps = [
         ":cross_replica_ops_op_lib",
diff --git a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
index ef2f8dd36d..0ef29bdf73 100644
--- a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
@@ -335,7 +335,6 @@ void RegisterPerTableLoadAndRetrieveOps() {
     tpu::GradientAccumulationSupport grad_accum_support;
     TF_CHECK_OK(GetGradientAccumulationSupport(alg, &grad_accum_support));
     if (grad_accum_support == tpu::GradientAccumulationSupport::kSupported) {
-      // TODO(gkurian): Condition this on being used internally within Google.
       OpRegistry::Global()->Register(
           [alg](OpRegistrationData* op_reg_data) -> Status {
             return RegisterPerTableLoadOpsForAlgorithmBody(alg, true,
@@ -353,7 +352,6 @@ void RegisterPerTableLoadAndRetrieveOps() {
     tpu::GradientAccumulationSupport grad_accum_support;
     TF_CHECK_OK(GetGradientAccumulationSupport(alg, &grad_accum_support));
     if (grad_accum_support == tpu::GradientAccumulationSupport::kSupported) {
-      // TODO(gkurian): Condition this on being used internally within Google.
       OpRegistry::Global()->Register(
           [alg](OpRegistrationData* op_reg_data) -> Status {
             return RegisterPerTableRetrieveOpsForAlgorithmBody(alg, true,
@@ -366,7 +364,7 @@ void RegisterPerTableLoadAndRetrieveOps() {
 }  // namespace
 
 REGISTER_OP("RecvTPUEmbeddingActivations")
-    .Output("outputs: num_outputs * float")
+    .Output("outputs: num_outputs * float32")
     .Attr("num_outputs: int >= 1")
     .Attr("config: string")
     .SetIsStateful()
@@ -476,7 +474,8 @@ config: Serialized TPUEmbeddingConfiguration proto.
 
 REGISTER_OP("EnqueueTPUEmbeddingIntegerBatch")
     .Input("batch: N * int32")
-    .Attr("N: int")
+    .Input("mode_override: string")
+    .Attr("N: int >= 1")
     .Attr("device_ordinal: int = -1")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
@@ -485,6 +484,10 @@ An op that enqueues a list of input batch tensors to TPUEmbedding.
 
 batch: A list of 1D tensors, one for each embedding table, containing the
     indices into the tables.
+mode_override: A string input that overrides the mode specified in the
+    TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+    'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+    in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
 device_ordinal: The TPU device to use. Should be >= 0 and less than the number
     of TPU cores in the task on which the node is placed.
 )doc");
@@ -493,7 +496,8 @@ REGISTER_OP("EnqueueTPUEmbeddingSparseBatch")
     .Input("sample_indices: N * int32")
     .Input("embedding_indices: N * int32")
     .Input("aggregation_weights: N * float32")
-    .Attr("N: int")
+    .Input("mode_override: string")
+    .Attr("N: int >= 1")
     .Attr("device_ordinal: int = -1")
     .Attr("combiners: list(string) = []")
     .SetIsStateful()
@@ -523,14 +527,18 @@ The tensors at corresponding positions in the three input lists
 must have the same shape, i.e. rank 1 with dim_size() equal to the total
 number of lookups into the table described by the corresponding table_id.
 
-sample_indices: A list of Rank 1 Tensors specifying the training example and
+sample_indices: A list of rank 1 Tensors specifying the training example and
     feature to which the corresponding embedding_indices and aggregation_weights
     values belong. sample_indices[i] must equal b * nf + f, where nf is the
     number of features from the corresponding table, f is in [0, nf), and
     b is in [0, batch size).
-embedding_indices: A list of Rank 1 Tensors, indices into the embedding tables.
-aggregation_weights: A list of Rank 1 Tensors containing per sample -- i.e. per
+embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
     (training example, feature) -- aggregation weights.
+mode_override: A string input that overrides the mode specified in the
+    TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+    'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+    in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
 device_ordinal: The TPU device to use. Should be >= 0 and less than the number
     of TPU cores in the task on which the node is placed.
 combiners: A list of string scalars, one for each embedding table that specify
@@ -545,7 +553,8 @@ REGISTER_OP("EnqueueTPUEmbeddingSparseTensorBatch")
     .Input("sample_indices: N * int32")
     .Input("embedding_indices: N * int32")
     .Input("aggregation_weights: N * float32")
-    .Attr("N: int")
+    .Input("mode_override: string")
+    .Attr("N: int >= 1")
     .Attr("device_ordinal: int = -1")
     .Attr("combiners: list(string) = []")
     .Attr("table_ids: list(int)")
@@ -555,7 +564,7 @@ REGISTER_OP("EnqueueTPUEmbeddingSparseTensorBatch")
 This Op eases the porting of code that uses tf.nn.embedding_lookup_sparse().
 
 sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
-to ith feature. table_ids[i] indicates which embedding table to look up ith
+to the ith feature. table_ids[i] indicates which embedding table to look up ith
 feature.
 
 The tensors at corresponding positions in the three input lists (sample_indices,
@@ -563,12 +572,18 @@ embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
 with dim_size() equal to the total number of lookups into the table described by
 the corresponding feature.
 
-sample_indices: A list of Rank 1 Tensors, corresponds to sp_ids.indices[:,0] in
+sample_indices: A list of rank 1 Tensors specifying the training example to
+    which the corresponding embedding_indices and aggregation_weights values
+    belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
+embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+    It corresponds to sp_ids.values in embedding_lookup_sparse().
+aggregation_weights: A list of rank 1 Tensors containing per training example
+    aggregation weights. It corresponds to sp_weights.values in
     embedding_lookup_sparse().
-embedding_indices: A list of Rank 1 Tensors, corresponds to sp_ids.values
-    in embedding_lookup_sparse().
-aggregation_weights: A list of Rank 1 Tensors, corresponds to sp_weights.values
-    in embedding_lookup_sparse().
+mode_override: A string input that overrides the mode specified in the
+    TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+    'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+    in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
 device_ordinal: The TPU device to use. Should be >= 0 and less than the number
     of TPU cores in the task on which the node is placed.
 combiners: A list of string scalars, one for each embedding table that specify
@@ -577,8 +592,11 @@ combiners: A list of string scalars, one for each embedding table that specify
     the sum of the weights be 0 for 'mean' or the sum of the squared weights be
     0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
     all tables.
-table_ids: A list of int. table_ids[i] indicates which embedding table to look
-    up ith feature in the list.
+table_ids: A list of integers specifying the identifier of the embedding table
+    (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
+    corresponding input. The ith input is looked up using table_ids[i]. The size
+    of the table_ids list must be equal to that of sample_indices,
+    embedding_indices and aggregation_weights.
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ops.py b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
index e2e4acadab..968adccf2b 100644
--- a/tensorflow/contrib/tpu/python/ops/tpu_ops.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
@@ -227,6 +227,154 @@ if platform.system() != "Windows":
         inputs=inputs, learning_rates=learning_rates, config=config, name=name)
 
 
+  send_tpu_embedding_gradients.__doc__ = (
+      gen_tpu_ops._send_tpu_embedding_gradients.__doc__)
+
+  # pylint: disable=protected-access
+  def enqueue_tpu_embedding_integer_batch(batch,
+                                          device_ordinal,
+                                          mode_override=None,
+                                          name=None):
+    """A placeholder op for enqueueing embedding IDs to the TPU.
+
+    Args:
+      batch: A list of 1D tensors, one for each embedding table, containing the
+        indices into the tables.
+      device_ordinal: The TPU device to use. Should be >= 0 and less than the
+        number of TPU cores in the task on which the node is placed.
+      mode_override: A string input that overrides the mode specified in the
+        TPUEmbeddingConfiguration. Supported values are {'unspecified',
+        'inference', 'training', 'backward_pass_only'}. When set to
+        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
+        otherwise mode_override is used (optional).
+      name: A name for the operation (optional).
+
+    Returns:
+      An EnqueueTPUEmbeddingIntegerBatch operation.
+    """
+    if mode_override is None:
+      mode_override = "unspecified"
+    return gen_tpu_ops._enqueue_tpu_embedding_integer_batch(
+        batch=batch,
+        device_ordinal=device_ordinal,
+        mode_override=mode_override,
+        name=name)
+
+  enqueue_tpu_embedding_integer_batch.__doc__ = (
+      gen_tpu_ops._enqueue_tpu_embedding_integer_batch.__doc__)
+
+  # pylint: disable=protected-access
+  def enqueue_tpu_embedding_sparse_batch(sample_indices,
+                                         embedding_indices,
+                                         aggregation_weights,
+                                         device_ordinal,
+                                         combiners=None,
+                                         mode_override=None,
+                                         name=None):
+    """A placeholder op for enqueueing embedding IDs to the TPU.
+
+    Args:
+      sample_indices: A list of rank 1 Tensors specifying the training example
+        and feature to which the corresponding embedding_indices and
+        aggregation_weights values belong. sample_indices[i] must equal b * nf +
+        f, where nf is the number of features from the corresponding table, f is
+        in [0, nf), and b is in [0, batch size).
+      embedding_indices: A list of rank 1 Tensors, indices into the embedding
+        tables.
+      aggregation_weights: A list of rank 1 Tensors containing per sample --
+        i.e. per (training example, feature) -- aggregation weights.
+      device_ordinal: The TPU device to use. Should be >= 0 and less than the
+        number of TPU cores in the task on which the node is placed.
+      combiners: A list of string scalars, one for each embedding table that
+        specify how to normalize the embedding activations after weighted
+        summation. Supported combiners are 'mean', 'sum', or 'sqrtn'. It is
+        invalid to have the sum of the weights be 0 for 'mean' or the sum of the
+        squared weights be 0 for 'sqrtn'. If combiners isn't passed, the default
+        is to use 'sum' for all tables (optional).
+      mode_override: A string input that overrides the mode specified in the
+        TPUEmbeddingConfiguration. Supported values are {'unspecified',
+        'inference', 'training', 'backward_pass_only'}. When set to
+        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
+        otherwise mode_override is used (optional).
+      name: A name for the operation (optional).
+
+    Returns:
+      An EnqueueTPUEmbeddingSparseBatch operation.
+    """
+    if mode_override is None:
+      mode_override = "unspecified"
+    return gen_tpu_ops._enqueue_tpu_embedding_sparse_batch(
+        sample_indices=sample_indices,
+        embedding_indices=embedding_indices,
+        aggregation_weights=aggregation_weights,
+        device_ordinal=device_ordinal,
+        combiners=combiners,
+        mode_override=mode_override,
+        name=name)
+
+  enqueue_tpu_embedding_sparse_batch.__doc__ = (
+      gen_tpu_ops._enqueue_tpu_embedding_sparse_batch.__doc__)
+
+  # pylint: disable=protected-access
+  def enqueue_tpu_embedding_sparse_tensor_batch(sample_indices,
+                                                embedding_indices,
+                                                aggregation_weights,
+                                                table_ids,
+                                                device_ordinal,
+                                                combiners=None,
+                                                mode_override=None,
+                                                name=None):
+    """A placeholder op for enqueueing embedding IDs to the TPU.
+
+    Args:
+      sample_indices: A list of rank 1 Tensors specifying the training example
+        to which the corresponding embedding_indices and aggregation_weights
+        values
+        belong. It corresponds to sp_ids.indices[:,0] in
+          embedding_lookup_sparse().
+      embedding_indices: A list of rank 1 Tensors, indices into the embedding
+        tables. It corresponds to sp_ids.values in embedding_lookup_sparse().
+      aggregation_weights: A list of rank 1 Tensors containing per training
+        example aggregation weights. It corresponds to sp_weights.values in
+        embedding_lookup_sparse().
+      table_ids: A list of integers specifying the identifier of the embedding
+        table (offset of TableDescriptor in the TPUEmbeddingConfiguration) to
+        lookup the corresponding input. The ith input is looked up using
+        table_ids[i]. The size of the table_ids list must be equal to that of
+        sample_indices, embedding_indices and aggregation_weights.
+      device_ordinal: The TPU device to use. Should be >= 0 and less than the
+        number of TPU cores in the task on which the node is placed.
+      combiners: A list of string scalars, one for each embedding table that
+        specify how to normalize the embedding activations after weighted
+        summation. Supported combiners are 'mean', 'sum', or 'sqrtn'. It is
+        invalid to have the sum of the weights be 0 for 'mean' or the sum of the
+        squared weights be 0 for 'sqrtn'. If combiners isn't passed, the default
+        is to use 'sum' for all tables (optional).
+      mode_override: A string input that overrides the mode specified in the
+        TPUEmbeddingConfiguration. Supported values are {'unspecified',
+        'inference', 'training', 'backward_pass_only'}. When set to
+        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
+        otherwise mode_override is used (optional).
+      name: A name for the operation (optional).
+
+    Returns:
+      An EnqueueTPUEmbeddingSparseTensorBatch operation.
+    """
+    if mode_override is None:
+      mode_override = "unspecified"
+    return gen_tpu_ops._enqueue_tpu_embedding_sparse_tensor_batch(
+        sample_indices=sample_indices,
+        embedding_indices=embedding_indices,
+        aggregation_weights=aggregation_weights,
+        table_ids=table_ids,
+        device_ordinal=device_ordinal,
+        combiners=combiners,
+        mode_override=mode_override,
+        name=name)
+
+  enqueue_tpu_embedding_sparse_tensor_batch.__doc__ = (
+      gen_tpu_ops._enqueue_tpu_embedding_sparse_tensor_batch.__doc__)
+
 else:
   # We have already built the appropriate libraries into the binary via CMake
   # if we have built contrib, so we don't need this
-- 
GitLab


From 721ab82745a113fb8cca4ce2b1f22d1d5ab5d546 Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Mon, 1 Oct 2018 23:03:16 -0700
Subject: [PATCH 276/570] Loosen test bounds.

PiperOrigin-RevId: 215338403
---
 tensorflow/python/kernel_tests/depthwise_conv_op_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 6d1ead20be..9c02b69180 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -131,8 +131,8 @@ class DepthwiseConv2DTest(test.TestCase):
     with self.session(graph=graph, use_gpu=use_gpu) as sess:
       tolerance = {
           dtypes.float16: 4e-2,
-          dtypes.float32: 1e-8,
-          dtypes.float64: 1e-13,
+          dtypes.float32: 1e-7,
+          dtypes.float64: 1e-12,
       }[data_type]
 
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=data_type)
-- 
GitLab


From 9884cb36290664593682d235ce0d5e1925e3fa23 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Mon, 1 Oct 2018 23:06:12 -0700
Subject: [PATCH 277/570] Check that IsValid{Input|Output}Tensor is only given
 non-control edges

PiperOrigin-RevId: 215338658
---
 tensorflow/core/graph/graph.cc | 4 ++--
 tensorflow/core/graph/graph.h  | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 1630ab7a15..4c0cd14ff1 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -643,7 +643,7 @@ Status Graph::IsValidNode(const Node* node) const {
 
 Status Graph::IsValidOutputTensor(const Node* node, int idx) const {
   TF_RETURN_IF_ERROR(IsValidNode(node));
-  if (idx >= node->num_outputs()) {
+  if (idx >= node->num_outputs() || idx < 0) {
     return errors::OutOfRange("Node '", node->name(), "' (type: '",
                               node->op_def().name(),
                               "', num of outputs: ", node->num_outputs(),
@@ -654,7 +654,7 @@ Status Graph::IsValidOutputTensor(const Node* node, int idx) const {
 
 Status Graph::IsValidInputTensor(const Node* node, int idx) const {
   TF_RETURN_IF_ERROR(IsValidNode(node));
-  if (idx >= node->num_inputs()) {
+  if (idx >= node->num_inputs() || idx < 0) {
     return errors::OutOfRange("Node '", node->name(), "' (type: '",
                               node->op_def().name(),
                               "', num of inputs: ", node->num_inputs(),
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 52e9f23a76..72cef07072 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -590,12 +590,12 @@ class Graph {
   // Returns OK if `node` is non-null and belongs to this graph
   Status IsValidNode(const Node* node) const;
 
-  // Returns OK if IsValidNode(`node`) and `idx` is less than
-  // node->num_outputs()
+  // Returns OK if IsValidNode(`node`) and `idx` is a valid output.  Does not
+  // accept control outputs.
   Status IsValidOutputTensor(const Node* node, int idx) const;
 
-  // Returns OK if IsValidNode(`node`) and `idx` is less than
-  // node->num_inputs()
+  // Returns OK if IsValidNode(`node`) and `idx` a valid input.  Does not accept
+  // control inputs.
   Status IsValidInputTensor(const Node* node, int idx) const;
 
   // Create and return a new WhileContext owned by this graph. This is called
-- 
GitLab


From 38808119e9d5f8ad24bb414aab281e0fa3fde6dc Mon Sep 17 00:00:00 2001
From: Gautam <gautamrbharadwaj@gmail.com>
Date: Tue, 2 Oct 2018 11:56:06 +0530
Subject: [PATCH 278/570] Update backend.py

Adding missing import files in the commented examples. When trying out that particular example in commented section the TensorFlow and bumpy imports are missing
---
 tensorflow/python/keras/backend.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 584facc859..79ca4beb73 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -773,6 +773,8 @@ def is_keras_tensor(x):
 
   Examples:
   ```python
+      >>> import tensorflow as tf
+      >>> import numpy
       >>> from keras import backend as K
       >>> from keras.layers import Input, Dense
       >>> np_var = numpy.array([1, 2])
-- 
GitLab


From 7830912c03fe3939120651574d33cec01bc73fcf Mon Sep 17 00:00:00 2001
From: Gautam <gautamrbharadwaj@gmail.com>
Date: Tue, 2 Oct 2018 12:00:14 +0530
Subject: [PATCH 279/570] Update backend.py

adding missing import numpy
---
 tensorflow/python/keras/backend.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 584facc859..9c1581eef9 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -653,6 +653,7 @@ def variable(value, dtype=None, name=None, constraint=None):
 
   Examples:
   ```python
+      >>> import numpy as np
       >>> from keras import backend as K
       >>> val = np.array([[1, 2], [3, 4]])
       >>> kvar = K.variable(value=val, dtype='float64', name='example_var')
-- 
GitLab


From edea1be5dd98775399dbd12728e86039a14fb967 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 02:13:06 -0700
Subject: [PATCH 280/570] compat: Update forward compatibility horizon to
 2018-10-02

PiperOrigin-RevId: 215354927
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index bea5aa990f..3bb95b56c2 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -26,7 +26,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 1)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 2)
 
 
 @tf_export("compat.forward_compatible")
-- 
GitLab


From 44da41e4900c3fd481f12c9aa4c49679c9f32fa4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 03:01:09 -0700
Subject: [PATCH 281/570] Fix layout assignment for cross module all reduce

Previously we could have ended up with the different HLOs being assigned
different layouts what made lowering impossible. This change enforces a
consistent layout between the communicating nodes the same way it is
done for send&recv pairs.

PiperOrigin-RevId: 215359420
---
 .../compiler/xla/service/layout_assignment.cc | 65 +++++++++++++++----
 .../xla/service/layout_assignment_test.cc     | 44 +++++++++++++
 2 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 082bf8bffe..25d5327561 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -498,6 +498,22 @@ Status LayoutAssignment::AddMandatoryConstraints(
         TF_RETURN_IF_ERROR(
             constraints->SetBufferLayout(new_shape.layout(), *buffer));
       }
+    } else if (instruction->IsCrossModuleAllReduce()) {
+      CHECK(get_channel_constraints(instruction))
+          << "Multi-module layout assignment requires ChannelLayoutConstraints";
+      int64 all_reduce_id = instruction->all_reduce_id().value();
+      if (!get_channel_constraints(instruction)
+               ->IsChannelConstrained(all_reduce_id)) {
+        continue;
+      }
+      // TODO(b/68493863): Change to use SetOperandLayout().
+      const Shape& buffer_shape = instruction->operand(0)->shape();
+      TF_RET_CHECK(ShapeUtil::IsArray(buffer_shape));
+      Shape new_buffer_shape =
+          get_channel_constraints(instruction)
+              ->LayoutShapeForChannel(buffer_shape, all_reduce_id);
+      TF_RETURN_IF_ERROR(
+          constraints->SetInstructionLayout(new_buffer_shape, instruction));
     }
   }
 
@@ -1512,19 +1528,6 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
     // Verify all layouts in the shape have been set.
     TF_RET_CHECK(LayoutUtil::HasLayout(instruction->shape()));
   }
-
-  // Copy the root instruction's result if its layout does not match the result
-  // layout constraint.
-  if (constraints.ResultLayout() != nullptr &&
-      !constraints.ResultLayout()->MatchesLayoutInShape(
-          computation->root_instruction()->shape())) {
-    TF_ASSIGN_OR_RETURN(
-        HloInstruction * new_root,
-        CreateCopyWithNewLayout(constraints.ResultLayout()->shape(),
-                                computation->root_instruction()));
-    computation->set_root_instruction(new_root);
-  }
-
   return Status::OK();
 }
 
@@ -1654,6 +1657,18 @@ Status LayoutAssignment::RunOnComputation(
     TF_RETURN_IF_ERROR(
         ConstrainChannelLayouts(computation, channel_constraints));
   }
+
+  // Copy the root instruction's result if its layout does not match the result
+  // layout constraint.
+  if (constraints.ResultLayout() != nullptr &&
+      !constraints.ResultLayout()->MatchesLayoutInShape(
+          computation->root_instruction()->shape())) {
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_root,
+        CreateCopyWithNewLayout(constraints.ResultLayout()->shape(),
+                                computation->root_instruction()));
+    computation->set_root_instruction(new_root);
+  }
   return Status::OK();
 }
 
@@ -1709,6 +1724,30 @@ Status LayoutAssignment::ConstrainChannelLayouts(
             ShapeUtil::GetMutableSubshape(instruction->mutable_shape(), {0});
         *send_shape = shape;
       }
+    } else if (instruction->IsCrossModuleAllReduce()) {
+      const Layout* layout =
+          get_channel_constraints(instruction)
+              ->ConstrainChannel(instruction->all_reduce_id().value(),
+                                 instruction->shape().layout());
+      if (layout != nullptr) {
+        // We found an already constrained layout which does not match the one
+        // the channel wants to impose. Either add a new kCopy, or use the
+        // existing one to marshal the correct shape.
+        HloInstruction* operand = instruction->mutable_operand(0);
+        Shape shape = operand->shape();
+        *shape.mutable_layout() = *layout;
+        if (operand->opcode() != HloOpcode::kCopy) {
+          HloInstruction* copy = operand->parent()->AddInstruction(
+              HloInstruction::CreateUnary(shape, HloOpcode::kCopy, operand));
+          RegisterAddedCopy(copy);
+          SetupCopiedInstruction(*operand, copy, {});
+          TF_RETURN_IF_ERROR(instruction->ReplaceOperandWith(0, copy));
+          operand = copy;
+        } else {
+          *operand->mutable_shape() = shape;
+        }
+        *instruction->mutable_shape() = shape;
+      }
     }
   }
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 752a61476d..10f9a95121 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -860,6 +860,50 @@ TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0})));
 }
 
+TEST_F(LayoutAssignmentTest, AllReduceLayoutMissmatch) {
+  // Pin non matching layouts to parameter and root.
+  const char* module_str = R"(
+    HloModule test_module
+
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY entry_computation {
+      param = (f32[2,2]) parameter(0)
+      gte = f32[2,2] get-tuple-element(param), index=0
+      ar.0 = f32[2,2] cross-replica-sum(gte),
+        all_reduce_id=0, replica_groups={{0}}, to_apply=add,
+        sharding={maximal device=0}
+      const = f32[2,2] constant(f32[2,2]{{0,1},{2,3}})
+      ROOT ar.1 = f32[2,2] cross-replica-sum(const),
+        all_reduce_id=0, replica_groups={{0}}, to_apply=add,
+        sharding={maximal device=1}
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  ComputationLayout computation_layout(
+      module->entry_computation()->ComputeProgramShape());
+  Shape param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1})});
+  TF_ASSERT_OK(
+      computation_layout.mutable_parameter_layout(0)->CopyLayoutFromShape(
+          param_shape));
+  computation_layout.mutable_result_layout()->ResetLayout(
+      LayoutUtil::MakeLayout({1, 0}));
+
+  ChannelLayoutConstraints channel_constraints;
+  AssignLayouts(module.get(), &computation_layout, &channel_constraints);
+
+  EXPECT_THAT(LayoutOf(module.get(), "gte"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(module.get(), "ar.0"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(module.get(), "ar.1"), ElementsAre(0, 1));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root->shape().layout().minor_to_major(), ElementsAre(1, 0));
+}
+
 TEST_F(LayoutAssignmentTest, CopySliceOperandToAvoidImplicitLayoutChange) {
   const char* module_str = R"(
     HloModule CopySliceOperandToAvoidImplicitLayoutChange
-- 
GitLab


From f22037abf5a6f4581f5fb6013f72f91747f22965 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 03:36:14 -0700
Subject: [PATCH 282/570] Add a hint parameter to TransferLiteralToDeviceAsync
 that the implementation can use to accelerate transfers.

PiperOrigin-RevId: 215362667
---
 tensorflow/compiler/jit/xla_device_context.cc    | 15 +++++++++++----
 tensorflow/compiler/jit/xla_device_context.h     |  3 ++-
 .../xla/service/generic_transfer_manager.cc      |  2 +-
 .../xla/service/generic_transfer_manager.h       |  7 ++++---
 .../compiler/xla/service/transfer_manager.h      | 16 +++++++++++++++-
 5 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index af83c792e5..e083652978 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -75,8 +75,9 @@ XlaTransferManager::XlaTransferManager(
   }
 }
 
-Status XlaTransferManager::TransferLiteralToDevice(
-    const Tensor& host_tensor, Tensor* device_tensor) const {
+Status XlaTransferManager::TransferLiteralToDevice(const Tensor& host_tensor,
+                                                   Tensor* device_tensor,
+                                                   bool buffer_is_fresh) const {
   xla::Shape xla_shape;
   TF_RETURN_IF_ERROR(TensorShapeToXLAShape(host_tensor.dtype(),
                                            host_tensor.shape(), &xla_shape));
@@ -97,8 +98,11 @@ Status XlaTransferManager::TransferLiteralToDevice(
     // synchronized.
     host_to_device_stream_->ThenWaitFor(stream_.get());
   }
+  xla::TransferManager::TransferToDeviceHint hint =
+      buffer_is_fresh ? xla::TransferManager::kBufferUndefined
+                      : xla::TransferManager::kNoHint;
   TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDeviceAsync(
-      host_to_device_stream_.get(), *literal, shaped_buffer));
+      host_to_device_stream_.get(), *literal, shaped_buffer, hint));
   if (UseMultipleStreams()) {
     auto event = std::make_shared<se::Event>(stream_->parent());
     TF_RET_CHECK(event->Init()) << "Event failed to initialize!";
@@ -165,6 +169,7 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
     return;
   }
   TensorShape shape = shape_or_status.ValueOrDie();
+  bool buffer_is_fresh = false;
   if (!xla_tensor->has_shaped_buffer()) {
     Status s =
         xla_tensor->AllocateShapedBuffer(device_tensor->dtype(), shape, client_,
@@ -173,6 +178,7 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
       done(s);
       return;
     }
+    buffer_is_fresh = true;
   }
 
   Status status;
@@ -183,7 +189,8 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
           "Tensor::CopyFrom failed when copying from CPU to XLA device"));
       return;
     }
-    status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor);
+    status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor,
+                                     buffer_is_fresh);
   } else {
     se::DeviceMemoryBase dev_dst_ptr =
         XlaTensor::DeviceMemoryFromTensor(*device_tensor);
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index df82421294..a4c0c296fc 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -67,7 +67,8 @@ class XlaTransferManager {
 
  private:
   Status TransferLiteralToDevice(const Tensor& host_tensor,
-                                 Tensor* device_tensor) const;
+                                 Tensor* device_tensor,
+                                 bool buffer_is_fresh) const;
   void TransferLiteralFromDevice(Tensor* host_tensor,
                                  const Tensor& device_tensor,
                                  const StatusCallback& done) const;
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index bec02e14f9..f92fde7f46 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -98,7 +98,7 @@ Status GenericTransferManager::TransferLiteralFromDeviceInternal(
 
 Status GenericTransferManager::TransferLiteralToDeviceAsync(
     se::Stream* stream, const LiteralSlice& literal,
-    const ShapedBuffer& device_buffer) {
+    const ShapedBuffer& device_buffer, TransferToDeviceHint /*hint*/) {
   const Shape& shape = literal.shape();
   VLOG(2) << "transferring literal shape to device: "
           << ShapeUtil::HumanString(shape)
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index 86c8b1c145..b1cba82b9f 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -45,9 +45,10 @@ class GenericTransferManager : public TransferManager {
                                  MutableBorrowingLiteral literal,
                                  std::function<void(Status)> done) override;
 
-  Status TransferLiteralToDeviceAsync(
-      se::Stream* stream, const LiteralSlice& literal,
-      const ShapedBuffer& device_buffer) override;
+  Status TransferLiteralToDeviceAsync(se::Stream* stream,
+                                      const LiteralSlice& literal,
+                                      const ShapedBuffer& device_buffer,
+                                      TransferToDeviceHint hint) override;
 
   Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                  const LiteralSlice& literal) override;
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index f952e64af2..9199e32d0f 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -89,6 +89,16 @@ class TransferManager {
                                          const LiteralSlice& literal,
                                          const ShapedBuffer& device_buffer);
 
+  // Hint type given to TransferLiteralToDeviceAsync.
+  enum TransferToDeviceHint {
+    // No hint available.
+    kNoHint,
+
+    // The destination buffer is undefined on the device, meaning it can be
+    // transferred to eagerly rather than waiting for Stream ordering.
+    kBufferUndefined,
+  };
+
   // Transfers the given literal into the previously allocated device memory
   // represented by the given ShapedBuffer using the given executor. The shape
   // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible,
@@ -96,9 +106,13 @@ class TransferManager {
   //
   // This operation is performed asynchronously on the given stream. It returns
   // once the transfer is enqueued.
+  //
+  // The optional hint can allow implementations to optimize transfers. It is
+  // not mandatory for an implementation to obey the hint.
   virtual Status TransferLiteralToDeviceAsync(
       se::Stream* stream, const LiteralSlice& literal,
-      const ShapedBuffer& device_buffer) = 0;
+      const ShapedBuffer& device_buffer,
+      TransferToDeviceHint hint = kNoHint) = 0;
 
   // Convenience methods for transferring an array to or from the device at a
   // known address. This avoids having to construct a ShapedBuffer just to
-- 
GitLab


From 35f3046a326daea0179d024044636f2fcbb45f4a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 05:18:28 -0700
Subject: [PATCH 283/570] Export endpoint for the version of the
 `regex_replace` function that calls StaticRegexReplace.

PiperOrigin-RevId: 215371291
---
 .../python_api/api_def_RegexReplace.pbtxt     |  8 +-----
 tensorflow/python/ops/string_ops.py           | 25 +++++++++++--------
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/tensorflow/core/api_def/python_api/api_def_RegexReplace.pbtxt b/tensorflow/core/api_def/python_api/api_def_RegexReplace.pbtxt
index b17806b338..5020844204 100644
--- a/tensorflow/core/api_def/python_api/api_def_RegexReplace.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_RegexReplace.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "RegexReplace"
-  endpoint {
-    name: "strings.regex_replace"
-  }
-  endpoint {
-    name: "regex_replace"
-    deprecated: true
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index e83c08f643..0812f901a2 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -46,6 +46,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=redefined-builtin
+@tf_export("strings.regex_full_match")
 def regex_full_match(input, pattern, name=None):
   r"""Match elements of `input` with regex `pattern`.
 
@@ -73,15 +74,14 @@ def regex_full_match(input, pattern, name=None):
 
 regex_full_match.__doc__ = gen_string_ops.regex_full_match.__doc__
 
-# Expose regex_full_match in strings namespace
-tf_export("strings.regex_full_match")(regex_full_match)
 
-
-def regex_replace(source, pattern, rewrite, replace_global=True):
-  r"""Replace elements of `source` matching regex `pattern` with `rewrite`.
+@tf_export("strings.regex_replace", "regex_replace")
+@deprecation.deprecated_endpoints("regex_replace")
+def regex_replace(input, pattern, rewrite, replace_global=True, name=None):
+  r"""Replace elements of `input` matching regex `pattern` with `rewrite`.
 
   Args:
-    source: string `Tensor`, the source strings to process.
+    input: string `Tensor`, the source strings to process.
     pattern: string or scalar string `Tensor`, regular expression to use,
       see more details at https://github.com/google/re2/wiki/Syntax
     rewrite: string or scalar string `Tensor`, value to use in match
@@ -89,9 +89,10 @@ def regex_replace(source, pattern, rewrite, replace_global=True):
       text matching corresponding parenthesized group.
     replace_global: `bool`, if `True` replace all non-overlapping matches,
       else replace only the first match.
+    name: A name for the operation (optional).
 
   Returns:
-    string `Tensor` of the same shape as `source` with specified replacements.
+    string `Tensor` of the same shape as `input` with specified replacements.
   """
   if (isinstance(pattern, util_compat.bytes_or_text_types) and
       isinstance(rewrite, util_compat.bytes_or_text_types)):
@@ -99,11 +100,13 @@ def regex_replace(source, pattern, rewrite, replace_global=True):
     # use a version which performs the expensive regex compilation once at
     # creation time.
     return gen_string_ops.static_regex_replace(
-        input=source, pattern=pattern,
-        rewrite=rewrite, replace_global=replace_global)
+        input=input, pattern=pattern,
+        rewrite=rewrite, replace_global=replace_global,
+        name=name)
   return gen_string_ops.regex_replace(
-      input=source, pattern=pattern,
-      rewrite=rewrite, replace_global=replace_global)
+      input=input, pattern=pattern,
+      rewrite=rewrite, replace_global=replace_global,
+      name=name)
 
 
 @tf_export("strings.format")
-- 
GitLab


From 97d515273a1e86a861cdfb338671a42b3b1126a7 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 2 Oct 2018 07:34:40 -0700
Subject: [PATCH 284/570] Make StatelessRandomOpsTest.testRandomNormalIsFinite
 actually test stateless_random_normal.

Fixes #22611

PiperOrigin-RevId: 215385610
---
 tensorflow/compiler/tests/stateless_random_ops_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index f3861043b2..e8741bc468 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -91,7 +91,7 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
     with self.cached_session() as sess, self.test_scope():
       for dtype in self._random_types():
         seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
-        x = stateless.stateless_random_uniform(
+        x = stateless.stateless_random_normal(
             shape=[10000], seed=seed_t, dtype=dtype)
         y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
         self.assertTrue(np.all(np.isfinite(y)))
-- 
GitLab


From 1a56a3299e904d5a3352a3a15e4cf7401f72bbc3 Mon Sep 17 00:00:00 2001
From: joe yearsley <joe@kheironmed.com>
Date: Tue, 2 Oct 2018 16:33:37 +0100
Subject: [PATCH 285/570] Updated ordering for kwargs

---
 tensorflow/python/layers/core.py                       | 6 +++---
 tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt | 2 +-
 tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 5919fa543e..e06e9aba4a 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -292,17 +292,17 @@ class Flatten(keras_layers.Flatten, base.Layer):
 
 
 @tf_export('layers.flatten')
-def flatten(inputs, data_format='channels_last', name=None):
+def flatten(inputs, name=None, data_format='channels_last'):
   """Flattens an input tensor while preserving the batch axis (axis 0).
 
   Arguments:
     inputs: Tensor input.
+    name: The name of the layer (string).
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
       `(batch, height, width, channels)` while `channels_first` corresponds to
       inputs with shape `(batch, channels, height, width)`.
-    name: The name of the layer (string).
 
   Returns:
     Reshaped tensor.
@@ -319,7 +319,7 @@ def flatten(inputs, data_format='channels_last', name=None):
     # now `y` has shape `(None, None)`
   ```
   """
-  layer = Flatten(data_format=data_format, name=name)
+  layer = Flatten(name=name, data_format=data_format)
   return layer.apply(inputs)
 
 
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt
index 5d9ea2e5a3..0c24e9c7dd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt
@@ -122,7 +122,7 @@ tf_module {
   }
   member_method {
     name: "flatten"
-    argspec: "args=[\'inputs\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'channels_last\', \'None\'], "
+    argspec: "args=[\'inputs\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'channels_last\'], "
   }
   member_method {
     name: "max_pooling1d"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt
index 5d9ea2e5a3..0c24e9c7dd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt
@@ -122,7 +122,7 @@ tf_module {
   }
   member_method {
     name: "flatten"
-    argspec: "args=[\'inputs\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'channels_last\', \'None\'], "
+    argspec: "args=[\'inputs\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'channels_last\'], "
   }
   member_method {
     name: "max_pooling1d"
-- 
GitLab


From 28757ad658243526d84fd16d53b9eefbf809c6ff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 08:30:36 -0700
Subject: [PATCH 286/570] Use xlogy in a few places in TFP to avoid NaN's for
 certain special cases.

PiperOrigin-RevId: 215392621
---
 .../kernel_tests/distributions/beta_test.py     |  5 +++++
 .../distributions/dirichlet_test.py             | 17 +++++++++++++++++
 .../distributions/exponential_test.py           |  7 +++++++
 .../kernel_tests/distributions/gamma_test.py    |  8 ++++++++
 tensorflow/python/ops/distributions/beta.py     |  4 ++--
 .../python/ops/distributions/dirichlet.py       |  2 +-
 tensorflow/python/ops/distributions/gamma.py    |  2 +-
 7 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/kernel_tests/distributions/beta_test.py b/tensorflow/python/kernel_tests/distributions/beta_test.py
index d580a415dd..42e81bd658 100644
--- a/tensorflow/python/kernel_tests/distributions/beta_test.py
+++ b/tensorflow/python/kernel_tests/distributions/beta_test.py
@@ -167,6 +167,11 @@ class BetaTest(test.TestCase):
     self.assertAllClose([[1., 3. / 2], [3. / 2, 15. / 8]], self.evaluate(pdf))
     self.assertEqual((2, 2), pdf.get_shape())
 
+  def testLogPdfOnBoundaryIsFiniteWhenAlphaIsOne(self):
+    b = [[0.01, 0.1, 1., 2], [5., 10., 2., 3]]
+    pdf = self.evaluate(beta_lib.Beta(1., b).prob(0.))
+    self.assertAllEqual(np.ones_like(pdf, dtype=np.bool), np.isfinite(pdf))
+
   def testBetaMean(self):
     a = [1., 2, 3]
     b = [2., 4, 1.2]
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
index cace5b3ba2..0f96382453 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
@@ -83,6 +83,23 @@ class DirichletTest(test.TestCase):
     with self.assertRaisesOpError("sample last-dimension must sum to `1`"):
       self.evaluate(dist.prob([.1, .2, .8]))
 
+  def testLogPdfOnBoundaryIsFiniteWhenAlphaIsOne(self):
+    # Test concentration = 1. for each dimension.
+    concentration = 3 * np.ones((10, 10)).astype(np.float32)
+    concentration[range(10), range(10)] = 1.
+    x = 1 / 9. * np.ones((10, 10)).astype(np.float32)
+    x[range(10), range(10)] = 0.
+    dist = dirichlet_lib.Dirichlet(concentration)
+    log_prob = self.evaluate(dist.log_prob(x))
+    self.assertAllEqual(
+        np.ones_like(log_prob, dtype=np.bool), np.isfinite(log_prob))
+
+    # Test when concentration[k] = 1., and x is zero at various dimensions.
+    dist = dirichlet_lib.Dirichlet(10 * [1.])
+    log_prob = self.evaluate(dist.log_prob(x))
+    self.assertAllEqual(
+        np.ones_like(log_prob, dtype=np.bool), np.isfinite(log_prob))
+
   def testPdfZeroBatches(self):
     alpha = [1., 2]
     x = [.5, .5]
diff --git a/tensorflow/python/kernel_tests/distributions/exponential_test.py b/tensorflow/python/kernel_tests/distributions/exponential_test.py
index 367f8bb0f1..1600387585 100644
--- a/tensorflow/python/kernel_tests/distributions/exponential_test.py
+++ b/tensorflow/python/kernel_tests/distributions/exponential_test.py
@@ -65,6 +65,13 @@ class ExponentialTest(test.TestCase):
     self.assertAllClose(self.evaluate(log_pdf), expected_log_pdf)
     self.assertAllClose(self.evaluate(pdf), np.exp(expected_log_pdf))
 
+  def testExponentialLogPDFBoundary(self):
+    # Check that Log PDF is finite at 0.
+    rate = np.array([0.1, 0.5, 1., 2., 5., 10.], dtype=np.float32)
+    exponential = exponential_lib.Exponential(rate=rate)
+    log_pdf = exponential.log_prob(0.)
+    self.assertAllClose(np.log(rate), self.evaluate(log_pdf))
+
   def testExponentialCDF(self):
     batch_size = 6
     lam = constant_op.constant([2.0] * batch_size)
diff --git a/tensorflow/python/kernel_tests/distributions/gamma_test.py b/tensorflow/python/kernel_tests/distributions/gamma_test.py
index 4eff40b029..4c5b9c3ea3 100644
--- a/tensorflow/python/kernel_tests/distributions/gamma_test.py
+++ b/tensorflow/python/kernel_tests/distributions/gamma_test.py
@@ -77,6 +77,14 @@ class GammaTest(test.TestCase):
     self.assertAllClose(self.evaluate(log_pdf), expected_log_pdf)
     self.assertAllClose(self.evaluate(pdf), np.exp(expected_log_pdf))
 
+  def testGammaLogPDFBoundary(self):
+    # When concentration = 1, we have an exponential distribution. Check that at
+    # 0 we have finite log prob.
+    rate = np.array([0.1, 0.5, 1., 2., 5., 10.], dtype=np.float32)
+    gamma = gamma_lib.Gamma(concentration=1., rate=rate)
+    log_pdf = gamma.log_prob(0.)
+    self.assertAllClose(np.log(rate), self.evaluate(log_pdf))
+
   def testGammaLogPDFMultidimensional(self):
     batch_size = 6
     alpha = constant_op.constant([[2.0, 4.0]] * batch_size)
diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py
index 2ba1ea6744..d6f89a3517 100644
--- a/tensorflow/python/ops/distributions/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -267,8 +267,8 @@ class Beta(distribution.Distribution):
 
   def _log_unnormalized_prob(self, x):
     x = self._maybe_assert_valid_sample(x)
-    return ((self.concentration1 - 1.) * math_ops.log(x)
-            + (self.concentration0 - 1.) * math_ops.log1p(-x))
+    return (math_ops.xlogy(self.concentration1 - 1., x) +
+            (self.concentration0 - 1.) * math_ops.log1p(-x))
 
   def _log_normalization(self):
     return (math_ops.lgamma(self.concentration1)
diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
index 415249a958..997b1d392d 100644
--- a/tensorflow/python/ops/distributions/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -236,7 +236,7 @@ class Dirichlet(distribution.Distribution):
 
   def _log_unnormalized_prob(self, x):
     x = self._maybe_assert_valid_sample(x)
-    return math_ops.reduce_sum((self.concentration - 1.) * math_ops.log(x), -1)
+    return math_ops.reduce_sum(math_ops.xlogy(self.concentration - 1., x), -1)
 
   def _log_normalization(self):
     return special_math_ops.lbeta(self.concentration)
diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py
index 3293cda874..bbc64da7bc 100644
--- a/tensorflow/python/ops/distributions/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -225,7 +225,7 @@ class Gamma(distribution.Distribution):
 
   def _log_unnormalized_prob(self, x):
     x = self._maybe_assert_valid_sample(x)
-    return (self.concentration - 1.) * math_ops.log(x) - self.rate * x
+    return math_ops.xlogy(self.concentration - 1., x) - self.rate * x
 
   def _log_normalization(self):
     return (math_ops.lgamma(self.concentration)
-- 
GitLab


From 13643287a535581c133de529e3b02942ef7dd730 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Tue, 2 Oct 2018 18:46:11 +0300
Subject: [PATCH 287/570] Fix merge artifacts: replace Dataset by DatasetSource
 in Ignite Dataset.

---
 tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
index cfe59b6b23..288d485320 100644
--- a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
+++ b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
@@ -688,7 +688,7 @@ class IgniteClient(TcpClient):
         "Unknown binary type when expected string [type_id=%d]" % header)
 
 
-class IgniteDataset(dataset_ops.Dataset):
+class IgniteDataset(dataset_ops.DatasetSource):
   """Apache Ignite is a memory-centric distributed database, caching, and
 
      processing platform for transactional, analytical, and streaming workloads,
-- 
GitLab


From 7d66a720acb756291adc99ebe444c2c00bd37d84 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev <dmitrievanthony@gmail.com>
Date: Tue, 2 Oct 2018 18:57:07 +0300
Subject: [PATCH 288/570] Remove Ignite Dataset SSL tests by internal policy.

---
 .../python/tests/ignite_dataset_test.py       | 36 -------------------
 1 file changed, 36 deletions(-)

diff --git a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
index 1856a4fba8..ef29b5f14a 100644
--- a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
+++ b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
@@ -46,42 +46,6 @@ class IgniteDatasetTest(test.TestCase):
     ds = IgniteDataset(cache_name="SQL_PUBLIC_TEST_CACHE", port=42300)
     self._check_dataset(ds)
 
-  def test_ignite_dataset_with_ssl_client(self):
-    """Test Ignite Dataset with ssl client.
-
-    """
-    self._clear_env()
-    os.environ["IGNITE_DATASET_CERTFILE"] = os.path.dirname(
-        os.path.realpath(__file__)) + "/keystore/client.pem"
-    os.environ["IGNITE_DATASET_CERT_PASSWORD"] = "123456"
-
-    ds = IgniteDataset(
-        cache_name="SQL_PUBLIC_TEST_CACHE",
-        port=42301,
-        certfile=os.environ["IGNITE_DATASET_CERTFILE"],
-        cert_password=os.environ["IGNITE_DATASET_CERT_PASSWORD"])
-    self._check_dataset(ds)
-
-  def test_ignite_dataset_with_ssl_client_and_auth(self):
-    """Test Ignite Dataset with ssl client and authentication.
-
-    """
-    self._clear_env()
-    os.environ["IGNITE_DATASET_USERNAME"] = "ignite"
-    os.environ["IGNITE_DATASET_PASSWORD"] = "ignite"
-    os.environ["IGNITE_DATASET_CERTFILE"] = os.path.dirname(
-        os.path.realpath(__file__)) + "/keystore/client.pem"
-    os.environ["IGNITE_DATASET_CERT_PASSWORD"] = "123456"
-
-    ds = IgniteDataset(
-        cache_name="SQL_PUBLIC_TEST_CACHE",
-        port=42302,
-        certfile=os.environ["IGNITE_DATASET_CERTFILE"],
-        cert_password=os.environ["IGNITE_DATASET_CERT_PASSWORD"],
-        username=os.environ["IGNITE_DATASET_USERNAME"],
-        password=os.environ["IGNITE_DATASET_PASSWORD"])
-    self._check_dataset(ds)
-
   def _clear_env(self):
     """Clears environment variables used by Ignite Dataset.
 
-- 
GitLab


From ce41d2f95e1e5883f1808030c94fd9aaa57d9f10 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 09:32:20 -0700
Subject: [PATCH 289/570] Generate an error when --rnn_states refers to array
 names that aren't produced/consumed by any op.

PiperOrigin-RevId: 215402308
---
 .../resolve_multiply_by_zero.cc               | 14 ++++-----
 .../contrib/lite/toco/model_cmdline_flags.cc  | 18 ++++++++----
 tensorflow/contrib/lite/toco/tooling_util.cc  | 29 +++++++++++++++----
 3 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
index 4bb1217828..b2b2ea151b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
@@ -60,6 +60,10 @@ bool ResolveMultiplyByZero::Run(Model* model, std::size_t op_index) {
   const auto& output_array_name = mul_op->outputs[0];
   auto& output_array = model->GetArray(output_array_name);
 
+  if (!IsDiscardableArray(*model, output_array_name)) {
+    return false;
+  }
+
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
     return false;
@@ -139,14 +143,8 @@ bool ResolveMultiplyByZero::Run(Model* model, std::size_t op_index) {
   }
 
   // Erase input arrays to the multiply if no longer used
-  if (IsDiscardableArray(*model, mul_op->inputs[0]) &&
-      CountOpsWithInput(*model, mul_op->inputs[0]) == 1) {
-    model->EraseArray(mul_op->inputs[0]);
-  }
-  if (IsDiscardableArray(*model, mul_op->inputs[1]) &&
-      CountOpsWithInput(*model, mul_op->inputs[1]) == 1) {
-    model->EraseArray(mul_op->inputs[1]);
-  }
+  DeleteArrayIfUsedOnce(mul_op->inputs[0], model);
+  DeleteArrayIfUsedOnce(mul_op->inputs[1], model);
 
   // Erase the multiply operator.
   model->operators.erase(mul_it);
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index d34da63e43..b6a401aaf2 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -394,12 +394,18 @@ void ReadModelFlagsFromCommandLineFlags(
     }
   }
 
-  model_flags->set_allow_nonascii_arrays(
-      parsed_model_flags.allow_nonascii_arrays.value());
-  model_flags->set_allow_nonexistent_arrays(
-      parsed_model_flags.allow_nonexistent_arrays.value());
-  model_flags->set_change_concat_input_ranges(
-      parsed_model_flags.change_concat_input_ranges.value());
+  if (!model_flags->has_allow_nonascii_arrays()) {
+    model_flags->set_allow_nonascii_arrays(
+        parsed_model_flags.allow_nonascii_arrays.value());
+  }
+  if (!model_flags->has_allow_nonexistent_arrays()) {
+    model_flags->set_allow_nonexistent_arrays(
+        parsed_model_flags.allow_nonexistent_arrays.value());
+  }
+  if (!model_flags->has_change_concat_input_ranges()) {
+    model_flags->set_change_concat_input_ranges(
+        parsed_model_flags.change_concat_input_ranges.value());
+  }
 
   if (parsed_model_flags.arrays_extra_info_file.specified()) {
     string arrays_extra_info_file_contents;
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 4a1ae35cb5..b87e01fbf0 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -843,24 +843,40 @@ void CheckNonAsciiIOArrays(const ModelFlags& model_flags) {
 }
 
 void CheckNonExistentIOArrays(const Model& model) {
+  // "non-existent" is interpreted in the stronger sense of
+  // "not actually produced/consumed by an op".
+  // Rationale: we have to artificially fix up TensorFlow graphs by creating
+  // any array that it refers to, so just checking that arrays exist isn't
+  // sufficient. The real invariant here is whether arrays are produced/consumed
+  // by something.
   if (model.flags.allow_nonexistent_arrays()) {
     return;
   }
   for (const auto& input_array : model.flags.input_arrays()) {
-    CHECK(model.HasArray(input_array.name()))
-        << "Input array not found: " << input_array.name();
+    QCHECK(GetOpWithInput(model, input_array.name()))
+        << "Specified input array " << input_array.name()
+        << " is not consumed by any op in this graph. Is it a typo?";
   }
   for (const string& output_array : model.flags.output_arrays()) {
-    CHECK(model.HasArray(output_array))
-        << "Output array not found: " << output_array;
+    QCHECK(GetOpWithOutput(model, output_array))
+        << "Specified output array " << output_array
+        << " is not produced by any op in this graph. Is it a typo?";
   }
   for (const auto& rnn_state : model.flags.rnn_states()) {
     if (!rnn_state.discardable()) {
-      CHECK(model.HasArray(rnn_state.state_array()));
-      CHECK(model.HasArray(rnn_state.back_edge_source_array()));
+      // Check that all RNN states are consumed
+      QCHECK(GetOpWithInput(model, rnn_state.state_array()))
+          << "Specified RNN state " << rnn_state.state_array()
+          << " is not consumed by any op in this graph. Is it a typo?";
+      // Check that all RNN back-edge source arrays are produced
+      QCHECK(GetOpWithOutput(model, rnn_state.back_edge_source_array()))
+          << "Specified RNN back-edge source array "
+          << rnn_state.back_edge_source_array()
+          << " is not produced by any op in this graph. Is it a typo?";
     }
   }
 }
+
 }  // namespace
 
 void CheckNoMissingArray(const Model& model) {
@@ -1597,6 +1613,7 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
       input_array.GetOrCreateMinMax() = input_minmax;
     }
   }
+
   // Creation of the RNN state arrays
   for (const auto& rnn_state : model->flags.rnn_states()) {
     CreateOrCheckRnnStateArray(rnn_state.state_array(), rnn_state.size(),
-- 
GitLab


From dd66b78b38b457c7d37527472c4e92a7a07f4b09 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 2 Oct 2018 10:15:11 -0700
Subject: [PATCH 290/570] [XLA] Fix some outdated comments referring to FlatMap

Also convert unordered_map to flat/node_hash_map where the comments allow.

PiperOrigin-RevId: 215410566
---
 tensorflow/compiler/xla/service/BUILD                | 2 +-
 tensorflow/compiler/xla/service/allocation_tracker.h | 5 +----
 tensorflow/compiler/xla/service/gpu/BUILD            | 1 +
 tensorflow/compiler/xla/service/gpu/nvptx_compiler.h | 9 +++++----
 tensorflow/compiler/xla/service/hlo_evaluator.h      | 5 +++--
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 13803f5ebe..3f8b734afb 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -253,8 +253,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index af227fe4da..43feccee3c 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -124,10 +124,7 @@ class AllocationTracker {
   int64 next_handle_ GUARDED_BY(mutex_);
 
   // A map from device ordinal to AllocationMap.
-  //
-  // This is not a TF FlatMap because (currently) FlatMap (and therefore
-  // AllocationMap) is not movable.
-  std::unordered_map<int, AllocationMap> opaque_to_allocation_map_
+  absl::flat_hash_map<int, AllocationMap> opaque_to_allocation_map_
       GUARDED_BY(mutex_);
 
   // A map from data handle to a vector of shaped buffers that represent the
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index a838464cae..522e9f5948 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -718,6 +718,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index 8e97774750..c4a0b727cd 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/node_hash_map.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/executable.h"
@@ -140,10 +141,10 @@ class NVPTXCompiler : public LLVMCompiler {
     tensorflow::condition_variable compilation_done_cv_;
   };
 
-  // Don't even think about switching this to FlatMap; iterator stability is
-  // critical here.
-  std::unordered_map<CompilationCacheKey, CompilationCacheValue,
-                     CompilationCacheHash, CompilationCacheEq>
+  // Don't even think about switching this to flat_hash_map; iterator stability
+  // is critical here.
+  absl::node_hash_map<CompilationCacheKey, CompilationCacheValue,
+                      CompilationCacheHash, CompilationCacheEq>
       compilation_cache_ GUARDED_BY(mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(NVPTXCompiler);
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 6c2662ebae..2b0792616e 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/container/node_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
@@ -210,8 +211,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // post-orderring.
   // Must be cleared for each evaluation.
   // Storing Literal in place require the container to have pointer stability so
-  // we cannot use FlatMap any more.
-  std::unordered_map<const HloInstruction*, Literal> evaluated_;
+  // we cannot use flat_hash_map any more.
+  absl::node_hash_map<const HloInstruction*, Literal> evaluated_;
 
  private:
   template <typename ReturnT, typename NativeT>
-- 
GitLab


From feb0dc87078698fd335b528c661c54226a58efa9 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Tue, 2 Oct 2018 11:30:04 -0700
Subject: [PATCH 291/570] Remove dependency on contrib model_variable.

Also remove add_arg_scope.

PiperOrigin-RevId: 215426187
---
 tensorflow/contrib/quantize/BUILD             |  1 -
 .../contrib/quantize/python/quant_ops.py      | 28 +++++++++++++------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index 23e3a25d71..94a2d9672d 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -138,7 +138,6 @@ py_library(
     srcs = ["python/quant_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index 27069444a4..d9dc7fa62e 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework.python.ops import add_arg_scope
-from tensorflow.contrib.framework.python.ops import model_variable
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -29,7 +27,6 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import moving_averages
 
 
-@add_arg_scope
 def FixedQuantize(inputs, init_min=-6.0, init_max=6.0, scope=None):
   """Adds a fake quantize layer with fixed quantization interval.
 
@@ -46,7 +43,21 @@ def FixedQuantize(inputs, init_min=-6.0, init_max=6.0, scope=None):
         inputs, min=init_min, max=init_max)
 
 
-@add_arg_scope
+def _ModelVariable(name,
+                   shape=None,
+                   initializer=None,
+                   collections=None,
+                   trainable=None):
+  collections = list(collections or [])
+  collections += [ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES]
+  return variable_scope.get_variable(
+      name,
+      shape=shape,
+      initializer=initializer,
+      collections=collections,
+      trainable=trainable)
+
+
 def LastValueQuantize(inputs,
                       per_channel=False,
                       init_min=-6.0,
@@ -93,13 +104,13 @@ def LastValueQuantize(inputs,
     else:
       min_max_shape = []
 
-    min_var = model_variable(
+    min_var = _ModelVariable(
         'min',
         shape=min_max_shape,
         initializer=init_ops.constant_initializer(init_min),
         collections=[vars_collection],
         trainable=False)
-    max_var = model_variable(
+    max_var = _ModelVariable(
         'max',
         shape=min_max_shape,
         initializer=init_ops.constant_initializer(init_max),
@@ -153,7 +164,6 @@ def LastValueQuantize(inputs,
         narrow_range=narrow_range)
 
 
-@add_arg_scope
 def MovingAvgQuantize(inputs,
                       per_channel=False,
                       init_min=-6.0,
@@ -202,13 +212,13 @@ def MovingAvgQuantize(inputs,
     else:
       min_max_shape = []
 
-    min_var = model_variable(
+    min_var = _ModelVariable(
         'min',
         shape=min_max_shape,
         initializer=init_ops.constant_initializer(init_min),
         collections=[vars_collection],
         trainable=False)
-    max_var = model_variable(
+    max_var = _ModelVariable(
         'max',
         shape=min_max_shape,
         initializer=init_ops.constant_initializer(init_max),
-- 
GitLab


From b4c23d661228b549186dc82c16ecb22d261becf6 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 2 Oct 2018 11:40:08 -0700
Subject: [PATCH 292/570] [XLA] Replace the last FlatMap in XLA with a simple
 array.

A hash map for 18 pointers is just a waste of space.

PiperOrigin-RevId: 215428176
---
 tensorflow/compiler/xla/service/hlo_evaluator.cc |  2 +-
 tensorflow/compiler/xla/service/hlo_evaluator.h  | 10 ++--------
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index d7c39b2778..eec8d242fa 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -1378,7 +1378,7 @@ Status HloEvaluator::HandleReduce(HloInstruction* reduce) {
             "unsupported");
       }
     }
-    return reduce->Visit(typed_visitors_.at(first_element_type).get());
+    return reduce->Visit(typed_visitors_[first_element_type].get());
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 2b0792616e..07f8d0aad4 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace xla {
@@ -135,7 +134,7 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // Wraps around instruction handling to infer types before dispatching to
   // the corresponding typed Visitor.
   Status DefaultAction(HloInstruction* hlo) override {
-    return hlo->Visit(typed_visitors_.at(hlo->shape().element_type()).get());
+    return hlo->Visit(typed_visitors_[hlo->shape().element_type()].get());
   }
 
   Status Preprocess(HloInstruction* hlo) override;
@@ -242,12 +241,7 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   }
 
   // Map from a primitive type to its associated (templated) DfsHloVisitor.
-  // Note: the hash function here is only needed because current gcc std::hash
-  // does not specialize for enum types. This should however be fixed in the
-  // future: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60970#c5
-  tensorflow::gtl::FlatMap<PrimitiveType, std::unique_ptr<DfsHloVisitor>,
-                           std::hash<int>>
-      typed_visitors_;
+  std::unique_ptr<DfsHloVisitor> typed_visitors_[PrimitiveType_ARRAYSIZE];
 
   // Caches pointers to input literals, assuming they are in post-order.
   // Literals are not owned by this class, and they must outlive the lifetime of
-- 
GitLab


From 16b44d48d485dbb62b9922e172df4cc460174046 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Tue, 2 Oct 2018 12:14:58 -0700
Subject: [PATCH 293/570] Fix the case when an object may have multiple
 directives with the same annotation.

PiperOrigin-RevId: 215435613
---
 tensorflow/python/autograph/core/BUILD        |  47 ++++---
 tensorflow/python/autograph/core/converter.py |  53 ++++----
 .../python/autograph/core/converter_test.py   | 124 ++++++++++++++++++
 3 files changed, 184 insertions(+), 40 deletions(-)
 create mode 100644 tensorflow/python/autograph/core/converter_test.py

diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 843e381f31..3ab2e7b1bc 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -33,6 +33,35 @@ py_library(
     ],
 )
 
+py_library(
+    name = "test_lib",
+    srcs = [
+        "converter_testing.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        ":core",
+        "//tensorflow/python/autograph/operators",
+        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/autograph/utils",
+        "@gast_archive//:gast",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "converter_test",
+    srcs = ["converter_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":core",
+        ":test_lib",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "errors_test",
     srcs = ["errors_test.py"],
@@ -67,21 +96,3 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
-
-py_library(
-    name = "test_lib",
-    srcs = [
-        "converter_testing.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        ":core",
-        "//tensorflow/python/autograph/operators",
-        "//tensorflow/python/autograph/pyct",
-        "//tensorflow/python/autograph/pyct/static_analysis",
-        "//tensorflow/python/autograph/utils",
-        "@gast_archive//:gast",
-        "@six_archive//:six",
-    ],
-)
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index 80928ae7f4..408a573ad0 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -210,14 +210,22 @@ class Base(transformer.Base):
     self._ast_depth = 0
 
   def get_definition_directive(self, node, directive, arg, default):
-    """Returns the unique directive for a symbol, or a default if none exist.
+    """Returns the unique directive argument for a symbol.
 
     See lang/directives.py for details on directives.
 
+    Example:
+       # Given a directive in the code:
+       ag.foo_directive(bar, baz=1)
+
+       # One can write for an AST node Name(id='bar'):
+       get_definition_directive(node, ag.foo_directive, 'baz')
+
     Args:
-      node: ast.AST
-      directive: Callable[..., Any]
-      arg: str
+      node: ast.AST, the node representing the symbol for which the directive
+        argument is needed.
+      directive: Callable[..., Any], the directive to search.
+      arg: str, the directive argument to return.
       default: Any
 
     Raises:
@@ -227,27 +235,28 @@ class Base(transformer.Base):
     if not defs:
       return default
 
-    # TODO(mdan): Simplify this.
-    arg_values = []
+    arg_values_found = []
     for def_ in defs:
-      if (directive not in def_.directives or
-          arg not in def_.directives[directive]):
-        continue
-      arg_value = def_.directives[directive][arg]
-      for prev_value in arg_values:
-        if not ast_util.matches(arg_value, prev_value):
-          qn = anno.getanno(node, anno.Basic.QN)
-          raise ValueError('%s has ambiguous annotations for %s(%s): %s, %s' %
-                           (qn, directive.__name__, arg,
-                            compiler.ast_to_source(arg_value).strip(),
-                            compiler.ast_to_source(prev_value).strip()))
-      arg_values.append(arg_value)
-
-    if not arg_values:
+      if (directive in def_.directives and arg in def_.directives[directive]):
+        arg_values_found.append(def_.directives[directive][arg])
+
+    if not arg_values_found:
       return default
 
-    arg_value, = arg_values
-    return arg_value
+    if len(arg_values_found) == 1:
+      return arg_values_found[0]
+
+    # If multiple annotations reach the symbol, they must all match. If they do,
+    # return any of them.
+    first_value = arg_values_found[0]
+    for other_value in arg_values_found[1:]:
+      if not ast_util.matches(first_value, other_value):
+        qn = anno.getanno(node, anno.Basic.QN)
+        raise ValueError('%s has ambiguous annotations for %s(%s): %s, %s' %
+                         (qn, directive.__name__, arg,
+                          compiler.ast_to_source(other_value).strip(),
+                          compiler.ast_to_source(first_value).strip()))
+    return first_value
 
   def visit(self, node):
     if not self._ast_depth:
diff --git a/tensorflow/python/autograph/core/converter_test.py b/tensorflow/python/autograph/core/converter_test.py
new file mode 100644
index 0000000000..b73c67e337
--- /dev/null
+++ b/tensorflow/python/autograph/core/converter_test.py
@@ -0,0 +1,124 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lists module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.core import converter_testing
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.platform import test
+
+
+class TestConverter(converter.Base):
+  pass
+
+
+class ConverterBaseTest(converter_testing.TestCase):
+
+  def test_get_definition_directive_basic(self):
+
+    directive_key = object
+
+    def test_fn():
+      a = 1
+      return a
+
+    ns = {}
+    node, ctx = self.prepare(test_fn, ns)
+    symbol_a = node.body[1].value
+    defs, = anno.getanno(symbol_a, anno.Static.ORIG_DEFINITIONS)
+    defs.directives[directive_key] = {
+        'test_arg': parser.parse_expression('foo'),
+        'other_arg': parser.parse_expression('bar'),
+    }
+    c = TestConverter(ctx)
+    value = c.get_definition_directive(symbol_a, directive_key, 'test_arg',
+                                       None)
+    self.assertEqual(value.id, 'foo')
+
+  def test_get_definition_directive_default(self):
+
+    directive_key = object
+
+    def test_fn():
+      a = 1
+      return a
+
+    ns = {}
+    node, ctx = self.prepare(test_fn, ns)
+    symbol_a = node.body[1].value
+    c = TestConverter(ctx)
+    value = c.get_definition_directive(symbol_a, directive_key, 'test_arg',
+                                       parser.parse_expression('default'))
+    self.assertEqual(value.id, 'default')
+
+  def test_get_definition_directive_multiple_consistent(self):
+
+    directive_key = object
+
+    def test_fn():
+      a = 1
+      if a:
+        a = 2
+      return a
+
+    ns = {}
+    node, ctx = self.prepare(test_fn, ns)
+    symbol_a = node.body[2].value
+    defs = anno.getanno(symbol_a, anno.Static.ORIG_DEFINITIONS)
+    defs[0].directives[directive_key] = {
+        'test_arg': parser.parse_expression('foo'),
+        'other_arg': parser.parse_expression('bar'),
+    }
+    defs[1].directives[directive_key] = {
+        'test_arg': parser.parse_expression('foo'),
+        'other_arg': parser.parse_expression('baz'),
+    }
+    c = TestConverter(ctx)
+    value = c.get_definition_directive(symbol_a, directive_key, 'test_arg',
+                                       None)
+    self.assertEqual(value.id, 'foo')
+
+  def test_get_definition_directive_multiple_inconsistent(self):
+
+    directive_key = object
+
+    def test_fn():
+      a = 1
+      if a:
+        a = 2
+      return a
+
+    ns = {}
+    node, ctx = self.prepare(test_fn, ns)
+    symbol_a = node.body[2].value
+    defs = anno.getanno(symbol_a, anno.Static.ORIG_DEFINITIONS)
+    defs[0].directives[directive_key] = {
+        'test_arg': parser.parse_expression('foo'),
+    }
+    defs[1].directives[directive_key] = {
+        'test_arg': parser.parse_expression('bar'),
+    }
+    c = TestConverter(ctx)
+    with self.assertRaises(ValueError):
+      c.get_definition_directive(symbol_a, directive_key, 'test_arg', None)
+
+
+if __name__ == '__main__':
+  test.main()
-- 
GitLab


From 8d4ef71f06a06a093419bf0f80562a1941059029 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Tue, 2 Oct 2018 12:15:36 -0700
Subject: [PATCH 294/570] Allow creating a list from a tensor. Fix a few
 inconsistencies in the tensor list constructors.

PiperOrigin-RevId: 215435720
---
 .../autograph/lang/special_functions.py       | 24 ++++++++++--
 .../autograph/lang/special_functions_test.py  | 37 ++++++++++++++++++-
 .../autograph/operators/data_structures.py    | 17 ++++++++-
 .../operators/data_structures_test.py         | 31 ++++++++++++++--
 4 files changed, 99 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/autograph/lang/special_functions.py b/tensorflow/python/autograph/lang/special_functions.py
index e4838d1b6d..62ac018ac4 100644
--- a/tensorflow/python/autograph/lang/special_functions.py
+++ b/tensorflow/python/autograph/lang/special_functions.py
@@ -24,6 +24,26 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.operators import data_structures
+from tensorflow.python.framework import tensor_util
+
+
+def _validate_list_constructor(elements, element_dtype, element_shape):
+  """Validates the inputs of tensor_list."""
+  if element_dtype is not None and element_shape is not None:
+    return
+  if tensor_util.is_tensor(elements):
+    return
+  if isinstance(elements, (list, tuple)):
+    if elements:
+      return
+    else:
+      raise ValueError(
+          'element_dtype and element_shape are required when elements are'
+          ' empty')
+
+  raise ValueError(
+      'unknown type for elements: {}; only Tensor, list and tuple are'
+      ' allowed'.format(type(elements)))
 
 
 def tensor_list(elements,
@@ -52,9 +72,7 @@ def tensor_list(elements,
   Raises:
     ValueError: for invalid arguments
   """
-  if not (elements or (element_dtype and element_shape)):
-    raise ValueError(
-        'element_dtype and element_shape are required for empty lists')
+  _validate_list_constructor(elements, element_dtype, element_shape)
   if use_tensor_array:
     return data_structures.tf_tensor_array_new(elements, element_dtype,
                                                element_shape)
diff --git a/tensorflow/python/autograph/lang/special_functions_test.py b/tensorflow/python/autograph/lang/special_functions_test.py
index 545dd11729..206a32d07c 100644
--- a/tensorflow/python/autograph/lang/special_functions_test.py
+++ b/tensorflow/python/autograph/lang/special_functions_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -28,12 +30,43 @@ from tensorflow.python.platform import test
 
 class SpecialFunctionsTest(test.TestCase):
 
+  def test_tensor_list_empty_list(self):
+    l = special_functions.tensor_list([],
+                                      element_dtype=dtypes.int32,
+                                      element_shape=())
+    sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(sl), [])
+
+    l = special_functions.tensor_list((),
+                                      element_dtype=dtypes.int32,
+                                      element_shape=())
+    sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(sl), [])
+
+  def test_tensor_list_tensor(self):
+    l = special_functions.tensor_list(
+        constant_op.constant([], dtype=dtypes.int32))
+    sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(sl), [])
+
+  def test_tensor_list_unsupported_initializer(self):
+    with self.assertRaisesRegexp(ValueError, 'unknown type'):
+      special_functions.tensor_list(np.array([1, 2, 3]))
+
+  def test_tensor_list_empty_list_no_type(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'element_dtype and element_shape are required'):
+      special_functions.tensor_list([])
+
   def test_tensor_list_from_elements(self):
     elements = [constant_op.constant([1, 2]), constant_op.constant([3, 4])]
 
     l = special_functions.tensor_list(elements)
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.cached_session() as sess:
+    with self.test_session() as sess:
       self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
 
   def test_tensor_list_array_from_elements(self):
@@ -41,7 +74,7 @@ class SpecialFunctionsTest(test.TestCase):
 
     l = special_functions.tensor_list(elements, use_tensor_array=True)
     sl = l.stack()
-    with self.cached_session() as sess:
+    with self.test_session() as sess:
       self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
 
   def test_stack(self):
diff --git a/tensorflow/python/autograph/operators/data_structures.py b/tensorflow/python/autograph/operators/data_structures.py
index cc0a3c3544..b3a3851333 100644
--- a/tensorflow/python/autograph/operators/data_structures.py
+++ b/tensorflow/python/autograph/operators/data_structures.py
@@ -106,6 +106,14 @@ def tf_tensor_array_new(elements, element_dtype=None, element_shape=None):
 
 def tf_tensor_list_new(elements, element_dtype=None, element_shape=None):
   """Overload of new_list that stages a Tensor list creation."""
+  if tensor_util.is_tensor(elements):
+    if element_shape is not None:
+      raise ValueError(
+          'element shape may not be specified when creating list from tensor')
+    element_shape = array_ops.shape(elements)[1:]
+    l = list_ops.tensor_list_from_tensor(elements, element_shape=element_shape)
+    return l
+
   elements = tuple(ops.convert_to_tensor(el) for el in elements)
 
   all_dtypes = set(el.dtype for el in elements)
@@ -115,13 +123,15 @@ def tf_tensor_list_new(elements, element_dtype=None, element_shape=None):
       raise ValueError(
           'incompatible dtype; specified: {}, inferred from {}: {}'.format(
               element_dtype, elements, inferred_dtype))
-  else:
+  elif all_dtypes:
     # Heterogeneous lists are ok.
     if element_dtype is not None:
       raise ValueError(
           'specified dtype {} is inconsistent with that of elements {}'.format(
               element_dtype, elements))
     inferred_dtype = dtypes.variant
+  else:
+    inferred_dtype = dtypes.variant
 
   all_shapes = set(tuple(el.shape.as_list()) for el in elements)
   if len(all_shapes) == 1:
@@ -130,19 +140,22 @@ def tf_tensor_list_new(elements, element_dtype=None, element_shape=None):
       raise ValueError(
           'incompatible shape; specified: {}, inferred from {}: {}'.format(
               element_shape, elements, inferred_shape))
-  else:
+  elif all_shapes:
     # Heterogeneous lists are ok.
     if element_shape is not None:
       raise ValueError(
           'specified shape {} is inconsistent with that of elements {}'.format(
               element_shape, elements))
     inferred_shape = constant_op.constant(-1)  # unknown shape, by convention
+  else:
+    inferred_shape = constant_op.constant(-1)  # unknown shape, by convention
 
   if element_dtype is None:
     element_dtype = inferred_dtype
   if element_shape is None:
     element_shape = inferred_shape
 
+  element_shape = ops.convert_to_tensor(element_shape, dtype=dtypes.int32)
   l = list_ops.empty_tensor_list(
       element_shape=element_shape, element_dtype=element_dtype)
   for el in elements:
diff --git a/tensorflow/python/autograph/operators/data_structures_test.py b/tensorflow/python/autograph/operators/data_structures_test.py
index 8532dbe466..6039b07982 100644
--- a/tensorflow/python/autograph/operators/data_structures_test.py
+++ b/tensorflow/python/autograph/operators/data_structures_test.py
@@ -45,6 +45,20 @@ class ListTest(test.TestCase):
     with self.cached_session() as sess:
       self.assertAllEqual(sess.run(t), [3, 4, 5])
 
+  def test_tf_tensor_list_new_empty(self):
+    l = data_structures.tf_tensor_list_new([],
+                                           element_dtype=dtypes.int32,
+                                           element_shape=())
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
+    with self.cached_session() as sess:
+      self.assertAllEqual(sess.run(t), [])
+
+  def test_tf_tensor_list_new_from_tensor(self):
+    l = data_structures.tf_tensor_list_new(constant_op.constant([3, 4, 5]))
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
+    with self.cached_session() as sess:
+      self.assertAllEqual(sess.run(t), [3, 4, 5])
+
   def test_tf_tensor_list_new_illegal_input(self):
     with self.assertRaises(ValueError):
       data_structures.tf_tensor_list_new([3, 4.0])
@@ -56,9 +70,8 @@ class ListTest(test.TestCase):
     with self.assertRaises(ValueError):
       data_structures.tf_tensor_list_new([3, 4], element_shape=(2,))
     with self.assertRaises(ValueError):
-      data_structures.tf_tensor_list_new([], element_shape=(2,))
-    with self.assertRaises(ValueError):
-      data_structures.tf_tensor_list_new([], element_dtype=dtypes.float32)
+      data_structures.tf_tensor_list_new(
+          constant_op.constant([1, 2, 3]), element_shape=[1])
 
   def test_tf_tensor_array_new(self):
     l = data_structures.tf_tensor_array_new([3, 4, 5])
@@ -141,6 +154,18 @@ class ListTest(test.TestCase):
       t = data_structures.list_stack(l, opts)
       self.assertAllEqual(sess.run(t), sess.run(initial_list))
 
+  def test_stack_tensor_list_empty(self):
+    l = list_ops.empty_tensor_list(
+        element_shape=-1,
+        element_dtype=dtypes.variant)
+
+    opts = data_structures.ListStackOpts(
+        element_dtype=dtypes.int32, original_call=None)
+
+    # TODO(mdan): Allow stacking empty lists if the dtype and shape are known.
+    with self.assertRaises(ValueError):
+      data_structures.list_stack(l, opts)
+
   def test_stack_fallback(self):
 
     def dummy_function(l):
-- 
GitLab


From d3e830e608211bc81cfb111abe3c0357bd92a12e Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Tue, 2 Oct 2018 12:38:53 -0700
Subject: [PATCH 295/570] Disable fused_conv tests that don't build in
 open-source.

PiperOrigin-RevId: 215440356
---
 tensorflow/contrib/fused_conv/BUILD | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/contrib/fused_conv/BUILD b/tensorflow/contrib/fused_conv/BUILD
index 490da9b33b..57a5bfbf43 100644
--- a/tensorflow/contrib/fused_conv/BUILD
+++ b/tensorflow/contrib/fused_conv/BUILD
@@ -145,6 +145,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     tags = [
+        "manual",  # TODO(b/117128481): re-enable after fixing OSS build
         "no_pip",
         "requires-gpu-sm70",
     ],
@@ -169,6 +170,7 @@ cuda_py_test(
     ],
     main = "python/ops/fused_conv2d_bias_activation_benchmark.py",
     tags = [
+        "manual",  # TODO(b/117128481): re-enable after fixing OSS build
         "requires-gpu-sm70",
     ],
 )
-- 
GitLab


From 508dd179b6b6dd78aa3e24212648789e8fc018a0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 12:41:31 -0700
Subject: [PATCH 296/570] Allow passing --allow_nonexistent_arrays via
 toco_convert

PiperOrigin-RevId: 215440829
---
 tensorflow/contrib/lite/python/convert.py    |  8 +++++++-
 tensorflow/contrib/lite/toco/tooling_util.cc | 19 +++++++++++--------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
index 613a1530f7..1bf42d7551 100644
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -155,7 +155,8 @@ def build_toco_convert_protos(input_tensors,
                               post_training_quantize=False,
                               dump_graphviz_dir=None,
                               dump_graphviz_video=False,
-                              converter_mode=ConverterMode.DEFAULT):
+                              converter_mode=ConverterMode.DEFAULT,
+                              allow_nonexistent_arrays=False):
   """Builds protocol buffers describing a conversion of a model using TOCO.
 
   Typically this is to convert from TensorFlow GraphDef to TFLite, in which
@@ -212,6 +213,8 @@ def build_toco_convert_protos(input_tensors,
       every graph transformation. (default False)
     converter_mode: Experimental flag, subject to change. ConverterMode
       indicating which converter to use. (default ConverterMode.DEFAULT)
+    allow_nonexistent_arrays: Allow specifying array names that don't exist
+      or are unused in the final graph.  (default False)
 
   Returns:
     model_flags, toco_flags: two protocol buffers describing the conversion
@@ -261,6 +264,9 @@ def build_toco_convert_protos(input_tensors,
 
   for output_tensor in output_tensors:
     model.output_arrays.append(tensor_name(output_tensor))
+
+  model.allow_nonexistent_arrays = allow_nonexistent_arrays
+
   return model, toco
 
 
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index b87e01fbf0..e3f27e9e2a 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -852,27 +852,30 @@ void CheckNonExistentIOArrays(const Model& model) {
   if (model.flags.allow_nonexistent_arrays()) {
     return;
   }
+  static constexpr char general_comment[] =
+      "Is it a typo? To silence this message, pass this flag:  "
+      "allow_nonexistent_arrays";
   for (const auto& input_array : model.flags.input_arrays()) {
     QCHECK(GetOpWithInput(model, input_array.name()))
-        << "Specified input array " << input_array.name()
-        << " is not consumed by any op in this graph. Is it a typo?";
+        << "Specified input array \"" << input_array.name()
+        << "\" is not consumed by any op in this graph. " << general_comment;
   }
   for (const string& output_array : model.flags.output_arrays()) {
     QCHECK(GetOpWithOutput(model, output_array))
-        << "Specified output array " << output_array
-        << " is not produced by any op in this graph. Is it a typo?";
+        << "Specified output array \"" << output_array
+        << "\" is not produced by any op in this graph. " << general_comment;
   }
   for (const auto& rnn_state : model.flags.rnn_states()) {
     if (!rnn_state.discardable()) {
       // Check that all RNN states are consumed
       QCHECK(GetOpWithInput(model, rnn_state.state_array()))
-          << "Specified RNN state " << rnn_state.state_array()
-          << " is not consumed by any op in this graph. Is it a typo?";
+          << "Specified RNN state \"" << rnn_state.state_array()
+          << "\" is not consumed by any op in this graph. " << general_comment;
       // Check that all RNN back-edge source arrays are produced
       QCHECK(GetOpWithOutput(model, rnn_state.back_edge_source_array()))
-          << "Specified RNN back-edge source array "
+          << "Specified RNN back-edge source array \""
           << rnn_state.back_edge_source_array()
-          << " is not produced by any op in this graph. Is it a typo?";
+          << "\" is not produced by any op in this graph. " << general_comment;
     }
   }
 }
-- 
GitLab


From 0a201955b47d484c6bfa149364c264a5b5f91be7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 12:47:05 -0700
Subject: [PATCH 297/570] Copy tf.distributions to tfp.distributions, and
 deprecate the tf.distributions API.

PiperOrigin-RevId: 215441733
---
 .../python/debug/examples/examples_test.sh    |  2 +-
 tensorflow/python/ops/distributions/BUILD     |  7 ++++++
 .../python/ops/distributions/bernoulli.py     |  9 +++++++
 tensorflow/python/ops/distributions/beta.py   | 14 +++++++++++
 .../python/ops/distributions/categorical.py   |  9 +++++++
 .../python/ops/distributions/dirichlet.py     |  9 +++++++
 .../distributions/dirichlet_multinomial.py    |  9 +++++++
 .../python/ops/distributions/distribution.py  | 17 +++++++++++++
 .../python/ops/distributions/exponential.py   | 13 ++++++++++
 tensorflow/python/ops/distributions/gamma.py  | 14 +++++++++++
 .../ops/distributions/identity_bijector.py    |  9 +++++++
 .../ops/distributions/kullback_leibler.py     | 25 +++++++++++++++++++
 .../python/ops/distributions/laplace.py       | 14 +++++++++++
 .../python/ops/distributions/multinomial.py   |  9 +++++++
 tensorflow/python/ops/distributions/normal.py | 14 +++++++++++
 .../python/ops/distributions/student_t.py     | 14 +++++++++++
 .../distributions/transformed_distribution.py |  9 +++++++
 .../python/ops/distributions/uniform.py       |  9 +++++++
 18 files changed, 205 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/debug/examples/examples_test.sh b/tensorflow/python/debug/examples/examples_test.sh
index f7d597c8c0..89dc918616 100755
--- a/tensorflow/python/debug/examples/examples_test.sh
+++ b/tensorflow/python/debug/examples/examples_test.sh
@@ -115,7 +115,7 @@ OUTPUT=$(${OFFLINE_ANALYZER_BIN} 2>&1)
 set -e
 
 EXPECTED_OUTPUT="ERROR: dump_dir flag is empty."
-if [[ "${OUTPUT}" != "${EXPECTED_OUTPUT}" ]]; then
+if ! echo "${OUTPUT}" | grep -q "${EXPECTED_OUTPUT}"; then
   echo "ERROR: offline_analyzer output didn't match expectation: ${OUTPUT}" 1>&2
   echo "Expected output: ${EXPECTED_OUTPUT}"
   exit 1
diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
index e7ad028376..59ba9aee59 100644
--- a/tensorflow/python/ops/distributions/BUILD
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -12,6 +12,13 @@ py_library(
         ["*.py"],
         exclude = ["util.py"],
     ),
+    deprecation = ("TensorFlow Distributions has migrated to " +
+                   "TensorFlow Probability " +
+                   "(https://github.com/tensorflow/probability). " +
+                   "Deprecated copies remaining in tf.distributions " +
+                   "will not receive new features, and will be removed by " +
+                   "early 2019. You should update all usage of " +
+                   "`tf.distributions` to `tfp.distributions`."),
     srcs_version = "PY2AND3",
     deps = [
         ":util",
diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
index 84d9d40a35..baecc321d3 100644
--- a/tensorflow/python/ops/distributions/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -39,6 +40,14 @@ class Bernoulli(distribution.Distribution):
   `1` outcome (vs a `0` outcome).
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self,
                logits=None,
                probs=None,
diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py
index d6f89a3517..51c4f6eb3d 100644
--- a/tensorflow/python/ops/distributions/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -150,6 +151,14 @@ class Beta(distribution.Distribution):
 
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self,
                concentration1=None,
                concentration0=None,
@@ -341,6 +350,11 @@ class Beta(distribution.Distribution):
 class BetaWithSoftplusConcentration(Beta):
   """Beta with softplus transform of `concentration1` and `concentration0`."""
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "Use `tfd.Beta(tf.nn.softplus(concentration1), "
+      "tf.nn.softplus(concentration2))` instead.",
+      warn_once=True)
   def __init__(self,
                concentration1,
                concentration0,
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index fbbacf2521..26a3da2fb6 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -149,6 +150,14 @@ class Categorical(distribution.Distribution):
 
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(
       self,
       logits=None,
diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
index 997b1d392d..675c30b383 100644
--- a/tensorflow/python/ops/distributions/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -156,6 +157,14 @@ class Dirichlet(distribution.Distribution):
 
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self,
                concentration,
                validate_args=False,
diff --git a/tensorflow/python/ops/distributions/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
index 5350c82847..2e3151a5ab 100644
--- a/tensorflow/python/ops/distributions/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -163,6 +164,14 @@ class DirichletMultinomial(distribution.Distribution):
 
   # TODO(b/27419586) Change docstring for dtype of concentration once int
   # allowed.
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self,
                total_count,
                concentration,
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index 12fd039392..4741370cd8 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
@@ -229,6 +230,14 @@ class ReparameterizationType(object):
     gradients / surrogate loss instead.
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self, rep_type):
     self._rep_type = rep_type
 
@@ -405,6 +414,14 @@ class Distribution(_BaseDistribution):
 
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self,
                dtype,
                reparameterization_type,
diff --git a/tensorflow/python/ops/distributions/exponential.py b/tensorflow/python/ops/distributions/exponential.py
index 02129b5e2a..6a52af8c33 100644
--- a/tensorflow/python/ops/distributions/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import gamma
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -70,6 +71,14 @@ class Exponential(gamma.Gamma):
 
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self,
                rate,
                validate_args=False,
@@ -138,6 +147,10 @@ class Exponential(gamma.Gamma):
 class ExponentialWithSoftplusRate(Exponential):
   """Exponential with softplus transform on `rate`."""
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "Use `tfd.Exponential(tf.nn.softplus(rate)).",
+      warn_once=True)
   def __init__(self,
                rate,
                validate_args=False,
diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py
index bbc64da7bc..4a2db208d4 100644
--- a/tensorflow/python/ops/distributions/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -121,6 +122,14 @@ class Gamma(distribution.Distribution):
 
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self,
                concentration,
                rate,
@@ -279,6 +288,11 @@ class Gamma(distribution.Distribution):
 class GammaWithSoftplusConcentrationRate(Gamma):
   """`Gamma` with softplus of `concentration` and `rate`."""
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "Use `tfd.Gamma(tf.nn.softplus(concentration), "
+      "tf.nn.softplus(rate))` instead.",
+      warn_once=True)
   def __init__(self,
                concentration,
                rate,
diff --git a/tensorflow/python/ops/distributions/identity_bijector.py b/tensorflow/python/ops/distributions/identity_bijector.py
index 8628e68f96..eded96f5bc 100644
--- a/tensorflow/python/ops/distributions/identity_bijector.py
+++ b/tensorflow/python/ops/distributions/identity_bijector.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.util import deprecation
 
 
 __all__ = [
@@ -43,6 +44,14 @@ class Identity(bijector.Bijector):
 
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self, validate_args=False, name="identity"):
     super(Identity, self).__init__(
         forward_min_event_ndims=0,
diff --git a/tensorflow/python/ops/distributions/kullback_leibler.py b/tensorflow/python/ops/distributions/kullback_leibler.py
index fdeb97bf64..12743fa23d 100644
--- a/tensorflow/python/ops/distributions/kullback_leibler.py
+++ b/tensorflow/python/ops/distributions/kullback_leibler.py
@@ -22,6 +22,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
@@ -51,6 +52,14 @@ def _registered_kl(type_a, type_b):
   return kl_fn
 
 
+@deprecation.deprecated(
+    "2019-01-01",
+    "The TensorFlow Distributions library has moved to "
+    "TensorFlow Probability "
+    "(https://github.com/tensorflow/probability). You "
+    "should update all references to use `tfp.distributions` "
+    "instead of `tf.distributions`.",
+    warn_once=True)
 @tf_export("distributions.kl_divergence")
 def kl_divergence(distribution_a, distribution_b,
                   allow_nan_stats=True, name=None):
@@ -112,6 +121,14 @@ def kl_divergence(distribution_a, distribution_b,
       return array_ops.identity(kl_t, name="checked_kl")
 
 
+@deprecation.deprecated(
+    "2019-01-01",
+    "The TensorFlow Distributions library has moved to "
+    "TensorFlow Probability "
+    "(https://github.com/tensorflow/probability). You "
+    "should update all references to use `tfp.distributions` "
+    "instead of `tf.distributions`.",
+    warn_once=True)
 def cross_entropy(ref, other,
                   allow_nan_stats=True, name=None):
   """Computes the (Shannon) cross entropy.
@@ -155,6 +172,14 @@ class RegisterKL(object):
     # Return KL(norm_a || norm_b)
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self, dist_cls_a, dist_cls_b):
     """Initialize the KL registrar.
 
diff --git a/tensorflow/python/ops/distributions/laplace.py b/tensorflow/python/ops/distributions/laplace.py
index be17cf2527..4f6a8f587d 100644
--- a/tensorflow/python/ops/distributions/laplace.py
+++ b/tensorflow/python/ops/distributions/laplace.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import special_math
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -71,6 +72,14 @@ class Laplace(distribution.Distribution):
 
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self,
                loc,
                scale,
@@ -211,6 +220,11 @@ class Laplace(distribution.Distribution):
 class LaplaceWithSoftplusScale(Laplace):
   """Laplace with softplus applied to `scale`."""
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "Use `tfd.Laplace(loc, tf.nn.softplus(scale)) "
+      "instead.",
+      warn_once=True)
   def __init__(self,
                loc,
                scale,
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index d0943e8eee..8397353cd5 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -148,6 +149,14 @@ class Multinomial(distribution.Distribution):
   ```
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self,
                total_count,
                logits=None,
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index 2feaf806c0..9f511709b9 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import special_math
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -106,6 +107,14 @@ class Normal(distribution.Distribution):
 
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self,
                loc,
                scale,
@@ -240,6 +249,11 @@ class Normal(distribution.Distribution):
 class NormalWithSoftplusScale(Normal):
   """Normal with softplus applied to `scale`."""
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "Use `tfd.Normal(loc, tf.nn.softplus(scale)) "
+      "instead.",
+      warn_once=True)
   def __init__(self,
                loc,
                scale,
diff --git a/tensorflow/python/ops/distributions/student_t.py b/tensorflow/python/ops/distributions/student_t.py
index e8d214bbe0..b69e61925c 100644
--- a/tensorflow/python/ops/distributions/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -140,6 +141,14 @@ class StudentT(distribution.Distribution):
 
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self,
                df,
                loc,
@@ -361,6 +370,11 @@ class StudentT(distribution.Distribution):
 class StudentTWithAbsDfSoftplusScale(StudentT):
   """StudentT with `df = floor(abs(df))` and `scale = softplus(scale)`."""
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "Use `tfd.StudentT(tf.floor(tf.abs(df)), loc, "
+      "tf.nn.softplus(scale)) instead.",
+      warn_once=True)
   def __init__(self,
                df,
                loc,
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index e80bf9ee42..1becfc1877 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import distribution as distribution_lib
 from tensorflow.python.ops.distributions import identity_bijector
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util import deprecation
 
 __all__ = [
     "TransformedDistribution",
@@ -227,6 +228,14 @@ class TransformedDistribution(distribution_lib.Distribution):
 
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self,
                distribution,
                bijector=None,
diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py
index e66c4a37e7..b6b24187cc 100644
--- a/tensorflow/python/ops/distributions/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -76,6 +77,14 @@ class Uniform(distribution.Distribution):
 
   """
 
+  @deprecation.deprecated(
+      "2019-01-01",
+      "The TensorFlow Distributions library has moved to "
+      "TensorFlow Probability "
+      "(https://github.com/tensorflow/probability). You "
+      "should update all references to use `tfp.distributions` "
+      "instead of `tf.distributions`.",
+      warn_once=True)
   def __init__(self,
                low=0.,
                high=1.,
-- 
GitLab


From 78e4ce52aeda5a10ddaf5e64ea8958f439a2f9f2 Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Tue, 2 Oct 2018 13:08:39 -0700
Subject: [PATCH 298/570] Add proto serialization/deserialization testing to
 the HLO parser tests. Many of the HLO parser tests verify that an text form
 of an HLO module preserves all information when running through ToString then
 parsing. It makes sense to also use these tests to exercise proto
 serialization/deserialization. This is done by adding additional
 instantiations of the parameterized parsing tests. This caught several bugs
 which are fixed in this CL:

(1) Domain instructions were not being serialized properly.
(2) Host send/recv instructions did not preserve the is_host_transfer bit.
(3) Sparse literals could not be serialized or deserialized.

PiperOrigin-RevId: 215445200
---
 tensorflow/compiler/xla/literal.cc            | 18 ++++
 tensorflow/compiler/xla/literal_test.cc       | 10 +++
 tensorflow/compiler/xla/service/BUILD         | 20 +----
 tensorflow/compiler/xla/service/hlo.proto     |  6 +-
 .../compiler/xla/service/hlo_instruction.cc   | 33 +++++--
 .../compiler/xla/service/hlo_instructions.cc  | 21 +++++
 .../compiler/xla/service/hlo_instructions.h   |  3 +
 .../compiler/xla/service/hlo_parser_test.cc   | 85 +++++++++++++------
 8 files changed, 141 insertions(+), 55 deletions(-)

diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 5035f41988..d1dad0d45f 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -1850,6 +1850,24 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
   TF_RET_CHECK(LayoutUtil::HasLayout(proto.shape()));
   TF_RET_CHECK(ShapeUtil::Equal(proto.shape(), subshape()));
 
+  if (LayoutUtil::IsSparseArray(subshape())) {
+    // Compute the number of elements (indices) in the sparse shape and reserve
+    // the necessary space in spare_indices.
+    TF_RET_CHECK(ShapeUtil::Rank(subshape()) != 0)
+        << "Scalar shapes cannot be sparse";
+    TF_RET_CHECK(proto.sparse_indices_size() % ShapeUtil::Rank(subshape()) == 0)
+        << "Unexpected number of indices in proto ("
+        << proto.sparse_indices_size() << ") for shape of rank "
+        << ShapeUtil::Rank(subshape());
+    const int64 index_count =
+        proto.sparse_indices_size() / ShapeUtil::Rank(subshape());
+    sparse_indices()->Resize(index_count);
+
+    // Copy the indices from the proto into the SparseIndexArray object.
+    TF_RETURN_IF_ERROR(CopyFromRepeatedField(sparse_indices()->mutable_data(),
+                                             proto.sparse_indices()));
+  }
+
   switch (subshape().element_type()) {
     case PRED:
       TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<bool>(), proto.preds()));
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index 7ad287c897..dd5b54e4c9 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -224,6 +224,16 @@ TEST_F(LiteralUtilTest, CreateSparse) {
             absl::Span<const int64>(expected_indices.data(),
                                     expected_indices.num_elements()));
   EXPECT_EQ(literal.data<int64>(), absl::Span<const int64>(expected_values));
+
+  // Serialize then deserialize and verify the resulting literal.
+  TF_ASSERT_OK_AND_ASSIGN(Literal literal_from_proto,
+                          Literal::CreateFromProto(literal.ToProto()));
+
+  EXPECT_EQ(literal_from_proto.sparse_indices()->data(),
+            absl::Span<const int64>(expected_indices.data(),
+                                    expected_indices.num_elements()));
+  EXPECT_EQ(literal_from_proto.data<int64>(),
+            absl::Span<const int64>(expected_values));
 }
 
 TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 3f8b734afb..f329a27e14 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -300,6 +300,7 @@ cc_library(
         "hlo_opcode.cc",
         "hlo_schedule.cc",
         "hlo_sharding.cc",
+        "hlo_sharding_metadata.cc",
     ],
     hdrs = [
         "dfs_hlo_visitor.h",
@@ -313,6 +314,7 @@ cc_library(
         "hlo_opcode.h",
         "hlo_schedule.h",
         "hlo_sharding.h",
+        "hlo_sharding_metadata.h",
     ],
     deps = [
         ":hlo_casting_utils",
@@ -2759,22 +2761,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "hlo_sharding_metadata",
-    srcs = ["hlo_sharding_metadata.cc"],
-    hdrs = [
-        "hlo_sharding_metadata.h",
-    ],
-    deps = [
-        ":hlo",
-        "//tensorflow/compiler/xla:shape_tree",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 cc_library(
     name = "hlo_domain_verifier",
     srcs = ["hlo_domain_verifier.cc"],
@@ -2825,7 +2811,6 @@ tf_cc_test(
         ":hlo_domain_isolator",
         ":hlo_domain_remover",
         ":hlo_parser",
-        ":hlo_sharding_metadata",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -3441,7 +3426,6 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_lexer",
-        ":hlo_sharding_metadata",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index caaca16f71..1ea26ddd5b 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -34,7 +34,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 54
+// Next ID: 56
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -180,6 +180,10 @@ message HloInstructionProto {
 
   // Collective permute field.
   repeated SourceTarget source_target_pairs = 52;
+
+  // Sharding for kDomain instructions.
+  xla.OpSharding domain_entry_sharding = 54;
+  xla.OpSharding domain_exit_sharding = 55;
 }
 
 // Serialization of HloComputation.
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 0207f9ae3f..de22b2d3a5 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -467,14 +468,27 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           proto.dot_dimension_numbers(), precision_config);
       break;
     }
-    case HloOpcode::kDomain:
+    case HloOpcode::kDomain: {
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "Domain instruction should have 1 operands but sees "
           << proto.operand_ids_size();
+      TF_RET_CHECK(proto.has_domain_entry_sharding())
+          << "Domain instruction must domain_entry_sharding";
+      TF_RET_CHECK(proto.has_domain_exit_sharding())
+          << "Domain instruction must domain_exit_sharding";
+      TF_ASSIGN_OR_RETURN(
+          HloSharding entry_hlo_sharding,
+          HloSharding::FromProto(proto.domain_entry_sharding()));
+      TF_ASSIGN_OR_RETURN(HloSharding exit_hlo_sharding,
+                          HloSharding::FromProto(proto.domain_exit_sharding()));
       instruction = absl::make_unique<HloDomainInstruction>(
-          proto.shape(), operands(0), /*operand_side_metadata=*/nullptr,
-          /*user_side_metadata=*/nullptr);
+          proto.shape(), operands(0),
+          absl::make_unique<ShardingMetadata>(
+              std::make_shared<const HloSharding>(entry_hlo_sharding)),
+          absl::make_unique<ShardingMetadata>(
+              std::make_shared<const HloSharding>(exit_hlo_sharding)));
       break;
+    }
     default: {
       instruction = absl::WrapUnique(new HloInstruction(opcode, proto.shape()));
       for (const int64 operand_id : proto.operand_ids()) {
@@ -482,12 +496,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
             << "No instruction with id " << operand_id;
         instruction->AppendOperand(instruction_map.at(operand_id));
       }
-      for (const int64 predecessor_id : proto.control_predecessor_ids()) {
-        TF_RET_CHECK(ContainsKey(instruction_map, predecessor_id))
-            << "No instruction with id " << predecessor_id;
-        TF_RETURN_IF_ERROR(instruction_map.at(predecessor_id)
-                               ->AddControlDependencyTo(instruction.get()));
-      }
       if (instruction->opcode() != HloOpcode::kFusion) {
         for (const int64 computation_id : proto.called_computation_ids()) {
           TF_RET_CHECK(ContainsKey(computation_map, computation_id))
@@ -503,6 +511,13 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     }
   }
 
+  for (const int64 predecessor_id : proto.control_predecessor_ids()) {
+    TF_RET_CHECK(ContainsKey(instruction_map, predecessor_id))
+        << "No instruction with id " << predecessor_id;
+    TF_RETURN_IF_ERROR(instruction_map.at(predecessor_id)
+                           ->AddControlDependencyTo(instruction.get()));
+  }
+
   TF_RET_CHECK(!proto.name().empty());
   instruction->SetAndSanitizeName(proto.name());
   instruction->metadata_ = proto.metadata();
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 1bc168c8b7..68d0979f5c 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/window_util.h"
 
 namespace xla {
@@ -213,6 +214,7 @@ HloSendRecvInstruction::HloSendRecvInstruction(HloOpcode opcode,
 HloInstructionProto HloSendRecvInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_channel_id(channel_id_);
+  proto.set_is_host_transfer(is_host_transfer_);
   return proto;
 }
 
@@ -2310,4 +2312,23 @@ std::unique_ptr<HloInstruction> HloDomainInstruction::CloneWithNewOperandsImpl(
       shape, new_operands[0], operand_side_metadata_->Clone(),
       user_side_metadata_->Clone());
 }
+
+HloInstructionProto HloDomainInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  auto operand_side_sharding =
+      dynamic_cast<const ShardingMetadata*>(operand_side_metadata_.get());
+  if (operand_side_sharding) {
+    *proto.mutable_domain_entry_sharding() =
+        operand_side_sharding->sharding()->ToProto();
+  }
+
+  auto user_side_sharding =
+      dynamic_cast<const ShardingMetadata*>(user_side_metadata_.get());
+  if (user_side_sharding) {
+    *proto.mutable_domain_exit_sharding() =
+        user_side_sharding->sharding()->ToProto();
+  }
+
+  return proto;
+}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 9c22f5db7e..c929867bb9 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -1341,6 +1341,9 @@ class HloDomainInstruction : public HloInstruction {
       std::unique_ptr<DomainMetadata> operand_side_metadata,
       std::unique_ptr<DomainMetadata> user_side_metadata);
 
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
   // Retrieves the operand side metadata of a kDomain instruction.
   const DomainMetadata& operand_side_metadata() const {
     return *operand_side_metadata_;
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 96db96bdb9..dd4ee780f0 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -1163,49 +1163,80 @@ ENTRY Sort {
   // clang-format on
 }
 
-class HloParserTest : public ::testing::Test,
-                      public ::testing::WithParamInterface<TestData> {
+// The test class for those tests defined above which round-trip through the
+// parser and ToString is templatized on two bool parameters:
+//
+//  short_form : used for the "short" test cases which use the ShortParsable
+//    output form.
+//  proto_round_trip : whether the module should also be round-tripped through
+//    HloProto form. This provides much better coverage for the proto
+//    serialization/deserialization.
+//
+// The proto_round_trip=true case also technically covers the Parser->ToString
+// roundtrip as well, but separating out the Parser->ToString roundtrip as its
+// own test provides better isolation and could conceivably catch weirdo bugs
+// which are hidden by interaction between the textual and proto roundtripping.
+template <bool short_form, bool proto_round_trip>
+class HloParameterizedParserTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<TestData> {
  protected:
-  static void ExpectHasSubstr(string_view s, string_view expected) {
-    EXPECT_TRUE(absl::StrContains(s, expected))
-        << "'" << s << "' does not contain '" << expected << "'";
-  }
-
   // Expects "ToString(ParseHloString(string)) == string", that is, parses the
   // string, asserts that it succeeded, stringifies the parsed module, and
   // checks that the it equals the original string.
   void ExpectEqual() {
     const string& original = GetParam().module_string;
-    auto result = ParseHloString(original);
-    TF_ASSERT_OK(result.status());
-    EXPECT_EQ(original, result.ValueOrDie()->ToString(
-                            HloPrintOptions().set_print_large_constants(true)));
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                            ParseHloString(original));
+    if (proto_round_trip) {
+      TF_ASSERT_OK_AND_ASSIGN(module, HloModule::CreateFromProto(
+                                          module->ToProto(), module->config()));
+    }
+    if (short_form) {
+      EXPECT_EQ(original, module->ToString(HloPrintOptions::ShortParsable()));
+    } else {
+      EXPECT_EQ(
+          original,
+          module->ToString(HloPrintOptions().set_print_large_constants(true)));
+    }
   }
 };
 
-class HloParserShortTest : public HloParserTest {
- protected:
-  void ExpectEqualShort() {
-    const string& original = GetParam().module_string;
-    auto result = ParseHloString(original);
-    TF_ASSERT_OK(result.status());
-    EXPECT_EQ(original,
-              result.ValueOrDie()->ToString(HloPrintOptions::ShortParsable()));
-  }
-};
+// These using shenanigans are required because the TEST_P macro doesn't like
+// template instantiations which contain commas.
+using HloParserTestLong = HloParameterizedParserTest<false, false>;
+using HloParserTestLongProto = HloParameterizedParserTest<false, true>;
+using HloParserTestShort = HloParameterizedParserTest<true, false>;
+using HloParserTestShortProto = HloParameterizedParserTest<true, true>;
 
-TEST_P(HloParserTest, Run) { ExpectEqual(); }
+TEST_P(HloParserTestLong, Run) { ExpectEqual(); }
+TEST_P(HloParserTestLongProto, Run) { ExpectEqual(); }
+TEST_P(HloParserTestShort, Run) { ExpectEqual(); }
+TEST_P(HloParserTestShortProto, Run) { ExpectEqual(); }
 
-TEST_P(HloParserShortTest, Run) { ExpectEqualShort(); }
-
-INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserTest,
+INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserTestLong,
                         ::testing::ValuesIn(CreateTestCases()),
                         TestDataToString);
-
-INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserShortTest,
+INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation,
+                        HloParserTestLongProto,
+                        ::testing::ValuesIn(CreateTestCases()),
+                        TestDataToString);
+INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserTestShort,
+                        ::testing::ValuesIn(CreateShortTestCases()),
+                        TestDataToString);
+INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation,
+                        HloParserTestShortProto,
                         ::testing::ValuesIn(CreateShortTestCases()),
                         TestDataToString);
 
+class HloParserTest : public ::testing::Test {
+ protected:
+  static void ExpectHasSubstr(string_view s, string_view expected) {
+    EXPECT_TRUE(absl::StrContains(s, expected))
+        << "'" << s << "' does not contain '" << expected << "'";
+  }
+};
+
 TEST_F(HloParserTest, Empty) {
   const string original = "";
   auto result = ParseHloString(original);
-- 
GitLab


From 8d12c635cc48e896da0bcac1cd568bd6381ca64e Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Tue, 2 Oct 2018 13:18:27 -0700
Subject: [PATCH 299/570] Support shape_invariants in while_v2. Note that this
 arg is temporary and may be replaced by automatic shape inference in TF 2.0
 (or before). Add a output_shapes attr to While op to allow output shapes to
 be different from the incoming loop_vars.

PiperOrigin-RevId: 215446737
---
 .../function_functional_while.pbtxt           |  7 +++
 tensorflow/core/ops/functional_ops.cc         | 23 +++++++-
 .../kernel_tests/control_flow_ops_py_test.py  | 11 ++--
 tensorflow/python/ops/control_flow_ops.py     |  3 +-
 tensorflow/python/ops/while_v2.py             | 59 ++++++++++++++++---
 5 files changed, 86 insertions(+), 17 deletions(-)

diff --git a/tensorflow/core/grappler/costs/graph_properties_testdata/function_functional_while.pbtxt b/tensorflow/core/grappler/costs/graph_properties_testdata/function_functional_while.pbtxt
index c94ee2f227..0ec95dd684 100644
--- a/tensorflow/core/grappler/costs/graph_properties_testdata/function_functional_while.pbtxt
+++ b/tensorflow/core/grappler/costs/graph_properties_testdata/function_functional_while.pbtxt
@@ -88,6 +88,13 @@ library {
           }
         }
       }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+          }
+        }
+      }
     }
     ret {
       key: "while"
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index bda4a75c5d..fed3fa22ed 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -150,10 +150,29 @@ REGISTER_OP("While")
     .Attr("T: list(type) >= 0")
     .Attr("cond: func")
     .Attr("body: func")
+    .Attr("output_shapes: list(shape) = []")
     .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      for (int i = 0; i < c->num_outputs(); ++i) {
-        c->set_output(i, c->input(i));
+      std::vector<PartialTensorShape> output_shapes;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+      // If `output_shapes` attr is set use that as the shapes of the outputs
+      // else use the input shapes.
+      if (!output_shapes.empty()) {
+        if (output_shapes.size() != c->num_outputs()) {
+          return errors::InvalidArgument(
+              "`output_shapes` must be the same length as num outputs (",
+              output_shapes.size(), " vs. ", c->num_outputs());
+        }
+        for (size_t i = 0; i < output_shapes.size(); ++i) {
+          shape_inference::ShapeHandle output_shape_handle;
+          TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+              output_shapes[i], &output_shape_handle));
+          c->set_output(static_cast<int>(i), output_shape_handle);
+        }
+      } else {
+        for (int i = 0; i < c->num_outputs(); ++i) {
+          c->set_output(i, c->input(i));
+        }
       }
       return Status::OK();
     });
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index ae61be614e..655fece5ff 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -1040,7 +1040,6 @@ class ControlFlowTest(test.TestCase):
       result = r[3].eval()
     self.assertAllEqual(42, result)
 
-  @test_util.disable_control_flow_v2("b/116283162 (shape_invariants)")
   def testWhile_5(self):
     with self.cached_session():
 
@@ -1116,7 +1115,6 @@ class ControlFlowTest(test.TestCase):
     self._testWhile_Gpu_1(use_gpu=False)
     self._testWhile_Gpu_1(use_gpu=True)
 
-  @test_util.disable_control_flow_v2("b/116283162 (shape_invariants)")
   def testWhileShape(self):
     with self.cached_session():
       i = constant_op.constant(0)
@@ -1152,7 +1150,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
       self.assertEqual([10000], r.eval())
 
-  @test_util.disable_control_flow_v2("b/116283162 (shape_invariants)")
   def testWhileShapeInference(self):
     with self.cached_session():
       i = constant_op.constant(0)
@@ -1366,6 +1363,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(lambda x: x < 10, body, [x0])
       self.assertEqual(10, sess.run(r, {b: True}))
 
+  @test_util.disable_control_flow_v2("b/116134862 (cond output shape)")
   def testWhileCondWithControl(self):
     # Ensure that no control edges by an outer control dependency context are
     # added to nodes inside cond/while contexts.
@@ -1477,6 +1475,7 @@ class ControlFlowTest(test.TestCase):
     self._testCondWhile_3(use_gpu=False)
     self._testCondWhile_3(use_gpu=True)
 
+  @test_util.disable_control_flow_v2("b/116134862 (cond output shape)")
   def testWhileCond_1(self):
 
     with self.cached_session():
@@ -1493,6 +1492,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [i])
       self.assertAllEqual(10, r.eval())
 
+  @test_util.disable_control_flow_v2("b/116134862 (cond output shape)")
   def testWhileCond_2(self):
 
     with self.cached_session():
@@ -1502,6 +1502,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [n])
       self.assertAllEqual(10, r.eval())
 
+  @test_util.disable_control_flow_v2("b/116134862 (cond output shape)")
   def testWhileCond_3(self):
 
     with self.cached_session():
@@ -1696,7 +1697,7 @@ class ControlFlowTest(test.TestCase):
       for i in xrange(10):
         self.assertEqual([i], q.dequeue().eval())
 
-  @test_util.disable_control_flow_v2("b/116283162 (shape_invariants)")
+  @test_util.disable_control_flow_v2("b/117119329 (stack)")
   def testWhileStack_1(self):
     with self.cached_session():
       s = gen_data_flow_ops.stack_v2(-1, dtypes.int32, stack_name="foo")
@@ -1781,7 +1782,6 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllClose(1024.0, r.eval())
 
-  @test_util.disable_control_flow_v2("b/116283162 (shape_invariants)")
   def testWhileGrad_Shape(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32, shape=[None])
@@ -2291,7 +2291,6 @@ class ControlFlowTest(test.TestCase):
       r = sess.run(r, feed_dict={v: 2.0})
       self.assertAllClose(1024.0, r)
 
-  @test_util.disable_control_flow_v2("b/116283162 (shape_invariants)")
   def testWhileGrad_Concat(self):
     with self.cached_session() as sess:
       x = variable_scope.get_variable("x", initializer=[[1., 2.]])
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 8ad71fe00c..f779c3d273 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -3225,7 +3225,8 @@ def while_loop(cond,
       raise ValueError("The while_v2 module is not set. Did you forget to "
                        "import tensorflow.python.ops."
                        "while_v2?")
-    return _while_v2.while_loop(cond, body, loop_vars, name)
+    return _while_v2.while_loop(
+        cond, body, loop_vars, shape_invariants=shape_invariants, name=name)
 
   with ops.name_scope(name, "while", loop_vars):
     if not loop_vars:
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 6791e1cd61..8e88a84d60 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import cond_v2_impl as cond_v2
 from tensorflow.python.ops import control_flow_ops
@@ -52,8 +53,17 @@ control_flow_ops._while_v2 = sys.modules[__name__]
 # handled in the CapturingGraph itself.
 
 
-def while_loop(cond, body, loop_vars, name=None):
+def while_loop(cond, body, loop_vars, shape_invariants=None, name=None):
   """Like tf.while_loop, except emits a single While op."""
+  flattened_loop_vars = nest.flatten(loop_vars)
+  if shape_invariants is not None:
+    nest.assert_same_structure(loop_vars, shape_invariants)
+    flattened_shapes = nest.flatten(shape_invariants)
+  else:
+    flattened_shapes = [t.shape for t in flattened_loop_vars]
+
+  del shape_invariants
+
   if not name:
     name = "while"
 
@@ -62,25 +72,33 @@ def while_loop(cond, body, loop_vars, name=None):
       cond_name = _get_unique_name(("%scond" % scope).replace("/", "_"))
       body_name = _get_unique_name(("%sbody" % scope).replace("/", "_"))
 
-    flattened_loop_vars = nest.flatten(loop_vars)
     num_outputs = len(flattened_loop_vars)
 
     # Add loop counter needed for computing gradients.
     flattened_loop_vars = [constant_op.constant(0., name="loop_counter")
                           ] + flattened_loop_vars
 
+    flattened_shapes = [tensor_shape.scalar()] + flattened_shapes
+
     # Build a `cond` wrapper that can handle the extra counter loop_var.
     def wrapped_cond(unused_loop_counter, *loop_vars):
       return cond(*loop_vars)
 
-    cond_graph = function.func_graph_from_py_func(cond_name, wrapped_cond,
-                                                  flattened_loop_vars, {})
+    signature = [
+        tensor_spec.TensorSpec(shape, t.dtype)
+        for shape, t in zip(flattened_shapes, flattened_loop_vars)
+    ]
+    cond_graph = function.func_graph_from_py_func(
+        cond_name, wrapped_cond, flattened_loop_vars, {}, signature=signature)
 
     # Add external_captures of cond to the list of loop vars.
     # Note that external tensors will be treated as loop invariants, i.e.,
     # the value of that tensor in each iteration is the same as it was at the
     # beginning of the loop execution.
     flattened_loop_vars = flattened_loop_vars + cond_graph.external_captures
+    flattened_shapes = flattened_shapes + [
+        t.shape for t in cond_graph.external_captures
+    ]
 
     def wrapped_body(loop_counter, *args):
       """Loop body augmented with counter update.
@@ -105,8 +123,12 @@ def while_loop(cond, body, loop_vars, name=None):
       # is_constant=True for inputs that are directly passed to outputs.
       return [loop_counter + 1] + list(outputs) + list(args[num_outputs:])
 
-    body_graph = function.func_graph_from_py_func(body_name, wrapped_body,
-                                                  flattened_loop_vars, {})
+    signature = [
+        tensor_spec.TensorSpec(shape, t.dtype)
+        for shape, t in zip(flattened_shapes, flattened_loop_vars)
+    ]
+    body_graph = function.func_graph_from_py_func(
+        body_name, wrapped_body, flattened_loop_vars, {}, signature=signature)
     # Add external captures of body to the list of loop vars.
     # Note that external tensors will be treated as loop invariants, i.e.,
     # the value of that tensor in each iteration is the same as it was at the
@@ -149,10 +171,17 @@ def while_loop(cond, body, loop_vars, name=None):
         # Add this modified tensor list to the list of outputs.
         body_graph.outputs.append(appended_tensor_list)
 
+    # Make sure that the shapes of the loop outputs are compatible with the
+    # shape invariants, or the shapes of the loop vars if the invariants are not
+    # specified.
+    _check_shapes_compat(body_graph.outputs[1:1 + num_outputs],
+                         flattened_shapes[1:1 + num_outputs],
+                         flattened_loop_vars[1:1 + num_outputs])
     outputs = gen_functional_ops._while(
         flattened_loop_vars,
         cond_v2._create_new_tf_function(cond_graph),
         cond_v2._create_new_tf_function(body_graph),
+        output_shapes=[t.shape for t in body_graph.outputs],
         name=scope)
 
     _copy_handle_data(body_graph.outputs, outputs)
@@ -216,6 +245,7 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
       loop_vars,
       cond_v2._create_new_tf_function(cond_grad_graph),
       cond_v2._create_new_tf_function(body_grad_graph),
+      output_shapes=[t.shape for t in body_grad_graph.outputs],
       name=_get_unique_name("%s_grad" % op.name))
 
   _copy_handle_data(body_grad_graph.outputs, outputs)
@@ -236,8 +266,10 @@ def _get_body_graph(while_op):
   Returns:
     `FuncGraph` for the while body.
   """
-  extra_inputs = list(while_op.inputs)
-  input_shapes = [t.shape for t in extra_inputs]
+  # TODO(srbs): Handle TensorShapeProto in function_def_to_graph.input_shapes.
+  input_shapes = [
+      tensor_shape.TensorShape(s) for s in while_op.get_attr("output_shapes")
+  ]
   func_name = while_op.get_attr("body").name
   fdef = while_op.graph._get_function(func_name).definition
   func_graph = function_def_to_graph.function_def_to_graph(fdef, input_shapes)
@@ -535,6 +567,17 @@ class _WhileBodyGradFuncGraph(function.FuncGraph):
     return captured_tensor
 
 
+def _check_shapes_compat(output_tensors, shape_invariants, input_tensors):
+  for (t, shape, input_t) in zip(output_tensors, shape_invariants,
+                                 input_tensors):
+    if not control_flow_ops._ShapeLessThanOrEqual(t.shape, shape):
+      raise ValueError(
+          "Input tensor '%s' enters the loop with shape %s, but has "
+          "shape %s after one iteration. To allow the shape to vary across "
+          "iterations, use the `shape_invariants` argument of tf.while_loop to "
+          "specify a less-specific shape." % (input_t.name, shape, t.shape))
+
+
 def _copy_handle_data(src_tensors, tgt_tensors):
   for src_t, tgt_t in zip(src_tensors, tgt_tensors):
     function._copy_handle_data(src_t, tgt_t)
-- 
GitLab


From a2599d1f89e3d6fe0a3f0436b5053fcbf4ae0265 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 13:28:51 -0700
Subject: [PATCH 300/570] Update ops-related pbtxt files.

PiperOrigin-RevId: 215448397
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 33 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  8 +++++
 2 files changed, 41 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 43c14d83b5..e46cbc863d 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -76797,6 +76797,39 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "WholeFileReader"
   output_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index abee803889..0e9f939ab4 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -36935,6 +36935,14 @@ op {
     name: "body"
     type: "func"
   }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
   is_stateful: true
 }
 op {
-- 
GitLab


From a12b8c4afdca3ac2945d62b3b83ca2599ab360f9 Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliacomputing.com>
Date: Sun, 16 Sep 2018 18:39:50 -0400
Subject: [PATCH 301/570] [xla] Improve validation of Broadcast shape

If one misreads the semantics of this instruction, it's easy to cause
an out of bounds access into the dimensions here. Add an extra check
to return a proper error to the user rather than crashing in that
case.

Ref #22130
---
 tensorflow/compiler/xla/service/hlo_verifier.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 50f39cbcb5..0f6ecd42f6 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -313,8 +313,9 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
        operand_dimension < ShapeUtil::Rank(operand_shape);
        ++operand_dimension) {
     int64 output_dimension = broadcast->dimensions()[operand_dimension];
-    TF_RET_CHECK(broadcast->shape().dimensions(output_dimension) ==
-                 operand_shape.dimensions(operand_dimension))
+    TF_RET_CHECK((output_dimension < ShapeUtil::Rank(broadcast->shape())) &&
+                 (broadcast->shape().dimensions(output_dimension) ==
+                 operand_shape.dimensions(operand_dimension)))
         << broadcast->ToString() << " operand shape " << operand_shape;
   }
   return Status::OK();
-- 
GitLab


From e45c90f0e4d17ac22048a73f1e81bd9c7a7a5145 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 14:03:40 -0700
Subject: [PATCH 302/570] Upgrade cloud tpu profiler to 1.12.0.

PiperOrigin-RevId: 215454323
---
 tensorflow/contrib/tpu/profiler/pip_package/setup.py | 2 +-
 tensorflow/contrib/tpu/profiler/version.h            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index 2415c46718..f27ae38e04 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from setuptools import setup
 
-_VERSION = '1.11.0'
+_VERSION = '1.12.0'
 
 CONSOLE_SCRIPTS = [
     'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h
index 90d34b5ef1..4b6d1b2b07 100644
--- a/tensorflow/contrib/tpu/profiler/version.h
+++ b/tensorflow/contrib/tpu/profiler/version.h
@@ -16,6 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 #define TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 
-#define TPU_PROFILER_VERSION "1.11.0"
+#define TPU_PROFILER_VERSION "1.12.0"
 
 #endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
-- 
GitLab


From c921e45bccac86ce0becc71cedc3da2c702d5c38 Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Tue, 2 Oct 2018 14:30:22 -0700
Subject: [PATCH 303/570] Add support for multiple input/output numpy arrays
 when using Keras APIs.

PiperOrigin-RevId: 215459075
---
 tensorflow/contrib/distribute/python/BUILD    |   1 +
 .../contrib/distribute/python/keras_test.py   |  88 ++++++++++--
 .../engine/distributed_training_utils.py      | 134 +++++++++++++++---
 tensorflow/python/keras/engine/training.py    |  48 ++++---
 .../keras/engine/training_distributed.py      |  30 ++--
 tensorflow/python/keras/models.py             |   5 +
 6 files changed, 237 insertions(+), 69 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index cfb9d42a6f..defa82f98a 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -728,6 +728,7 @@ cuda_py_test(
     additional_deps = [
         ":keras_test_lib",
     ],
+    shard_count = 16,
     tags = [
         "multi_and_single_gpu",
         "no_pip",
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index 3aab2c521f..993cb2bac3 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -189,6 +189,14 @@ def get_dataset(distribution):
   return dataset
 
 
+def get_predict_dataset(distribution):
+  inputs = np.zeros((10, 3), dtype=np.float32)
+  dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
+  dataset = dataset.repeat(100)
+  dataset = batch_wrapper(dataset, 10, distribution)
+  return dataset
+
+
 strategies = [combinations.default_strategy,
               combinations.one_device_strategy,
               combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -387,16 +395,26 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
           distributed_training_utils.validate_distributed_dataset_inputs(
               strategy, x, y)
 
-  def test_calling_model_with_numpy_arrays(self):
+  # TODO(anjalisridhar): Move this test along with other numpy related tests to
+  # its own class.
+  @combinations.generate(strategy_combinations())
+  def test_creating_var_with_numpy_arrays(self, distribution):
+    with self.cached_session():
+      x = np.asarray(np.random.random((64, 3)), dtype=np.float32)
+      var_x = distributed_training_utils.get_var_for_numpy(distribution, x)
+      val = self.evaluate(var_x.value())
+      # Verify that the numpy value is copied to the variable.
+      self.assertAllEqual(x, val)
+
+  @combinations.generate(strategy_combinations())
+  def test_calling_model_with_numpy_arrays(self, distribution):
     with self.cached_session():
       model = get_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
-      metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
       inputs = np.zeros((64, 3), dtype=np.float32)
       targets = np.zeros((64, 4), dtype=np.float32)
@@ -419,6 +437,48 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
       # with batch_size
       model.predict(inputs, batch_size=8)
 
+  @combinations.generate(strategy_combinations())
+  def test_calling_model_with_nested_numpy_arrays(self, distribution):
+    with self.cached_session():
+      a = keras.layers.Input(shape=(3,), name='input_a')
+      b = keras.layers.Input(shape=(3,), name='input_b')
+
+      dense = keras.layers.Dense(4, name='dense')
+      c = dense(a)
+      d = dense(b)
+      e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+      model = keras.models.Model([a, b], [d, e])
+
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      input_a_np = np.asarray(np.random.random((64, 3)), dtype=np.float32)
+      input_b_np = np.asarray(np.random.random((64, 3)), dtype=np.float32)
+      inputs = [input_a_np, input_b_np]
+
+      output_d_np = np.asarray(np.random.random((64, 4)), dtype=np.float32)
+      output_e_np = np.asarray(np.random.random((64, 4)), dtype=np.float32)
+      targets = [output_d_np, output_e_np]
+
+      # Call fit with validation data
+      model.fit(inputs, targets, epochs=1, batch_size=8, verbose=0)
+
+      # TODO(anjalisridhar): We need tests for when the batch size and steps are
+      # smaller and results in a 0 batch_size and steps value.
+      model.evaluate(inputs, targets)
+      # with steps
+      model.evaluate(inputs, targets, steps=2)
+      # with batch_size
+      model.evaluate(inputs, targets, batch_size=8)
+
+      model.predict(inputs)
+      # with steps
+      model.predict(inputs, steps=2)
+      # with batch_size
+      model.predict(inputs, batch_size=8)
+
   @combinations.generate(strategy_combinations())
   def test_calling_model_on_same_dataset(self, distribution):
     with self.cached_session():
@@ -436,7 +496,7 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
                 validation_data=dataset, validation_steps=2)
       model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
                 validation_data=dataset, validation_steps=2)
-      model.predict(dataset, steps=2)
+      model.predict(get_predict_dataset(distribution), steps=2)
 
   # TODO(priyag): Enable this test for TPU. Currently tuples/dict don't work
   # as clone_model's input_tensors argument only seems to accept list and not
@@ -496,10 +556,7 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
 
       model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
       model.evaluate(dataset, steps=2, verbose=1)
-      model.predict(dataset, steps=2)
-      # Test with validation data
-      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                validation_data=dataset, validation_steps=2)
+      model.predict(get_predict_dataset(distribution), steps=2)
 
   @combinations.generate(strategy_and_optimizer_combinations())
   def test_fit_eval_and_predict_with_optimizer(self, distribution, optimizer):
@@ -513,7 +570,7 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
 
       model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
       model.evaluate(dataset, steps=2, verbose=1)
-      model.predict(dataset, steps=2)
+      model.predict(get_predict_dataset(distribution), steps=2)
 
   def test_unsupported_features(self):
     with self.cached_session():
@@ -726,8 +783,12 @@ class NormalizationLayerWithDistributionStrategyTest(
       dataset = dataset.repeat(100)
       dataset = batch_wrapper(dataset, 32, distribution)
 
+      predict_dataset = dataset_ops.Dataset.from_tensor_slices(x)
+      predict_dataset = predict_dataset.repeat(100)
+      predict_dataset = batch_wrapper(predict_dataset, 32, distribution)
+
       model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10)
-      out = model.predict(dataset, steps=2)
+      out = model.predict(predict_dataset, steps=2)
       out -= keras.backend.eval(norm.beta)
       out /= keras.backend.eval(norm.gamma)
       np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
@@ -811,8 +872,7 @@ class CorrectnessWithDistributionStrategyTest(test.TestCase,
         predict_batch_size = 4
         if with_distribution:
           predict_batch_size //= with_distribution.num_towers
-        predict_dataset = dataset_ops.Dataset.from_tensor_slices((x_predict,
-                                                                  x_predict))
+        predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
         predict_dataset = batch_wrapper(predict_dataset,
                                         predict_batch_size, distribution)
         predict_result = model.predict(predict_dataset, steps=1)
diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py
index 39341a931b..050602868a 100644
--- a/tensorflow/python/keras/engine/distributed_training_utils.py
+++ b/tensorflow/python/keras/engine/distributed_training_utils.py
@@ -17,12 +17,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.client import session as session_module
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
@@ -304,23 +310,19 @@ def validate_inputs(x, y, distribution_strategy):
       compiled.
 
   Raises:
-    ValueError: if input is not a Dataset or a numpy array.
+    ValueError: if input is not a Dataset or a numpy array(when we use
+      MirroredStrategy).
   """
-  if isinstance(x, list) or isinstance(y, list):
-    raise ValueError('DistributionStrategy does not support lists of numpy'
-                     'arrays. You must pass a Dataset object or a numpy array '
-                     'as input.')
-
   if isinstance(x, dict) or isinstance(y, dict):
-    raise ValueError('DistributionStrategy does not support inputs of type '
-                     'dict. You must pass a Dataset object or a numpy array as '
-                     'input.')
+    raise ValueError('`DistributionStrategy` does not support inputs of type '
+                     'dict. You must pass a `tf.data.Dataset` object or a '
+                     'numpy array as input.')
 
-  if isinstance(x, iterator_ops.Iterator) or \
-      isinstance(y, iterator_ops.Iterator):
-    raise ValueError('DistributionStrategy does not support inputs of type '
-                     'Iterator. You must pass a Dataset object or a numpy '
-                     'array as input.')
+  if (isinstance(x, iterator_ops.Iterator) or
+      isinstance(y, iterator_ops.Iterator)):
+    raise ValueError('`DistributionStrategy` does not support inputs of type '
+                     'Iterator. You must pass a `tf.data.Dataset` object or a '
+                     'numpy array as input.')
 
   if distribution_strategy.__class__.__name__ == 'TPUStrategy':
     for i in [x, y]:
@@ -334,14 +336,14 @@ def validate_inputs(x, y, distribution_strategy):
               'Found unknown shape {} in input {}.'.format(s, i))
 
 
-def get_input_batch_params(first_x_value, batch_size, current_strategy):
+def get_input_batch_params(first_x_value, batch_size, distribution_strategy):
   """Calculate the number of batches and steps/steps_per_epoch.
 
   Args:
     first_x_value: This is the first input numpy array that is passed in as the
       model input.
     batch_size: The specified batch_size or the default batch_size of 32.
-    current_strategy: The current DistributionStrategy used to compile the
+    distribution_strategy: The current DistributionStrategy used to compile the
       model.
 
   Returns:
@@ -359,14 +361,14 @@ def get_input_batch_params(first_x_value, batch_size, current_strategy):
   # TODO(anjalisridhar): TPU currently supports using the num_towers property.
   # We might want to look into implementing worker_devices. In multi worker
   # strategy, perhaps num_towers works better?
-  steps = num_batches // current_strategy.num_towers
+  steps = num_batches // distribution_strategy.num_towers
   if not steps:
     # TODO(anjalisridhar): Number of towers in the error message may not convey
     # what we want to the user. Is there another terminology that we can use
     # that is consistent across different strategies.
     raise ValueError('The number of batches %d is smaller than the number '
                      'of towers %d used for DistributionStrategy. ' %
-                     num_batches, current_strategy.num_towers)
+                     (num_batches, distribution_strategy.num_towers))
   return steps
 
 
@@ -376,3 +378,99 @@ def get_batch_dimension(iterator):
   # all.
   dims = shapes[0].dims
   return dims[0] if dims else None
+
+
+def get_cpu_device(distribution_strategy):
+  """Returns the CPU device of the TPU host or the default CPU device string.
+
+  Args:
+    distribution_strategy: The DistributionStrategy used to compile the model.
+
+  Returns:
+    A device string which is the TPU host's CPU device in case of
+    TPUDistributionStrategy or the default CPU device string in all other
+    cases.
+
+  Raises:
+    NotImplementedError: We currently don't support copying numpy data to
+    multiple hosts in the case of Cloud TPU pods.
+  """
+  if distribution_strategy.__class__.__name__ == 'TPUStrategy':
+    if distribution_strategy.num_hosts > 1:
+      raise NotImplementedError('TPUDistributionStrategy does not '
+                                'support numpy inputs when running on Cloud'
+                                'TPU pods.')
+    return distribution_strategy.get_host_cpu_device(0)
+  else:
+    # For all strategies except TPUDistributionStrategy
+    # TODO(anjalisridhar): We may need to modify this when we add support for
+    # multi-worker strategy.
+    return '/CPU:0'
+
+
+def get_var_for_numpy(distribution_strategy, x):
+  if isinstance(x, list):
+    var_x = tuple([_get_var_for_numpy(distribution_strategy, single_input)
+                   for single_input in x])
+  else:
+    var_x = _get_var_for_numpy(distribution_strategy, x)
+  return var_x
+
+
+def _get_var_for_numpy(distribution_strategy, input_array):
+  """Creates a variable and assigns the value of the numpy array to it.
+
+  Args:
+    distribution_strategy: The DistributionStrategy used to compile the model.
+    input_array: The input numpy array whose value will be assigned to the
+      variable we create.
+
+  Returns:
+    The variable to which we will copy the value of the input numpy array.
+
+  """
+  with ops.device(get_cpu_device(distribution_strategy)):
+    # Create and initialize a variable on the CPU device. This is the CPU
+    # device of the host in the case of TPUDistributionStrategy.
+    input_var = variables.VariableV1(array_ops.zeros(input_array.shape,
+                                                     input_array.dtype),
+                                     trainable=False, use_resource=True)
+  K.get_session().run(input_var.initializer)
+
+  # Create a placeholder for the numpy array input slices. We copy the value
+  # of the input numpy array to the variable in slices of size 64 MB to avoid
+  # running into memory issues or RPC message limits.
+  start_placeholder = array_ops.placeholder(dtypes.int64, ())
+  end_placeholder = array_ops.placeholder(dtypes.int64, ())
+  slice_placeholder = array_ops.placeholder(input_var.dtype)
+  assign_slice_op = input_var[start_placeholder:end_placeholder].assign(
+      slice_placeholder)
+
+  # If each batch element is > 64 MB, then we copy each batch element
+  # individually. Otherwise, the slices will be < 128 MB. There might be padding
+  # which might mean that the slices are 128 MB even if the size of the
+  # tensor allocated is less than 128 MB.
+  # This formula gives slices with size:
+  # ceil(64 MB / byte size per batch element) bytes.
+  # Using ceil() guarantees we get a number >= 1.
+
+  # Calculate the size of each batch element.
+  byte_size_per_batch_element = np.prod(input_array.shape[1:]) * \
+                                input_var.dtype.size
+
+  # Calculate number of elements we want to copy per slice.
+  batch_size_per_slice = np.ceil((64 << 20) / byte_size_per_batch_element)
+
+  # Copy slices of the above size starting at 0, except the last slice will be
+  # smaller.
+  start = 0
+  limit = input_array.shape[0]
+  while start < limit:
+    end = min(start + batch_size_per_slice, limit)
+    K.get_session().run(assign_slice_op, feed_dict={
+        start_placeholder: start,
+        end_placeholder: end,
+        slice_placeholder: input_array[start:end]})
+    start = end
+
+  return input_var
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 5091cac836..c842b8192e 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -20,11 +20,9 @@ from __future__ import print_function
 
 import weakref
 import numpy as np
-import six
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops.dataset_ops import Dataset
 from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -814,19 +812,21 @@ class Model(Network):
     first_x_value = nest.flatten(x)[0]
     if isinstance(first_x_value, np.ndarray):
       x_shape = first_x_value.shape
-      x_dtype = first_x_value.dtype
       if batch_size is None:
         batch_size = x_shape[0] // steps
       if y is not None:
-        first_y_value = nest.flatten(y)[0]
-        x = Dataset.from_generator(lambda x=x, y=y: six.moves.zip(x, y),
-                                   output_types=(x_dtype, first_y_value.dtype),
-                                   output_shapes=(x_shape[1:],
-                                                  first_y_value.shape[1:]))
+        var_x = distributed_training_utils.get_var_for_numpy(
+            self._distribution_strategy, x)
+        var_y = distributed_training_utils.get_var_for_numpy(
+            self._distribution_strategy, y)
+
+        x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y))
         # TODO(anjalisridhar): What should the buffer size be?
         x = x.shuffle(10000)
         x = x.repeat()
-        x = x.batch(batch_size)
+        # We need to use the drop_remainder argument to allow for a static
+        # input shape which is required for TPUs.
+        x = x.batch(batch_size, drop_remainder=True)
         y = None
       else:
         # This case is for the predict call where the dataset only contains
@@ -834,11 +834,13 @@ class Model(Network):
         # TODO(anjalisridhar): Raise an error if we are not able to process
         # all the predict samples. This can happen if the number of batches is
         # not evenly divisible by the number of worker devices.
-        x = Dataset.from_generator(lambda x=x: x,
-                                   output_types=x_dtype,
-                                   output_shapes=x_shape[1:])
+        var_x = distributed_training_utils.get_var_for_numpy(
+            self._distribution_strategy, x)
+        x = dataset_ops.Dataset.from_tensor_slices(var_x)
         x = x.repeat()
-        x = x.batch(batch_size)
+        # We need to use the drop_remainder argument to allow for a static
+        # input shape which is required for TPUs.
+        x = x.batch(batch_size, drop_remainder=True)
 
     # TODO(anjalisridhar): Can we use the iterator and getnext op cache?
     # We require users to pass Datasets since we distribute the dataset across
@@ -978,16 +980,18 @@ class Model(Network):
                            'Make sure that your dataset can generate '
                            'required number of samples.')
 
-      if (not isinstance(next_element, (list, tuple)) or
-          len(next_element) not in [2, 3]):
-        raise ValueError(
-            'Please provide model inputs as a list or tuple of 2  or 3'
-            'elements: (input, target) or (input, target, sample_weights)'
-            'Received %s' % next_element)
-      if len(next_element) == 2:
-        x, y = next_element
+      if isinstance(next_element, (list, tuple)):
+        if len(next_element) not in [2, 3]:
+          raise ValueError(
+              'Please provide model inputs as a list or tuple of 2  or 3'
+              'elements: (input, target) or (input, target, sample_weights)'
+              'Received %s' % next_element)
+        if len(next_element) == 2:
+          x, y = next_element
+        else:
+          x, y, sample_weight = next_element
       else:
-        x, y, sample_weight = next_element
+        x = next_element
     x, y, sample_weights = self._standardize_weights(x, y, sample_weight,
                                                      class_weight, batch_size)
     return x, y, sample_weights
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index a6470458d2..04e8d079c0 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.util import nest
 
 
 # TODO(priyag, sourabhbajaj): Refactor this file to address code duplication.
@@ -296,15 +297,16 @@ def _experimental_fit_loop(
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
   if steps_per_epoch is None:
-    raise ValueError('steps_per_epoch should be specified in the fit call.')
-  steps_per_run_var = K.variable(
+    raise ValueError('`steps_per_epoch` should be specified when calling '
+                     '`fit` on the model.')
+  steps_per_run = K.variable(
       value=min(steps_per_epoch, current_strategy.steps_per_run),
       dtype='int32',
-      name='steps_per_run_var')
+      name='steps_per_run')
 
   with current_strategy.scope():
     ctx = current_strategy.run_steps_on_dataset(
-        step_fn, iterator, iterations=steps_per_run_var,
+        step_fn, iterator, iterations=steps_per_run,
         initial_loop_values=initial_loop_values)
 
   train_op = ctx.run_op
@@ -344,7 +346,7 @@ def _experimental_fit_loop(
       batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
       callbacks.on_batch_begin(step_index, batch_logs)
       if prev_step_count is None or step_count != prev_step_count:
-        steps_per_run_var.load(step_count, K.get_session())
+        steps_per_run.load(step_count, K.get_session())
         prev_step_count = step_count
       try:
         _, outputs = K.get_session().run([train_op, output_tensors])
@@ -720,13 +722,9 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
             model.predict_function.updates_op,
             model.predict_function.session_kwargs)
 
-  def step_fn(ctx, inputs, targets):
+  def step_fn(ctx, *inputs):
     """Clones the model and calls make_predict_function."""
 
-    # TODO(anjalisridhar): Support predict input correctly as it will not
-    # contain targets, only inputs.
-    del targets
-
     # TODO(priyag, sourabhbajaj): The model gets cloned every time
     # fit/test/predict is called. We should look into caching this keyed on
     # input shapes.
@@ -824,9 +822,10 @@ def _clone_and_build_model(model, inputs=None, targets=None):
 
   # TODO(priyag): Is there a cleaner way to do this? The API doc suggests a
   # single tensor should be OK but it throws an error in that case.
-  if (targets is not None and not isinstance(targets, list) and
-      not isinstance(targets, dict)):
+  if targets is not None and not isinstance(targets, (list, dict, tuple)):
     targets = [targets]
+  if isinstance(targets, tuple):
+    targets = nest.flatten(targets)
   cloned_model.compile(
       optimizer,
       model.loss,
@@ -891,11 +890,12 @@ def _get_input_from_iterator(iterator, model):
   """Get elements from the iterator and verify the input shape and type."""
   next_element = iterator.get_next()
 
-  if isinstance(next_element, tuple):
-    x, y = next_element
-  else:
+  if len(nest.flatten(next_element)) == len(model.inputs):
     x = next_element
     y = None
+  else:
+    x, y = next_element
+
   # Validate that all the elements in x and y are of the same type and shape.
   # We can then pass the first element of x and y to `_standardize_weights`
   # below and be confident of the output.
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index b04b4df257..2883c9ad74 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -96,6 +96,8 @@ def _clone_functional_model(model, input_tensors=None):
   else:
     # Make sure that all input tensors come from a Keras layer.
     # If tensor comes from an input layer: cache the input layer.
+    if isinstance(input_tensors, tuple):
+      input_tensors = list(input_tensors)
     input_tensors = generic_utils.to_list(input_tensors)
     input_tensors_ = []
     for i, x in enumerate(input_tensors):
@@ -212,6 +214,9 @@ def _clone_sequential_model(model, input_tensors=None):
       raise ValueError('To clone a `Sequential` model, we expect '
                        ' at most one tensor '
                        'as part of `input_tensors`.')
+
+    if isinstance(input_tensors, tuple):
+      input_tensors = list(input_tensors)
     x = generic_utils.to_list(input_tensors)[0]
     if K.is_keras_tensor(x):
       origin_layer = x._keras_history[0]
-- 
GitLab


From 05812d761031b108b43560c90867b96dc4f030eb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 14:35:49 -0700
Subject: [PATCH 304/570] Fixes for few issues in HloModule::CreateFromProto()

PiperOrigin-RevId: 215460064
---
 tensorflow/compiler/xla/literal.cc            |  2 ++
 .../compiler/xla/service/hlo_computation.cc   | 22 +++++++++++++++++++
 .../compiler/xla/service/hlo_instruction.cc   | 20 +++++++++++++----
 .../compiler/xla/service/hlo_sharding.cc      |  8 +++++--
 tensorflow/compiler/xla/shape_util.cc         |  3 ++-
 5 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index d1dad0d45f..deeb140b8f 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -287,6 +287,8 @@ Status MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
     return InvalidArgument("LiteralProto has no layout");
   }
 
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(proto.shape()));
+
   Literal literal(proto.shape());
 
   TF_RETURN_IF_ERROR(literal.root_piece_->ForEachMutableSubpieceWithStatus(
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 6ef67ab0a8..c2041c4667 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -535,6 +535,28 @@ HloComputation::CreateFromProto(
               return to_proto_id[a.get()] < to_proto_id[b.get()];
             });
 
+  TF_RETURN_IF_ERROR([&]() -> Status {
+    std::vector<bool> parameters_seen(parameter_count);
+    int parameters_seen_count = 0;
+    for (auto& instruction : instructions) {
+      if (instruction->opcode() == HloOpcode::kParameter) {
+        int64 param_no = instruction->parameter_number();
+        TF_RET_CHECK(param_no >= 0 && param_no < parameter_count)
+            << "Invalid parameter number.  Expected [0, " << parameter_count
+            << "), got " << param_no;
+        TF_RET_CHECK(!parameters_seen[param_no])
+            << "Parameter number " << param_no
+            << " already allocated in this computation";
+        parameters_seen[param_no] = true;
+        parameters_seen_count++;
+      }
+    }
+    TF_RET_CHECK(parameters_seen_count == parameter_count)
+        << "Not all parameters in range [0, " << parameter_count
+        << ") were referenced";
+    return Status::OK();
+  }());
+
   auto computation = absl::WrapUnique(
       new HloComputation(proto.name(), parameter_count, &instructions, root,
                          /*fusion_instruction=*/nullptr));
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index de22b2d3a5..5c16d6bb5e 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -81,6 +81,20 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   const auto computations = [&computation_map, &proto](int index) {
     return computation_map.at(proto.called_computation_ids(index));
   };
+
+  TF_RET_CHECK(std::all_of(
+      proto.operand_ids().begin(), proto.operand_ids().end(),
+      [&instruction_map](int64 id) { return instruction_map.contains(id); }))
+      << proto.name() << " instruction contains invalid operand id(s)";
+
+  TF_RET_CHECK(std::all_of(
+      proto.called_computation_ids().begin(),
+      proto.called_computation_ids().end(),
+      [&computation_map](int64 id) { return computation_map.contains(id); }))
+      << proto.name() << " instruction references invalid computation id(s)";
+
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(proto.shape()));
+
   switch (opcode) {
     // Ops migrated to subclasses.
     case HloOpcode::kBatchNormTraining:
@@ -304,6 +318,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     } break;
     case HloOpcode::kOutfeed:
       TF_RET_CHECK(proto.operand_ids_size() == 2);
+      TF_RETURN_IF_ERROR(
+          ShapeUtil::ValidateShapeWithOptionalLayout(proto.outfeed_shape()));
       instruction = CreateOutfeed(proto.outfeed_shape(), operands(0),
                                   operands(1), proto.outfeed_config());
       break;
@@ -492,14 +508,10 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     default: {
       instruction = absl::WrapUnique(new HloInstruction(opcode, proto.shape()));
       for (const int64 operand_id : proto.operand_ids()) {
-        TF_RET_CHECK(ContainsKey(instruction_map, operand_id))
-            << "No instruction with id " << operand_id;
         instruction->AppendOperand(instruction_map.at(operand_id));
       }
       if (instruction->opcode() != HloOpcode::kFusion) {
         for (const int64 computation_id : proto.called_computation_ids()) {
-          TF_RET_CHECK(ContainsKey(computation_map, computation_id))
-              << "No computation with id " << computation_id;
           instruction->called_computations_.push_back(
               computation_map.at(computation_id));
         }
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index de7e6b53d4..94c7bafd3b 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -369,10 +369,14 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
     return HloSharding(tuple_shardings);
   } else if (proto.type() == OpSharding::Type::OpSharding_Type_REPLICATED) {
     return Replicate();
-  } else if (proto.type() == OpSharding::Type::OpSharding_Type_MAXIMAL ||
-             proto.tile_assignment_devices().size() == 1) {
+  } else if (proto.tile_assignment_devices().size() == 1) {
     return HloSharding(proto.tile_assignment_devices(0));
   }
+
+  TF_RET_CHECK(proto.type() != OpSharding::Type::OpSharding_Type_MAXIMAL)
+      << "Maximal sharding is expected to have single device assignment, but "
+      << proto.tile_assignment_devices().size() << " has provided.";
+
   // Some versions of gcc cannot infer the TileAssignment constructor from a
   // braced initializer-list, so create one manually.
   std::vector<int64> devices(proto.tile_assignment_devices().begin(),
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 020c167ee9..476a9fe868 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -831,7 +831,8 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
 
 /* static */ Status ShapeUtil::ValidateShapeWithOptionalLayoutInternal(
     const Shape& shape) {
-  if (shape.element_type() == PRIMITIVE_TYPE_INVALID) {
+  if (shape.element_type() == PRIMITIVE_TYPE_INVALID ||
+      !PrimitiveType_IsValid(shape.element_type())) {
     return InvalidArgument("shape has invalid element type: %s",
                            shape.ShortDebugString());
   }
-- 
GitLab


From 891e49f57b8229f58315cfeb743e38c235918083 Mon Sep 17 00:00:00 2001
From: Suyog Gupta <suyoggupta@google.com>
Date: Tue, 2 Oct 2018 14:46:13 -0700
Subject: [PATCH 305/570] Add missing documentation for use_tpu hparam

PiperOrigin-RevId: 215462000
---
 tensorflow/contrib/model_pruning/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md
index 15d95896d9..b313024e28 100644
--- a/tensorflow/contrib/model_pruning/README.md
+++ b/tensorflow/contrib/model_pruning/README.md
@@ -62,6 +62,7 @@ The pruning library allows for specification of the following hyper parameters:
 | sparsity_function_begin_step | integer | 0 | The global step at this which the gradual sparsity function begins to take effect |
 | sparsity_function_end_step | integer | 100 | The global step used as the end point for the gradual sparsity function |
 | sparsity_function_exponent | float | 3.0 | exponent = 1 is linearly varying sparsity between initial and final. exponent > 1 varies more slowly towards the end than the beginning |
+| use_tpu | bool | False | Training using TPUs? |
 
 The sparsity $$s_t$$ at global step $$t$$ is given by:
 
-- 
GitLab


From 664f3dde67bfa436e5216ae54ee256761c7c6962 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 14:52:16 -0700
Subject: [PATCH 306/570] Do not warn about loss of accuracy in trivial cases
 when all array elements are equal to either the min or the max value, so that
 they are trivially exactly quantized. This case does not normally occur for
 true learned weights, which is what this warning is intended for.

PiperOrigin-RevId: 215463096
---
 .../toco/graph_transformations/quantize.cc    | 30 +++++++++++++------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index 1bc366f555..fb299c31b7 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -97,15 +97,6 @@ const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
   // to allow easily trying out quantization even if the graph
   // lacks some minmax information.
   if (array.buffer != nullptr) {
-    LOG(WARNING)
-        << "Constant array " << array_name
-        << " lacks MinMax information. To make up for that, we will now compute"
-        << " the MinMax from actual array elements. That will result in"
-        << " quantization parameters that probably do not match whichever "
-           "arithmetic"
-        << " was used during training, and thus will probably be a cause of "
-           "poor"
-        << " inference accuracy.";
     CHECK(array.buffer->type == ArrayDataType::kFloat);
     const auto& data = array.GetBuffer<ArrayDataType::kFloat>().data;
     // We always want [min, max] to contain 0.
@@ -120,6 +111,27 @@ const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
       // to not be equal.
       max = 1.f;
     }
+    // No need to warn about accuracy if all array values are equal to either
+    // min or max:
+    // in that case, quantization is exact, and such arrays are not learned
+    // weights arrays for which fake-quantization would make sense, rather
+    // they tend to be hardcoded arrays of zeros or ones used in some graphs.
+    bool is_quantization_trivially_exact = true;
+    for (auto val : data) {
+      is_quantization_trivially_exact &= (val == min || val == max);
+    }
+    if (!is_quantization_trivially_exact) {
+      LOG(WARNING)
+          << "Constant array " << array_name
+          << " lacks MinMax information. To make up for that, we will now "
+             "compute"
+          << " the MinMax from actual array elements. That will result in"
+          << " quantization parameters that probably do not match whichever "
+             "arithmetic"
+          << " was used during training, and thus will probably be a cause of "
+             "poor"
+          << " inference accuracy.";
+    }
     auto& minmax = array.GetOrCreateMinMax();
     minmax.min = min;
     minmax.max = max;
-- 
GitLab


From 44f273e853360042ee14def03eba85d1e04a7272 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Tue, 2 Oct 2018 14:54:08 -0700
Subject: [PATCH 307/570] [XLA] A test that disables layout assignment should
 only contain layout consistent HLO instructions.

Fix a dot test that disables layout assignment pass to not generate layout
inconsistent HLO instructions. This includes only adding the dot result to an
addend with the same layout, and disabling algebraic simplification which may
transform a dot to a multiplication with inconsistent layouts.

PiperOrigin-RevId: 215463477
---
 .../compiler/xla/tests/dot_operation_test.cc  | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 0171f51583..6c0847a875 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -394,6 +394,10 @@ class ParametricDotTestWithoutLayoutAssignment : public ParametricDotTest {
   ParametricDotTestWithoutLayoutAssignment() {
     execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
         "layout-assignment");
+    // Disable algebraic simplification because the pass may replace a dot
+    // instruction with a layout-changing multiplication instruction.
+    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
+        "algsimp");
   }
 };
 
@@ -404,31 +408,18 @@ std::vector<DotTestParam> CreateNoLayoutAssignmentDotTestParameters() {
     for (bool lhs_row_major : {true, false}) {
       for (bool rhs_row_major : {true, false}) {
         for (bool has_addend : {true, false}) {
+          // The addend needs to be row major to match the result of the dot.
           params.push_back({/*m=*/1, /*k=*/k, /*n=*/n,
                             /*dot_lhs_row_major=*/lhs_row_major,
                             /*dot_rhs_row_major=*/rhs_row_major,
                             /*has_addend=*/has_addend,
                             /*addend_row_major=*/true});
-          if (has_addend) {
-            params.push_back({/*m=*/1, /*k=*/k, /*n=*/n,
-                              /*dot_lhs_row_major=*/lhs_row_major,
-                              /*dot_rhs_row_major=*/rhs_row_major,
-                              /*has_addend=*/has_addend,
-                              /*addend_row_major=*/false});
-          }
           if (n != 1) {
             params.push_back({/*m=*/n, /*k=*/k, /*n=*/1,
                               /*dot_lhs_row_major=*/lhs_row_major,
                               /*dot_rhs_row_major=*/rhs_row_major,
                               /*has_addend=*/has_addend,
                               /*addend_row_major=*/true});
-            if (has_addend) {
-              params.push_back({/*m=*/n, /*k=*/k, /*n=*/1,
-                                /*dot_lhs_row_major=*/lhs_row_major,
-                                /*dot_rhs_row_major=*/rhs_row_major,
-                                /*has_addend=*/has_addend,
-                                /*addend_row_major=*/false});
-            }
           }
         }
       }
-- 
GitLab


From 08e5ad2839ca2c6749544ace354f78d00f5243d9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 15:06:38 -0700
Subject: [PATCH 308/570] Fix a bug: the use of sequence-point boolean
 operators here had the unintended effect of causing the second line not to
 run at all depending on the result from the first line.

PiperOrigin-RevId: 215466006
---
 .../read_array_minmax_and_narrow_range_from_fake_quant.cc   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
index 5b41c49bfa..eaa9d3bcda 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
@@ -71,8 +71,10 @@ bool ReadArrayMinmaxAndNarrowRangeFromFakeQuant::Run(Model* model,
   CHECK(fq_op->minmax);
   CHECK_EQ(1, fq_op->inputs.size());
 
-  return ApplyAttrsToArray(this, model, *fq_op, fq_op->inputs[0]) ||
-         ApplyAttrsToArray(this, model, *fq_op, fq_op->outputs[0]);
+  bool changed = false;
+  changed |= ApplyAttrsToArray(this, model, *fq_op, fq_op->inputs[0]);
+  changed |= ApplyAttrsToArray(this, model, *fq_op, fq_op->outputs[0]);
+  return changed;
 }
 
 }  // namespace toco
-- 
GitLab


From cfec3aa38db1d2b70045e7b89d82fae87c3fec02 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 15:07:36 -0700
Subject: [PATCH 309/570] Update code to use
 convert_to_tensor_or_indexed_slices, since features may be SparseTensors as
 well.

PiperOrigin-RevId: 215466199
---
 .../estimator/python/estimator/dnn_with_layer_annotations.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations.py b/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations.py
index 5faf0aacfe..6ca7aaf989 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations.py
@@ -151,7 +151,7 @@ def make_input_layer_with_layer_annotations(original_input_layer):
     # spec and looking at the keys.
     spec = feature_column_lib.make_parse_example_spec(feature_columns)
     for key in spec.keys():
-      tensor = ops.convert_to_tensor(features[key])
+      tensor = ops.convert_to_tensor_or_indexed_slices(features[key])
       ops.add_to_collection(
           LayerAnnotationsCollectionNames.keys(
               LayerAnnotationsCollectionNames.UNPROCESSED_FEATURES), key)
-- 
GitLab


From bb84d5d5e309204110315f7d0ff8ca0dbb022dd2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 15:08:52 -0700
Subject: [PATCH 310/570] [XLA] Support parsing the canonical format of HLO
 text.

Also stop truncating operands in the canonical format.

PiperOrigin-RevId: 215466465
---
 .../xla/service/hlo_execution_profile.cc      |   5 +-
 .../compiler/xla/service/hlo_instruction.cc   |   2 +-
 .../compiler/xla/service/hlo_instruction.h    |  14 +-
 tensorflow/compiler/xla/service/hlo_parser.cc | 276 ++++++++++++------
 tensorflow/compiler/xla/service/hlo_parser.h  |   5 +-
 .../compiler/xla/service/hlo_parser_test.cc   | 142 ++++++++-
 6 files changed, 338 insertions(+), 106 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index de3d7a1677..ce4cad4235 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -90,8 +90,9 @@ std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
       HloInstructionInfo* instruction_info =
           computation_info->add_instruction_infos();
       instruction_info->set_long_name(hlo->ToString());
-      instruction_info->set_short_name(
-          hlo->ToString(HloPrintOptions().set_compact_operands(true)));
+      instruction_info->set_short_name(hlo->ToString(
+          HloPrintOptions().set_compact_operands(true).set_print_operand_names(
+              false)));
       instruction_info->set_category(hlo->ToCategory());
       instruction_info->set_flop_count(cost_analysis.flop_count(*hlo));
       instruction_info->set_transcendental_count(
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 5c16d6bb5e..8bddaa8c96 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -2034,7 +2034,7 @@ string HloInstruction::OperandsToStringWithCanonicalNameMap(
         options.is_in_nested_computation()) {
       str.push_back(PrintName(
           canonical_name_map->LookupOrInsert(operand->name()), options));
-    } else if (!options.compact_operands()) {
+    } else if (options.print_operand_names()) {
       str.push_back(PrintName(operand->name(), options));
     }
     StrAppend(out, StrJoin(str, " "));
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 1bfdc88abc..9deed20e5d 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -80,6 +80,7 @@ class HloPrintOptions {
         print_backend_config_(true),
         compact_operands_(false),
         print_operand_shape_(true),
+        print_operand_names_(true),
         print_program_shape_(true),
         print_percent_(true),
         print_control_dependencies_(true),
@@ -107,6 +108,7 @@ class HloPrintOptions {
         .set_print_metadata(false)
         .set_print_backend_config(false)
         .set_compact_operands(true)
+        .set_print_operand_names(false)
         .set_print_operand_shape(true)
         .set_print_program_shape(false)
         .set_print_percent(false)
@@ -144,6 +146,12 @@ class HloPrintOptions {
     return *this;
   }
 
+  // If true, the operand names will be printed.
+  HloPrintOptions& set_print_operand_names(bool value) {
+    print_operand_names_ = value;
+    return *this;
+  }
+
   // If true, program shape of hlo computations will be printed.
   HloPrintOptions& set_print_program_shape(bool value) {
     print_program_shape_ = value;
@@ -162,8 +170,8 @@ class HloPrintOptions {
     return *this;
   }
 
-  // If true, only a part of operands will be printed out, and their names will
-  // be omitted (note that in this case the text will not be parsable).
+  // If true, only a part of operands will be printed out (note that in this
+  // case the text will not be parsable).
   HloPrintOptions& set_compact_operands(bool value) {
     compact_operands_ = value;
     return *this;
@@ -197,6 +205,7 @@ class HloPrintOptions {
   bool print_backend_config() const { return print_backend_config_; }
   bool compact_operands() const { return compact_operands_; }
   bool print_operand_shape() const { return print_operand_shape_; }
+  bool print_operand_names() const { return print_operand_names_; }
   bool print_program_shape() const { return print_program_shape_; }
   bool print_percent() const { return print_percent_; }
   bool print_control_dependencies() const {
@@ -215,6 +224,7 @@ class HloPrintOptions {
   bool print_backend_config_;
   bool compact_operands_;
   bool print_operand_shape_;
+  bool print_operand_names_;
   bool print_program_shape_;
   bool print_percent_;
   bool print_control_dependencies_;
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 25b70740e3..5a125b4c08 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -80,17 +80,23 @@ class HloParser {
   StatusOr<PaddingConfig> ParsePaddingConfigOnly();
 
   // Stand-alone parsing utility for a single instruction worth of text.
-  Status ParseSingleInstruction(HloComputation::Builder* builder,
-                                string* root_name);
+  Status ParseSingleInstruction(HloModule* module);
 
  private:
-  // Locates an instruction with the given name in the instruction_pool_ or
+  using InstrNameTable =
+      std::unordered_map<string, std::pair<HloInstruction*, LocTy>>;
+
+  // Returns the map from the instruction name to the instruction itself and its
+  // location in the current scope.
+  InstrNameTable& current_name_table() { return scoped_name_tables_.back(); }
+
+  // Locates an instruction with the given name in the current_name_table() or
   // returns nullptr.
   //
-  // If the missing_instruction_hook_ is registered and a "shape" is provided,
-  // the hook will be called and may satisfy the request for the given
-  // instruction. This is useful when we reify parameters as they're resolved;
-  // i.e. for ParseSingleInstruction.
+  // When the name is not found or name is empty, if create_missing_instruction_
+  // hook is registered and a "shape" is provided, the hook will be called to
+  // create an instruction. This is useful when we reify parameters as they're
+  // resolved; i.e. for ParseSingleInstruction.
   std::pair<HloInstruction*, LocTy>* FindInstruction(
       const string& name, const optional<Shape>& shape = nullopt);
 
@@ -98,9 +104,11 @@ class HloParser {
   bool ParseHloModule(HloModule* module);
   bool ParseComputations(HloModule* module);
   bool ParseComputation(HloComputation** entry_computation);
-  bool ParseInstructionList(HloComputation::Builder* builder,
-                            string* root_name);
+  bool ParseInstructionList(HloComputation** computation,
+                            const string& computation_name);
   bool ParseInstruction(HloComputation::Builder* builder, string* root_name);
+  bool ParseInstruciontRhs(HloComputation::Builder* builder, const string& name,
+                           LocTy name_loc);
   bool ParseControlPredecessors(HloInstruction* instruction);
   bool ParseLiteral(Literal* literal, const Shape& shape);
   bool ParseTupleLiteral(Literal* literal, const Shape& shape);
@@ -281,23 +289,47 @@ class HloParser {
   bool AddComputation(const string& name, HloComputation* computation,
                       LocTy name_loc);
 
-  // The map from the instruction/computation name to the
-  // instruction/computation itself and it's location. This does not own the
-  // pointers.
-  std::unordered_map<string, std::pair<HloInstruction*, LocTy>>
-      instruction_pool_;
+  HloLexer lexer_;
+
+  // A stack for the instruction names. The top of the stack stores the
+  // instruction name table for the current scope.
+  //
+  // A instruction's name is unique among its scope (i.e. its parent
+  // computation), but it's not necessarily unique among all computations in the
+  // module. When there are multiple levels of nested computations, the same
+  // name could appear in both an outer computation and an inner computation. So
+  // we need a stack to make sure a name is only visible within its scope,
+  std::vector<InstrNameTable> scoped_name_tables_;
+
+  // A helper class which pushes and pops to an InstrNameTable stack via RAII.
+  class Scope {
+   public:
+    explicit Scope(std::vector<InstrNameTable>* scoped_name_tables)
+        : scoped_name_tables_(scoped_name_tables) {
+      scoped_name_tables_->emplace_back();
+    }
+    ~Scope() { scoped_name_tables_->pop_back(); }
+
+   private:
+    std::vector<InstrNameTable>* scoped_name_tables_;
+  };
+
+  // Map from the computation name to the computation itself and its location.
   std::unordered_map<string, std::pair<HloComputation*, LocTy>>
       computation_pool_;
 
-  HloLexer lexer_;
   std::vector<std::unique_ptr<HloComputation>> computations_;
   std::vector<string> error_;
 
-  // Function that gets invoked when we try to resolve an instruction
-  // instruction_pool_ but fail to do so.
-  std::function<std::pair<HloInstruction*, LocTy>*(string,
-                                                   const optional<Shape>&)>
-      missing_instruction_hook_;
+  // When an operand name cannot be resolved, this function is called to create
+  // a parameter instruction with the given name and shape. It registers the
+  // name, instruction, and a placeholder location in the name table. It returns
+  // the newly-created instruction and the placeholder location. If `name` is
+  // empty, this should create the parameter with a generated name. This is
+  // supposed to be set and used only in ParseSingleInstruction.
+  std::function<std::pair<HloInstruction*, LocTy>*(const string& name,
+                                                   const Shape& shape)>
+      create_missing_instruction_;
 };
 
 bool SplitToInt64s(absl::string_view s, char delim, std::vector<int64>* out) {
@@ -351,11 +383,21 @@ bool HloParser::Run(HloModule* module) {
 
 std::pair<HloInstruction*, HloParser::LocTy>* HloParser::FindInstruction(
     const string& name, const optional<Shape>& shape) {
-  std::pair<HloInstruction*, LocTy>* instr =
-      tensorflow::gtl::FindOrNull(instruction_pool_, name);
+  std::pair<HloInstruction*, LocTy>* instr = nullptr;
+  if (!name.empty()) {
+    instr = tensorflow::gtl::FindOrNull(current_name_table(), name);
+  }
+
   // Potentially call the missing instruction hook.
-  if (instr == nullptr && missing_instruction_hook_ != nullptr) {
-    return missing_instruction_hook_(name, shape);
+  if (instr == nullptr && create_missing_instruction_ != nullptr &&
+      scoped_name_tables_.size() == 1) {
+    if (!shape.has_value()) {
+      Error(lexer_.GetLoc(),
+            "Operand had no shape in HLO text; cannot create parameter for "
+            "single-instruction module.");
+      return nullptr;
+    }
+    return create_missing_instruction_(name, *shape);
   }
   return instr;
 }
@@ -439,7 +481,6 @@ bool HloParser::ParseComputation(HloComputation** entry_computation) {
   if (!ParseName(&name)) {
     return false;
   }
-  auto builder = absl::make_unique<HloComputation::Builder>(name);
 
   LocTy shape_loc = nullptr;
   Shape shape;
@@ -447,40 +488,21 @@ bool HloParser::ParseComputation(HloComputation** entry_computation) {
     return false;
   }
 
-  string root_name;
-  if (!ParseInstructionList(builder.get(), &root_name)) {
+  HloComputation* computation = nullptr;
+  if (!ParseInstructionList(&computation, name)) {
     return false;
   }
 
-  std::pair<HloInstruction*, LocTy>* root_node = FindInstruction(root_name);
-  // This means some instruction was marked as ROOT but we didn't find it in the
-  // pool, which should not happen.
-  if (!root_name.empty() && root_node == nullptr) {
-    LOG(FATAL) << "instruction " << root_name
-               << " was marked as ROOT but the parser has not seen it before";
-  }
-
-  HloInstruction* root = root_node == nullptr ? nullptr : root_node->first;
-  // Now root can be either an existing instruction or a nullptr. If it's a
-  // nullptr, the implementation of Builder will set the last instruction as
-  // root instruction.
-  computations_.emplace_back(builder->Build(root));
-  HloComputation* computation = computations_.back().get();
-
-  if (!root) {
-    root = computation->root_instruction();
-  } else {
-    CHECK_EQ(root, computation->root_instruction());
-  }
-
   // If param_list_to_shape was present, check compatibility.
-  if (shape_loc != nullptr && !ShapeUtil::Compatible(root->shape(), shape)) {
+  if (shape_loc != nullptr &&
+      !ShapeUtil::Compatible(computation->root_instruction()->shape(), shape)) {
     return Error(
         shape_loc,
-        StrCat("Shape of computation ", name, ", ",
-               ShapeUtil::HumanString(shape),
-               ", is not compatible with that of its root instruction ",
-               root_name, ", ", ShapeUtil::HumanString(root->shape())));
+        StrCat(
+            "Shape of computation ", name, ", ", ShapeUtil::HumanString(shape),
+            ", is not compatible with that of its root instruction ",
+            computation->root_instruction()->name(), ", ",
+            ShapeUtil::HumanString(computation->root_instruction()->shape())));
   }
 
   if (is_entry_computation) {
@@ -489,43 +511,62 @@ bool HloParser::ParseComputation(HloComputation** entry_computation) {
     }
     *entry_computation = computation;
   }
-  instruction_pool_.clear();
 
   return AddComputation(name, computation, name_loc);
 }
 
 // instruction_list ::= '{' instruction_list1 '}'
 // instruction_list1 ::= (instruction)+
-bool HloParser::ParseInstructionList(HloComputation::Builder* builder,
-                                     string* root_name) {
+bool HloParser::ParseInstructionList(HloComputation** computation,
+                                     const string& computation_name) {
+  Scope scope(&scoped_name_tables_);
+  HloComputation::Builder builder(computation_name);
   if (!ParseToken(TokKind::kLbrace,
                   "expects '{' at the beginning of instruction list.")) {
     return false;
   }
+  string root_name;
   do {
-    if (!ParseInstruction(builder, root_name)) {
+    if (!ParseInstruction(&builder, &root_name)) {
       return false;
     }
   } while (lexer_.GetKind() != TokKind::kRbrace);
-  return ParseToken(TokKind::kRbrace,
-                    "expects '}' at the end of instruction list.");
+  if (!ParseToken(TokKind::kRbrace,
+                  "expects '}' at the end of instruction list.")) {
+    return false;
+  }
+  HloInstruction* root = nullptr;
+  if (!root_name.empty()) {
+    std::pair<HloInstruction*, LocTy>* root_node =
+        tensorflow::gtl::FindOrNull(current_name_table(), root_name);
+
+    // This means some instruction was marked as ROOT but we didn't find it in
+    // the pool, which should not happen.
+    if (root_node == nullptr) {
+      LOG(FATAL) << "instruction " << root_name
+                 << " was marked as ROOT but the parser has not seen it before";
+    }
+    root = root_node->first;
+  }
+
+  // Now root can be either an existing instruction or a nullptr. If it's a
+  // nullptr, the implementation of Builder will set the last instruction as
+  // the root instruction.
+  computations_.emplace_back(builder.Build(root));
+  *computation = computations_.back().get();
+  return true;
 }
 
 // instruction ::= ('ROOT')? name '=' shape opcode operands (attribute)*
 bool HloParser::ParseInstruction(HloComputation::Builder* builder,
                                  string* root_name) {
   string name;
-  Shape shape;
-  HloOpcode opcode;
-  std::vector<HloInstruction*> operands;
-
   LocTy maybe_root_loc = lexer_.GetLoc();
   bool is_root = EatIfPresent(TokKind::kw_ROOT);
 
   const LocTy name_loc = lexer_.GetLoc();
   if (!ParseName(&name) ||
-      !ParseToken(TokKind::kEqual, "expects '=' in instruction") ||
-      !ParseShape(&shape) || !ParseOpcode(&opcode)) {
+      !ParseToken(TokKind::kEqual, "expects '=' in instruction")) {
     return false;
   }
 
@@ -536,6 +577,19 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     *root_name = name;
   }
 
+  return ParseInstruciontRhs(builder, name, name_loc);
+}
+
+bool HloParser::ParseInstruciontRhs(HloComputation::Builder* builder,
+                                    const string& name, LocTy name_loc) {
+  Shape shape;
+  HloOpcode opcode;
+  std::vector<HloInstruction*> operands;
+
+  if (!ParseShape(&shape) || !ParseOpcode(&opcode)) {
+    return false;
+  }
+
   // Add optional attributes.
   std::unordered_map<string, AttrConfig> attrs;
   optional<OpSharding> sharding;
@@ -2146,7 +2200,20 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
         }
       }
       if (!ParseName(&name)) {
-        return false;
+        // When parsing a single instruction (as opposed to a whole module), an
+        // HLO may have one or more operands with a shape but no name:
+        //
+        //  foo = add(f32[10], f32[10])
+        //
+        // create_missing_instruction_ is always non-null when parsing a single
+        // instruction, and is responsible for creating kParameter instructions
+        // for these operands.
+        if (shape.has_value() && create_missing_instruction_ != nullptr &&
+            scoped_name_tables_.size() == 1) {
+          name = "";
+        } else {
+          return false;
+        }
       }
       std::pair<HloInstruction*, LocTy>* instruction =
           FindInstruction(name, shape);
@@ -2299,9 +2366,17 @@ bool HloParser::ParseAttributeHelper(
         return true;
       }
       case AttrTy::kHloComputation: {
-        HloComputation* result;
-        if (!ParseComputationName(&result)) {
-          return false;
+        HloComputation* result = nullptr;
+        if (lexer_.GetKind() == TokKind::kLbrace) {
+          // This means it is a nested computation.
+          if (!ParseInstructionList(&result, /*computation_name=*/"_")) {
+            return false;
+          }
+        } else {
+          // This means it is a computation name.
+          if (!ParseComputationName(&result)) {
+            return false;
+          }
         }
         static_cast<optional<HloComputation*>*>(attr_out_ptr)->emplace(result);
         return true;
@@ -3134,7 +3209,7 @@ bool HloParser::EatIfPresent(TokKind kind) {
 
 bool HloParser::AddInstruction(const string& name, HloInstruction* instruction,
                                LocTy name_loc) {
-  auto result = instruction_pool_.insert({name, {instruction, name_loc}});
+  auto result = current_name_table().insert({name, {instruction, name_loc}});
   if (!result.second) {
     Error(name_loc, StrCat("instruction already exists: ", name));
     return Error(/*loc=*/result.first->second.second,
@@ -3204,36 +3279,51 @@ StatusOr<PaddingConfig> HloParser::ParsePaddingConfigOnly() {
   return padding_config;
 }
 
-Status HloParser::ParseSingleInstruction(HloComputation::Builder* builder,
-                                         string* root_name) {
-  TF_RET_CHECK(missing_instruction_hook_ == nullptr);
+Status HloParser::ParseSingleInstruction(HloModule* module) {
+  TF_RET_CHECK(create_missing_instruction_ == nullptr);
+  TF_RET_CHECK(scoped_name_tables_.empty());
+  HloComputation::Builder builder(module->name());
 
   // The missing instruction hook we register creates the shaped instruction on
   // the fly as a parameter and returns it.
   int64 parameter_count = 0;
-  missing_instruction_hook_ =
-      [this, builder, &parameter_count](
-          string name,
-          const optional<Shape>& shape) -> std::pair<HloInstruction*, LocTy>* {
-    if (!shape.has_value()) {
-      Error(lexer_.GetLoc(),
-            StrCat("Operand ", name,
-                   " had no shape in HLO text; cannot create parameter for "
-                   "single-instruction module."));
-      return nullptr;
-    }
-    HloInstruction* parameter = builder->AddInstruction(
-        HloInstruction::CreateParameter(parameter_count++, *shape, name));
-    instruction_pool_[name] = {parameter, lexer_.GetLoc()};
-    return tensorflow::gtl::FindOrNull(instruction_pool_, name);
+  create_missing_instruction_ =
+      [this, &builder, &parameter_count](
+          const string& name,
+          const Shape& shape) -> std::pair<HloInstruction*, LocTy>* {
+    string new_name = name.empty() ? StrCat("_", parameter_count) : name;
+    HloInstruction* parameter = builder.AddInstruction(
+        HloInstruction::CreateParameter(parameter_count++, shape, new_name));
+    current_name_table()[new_name] = {parameter, lexer_.GetLoc()};
+    return tensorflow::gtl::FindOrNull(current_name_table(), new_name);
   };
 
   // Prime the lexer.
   lexer_.Lex();
 
   // Parse the instruction with the registered hook.
-  if (!ParseInstruction(builder, root_name)) {
-    return InvalidArgument("Syntax error:\n%s", GetError());
+  Scope scope(&scoped_name_tables_);
+  if (CanBeShape()) {
+    // This means that the instruction's left-hand side is probably omitted,
+    // e.g.
+    //
+    //  f32[10] fusion(...), calls={...}
+    if (!ParseInstruciontRhs(&builder, module->name(), lexer_.GetLoc())) {
+      return InvalidArgument("Syntax error:\n%s", GetError());
+    }
+  } else {
+    // This means that the instruction's left-hand side might exist, e.g.
+    //
+    //  foo = f32[10] fusion(...), calls={...}
+    string root_name;
+    if (!ParseInstruction(&builder, &root_name)) {
+      return InvalidArgument("Syntax error:\n%s", GetError());
+    }
+  }
+
+  module->AddEntryComputation(builder.Build());
+  for (auto& comp : computations_) {
+    module->AddEmbeddedComputation(std::move(comp));
   }
   return Status::OK();
 }
@@ -3271,12 +3361,8 @@ Status ParseHloString(absl::string_view str, HloModule* module) {
 StatusOr<std::unique_ptr<HloModule>> ParseHloOpToModule(
     absl::string_view str, absl::string_view name) {
   HloParser parser(str);
-  auto builder = absl::make_unique<HloComputation::Builder>(string(name));
-  string root_name;
-  TF_RETURN_IF_ERROR(parser.ParseSingleInstruction(builder.get(), &root_name));
-  std::unique_ptr<HloComputation> computation = builder->Build();
   auto module = absl::make_unique<HloModule>(string(name), HloModuleConfig());
-  module->AddEntryComputation(std::move(computation));
+  TF_RETURN_IF_ERROR(parser.ParseSingleInstruction(module.get()));
   return std::move(module);
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h
index 3696035514..97d6f0117e 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.h
+++ b/tensorflow/compiler/xla/service/hlo_parser.h
@@ -40,8 +40,9 @@ StatusOr<std::unique_ptr<HloModule>> ParseHloString(
 // point to an empty module (no computations).
 Status ParseHloString(absl::string_view str, HloModule* module);
 
-// Parses the text for a single HLO operation into an HLO module with a function
-// that runs that operation (with the same parameters) as its entry computation.
+// Parses the text for a single HLO instruction into an HLO module with an
+// entry computation that runs that instruction (with the same parameters) as
+// its root instruction.
 StatusOr<std::unique_ptr<HloModule>> ParseHloOpToModule(
     absl::string_view str, absl::string_view name = "single_op");
 
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index dd4ee780f0..d10acf3814 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -1763,6 +1763,25 @@ ENTRY entry {
       "was parsing 8:39: error: instruction does not exist: aparam");
 }
 
+TEST_F(HloParserTest, SameNameDiffComputations) {
+  const string original = R"(HloModule same_names:
+add {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT result = f32[] add(p0, p1)
+}
+
+ENTRY ReduceR3ToR2 {
+  p0 = f32[8,16,256]{2,1,0} parameter(0)
+  p1 = f32[] constant(0)
+  ROOT result = f32[8,16]{1,0} reduce(p0, p1), dimensions={2}, to_apply=add
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(original));
+  ASSERT_NE(module->entry_computation(), nullptr);
+  EXPECT_THAT(module->entry_computation()->root_instruction(), op::Reduce());
+}
+
 TEST_F(HloParserTest, ParseSharding) {
   const string original = "{maximal device=42}";
   TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
@@ -1823,14 +1842,129 @@ TEST(HloParserSingleOpTest, SingleOp) {
               op::Multiply(op::Parameter(0), op::Parameter(1)));
 }
 
-TEST(HloParserSingleOpTest, SingleOpNoShapesProducesError) {
+TEST(HloParserSingleOpTest, SingleOpNoShapeProducesError) {
+  const string text = "multiply(f32[2,4]{1,0} %broadcast, f32[2,4]{1,0} %x)";
+  StatusOr<std::unique_ptr<HloModule>> module = ParseHloOpToModule(text);
+  ASSERT_TRUE(!module.status().ok());
+  LOG(INFO) << "Status: " << module.status();
+  EXPECT_THAT(module.status().ToString(),
+              ::testing::HasSubstr("expects '=' in instruction"));
+}
+
+TEST(HloParserSingleOpTest, SingleOpNoOperandShapesProducesError) {
   const string text = "%multiply = f32[2,4]{1,0} multiply(%broadcast, %x)";
   StatusOr<std::unique_ptr<HloModule>> module = ParseHloOpToModule(text);
   ASSERT_TRUE(!module.status().ok());
   LOG(INFO) << "Status: " << module.status();
-  EXPECT_THAT(
-      module.status().ToString(),
-      ::testing::HasSubstr("Operand broadcast had no shape in HLO text"));
+  EXPECT_THAT(module.status().ToString(),
+              ::testing::HasSubstr("Operand had no shape in HLO text"));
+}
+
+TEST(HloParserSingleOpTest, SingleOpNoNames) {
+  const string text =
+      "%multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0}, f32[2,4]{1,0})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloOpToModule(text));
+  const HloComputation* computation = module->entry_computation();
+  ASSERT_NE(computation, nullptr);
+  EXPECT_THAT(computation->root_instruction(),
+              op::Multiply(op::Parameter(0), op::Parameter(1)));
+}
+
+TEST(HloParserSingleOpTest, CanonicalOp) {
+  const string text = "f32[2,4]{1,0} multiply(f32[2,4]{1,0}, f32[2,4]{1,0})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloOpToModule(text));
+  const HloComputation* computation = module->entry_computation();
+  ASSERT_NE(computation, nullptr);
+  EXPECT_THAT(computation->root_instruction(),
+              op::Multiply(op::Parameter(0), op::Parameter(1)));
+  EXPECT_EQ(
+      computation->root_instruction()->ToString(HloPrintOptions::Canonical()),
+      text);
+}
+
+TEST(HloParserSingleOpTest, CanonicalOpWithNested) {
+  const string text =
+      R"(f32[5,20]{1,0} while(f32[5,10]{1,0}), condition=
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  tmp_1 = f32[20,10]{1,0} parameter(1)
+  ROOT tmp_2 = f32[5,20]{1,0} fusion(f32[5,10]{1,0} tmp_0, f32[20,10]{1,0} tmp_1), kind=kLoop, calls=
+  {
+    tmp_0 = f32[5,10]{1,0} parameter(0)
+    tmp_1 = f32[20,10]{1,0} parameter(1)
+    tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0}
+    ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+}, body=
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  tmp_1 = f32[20,10]{1,0} parameter(1)
+  ROOT tmp_2 = f32[5,20]{1,0} fusion(f32[5,10]{1,0} tmp_0, f32[20,10]{1,0} tmp_1), kind=kLoop, calls=
+  {
+    tmp_0 = f32[5,10]{1,0} parameter(0)
+    tmp_1 = f32[20,10]{1,0} parameter(1)
+    tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0}
+    ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloOpToModule(text));
+  const HloComputation* computation = module->entry_computation();
+  ASSERT_NE(computation, nullptr);
+  EXPECT_EQ(
+      computation->root_instruction()->ToString(HloPrintOptions::Canonical()),
+      text);
+}
+
+TEST(HloParserSingleOpTest, SingleOpWithNested) {
+  const string text =
+      R"(%fusion = f32[3,2,1,1]{3,2,1,0} fusion(f32[3,2,1,1]{3,2,1,0} %p0, f32[2]{0} %p1), kind=kLoop, calls=
+{
+  %param_0 = f32[3,2,1,1]{3,2,1,0} parameter(0)
+  %param_1 = f32[2]{0} parameter(1)
+  %broadcast = f32[3,2,1,1]{3,2,1,0} broadcast(f32[2]{0} %param_1), dimensions={1}
+  ROOT %subtract = f32[3,2,1,1]{3,2,1,0} subtract(f32[3,2,1,1]{3,2,1,0} %param_0, f32[3,2,1,1]{3,2,1,0} %broadcast)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloOpToModule(text));
+  const HloComputation* computation = module->entry_computation();
+  ASSERT_NE(computation, nullptr);
+  EXPECT_THAT(computation->root_instruction(),
+              op::Fusion(op::Parameter(0), op::Parameter(1)));
+}
+
+TEST(HloParserSingleOpTest, SingleOpWithNested_DoesNotExist) {
+  const string text =
+      R"(reduce = f32[] reduce(f32[10], f32[]), dimensions={1}, to_apply=
+{
+  result = f32[] add(f32[] x, f32[] y)
+})";
+  auto status = ParseHloOpToModule(text).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("does not exist: x"));
+}
+
+TEST(HloParserSingleOpTest, SingleOpWithNested_NoLhs) {
+  const string text =
+      R"(reduce = f32[] reduce(f32[10], f32[]), dimensions={1}, to_apply=
+{
+  f32[] add(f32[] x, f32[] y)
+})";
+  auto status = ParseHloOpToModule(text).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(), ::testing::HasSubstr("expects name"));
+}
+
+TEST(HloParserSingleOpTest, SingleOpWithNested_NoOperandName) {
+  const string text =
+      R"(reduce = f32[] reduce(f32[10], f32[]), dimensions={1}, to_apply=
+{
+  result = f32[] add(f32[], f32[])
+})";
+  auto status = ParseHloOpToModule(text).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(), ::testing::HasSubstr("expects name"));
 }
 
 TEST(HloParserSingleOpTest, ConvolutionTrivialFeatureGroupCount) {
-- 
GitLab


From 00000cbfdf0efac737f3bfff94950a49d48659fb Mon Sep 17 00:00:00 2001
From: Christopher Olston <olston@google.com>
Date: Tue, 2 Oct 2018 15:48:17 -0700
Subject: [PATCH 311/570] Delete the shims in tensorflow/contrib/batching/.

PiperOrigin-RevId: 215473319
---
 tensorflow/contrib/batching/BUILD             | 58 -------------------
 .../adaptive_shared_batch_scheduler.h         | 21 -------
 .../contrib/batching/basic_batch_scheduler.h  | 21 -------
 tensorflow/contrib/batching/batch_scheduler.h | 21 -------
 .../batching/serial_device_batch_scheduler.h  | 21 -------
 .../contrib/batching/shared_batch_scheduler.h | 21 -------
 tensorflow/contrib/batching/test_util/BUILD   | 19 ------
 .../batching/test_util/fake_clock_env.h       | 21 -------
 tensorflow/contrib/batching/util/BUILD        | 28 ---------
 .../contrib/batching/util/periodic_function.h | 20 -------
 10 files changed, 251 deletions(-)
 delete mode 100644 tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
 delete mode 100644 tensorflow/contrib/batching/basic_batch_scheduler.h
 delete mode 100644 tensorflow/contrib/batching/batch_scheduler.h
 delete mode 100644 tensorflow/contrib/batching/serial_device_batch_scheduler.h
 delete mode 100644 tensorflow/contrib/batching/shared_batch_scheduler.h
 delete mode 100644 tensorflow/contrib/batching/test_util/BUILD
 delete mode 100644 tensorflow/contrib/batching/test_util/fake_clock_env.h
 delete mode 100644 tensorflow/contrib/batching/util/BUILD
 delete mode 100644 tensorflow/contrib/batching/util/periodic_function.h

diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index b27a19b16c..648f3ebb05 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -7,64 +7,6 @@ package(
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-
-cc_library(
-    name = "batch_scheduler_hdrs",
-    hdrs = ["batch_scheduler.h"],
-    deps = [
-        "//tensorflow/core/kernels/batching_util:batch_scheduler_hdrs",
-    ],
-)
-
-cc_library(
-    name = "batch_scheduler",
-    hdrs = ["batch_scheduler.h"],
-    deps = [
-        "//tensorflow/core/kernels/batching_util:batch_scheduler",
-    ],
-)
-
-cc_library(
-    name = "shared_batch_scheduler_hdrs",
-    hdrs = ["shared_batch_scheduler.h"],
-    deps = [
-        "//tensorflow/core/kernels/batching_util:shared_batch_scheduler_hdrs",
-    ],
-)
-
-cc_library(
-    name = "shared_batch_scheduler",
-    hdrs = ["shared_batch_scheduler.h"],
-    deps = [
-        "//tensorflow/core/kernels/batching_util:shared_batch_scheduler",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "adaptive_shared_batch_scheduler",
-    hdrs = ["adaptive_shared_batch_scheduler.h"],
-    deps = [
-        "//tensorflow/core/kernels/batching_util:adaptive_shared_batch_scheduler",
-    ],
-)
-
-cc_library(
-    name = "serial_device_batch_scheduler",
-    hdrs = ["serial_device_batch_scheduler.h"],
-    deps = [
-        "//tensorflow/core/kernels/batching_util:serial_device_batch_scheduler",
-    ],
-)
-
-cc_library(
-    name = "basic_batch_scheduler",
-    hdrs = ["basic_batch_scheduler.h"],
-    deps = [
-        "//tensorflow/core/kernels/batching_util:basic_batch_scheduler",
-    ],
-)
-
 load(
     "//tensorflow:tensorflow.bzl",
     "py_test",
diff --git a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h b/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
deleted file mode 100644
index 86250e6692..0000000000
--- a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
-#define TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
-
-#include "tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h"
-
-#endif  // TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler.h b/tensorflow/contrib/batching/basic_batch_scheduler.h
deleted file mode 100644
index d9b37da693..0000000000
--- a/tensorflow/contrib/batching/basic_batch_scheduler.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
-#define TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
-
-#include "tensorflow/core/kernels/batching_util/basic_batch_scheduler.h"
-
-#endif  // TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/batch_scheduler.h b/tensorflow/contrib/batching/batch_scheduler.h
deleted file mode 100644
index 8e94e1fd8b..0000000000
--- a/tensorflow/contrib/batching/batch_scheduler.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
-#define TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
-
-#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
-
-#endif  // TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/serial_device_batch_scheduler.h b/tensorflow/contrib/batching/serial_device_batch_scheduler.h
deleted file mode 100644
index bf6b708361..0000000000
--- a/tensorflow/contrib/batching/serial_device_batch_scheduler.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_BATCHING_SERIAL_DEVICE_BATCH_SCHEDULER_H_
-#define TENSORFLOW_CONTRIB_BATCHING_SERIAL_DEVICE_BATCH_SCHEDULER_H_
-
-#include "tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h"
-
-#endif  // TENSORFLOW_CONTRIB_BATCHING_SERIAL_DEVICE_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/shared_batch_scheduler.h b/tensorflow/contrib/batching/shared_batch_scheduler.h
deleted file mode 100644
index 83a59695d7..0000000000
--- a/tensorflow/contrib/batching/shared_batch_scheduler.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
-#define TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
-
-#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
-
-#endif  // TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/test_util/BUILD b/tensorflow/contrib/batching/test_util/BUILD
deleted file mode 100644
index 7cb2d8079b..0000000000
--- a/tensorflow/contrib/batching/test_util/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-# Description: Utilities to aid testing.
-
-package(
-    default_visibility = ["//tensorflow:internal"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "fake_clock_env",
-    testonly = 1,
-    hdrs = ["fake_clock_env.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core/kernels/batching_util:fake_clock_env",
-    ],
-)
diff --git a/tensorflow/contrib/batching/test_util/fake_clock_env.h b/tensorflow/contrib/batching/test_util/fake_clock_env.h
deleted file mode 100644
index 40a39a5569..0000000000
--- a/tensorflow/contrib/batching/test_util/fake_clock_env.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
-#define TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
-
-#include "tensorflow/core/kernels/batching_util/fake_clock_env.h"
-
-#endif  // TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
diff --git a/tensorflow/contrib/batching/util/BUILD b/tensorflow/contrib/batching/util/BUILD
deleted file mode 100644
index 8f81b6702f..0000000000
--- a/tensorflow/contrib/batching/util/BUILD
+++ /dev/null
@@ -1,28 +0,0 @@
-# Description: Utilities.
-
-package(
-    default_visibility = ["//tensorflow:internal"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-
-cc_library(
-    name = "periodic_function_dynamic",
-    hdrs = ["periodic_function.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core/kernels/batching_util:periodic_function_dynamic",
-        "//third_party/eigen3",
-    ],
-)
-
-cc_library(
-    name = "periodic_function",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":periodic_function_dynamic",
-        "//tensorflow/core/kernels/batching_util:periodic_function",
-    ],
-)
diff --git a/tensorflow/contrib/batching/util/periodic_function.h b/tensorflow/contrib/batching/util/periodic_function.h
deleted file mode 100644
index aa2ed0a385..0000000000
--- a/tensorflow/contrib/batching/util/periodic_function.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
-#define TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
-
-#include "tensorflow/core/kernels/batching_util/periodic_function.h"
-
-#endif  // TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
-- 
GitLab


From 6c487cddd3503ef72c015c5c283fff81328282e5 Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Tue, 2 Oct 2018 15:48:27 -0700
Subject: [PATCH 312/570] Internal change.

PiperOrigin-RevId: 215473351
---
 .../data/experimental/kernel_tests/BUILD      | 113 +++++++++-
 .../kernel_tests/optimization/BUILD           |  43 ++++
 .../kernel_tests/serialization/BUILD          | 196 +++++++++++++++---
 3 files changed, 316 insertions(+), 36 deletions(-)

diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index a46c30ed2e..f56127f3ef 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -15,6 +15,7 @@ py_test(
     tags = [
         "no_oss",  # (b/79552534)
         "no_pip",
+        "no_windows",
     ],
     deps = [
         "//tensorflow/python:array_ops",
@@ -43,6 +44,11 @@ py_test(
     size = "medium",
     srcs = ["bucketing_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -66,7 +72,11 @@ py_test(
     size = "medium",
     srcs = ["csv_dataset_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -93,6 +103,9 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "manual",
+        "no_oss",
+        "no_pip",
+        "no_windows",
         "nomac",  # b/62040583
     ],
     deps = [
@@ -111,6 +124,11 @@ py_test(
     size = "medium",
     srcs = ["directed_interleave_dataset_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -126,6 +144,11 @@ py_test(
     name = "get_single_element_test",
     size = "small",
     srcs = ["get_single_element_test.py"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -144,6 +167,11 @@ py_test(
 py_test(
     name = "indexed_dataset_ops_test",
     srcs = ["indexed_dataset_ops_test.py"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -164,6 +192,7 @@ py_test(
     tags = [
         "no_oss",
         "no_pip",
+        "no_windows",
         "notap",
     ],
     deps = [
@@ -187,7 +216,11 @@ py_test(
     size = "small",
     srcs = ["iterator_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -208,7 +241,9 @@ py_test(
     srcs = ["map_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",
         "no_pip",
+        "no_windows",
         "noasan",  # times out
         "optonly",
     ],
@@ -234,6 +269,11 @@ py_test(
     size = "medium",
     srcs = ["filter_dataset_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -254,7 +294,11 @@ py_test(
     size = "small",
     srcs = ["map_defun_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -277,6 +321,11 @@ py_test(
     size = "small",
     srcs = ["parsing_ops_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -313,7 +362,12 @@ cuda_py_test(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
     ],
-    tags = ["no_windows_gpu"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+        "no_windows_gpu",
+    ],
 )
 
 py_test(
@@ -321,6 +375,11 @@ py_test(
     size = "small",
     srcs = ["range_dataset_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -366,7 +425,11 @@ py_test(
     size = "medium",
     srcs = ["reader_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":reader_dataset_ops_test_base",
         "//tensorflow/python:client_testlib",
@@ -390,6 +453,9 @@ py_test(
     shard_count = 2,
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
         "noasan",
         "optonly",
     ],
@@ -415,7 +481,11 @@ py_test(
     size = "small",
     srcs = ["scan_dataset_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -438,7 +508,9 @@ py_test(
     srcs = ["shuffle_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",
         "no_pip",
+        "no_windows",
         "optonly",
     ],
     deps = [
@@ -475,7 +547,11 @@ py_test(
     size = "small",
     srcs = ["sql_dataset_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":sql_dataset_op_test_base",
         "//tensorflow/python:client_testlib",
@@ -489,7 +565,11 @@ py_test(
     size = "medium",
     srcs = ["stats_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":reader_dataset_ops_test_base",
         ":stats_dataset_test_base",
@@ -519,7 +599,11 @@ py_test(
     size = "small",
     srcs = ["threadpool_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -539,7 +623,11 @@ py_test(
     size = "small",
     srcs = ["unique_dataset_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -555,6 +643,11 @@ py_test(
     name = "writer_ops_test",
     size = "small",
     srcs = ["writer_ops_test.py"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index 68f73bddb5..c92bb8b9bc 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -11,6 +11,11 @@ py_test(
     size = "medium",
     srcs = ["assert_next_dataset_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -25,6 +30,11 @@ py_test(
     size = "small",
     srcs = ["hoist_random_uniform_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -43,6 +53,11 @@ py_test(
     size = "small",
     srcs = ["latency_all_edges_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -58,6 +73,11 @@ py_test(
     size = "small",
     srcs = ["map_vectorization_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:check_ops",
         "//tensorflow/python:client_testlib",
@@ -80,6 +100,11 @@ py_test(
     size = "medium",
     srcs = ["map_and_filter_fusion_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -98,6 +123,11 @@ py_test(
     size = "small",
     srcs = ["map_parallelization_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -117,6 +147,9 @@ py_test(
     srcs = ["model_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
         "optonly",
     ],
     deps = [
@@ -136,6 +169,11 @@ py_test(
     size = "small",
     srcs = ["noop_elimination_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -153,6 +191,11 @@ py_test(
     size = "small",
     srcs = ["optimize_dataset_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index 20c02a5366..58a335ae4f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -34,7 +34,11 @@ py_test(
     size = "medium",
     srcs = ["batch_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -51,6 +55,11 @@ py_test(
     size = "small",
     srcs = ["cache_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -65,6 +74,11 @@ py_test(
     size = "small",
     srcs = ["concatenate_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -78,7 +92,11 @@ py_test(
     size = "small",
     srcs = ["csv_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -92,6 +110,11 @@ py_test(
     size = "medium",
     srcs = ["dataset_constructor_serialization_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -106,7 +129,11 @@ py_test(
     size = "medium",
     srcs = ["filter_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -123,7 +150,11 @@ py_test(
     srcs = ["fixed_length_record_dataset_serialization_test.py"],
     shard_count = 4,
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -136,7 +167,11 @@ py_test(
     name = "flat_map_dataset_serialization_test",
     size = "medium",
     srcs = ["flat_map_dataset_serialization_test.py"],
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -158,6 +193,11 @@ py_test(
     size = "medium",
     srcs = ["group_by_reducer_serialization_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -172,6 +212,11 @@ py_test(
     size = "medium",
     srcs = ["group_by_window_serialization_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -186,7 +231,11 @@ py_test(
     size = "small",
     srcs = ["ignore_errors_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -202,7 +251,11 @@ py_test(
     size = "medium",
     srcs = ["interleave_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -219,7 +272,11 @@ py_test(
     size = "medium",
     srcs = ["map_and_batch_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -234,7 +291,11 @@ py_test(
     size = "medium",
     srcs = ["map_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -256,6 +317,11 @@ py_test(
     size = "small",
     srcs = ["optimize_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -269,7 +335,11 @@ py_test(
     size = "medium",
     srcs = ["padded_batch_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -285,7 +355,11 @@ py_test(
     size = "medium",
     srcs = ["parallel_interleave_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -302,7 +376,11 @@ py_test(
     size = "medium",
     srcs = ["parallel_map_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -323,7 +401,11 @@ py_test(
     size = "medium",
     srcs = ["parse_example_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -336,7 +418,11 @@ py_test(
     size = "small",
     srcs = ["prefetch_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -349,6 +435,11 @@ py_test(
     size = "small",
     srcs = ["range_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -368,6 +459,11 @@ py_test(
     size = "medium",
     srcs = ["sample_from_datasets_serialization_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -381,7 +477,11 @@ py_test(
     size = "small",
     srcs = ["scan_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -395,7 +495,11 @@ py_test(
     size = "medium",
     srcs = ["sequence_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -409,7 +513,11 @@ py_test(
     size = "small",
     srcs = ["serialization_integration_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -424,7 +532,11 @@ py_test(
     size = "medium",
     srcs = ["shuffle_and_repeat_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -438,7 +550,11 @@ py_test(
     size = "medium",
     srcs = ["shuffle_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -454,7 +570,11 @@ py_test(
     size = "small",
     srcs = ["sql_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -470,7 +590,11 @@ py_test(
     size = "medium",
     srcs = ["stats_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -487,7 +611,11 @@ py_test(
     srcs = ["textline_dataset_serialization_test.py"],
     shard_count = 4,
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -502,7 +630,11 @@ py_test(
     srcs = ["tf_record_dataset_serialization_test.py"],
     shard_count = 4,
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -516,7 +648,11 @@ py_test(
     size = "medium",
     srcs = ["unbatch_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -531,7 +667,11 @@ py_test(
     size = "small",
     srcs = ["unique_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -545,7 +685,11 @@ py_test(
     size = "small",
     srcs = ["zip_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
-- 
GitLab


From 7c0c0abab5b07528bae982d69257ebf4a8c077cb Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Tue, 2 Oct 2018 16:14:32 -0700
Subject: [PATCH 313/570] Internal change.

PiperOrigin-RevId: 215477724
---
 tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
index cd7206baf8..9c6390070c 100755
--- a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
+++ b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
@@ -29,7 +29,7 @@ TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU:-8}
 # p100 has minimum 12G memory. Therefore, we should limit each test to 1.5G.
 # To leave some room in case we want to run more tests in parallel in the
 # future and to use a rounder number, we set it to 1G.
-export TF_PER_DEVICE_MEMORY_LIMIT_MB=1024
+export TF_PER_DEVICE_MEMORY_LIMIT_MB=${TF_PER_DEVICE_MEMORY_LIMIT_MB:-1024}
 
 # *******************************************************************
 #         This section of the script is needed to
-- 
GitLab


From 6663959a8a2dd93a4dab9b049767d64761a00adc Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Tue, 2 Oct 2018 16:27:57 -0700
Subject: [PATCH 314/570] Update Keras RNN layer to support time major input.

PiperOrigin-RevId: 215479788
---
 tensorflow/python/keras/backend.py            | 25 ++++--
 .../python/keras/layers/cudnn_recurrent.py    | 24 +++--
 .../keras/layers/cudnn_recurrent_test.py      | 27 ++++++
 tensorflow/python/keras/layers/recurrent.py   | 65 ++++++++++----
 .../python/keras/layers/recurrent_test.py     | 90 +++++++++++++++++++
 .../golden/v1/tensorflow.keras.backend.pbtxt  |  2 +-
 .../v1/tensorflow.keras.layers.-r-n-n.pbtxt   |  2 +-
 .../golden/v2/tensorflow.keras.backend.pbtxt  |  2 +-
 .../v2/tensorflow.keras.layers.-r-n-n.pbtxt   |  2 +-
 9 files changed, 207 insertions(+), 32 deletions(-)

diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 584facc859..0d6877e4a1 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -3058,7 +3058,8 @@ def rnn(step_function,
         mask=None,
         constants=None,
         unroll=False,
-        input_length=None):
+        input_length=None,
+        time_major=False):
   """Iterates over the time dimension of a tensor.
 
   Arguments:
@@ -3087,6 +3088,13 @@ def rnn(step_function,
       constants: List of constant values passed at each step.
       unroll: Whether to unroll the RNN or to use a symbolic `while_loop`.
       input_length: If specified, assume time dimension is of this length.
+      time_major: Boolean. If true, the inputs and outputs will be in shape
+          `(timesteps, batch, ...)`, whereas in the False case, it will be
+          `(batch, timesteps, ...)`. Using `time_major = True` is a bit more
+          efficient because it avoids transposes at the beginning and end of the
+          RNN calculation. However, most TensorFlow data is batch-major, so by
+          default this function accepts input and emits output in batch-major
+          form.
 
   Returns:
       A tuple, `(last_output, outputs, new_states)`.
@@ -3108,15 +3116,17 @@ def rnn(step_function,
   if ndim < 3:
     raise ValueError('Input should be at least 3D.')
   inputs_shape = inputs.shape
-  axes = [1, 0] + list(range(2, ndim))
-  inputs = array_ops.transpose(inputs, (axes))
+  if not time_major:
+    axes = [1, 0] + list(range(2, ndim))
+    inputs = array_ops.transpose(inputs, axes)
 
   if mask is not None:
     if mask.dtype != dtypes_module.bool:
       mask = math_ops.cast(mask, dtypes_module.bool)
     if len(mask.shape) == ndim - 1:
       mask = expand_dims(mask)
-    mask = array_ops.transpose(mask, axes)
+    if not time_major:
+      mask = array_ops.transpose(mask, axes)
 
   if constants is None:
     constants = []
@@ -3297,10 +3307,11 @@ def rnn(step_function,
     outputs = output_ta.stack()
     last_output = output_ta.read(last_time - 1)
 
-  axes = [1, 0] + list(range(2, len(outputs.shape)))
-  outputs = array_ops.transpose(outputs, axes)
+  if not time_major:
+    axes = [1, 0] + list(range(2, len(outputs.shape)))
+    outputs = array_ops.transpose(outputs, axes)
 
-  # Static shape inference: (samples, time, ...)
+  # Static shape inference: (samples, time, ...) or (time, sample, ...)
   outputs_shape = outputs.shape.as_list()
   outputs_shape[0] = inputs_shape[0]
   outputs_shape[1] = inputs_shape[1]
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
index cf2b0c476c..29a09a3d71 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -47,6 +47,9 @@ class _CuDNNRNN(RNN):
     stateful: Boolean (default False). If True, the last state
         for each sample at index i in a batch will be used as initial
         state for the sample of index i in the following batch.
+    time_major: Boolean (default False). If true, the inputs and outputs will be
+        in shape `(timesteps, batch, ...)`, whereas in the False case, it will
+        be `(batch, timesteps, ...)`.
   """
 
   def __init__(self,
@@ -54,6 +57,7 @@ class _CuDNNRNN(RNN):
                return_state=False,
                go_backwards=False,
                stateful=False,
+               time_major=False,
                **kwargs):
     # We invoke the base layer's initializer directly here because we do not
     # want to create RNN cell instance.
@@ -62,6 +66,7 @@ class _CuDNNRNN(RNN):
     self.return_state = return_state
     self.go_backwards = go_backwards
     self.stateful = stateful
+    self.time_major = time_major
     self.supports_masking = False
     self.input_spec = [InputSpec(ndim=3)]
     if hasattr(self.cell.state_size, '__len__'):
@@ -124,7 +129,8 @@ class _CuDNNRNN(RNN):
         'return_sequences': self.return_sequences,
         'return_state': self.return_state,
         'go_backwards': self.go_backwards,
-        'stateful': self.stateful
+        'stateful': self.stateful,
+        'time_major': self.time_major,
     }
     base_config = super(  # pylint: disable=bad-super-call
         RNN, self).get_config()
@@ -267,7 +273,8 @@ class CuDNNGRU(_CuDNNRNN):
     self.built = True
 
   def _process_batch(self, inputs, initial_state):
-    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+    if not self.time_major:
+      inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
     input_h = initial_state[0]
     input_h = array_ops.expand_dims(input_h, axis=0)
 
@@ -301,7 +308,10 @@ class CuDNNGRU(_CuDNNRNN):
     if self.stateful or self.return_state:
       h = h[0]
     if self.return_sequences:
-      output = array_ops.transpose(outputs, perm=(1, 0, 2))
+      if self.time_major:
+        output = outputs
+      else:
+        output = array_ops.transpose(outputs, perm=(1, 0, 2))
     else:
       output = outputs[-1]
     return output, [h]
@@ -456,7 +466,8 @@ class CuDNNLSTM(_CuDNNRNN):
     self.built = True
 
   def _process_batch(self, inputs, initial_state):
-    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+    if not self.time_major:
+      inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
     input_h = initial_state[0]
     input_c = initial_state[1]
     input_h = array_ops.expand_dims(input_h, axis=0)
@@ -496,7 +507,10 @@ class CuDNNLSTM(_CuDNNRNN):
       h = h[0]
       c = c[0]
     if self.return_sequences:
-      output = array_ops.transpose(outputs, perm=(1, 0, 2))
+      if self.time_major:
+        output = outputs
+      else:
+        output = array_ops.transpose(outputs, perm=(1, 0, 2))
     else:
       output = outputs[-1]
     return output, [h, c]
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index 2ed0aa8f26..7becbfede1 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -26,6 +26,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
@@ -138,6 +139,32 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
         np.testing.assert_allclose(
             keras.backend.eval(layer.states[0]), state, atol=1e-4)
 
+  @parameterized.named_parameters(
+      ('cudnngru', keras.layers.CuDNNGRU),
+      ('cudnnlstm', keras.layers.CuDNNLSTM),
+  )
+  def test_time_major_input(self, layer_class):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2])))
+        layer = layer_class(units, time_major=True, return_sequences=True)
+        model.add(layer)
+        model.add(
+            keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2])))
+        model.compile(loss='categorical_crossentropy', optimizer='adam')
+        model.fit(
+            np.ones((num_samples, timesteps, input_size)),
+            np.ones((num_samples, timesteps, units)))
+        out = model.predict(np.ones((num_samples, timesteps, input_size)))
+        self.assertEqual(out.shape, (num_samples, timesteps, units))
+
   @parameterized.named_parameters(
       ('cudnngru', keras.layers.CuDNNGRU),
       ('cudnnlstm', keras.layers.CuDNNLSTM),
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index ba7498e7e6..b07ec71178 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -336,9 +336,18 @@ class RNN(Layer):
           in your model, you would need to specify the input length
           at the level of the first layer
           (e.g. via the `input_shape` argument)
+      time_major: The shape format of the `inputs` and `outputs` tensors.
+          If True, the inputs and outputs will be in shape
+          `(timesteps, batch, ...)`, whereas in the False case, it will be
+          `(batch, timesteps, ...)`. Using `time_major = True` is a bit more
+          efficient because it avoids transposes at the beginning and end of the
+          RNN calculation. However, most TensorFlow data is batch-major, so by
+          default this function accepts input and emits output in batch-major
+          form.
 
   Input shape:
-      N-D tensor with shape `(batch_size, timesteps, ...)`.
+      N-D tensor with shape `(batch_size, timesteps, ...)` or
+      `(timesteps, batch_size, ...)` when time_major is True.
 
   Output shape:
       - if `return_state`: a list of tensors. The first tensor is
@@ -347,7 +356,8 @@ class RNN(Layer):
           be a high dimension tensor shape.
       - if `return_sequences`: N-D tensor with shape
           `(batch_size, timesteps, output_size)`, where `output_size` could
-          be a high dimension tensor shape.
+          be a high dimension tensor shape, or
+          `(timesteps, batch_size, output_size)` when `time_major` is True.
       - else, N-D tensor with shape `(batch_size, output_size)`, where
           `output_size` could be a high dimension tensor shape.
 
@@ -448,6 +458,7 @@ class RNN(Layer):
                go_backwards=False,
                stateful=False,
                unroll=False,
+               time_major=False,
                **kwargs):
     if isinstance(cell, (list, tuple)):
       cell = StackedRNNCells(cell)
@@ -468,6 +479,7 @@ class RNN(Layer):
     self.go_backwards = go_backwards
     self.stateful = stateful
     self.unroll = unroll
+    self.time_major = time_major
 
     self.supports_masking = True
     self.input_spec = [None]  # The input shape is unknown yet, at least rank 3.
@@ -503,14 +515,21 @@ class RNN(Layer):
       # Note that state_size[0] could be a tensor_shape or int.
       output_dim = tensor_shape.as_shape(state_size[0]).as_list()
 
+    batch = input_shape[0]
+    time_step = input_shape[1]
+    if self.time_major:
+      batch, time_step = time_step, batch
     if self.return_sequences:
-      output_shape = tuple([input_shape[0], input_shape[1]] + output_dim)
+      if self.time_major:
+        output_shape = tuple([time_step, batch] + output_dim)
+      else:
+        output_shape = tuple([batch, time_step] + output_dim)
     else:
-      output_shape = tuple([input_shape[0]] + output_dim)
+      output_shape = tuple([batch] + output_dim)
 
     if self.return_state:
       state_shape = [
-          tuple([input_shape[0]] + tensor_shape.as_shape(dim).as_list())
+          tuple([batch] + tensor_shape.as_shape(dim).as_list())
           for dim in state_size
       ]
       return [output_shape] + state_shape
@@ -539,13 +558,18 @@ class RNN(Layer):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
 
-    batch_size = input_shape[0] if self.stateful else None
-    input_dim = input_shape[2:]
-    self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_dim)
+    input_spec_shape = list(input_shape)
+    batch_index, time_step_index = (1, 0) if self.time_major else (0, 1)
+    if not self.stateful:
+      input_spec_shape[batch_index] = None
+    input_spec_shape[time_step_index] = None
+    self.input_spec[0] = InputSpec(shape=tuple(input_spec_shape))
 
+    batch = input_shape[batch_index]
+    input_dim = input_shape[2:]
+    step_input_shape = (batch,) + input_dim
     # allow cell (if layer) to build before we set or validate state_spec
     if isinstance(self.cell, Layer):
-      step_input_shape = (input_shape[0],) + input_dim
       if constants_shape is not None:
         self.cell.build([step_input_shape] + constants_shape)
       else:
@@ -598,12 +622,16 @@ class RNN(Layer):
 
   def get_initial_state(self, inputs):
     get_initial_state_fn = getattr(self.cell, 'get_initial_state', None)
+
+    input_shape = array_ops.shape(inputs)
+    batch_size = input_shape[1] if self.time_major else input_shape[0]
+    dtype = inputs.dtype
     if get_initial_state_fn:
       init_state = get_initial_state_fn(
-          inputs=inputs, batch_size=None, dtype=None)
+          inputs=None, batch_size=batch_size, dtype=dtype)
     else:
-      init_state = _generate_zero_filled_state(
-          array_ops.shape(inputs)[0], self.cell.state_size, inputs.dtype)
+      init_state = _generate_zero_filled_state(batch_size, self.cell.state_size,
+                                               dtype)
     # Keras RNN expect the states in a list, even if it's a single state tensor.
     if not nest.is_sequence(init_state):
       init_state = [init_state]
@@ -696,7 +724,7 @@ class RNN(Layer):
           'Layer has ' + str(len(self.states)) + ' states but was passed ' +
           str(len(initial_state)) + ' initial states.')
     input_shape = K.int_shape(inputs)
-    timesteps = input_shape[1]
+    timesteps = input_shape[0] if self.time_major else input_shape[1]
     if self.unroll and timesteps in [None, 1]:
       raise ValueError('Cannot unroll a RNN if the '
                        'time dimension is undefined or equal to 1. \n'
@@ -747,7 +775,8 @@ class RNN(Layer):
         go_backwards=self.go_backwards,
         mask=mask,
         unroll=self.unroll,
-        input_length=timesteps)
+        input_length=timesteps,
+        time_major=self.time_major)
     if self.stateful:
       updates = []
       for i in range(len(states)):
@@ -777,7 +806,10 @@ class RNN(Layer):
   def reset_states(self, states=None):
     if not self.stateful:
       raise AttributeError('Layer must be stateful.')
-    batch_size = self.input_spec[0].shape[0]
+    if self.time_major:
+      batch_size = self.input_spec[0].shape[1]
+    else:
+      batch_size = self.input_spec[0].shape[0]
     if not batch_size:
       raise ValueError('If a RNN is stateful, it needs to know '
                        'its batch size. Specify the batch size '
@@ -839,7 +871,8 @@ class RNN(Layer):
         'return_state': self.return_state,
         'go_backwards': self.go_backwards,
         'stateful': self.stateful,
-        'unroll': self.unroll
+        'unroll': self.unroll,
+        'time_major': self.time_major
     }
     if self._num_constants is not None:
       config['num_constants'] = self._num_constants
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index b9e90095e4..d246be6b45 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -186,6 +186,96 @@ class RNNTest(test.TestCase):
       y_np_2 = model.predict(x_np)
       self.assertAllClose(y_np, y_np_2, atol=1e-4)
 
+  def test_rnn_with_time_major(self):
+    batch = 10
+    time_step = 5
+    embedding_dim = 4
+    units = 3
+
+    with self.cached_session():
+      # Test basic case.
+      x = keras.Input((time_step, embedding_dim))
+      time_major_x = keras.layers.Lambda(
+          lambda t: array_ops.transpose(t, [1, 0, 2]))(x)
+      layer = keras.layers.SimpleRNN(
+          units, time_major=True, return_sequences=True)
+      self.assertEqual(
+          layer.compute_output_shape((time_step, None,
+                                      embedding_dim)).as_list(),
+          [time_step, None, units])
+      y = layer(time_major_x)
+      self.assertEqual(layer.output_shape, (time_step, None, units))
+
+      y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(y)
+
+      model = keras.models.Model(x, y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          np.zeros((batch, time_step, embedding_dim)),
+          np.zeros((batch, time_step, units)))
+
+    with self.cached_session():
+      # Test stacking.
+      x = keras.Input((time_step, embedding_dim))
+      time_major_x = keras.layers.Lambda(
+          lambda t: array_ops.transpose(t, [1, 0, 2]))(x)
+      cell_units = [10, 8, 6]
+      cells = [keras.layers.SimpleRNNCell(cell_units[i]) for i in range(3)]
+      layer = keras.layers.RNN(cells, time_major=True, return_sequences=True)
+      y = layer(time_major_x)
+      self.assertEqual(layer.output_shape, (time_step, None, cell_units[-1]))
+
+      y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(y)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          np.zeros((batch, time_step, embedding_dim)),
+          np.zeros((batch, time_step, cell_units[-1])))
+
+    with self.cached_session():
+      # Test masking.
+      x = keras.Input((time_step, embedding_dim))
+      time_major = keras.layers.Lambda(
+          lambda t: array_ops.transpose(t, [1, 0, 2]))(x)
+      mask = keras.layers.Masking()(time_major)
+      rnn = keras.layers.SimpleRNN(
+          units, time_major=True, return_sequences=True)(mask)
+      y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(rnn)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          np.zeros((batch, time_step, embedding_dim)),
+          np.zeros((batch, time_step, units)))
+
+    with self.cached_session():
+      # Test layer output
+      x = keras.Input((time_step, embedding_dim))
+      rnn_1 = keras.layers.SimpleRNN(units, return_sequences=True)
+      y = rnn_1(x)
+
+      model = keras.models.Model(x, y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          np.zeros((batch, time_step, embedding_dim)),
+          np.zeros((batch, time_step, units)))
+
+      x_np = np.random.random((batch, time_step, embedding_dim))
+      y_np_1 = model.predict(x_np)
+
+      time_major = keras.layers.Lambda(
+          lambda t: array_ops.transpose(t, [1, 0, 2]))(x)
+      rnn_2 = keras.layers.SimpleRNN(
+          units, time_major=True, return_sequences=True)
+      y_2 = rnn_2(time_major)
+      y_2 = keras.layers.Lambda(
+          lambda t: array_ops.transpose(t, [1, 0, 2]))(y_2)
+
+      model_2 = keras.models.Model(x, y_2)
+      rnn_2.set_weights(rnn_1.get_weights())
+
+      y_np_2 = model_2.predict(x_np)
+      self.assertAllClose(y_np_1, y_np_2, atol=1e-4)
+
   def test_rnn_cell_with_constants_layer(self):
 
     class RNNCellWithConstants(keras.layers.Layer):
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
index 126ce8db6a..a71a59e269 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
@@ -398,7 +398,7 @@ tf_module {
   }
   member_method {
     name: "rnn"
-    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "round"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index 2b6e8af11d..68b6678d48 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -86,7 +86,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cell\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'cell\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'time_major\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
index 126ce8db6a..a71a59e269 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -398,7 +398,7 @@ tf_module {
   }
   member_method {
     name: "rnn"
-    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "round"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index 2b6e8af11d..68b6678d48 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -86,7 +86,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cell\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'cell\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'time_major\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
-- 
GitLab


From 41e97007638ef41764b1da86fb2de772f35762e5 Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Tue, 2 Oct 2018 17:00:46 -0700
Subject: [PATCH 315/570] Disable XLA from raspberry pi builds.

There is no known conceptual reason we can't use XLA, but in practice
we have some build issues that will need to be fixed.

PiperOrigin-RevId: 215484942
---
 tensorflow/tools/ci_build/pi/build_raspberry_pi.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 3d27e84b81..864278c647 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -34,6 +34,8 @@ set -e
 #
 # Make sure you have an up to date version of the Bazel build tool installed too.
 
+export TF_ENABLE_XLA=0
+
 yes '' | ./configure
 
 # Fix for curl build problem in 32-bit, see https://stackoverflow.com/questions/35181744/size-of-array-curl-rule-01-is-negative
-- 
GitLab


From e4188461aee1d614a14f17fe2abaf2a9a94886d9 Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Tue, 2 Oct 2018 17:02:30 -0700
Subject: [PATCH 316/570] Add missing `import unittest` to
 control_flow_ops_py_test.py

PiperOrigin-RevId: 215485333
---
 tensorflow/python/kernel_tests/control_flow_ops_py_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 655fece5ff..07ec859766 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 import collections
 import math
 import time
+import unittest
 
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
-- 
GitLab


From 22919770355b1b7d8f4c5a20327898e881aa11cb Mon Sep 17 00:00:00 2001
From: Jeremy Lau <lauj@google.com>
Date: Tue, 2 Oct 2018 17:09:45 -0700
Subject: [PATCH 317/570] Pin wheel=0.31.1 to work around issue
 https://github.com/pypa/auditwheel/issues/102

PiperOrigin-RevId: 215486669
---
 tensorflow/tools/ci_build/install/install_pip_packages.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index b90f3f3b97..7f293e8604 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -24,8 +24,10 @@ easy_install3 -U pip==9.0.3
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
 
-pip2 install wheel
-pip3 install wheel
+# Pin wheel==0.31.1 to work around issue
+# https://github.com/pypa/auditwheel/issues/102
+pip2 install wheel==0.31.1
+pip3 install wheel==0.31.1
 
 pip2 install virtualenv
 pip3 install virtualenv
-- 
GitLab


From 80821abd6410f47130fc031b15e9ac220de5b1b9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 17:16:05 -0700
Subject: [PATCH 318/570] Make RemoveTrivialPassthrough preserve minmax-related
 info

PiperOrigin-RevId: 215487633
---
 .../remove_trivial_passthrough.cc             | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index fc49fbda59..d5983a1f12 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -29,20 +29,34 @@ namespace {
 // array instead. from_array is assumed to be discardable, and consequently
 // this only updates operator edges (since discardable arrays only
 // appear there, and not e.g. in model flags).
-void RerouteEdges(const string& from_array, const string& to_array,
-                  Model* model) {
+void Reroute(const string& from, const string& to, Model* model) {
   for (const auto& op : model->operators) {
     for (auto& output : op->outputs) {
-      if (output == from_array) {
-        output = to_array;
+      if (output == from) {
+        output = to;
       }
     }
     for (auto& input : op->inputs) {
-      if (input == from_array) {
-        input = to_array;
+      if (input == from) {
+        input = to;
       }
     }
   }
+  const Array& from_array = model->GetArray(from);
+  Array& to_array = model->GetOrCreateArray(to);
+  // Preserve minmax information if to_array didn't already have any.
+  if (from_array.minmax && !to_array.minmax) {
+    to_array.GetOrCreateMinMax() = from_array.GetMinMax();
+    // If we're copying minmax info, then we should also be copying
+    // narrow_range, which affects how minmax info is to be interpreted.
+    to_array.narrow_range = from_array.narrow_range;
+  }
+  // Separately, also preserve final_data_type if to_array didn't already
+  // have any.
+  if (from_array.final_data_type != ArrayDataType::kNone &&
+      to_array.final_data_type == ArrayDataType::kNone) {
+    to_array.final_data_type = from_array.final_data_type;
+  }
 }
 
 }  // namespace
@@ -90,14 +104,14 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
     transformation->AddMessageF(
         "Removing %s, keeping its non-constant input array %s and removing %s",
         LogName(*passthru_op), main_input_name, output_name);
-    RerouteEdges(output_name, main_input_name, model);
+    Reroute(output_name, main_input_name, model);
   } else if (IsDiscardableArray(*model, main_input_name) &&
              !IsConstantParameterArray(*model, main_input_name)) {
     transformation->AddMessageF(
         "Removing %s, keeping its output array %s and removing non-constant "
         "input %s",
         LogName(*passthru_op), output_name, main_input_name);
-    RerouteEdges(main_input_name, output_name, model);
+    Reroute(main_input_name, output_name, model);
   } else {
     transformation->AddMessageF(
         "Cannot remove %s, neither its main input nor its output may be "
-- 
GitLab


From b7e9cbab27c893283acc4a6154d7a59dffb23758 Mon Sep 17 00:00:00 2001
From: Shivani Agrawal <shivaniagrawal@google.com>
Date: Tue, 2 Oct 2018 17:48:25 -0700
Subject: [PATCH 319/570] Use `defun` instead of `Defun` for `tf.data`, except
 for `make_one_shot_iterator` which is to be deprecated in future.

PiperOrigin-RevId: 215491729
---
 .../contrib/distribute/python/input_ops.py    |  2 +-
 tensorflow/python/data/ops/dataset_ops.py     | 60 ++++++++-----------
 tensorflow/python/eager/function.py           | 14 +++++
 tensorflow/python/eager/function_test.py      |  9 ++-
 4 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/input_ops.py b/tensorflow/contrib/distribute/python/input_ops.py
index f07ec8234d..423952c9e2 100644
--- a/tensorflow/contrib/distribute/python/input_ops.py
+++ b/tensorflow/contrib/distribute/python/input_ops.py
@@ -78,7 +78,7 @@ def auto_shard_dataset(dataset, num_shards, index):
       elif hasattr(dataset, "_map_func"):
         # TODO(priyag): Make this check more robust by enforcing some common
         # property on all map/flatmap/interleave datasets.
-        map_func_def = dataset._map_func.definition
+        map_func_def = dataset._map_func.function_def
         for node in map_func_def.node_def:
           if node.op in _READER_DATASET_OPS:
             found_reader_op = True
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 46ce191f7b..d90da5908d 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -30,6 +30,7 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import random_seed
 from tensorflow.python.data.util import sparse
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -37,6 +38,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -1713,7 +1715,8 @@ class _VariantDataset(Dataset):
 
 
 class StructuredFunctionWrapper(object):
-  """A wrapper for `Defun` that supports structured arguments and return values.
+  """A wrapper for `defun` that supports structured arguments and return values.
+
   """
 
   def __init__(self, func, transformation_name, dataset=None,
@@ -1765,7 +1768,7 @@ class StructuredFunctionWrapper(object):
     # TODO(b/110122868): Enable this support for all `tf.data` functions.
     self._nested_dataset_support = experimental_nested_dataset_support
 
-    @function.Defun(*self._defun_args())
+    @eager_function.defun(input_signature=self._defun_args())
     def tf_data_structured_function_wrapper(*args):
       """Wrapper for passing nested structures to and from tf.data functions."""
       flat_args = []
@@ -1850,36 +1853,43 @@ class StructuredFunctionWrapper(object):
       self._output_shapes = nest.pack_sequence_as(ret, flat_shapes)
       self._output_types = nest.pack_sequence_as(ret, flat_types)
 
-      _warn_if_collections(transformation_name)
-
       return flat_ret
 
-    self._function = tf_data_structured_function_wrapper
+    table_initializers_len = len(ops.get_default_graph().get_collection(
+        ops.GraphKeys.TABLE_INITIALIZERS))
+
+    self._function = tf_data_structured_function_wrapper.get_concrete_function()
     if add_to_graph:
       self._function.add_to_graph(ops.get_default_graph())
-    else:
-      # Use the private method that will execute
-      # `tf_data_structured_function_wrapper` but delay adding it to the graph
-      # in case (e.g.) we need to rerun the function.
-      self._function._create_definition_if_needed()  # pylint: disable=protected-access
+    if len(
+        self._function.graph.get_collection(
+            ops.GraphKeys.TABLE_INITIALIZERS)) != table_initializers_len:
+      warnings.warn(
+          "Creating lookup tables inside a function passed to %s is not"
+          " supported. Create each table outside the function, and "
+          "capture it inside the function to use it." % transformation_name)
 
   def _defun_args(self):
-    """Returns a flat list of `tf.DType` for the input element structure."""
+    """Returns a list of `tf.TensorSpec` for the input element structure."""
     ret = []
-    for input_type, input_class in zip(nest.flatten(self._input_types),
-                                       nest.flatten(self._input_classes)):
+    for input_type, input_shape, input_class in zip(
+        nest.flatten(self._input_types), nest.flatten(self._input_shapes),
+        nest.flatten(self._input_classes)):
       # TODO(b/110122868): Add a registration mechanism for new component types.
       if input_class is sparse_tensor_lib.SparseTensor:
-        ret.append(dtypes.variant)
+        ret.append(
+            tensor_spec.TensorSpec(
+                tensor_shape.TensorShape(None), dtypes.variant))
       elif isinstance(input_class, _NestedDatasetComponent):
         if not self._nested_dataset_support:
           raise NotImplementedError(
               "The %s transformation does not currently support nested "
               "datasets as inputs." % self._transformation_name)
-        ret.append(dtypes.variant)
+        ret.append(
+            tensor_spec.TensorSpec(tensor_shape.scalar(), dtypes.variant))
       else:
         assert isinstance(input_type, dtypes.DType)
-        ret.append(input_type)
+        ret.append(tensor_spec.TensorSpec(input_shape, input_type))
     return ret
 
   @property
@@ -2579,24 +2589,6 @@ def _should_unpack_args(args):
   return type(args) is tuple  # pylint: disable=unidiomatic-typecheck
 
 
-def _warn_if_collections(transformation_name):
-  """Prints warning message if the current graph uses common graph collections.
-
-  NOTE(mrry): Currently a warning is only generated for lookup tables. Any
-  variables created will be automatically hoisted out to the outermost scope
-  using `init_scope()`. Some collections (such as for control-flow contexts)
-  are benign and should not generate a warning.
-
-  Args:
-    transformation_name: A human-readable name for the transformation.
-  """
-  if ops.get_default_graph().get_collection(ops.GraphKeys.TABLE_INITIALIZERS):
-    warnings.warn("Creating lookup tables inside a function passed to %s is not"
-                  " supported. Create each table outside the function, and "
-                  "capture it inside the function to use it."
-                  % transformation_name)
-
-
 class MapDataset(UnaryDataset):
   """A `Dataset` that maps a function over elements in its input."""
 
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index f261d92d64..aeb1cac3e9 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -662,6 +662,11 @@ class Function(object):
     outputs = self._inference_function.call(ctx, args)
     return self._build_call_outputs(outputs)
 
+  @property
+  def name(self):
+    """Function name."""
+    return self._inference_function.name
+
   @property
   def graph(self):
     """Returns the graph from which this function was constructed."""
@@ -719,6 +724,10 @@ class Function(object):
     return nest.map_structure(lambda x: x.dtype if x is not None else None,
                               self._func_graph.structured_outputs)
 
+  def add_to_graph(self, g):
+    """Adds this function into the graph g."""
+    return self._inference_function.add_to_graph(g)
+
   def _construct_backprop_function(self):
     """Constructs the backprop function object for this function."""
     backwards_graph = FuncGraph(_backward_name(self._func_graph.name))
@@ -1122,6 +1131,8 @@ class PolymorphicFunction(object):
       *args: inputs to specialize on.
       **kwargs: inputs to specialize on.
     """
+    if self._input_signature:
+      args, kwargs = None, None
     graph_function, _ = self._maybe_define_function(args, kwargs)
     return graph_function
 
@@ -1304,6 +1315,9 @@ def register(func, *args, **kwargs):
   function definition into graph. Register function with different input param
   will result into multiple version of functions registered in graph.
 
+  Also, `args` and `kwargs` are ignored if this `PolymorphicFunction` was
+  created with an `input_signature`.
+
   Args:
     func: the PolymorphicFunction instance that generated by a @defun
     *args: input arguments for the Python function.
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 9ce367a837..ac45606eb0 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -1750,11 +1750,10 @@ class FunctionTest(test.TestCase):
         # pylint: disable=protected-access
         self.assertEqual(len(graph._functions), 3)
 
-        # Test input param shape mismatch
-        t2 = constant_op.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-        with self.assertRaisesRegexp(
-            ValueError, 'Python inputs incompatible with input_signature'):
-          function.register(defun_matmul, t2, t2)
+        # Test register function with cache, note inputs are ignored.
+        function.register(defun_matmul)
+        graph = ops.get_default_graph()
+        self.assertEqual(len(graph._functions), 3)
 
   def testRegisterFunctionWithCache(self):
     def matmul(x, y):
-- 
GitLab


From 9f7a138640408cea58698a432fd1596cf436b484 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Tue, 2 Oct 2018 17:57:49 -0700
Subject: [PATCH 320/570] Set shape for output tensors of cond_v2.

PiperOrigin-RevId: 215492782
---
 tensorflow/core/ops/functional_ops.cc         | 21 ++++++++++++++++++-
 .../kernel_tests/control_flow_ops_py_test.py  |  7 +++++++
 tensorflow/python/ops/cond_v2_impl.py         | 20 +++++++++++++++---
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index fed3fa22ed..22b4b07eff 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -110,8 +110,27 @@ REGISTER_OP("If")
     .Attr("Tout: list(type) >= 0")
     .Attr("then_branch: func")
     .Attr("else_branch: func")
+    .Attr("output_shapes: list(shape) = []")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      std::vector<PartialTensorShape> output_shapes;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+      // If `output_shapes` attr is set use that as the shapes of the outputs
+      // else return unknown shapes.
+      if (output_shapes.empty()) return shape_inference::UnknownShape(c);
+      if (output_shapes.size() != c->num_outputs()) {
+        return errors::InvalidArgument(
+            "`output_shapes` must be the same length as num outputs (",
+            output_shapes.size(), " vs. ", c->num_outputs());
+      }
+      for (size_t i = 0; i < output_shapes.size(); ++i) {
+        shape_inference::ShapeHandle output_shape_handle;
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            output_shapes[i], &output_shape_handle));
+        c->set_output(static_cast<int>(i), output_shape_handle);
+      }
+      return Status::OK();
+    });
 
 // TODO(drpng): remove this.
 REGISTER_OP("_While")
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 07ec859766..a1be77601c 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -351,6 +351,13 @@ class ControlFlowTest(test.TestCase):
     grad = gradients_impl.gradients(y, [v])
     self.assertAllEqual([None], grad)
 
+  def testCondOutputShape(self):
+    x = constant_op.constant(1.0)
+    b = control_flow_ops.cond(
+        constant_op.constant(True), lambda: math_ops.square(x),
+        lambda: math_ops.subtract(x, 1.))
+    self.assertEqual(b.shape, tensor_shape.scalar())
+
   def testFetchable(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
diff --git a/tensorflow/python/ops/cond_v2_impl.py b/tensorflow/python/ops/cond_v2_impl.py
index f8b1ddb140..195ad11c71 100644
--- a/tensorflow/python/ops/cond_v2_impl.py
+++ b/tensorflow/python/ops/cond_v2_impl.py
@@ -96,9 +96,12 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
 
     # Create the If op.
     tensors = gen_functional_ops._if(  # pylint: disable=protected-access
-        pred, cond_inputs, [t.dtype for t in true_graph.outputs],
+        pred,
+        cond_inputs, [t.dtype for t in true_graph.outputs],
         _create_new_tf_function(true_graph),
         _create_new_tf_function(false_graph),
+        output_shapes=_get_output_shapes(true_graph.outputs,
+                                         false_graph.outputs),
         name=scope)
 
     # Set the flag to enable lowering on the `if` op if necessary
@@ -175,9 +178,12 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
 
   # Create the gradient If op.
   tensors = gen_functional_ops._if(
-      op.inputs[0], grad_inputs, [t.dtype for t in true_grad_graph.outputs],
+      op.inputs[0],
+      grad_inputs, [t.dtype for t in true_grad_graph.outputs],
       _create_new_tf_function(true_grad_graph),
-      _create_new_tf_function(false_grad_graph))
+      _create_new_tf_function(false_grad_graph),
+      output_shapes=_get_output_shapes(true_grad_graph.outputs,
+                                       false_grad_graph.outputs))
 
   # The predicate has no gradient.
   return [None] + tensors[:num_grad_outputs]
@@ -480,6 +486,14 @@ def _check_same_outputs(true_graph, false_graph):
         "  false_fn: %s" % (true_output_types, false_output_types))
 
 
+def _get_output_shapes(true_graph_outputs, false_graph_outputs):
+  output_shapes = [
+      t_out.shape.most_specific_compatible_shape(f_out.shape)
+      for t_out, f_out in zip(true_graph_outputs, false_graph_outputs)
+  ]
+  return output_shapes
+
+
 def _is_ancestor(graph, maybe_ancestor):
   if maybe_ancestor == graph:
     return True
-- 
GitLab


From 05bc6c6762d5a58bacd585e9243133bf0378515f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 18:10:46 -0700
Subject: [PATCH 321/570] Remove initial accumulator (and other auxiliary
 parameter) values from optimization parameter protos and removed uses of that
 functionality in tests.

PiperOrigin-RevId: 215494433
---
 .../tpu/proto/optimization_parameters.proto     | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
index 8529b48c15..b9e0747fa4 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
@@ -28,7 +28,6 @@ message LearningRate {
 // https://www.tensorflow.org/api_docs/python/tf/train/AdagradOptimizer
 // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L151
 message AdagradParameters {
-  float initial_accumulator = 1;
 }
 
 // https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer
@@ -42,8 +41,6 @@ message FtrlParameters {
   float l1 = 1;
   float l2 = 2;
   float lr_power = 3;
-  float initial_accum = 4;
-  float initial_linear = 5;
 }
 
 // The Adam optimizer does not implement hyper-parameter update; use the dynamic
@@ -70,8 +67,6 @@ message AdamParameters {
   float beta1 = 3;
   float beta2 = 4;
   float epsilon = 5;
-  float initial_m = 6;
-  float initial_v = 7;
   bool use_non_lazy_adam = 8;
   bool use_max_with_epsilon = 9;
 }
@@ -81,7 +76,6 @@ message AdamParameters {
 message MomentumParameters {
   float momentum = 1;
   bool use_nesterov = 2;
-  float initial_accum = 3;
 }
 
 // https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
@@ -90,8 +84,6 @@ message RmsPropParameters {
   float rho = 1;
   float momentum = 2;
   float epsilon = 3;
-  float initial_ms = 4;
-  float initial_mom = 5;
 }
 
 // https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
@@ -100,9 +92,6 @@ message CenteredRmsPropParameters {
   float rho = 1;
   float momentum = 2;
   float epsilon = 3;
-  float initial_ms = 4;
-  float initial_mom = 5;
-  float initial_mg = 6;
 }
 
 // Variant of algorithm in http://proceedings.mlr.press/v44/shamir15.pdf
@@ -119,9 +108,6 @@ message MdlAdagradLightParameters {
   float mdl_hard_limit = 10;
   bool hard_limit_min_benefit = 11;
   bool mdl_regularize = 12;
-  float initial_accumulator = 13;
-  float initial_weight = 14;
-  float initial_benefit = 15;
 }
 
 // https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
@@ -129,8 +115,6 @@ message MdlAdagradLightParameters {
 message AdadeltaParameters {
   float rho = 1;
   float epsilon = 2;
-  float initial_accumulator = 3;
-  float initial_update = 4;
 }
 
 // https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
@@ -138,7 +122,6 @@ message AdadeltaParameters {
 message ProximalAdagradParameters {
   float l1 = 1;
   float l2 = 2;
-  float initial_accumulator = 3;
 }
 
 message OptimizationParameters {
-- 
GitLab


From f8ba42b0ab0bb19af0e4a930b95e7e7b3d2f557e Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Tue, 2 Oct 2018 18:38:24 -0700
Subject: [PATCH 322/570] Disable the cuDNN workarounds if the version number
 is new enough to get the corresponding bugs fixed. The bugs that were
 work-arounded were fixed and verified.

PiperOrigin-RevId: 215497418
---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 104 ++++++++++----------
 1 file changed, 54 insertions(+), 50 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index ca90c383f9..df8538a4b8 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -2487,30 +2487,32 @@ port::Status CudnnSupport::DoConvolveImpl(
 
   // Report an error if we might be hitting a cuDNN bug that accesses illegal
   // memory. See nvbugs/2138754, b/80018418.
-  SE_RETURN_IF_ERROR([&] {
-    if (algo_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) {
-      return port::Status::OK();
-    }
-    if (input_descriptor.ndims() < 3) {
-      return port::Status::OK();
-    }
-    // Checks that a*b is within the valid range (as provided by NVIDIA).
-    auto check_sizes = [](size_t a, size_t b) {
-      if ((a * b * 4608 - 1) >> 31 == 0) {
+  if (CUDNN_VERSION < 7300) {
+    SE_RETURN_IF_ERROR([&] {
+      if (algo_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) {
         return port::Status::OK();
       }
-      return port::Status(
-          port::error::FAILED_PRECONDITION,
-          "This configuration potentially accesses illegal memory.");
-    };
-    SE_RETURN_IF_ERROR(check_sizes(input_descriptor.feature_map_count(),
-                                   output_descriptor.feature_map_count()));
-    SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(),
-                                   input_descriptor.feature_map_count()));
-    SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(),
-                                   output_descriptor.feature_map_count()));
-    return port::Status::OK();
-  }());
+      if (input_descriptor.ndims() < 3) {
+        return port::Status::OK();
+      }
+      // Checks that a*b is within the valid range (as provided by NVIDIA).
+      auto check_sizes = [](size_t a, size_t b) {
+        if ((a * b * 4608 - 1) >> 31 == 0) {
+          return port::Status::OK();
+        }
+        return port::Status(
+            port::error::FAILED_PRECONDITION,
+            "This configuration potentially accesses illegal memory.");
+      };
+      SE_RETURN_IF_ERROR(check_sizes(input_descriptor.feature_map_count(),
+                                     output_descriptor.feature_map_count()));
+      SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(),
+                                     input_descriptor.feature_map_count()));
+      SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(),
+                                     output_descriptor.feature_map_count()));
+      return port::Status::OK();
+    }());
+  }
 
   if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
       !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
@@ -3166,7 +3168,7 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
 
   // Cudnn 7.1.4 has a bug if the workspace of the following convolution is not
   // zero-initialized, nvbugs/2254619.
-  if (CUDNN_VERSION >= 7000 &&
+  if (CUDNN_VERSION >= 7000 && CUDNN_VERSION < 7300 &&
       algorithm_config.algorithm().algo_id() ==
           CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 &&
       cudnn_type == CUDNN_DATA_HALF &&
@@ -3317,31 +3319,33 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
 
   // Report an error if we might be hitting a cuDNN bug that produces incorrect
   // results. See nvbugs/2072856
-  SE_RETURN_IF_ERROR([&] {
-    if (algo_desc.algo_id() != CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) {
-      return port::Status::OK();
-    }
-    if (output_descriptor.height() > 1 && output_descriptor.width() > 1) {
-      return port::Status::OK();
-    }
-    int convolution_size = output_descriptor.height() > 1
-                               ? filter_descriptor.input_filter_height()
-                               : filter_descriptor.input_filter_width();
-    if (convolution_size <= 32) {
-      return port::Status::OK();
-    }
-    cudnnConvolutionMode_t convolution_mode;
-    cudnnDataType_t compute_type;
-    RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdDescriptor(
-        conv.handle(), 0, nullptr, nullptr, nullptr, nullptr, &convolution_mode,
-        &compute_type));
-    if (convolution_mode != CUDNN_CONVOLUTION) {
-      return port::Status::OK();
-    }
-    return port::Status(
-        port::error::FAILED_PRECONDITION,
-        "This configuration potentially produces incorrect results.");
-  }());
+  if (CUDNN_VERSION < 7300) {
+    SE_RETURN_IF_ERROR([&] {
+      if (algo_desc.algo_id() != CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) {
+        return port::Status::OK();
+      }
+      if (output_descriptor.height() > 1 && output_descriptor.width() > 1) {
+        return port::Status::OK();
+      }
+      int convolution_size = output_descriptor.height() > 1
+                                 ? filter_descriptor.input_filter_height()
+                                 : filter_descriptor.input_filter_width();
+      if (convolution_size <= 32) {
+        return port::Status::OK();
+      }
+      cudnnConvolutionMode_t convolution_mode;
+      cudnnDataType_t compute_type;
+      RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdDescriptor(
+          conv.handle(), 0, nullptr, nullptr, nullptr, nullptr,
+          &convolution_mode, &compute_type));
+      if (convolution_mode != CUDNN_CONVOLUTION) {
+        return port::Status::OK();
+      }
+      return port::Status(
+          port::error::FAILED_PRECONDITION,
+          "This configuration potentially produces incorrect results.");
+    }());
+  }
 
   if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
       !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
@@ -3357,8 +3361,8 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
   // This wrong result caused by the bug is very flaky. It needs to be run for
   // up to 20 times to produce a mismatch.
   //
-  // TODO(timshen): add a nvbugs link.
-  if (CUDNN_VERSION >= 7100 &&
+  // See nvbugs/2379553.
+  if (CUDNN_VERSION >= 7100 && CUDNN_VERSION < 7300 &&
       algorithm_config.algorithm().algo_id() ==
           CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 &&
       cudnn_type == CUDNN_DATA_HALF &&
-- 
GitLab


From 8dc7bc7764150253c03a666eee84fc48f867d6a2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 19:13:14 -0700
Subject: [PATCH 323/570] In all constant-propagation transformations, check
 that the array we'd be turning into a constant is a discardable array. If
 it's not discardable, it means that the user wants this array to keep
 existing in a way that is observable to them, i.e. not as weights.

Typical example: a Fill op outputs an array that is passed as a RNN state array (non-discardable).
It seems that so far we have been relying on accidental ordering of graph transformations for such state
arrays not to be accidentally turned into constants. Instead, the desired graph transformation here is
RemoveUnusedOp noticing that such a Fill can be discarded since its output is a RNN state array.

So I don't have a test for this, but this seems to be tightening existing behavior, and should be good
to have as long as it does not regress anything.

PiperOrigin-RevId: 215500760
---
 .../toco/graph_transformations/resolve_constant_binary.cc | 8 ++++++++
 .../resolve_constant_concatenation.cc                     | 7 +++++++
 .../graph_transformations/resolve_constant_fake_quant.cc  | 7 +++++++
 .../toco/graph_transformations/resolve_constant_fill.cc   | 7 +++++++
 .../toco/graph_transformations/resolve_constant_gather.cc | 8 ++++++++
 .../toco/graph_transformations/resolve_constant_pack.cc   | 8 ++++++++
 .../resolve_constant_random_uniform.cc                    | 7 +++++++
 .../toco/graph_transformations/resolve_constant_range.cc  | 8 ++++++++
 .../graph_transformations/resolve_constant_reshape.cc     | 7 +++++++
 .../toco/graph_transformations/resolve_constant_select.cc | 8 ++++++++
 .../resolve_constant_shape_or_rank.cc                     | 8 ++++++++
 .../toco/graph_transformations/resolve_constant_slice.cc  | 8 ++++++++
 .../resolve_constant_strided_slice.cc                     | 8 ++++++++
 .../toco/graph_transformations/resolve_constant_tile.cc   | 7 +++++++
 .../graph_transformations/resolve_constant_transpose.cc   | 8 ++++++++
 .../toco/graph_transformations/resolve_constant_unary.cc  | 8 ++++++++
 16 files changed, 122 insertions(+)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
index f7e5aa6609..3e57d3f467 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
@@ -191,6 +191,14 @@ void EvaluateBinaryOperatorOnConstantInputs(Model* model,
 bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
   const auto binary_it = model->operators.begin() + op_index;
   const auto* binary_op = binary_it->get();
+
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, binary_op->outputs[0])) {
+    return false;
+  }
+
   // Test for binary ops of types that we know how to resolve
   if (binary_op->type != OperatorType::kAdd &&
       binary_op->type != OperatorType::kMul &&
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index d916ae0ddf..c6c5035a51 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -144,6 +144,13 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
   const auto* concat_op =
       static_cast<const ConcatenationOperator*>(concat_base_op);
 
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, concat_op->outputs[0])) {
+    return false;
+  }
+
   for (const string& input_name : concat_op->inputs) {
     // We only expect constant unquantized arrays as input, otherwise we return.
     // We  also make sure the shapes of the input arrays are known and they are
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
index f5f2f77460..3d797533c9 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
@@ -69,6 +69,13 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
   const auto* fakequant_op =
       static_cast<const FakeQuantOperator*>(fakequant_base_op);
 
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, fakequant_op->outputs[0])) {
+    return false;
+  }
+
   // Yield until the fakequant MinMax has been resolved.
   if (!fakequant_op->minmax) {
     return false;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc
index f6f95481b5..2cb1e64f3a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc
@@ -52,6 +52,13 @@ bool ResolveConstantFill::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
 
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, op->outputs[0])) {
+    return false;
+  }
+
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
index 36d7dad0ce..4dfe203a25 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
@@ -71,6 +71,14 @@ bool ResolveConstantGather::Run(Model* model, std::size_t op_index) {
 
   CHECK_GE(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
+
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, op->outputs[0])) {
+    return false;
+  }
+
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_pack.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_pack.cc
index e86616574d..6f44025dd4 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_pack.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_pack.cc
@@ -59,6 +59,14 @@ bool ResolveConstantPack::Run(Model* model, std::size_t op_index) {
 
   CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
+
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, op->outputs[0])) {
+    return false;
+  }
+
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
index 88d06d7dc7..c9f2b95d09 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
@@ -70,6 +70,13 @@ bool ResolveConstantRandomUniform::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
 
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, op->outputs[0])) {
+    return false;
+  }
+
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc
index 1a0ba9e2bc..e347286dd4 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc
@@ -28,6 +28,14 @@ bool ResolveConstantRange::Run(Model* model, std::size_t op_index) {
   auto* op = static_cast<RangeOperator*>(base_op);
 
   CHECK_EQ(op->inputs.size(), 3);
+
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, op->outputs[0])) {
+    return false;
+  }
+
   const auto& start_array = model->GetArray(op->inputs[0]);
   if (!start_array.has_shape()) {
     // Yield until all input dims have been resolved.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
index a6f665b5f0..bfdaa8aafd 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
@@ -33,6 +33,13 @@ bool ResolveConstantReshape::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
 
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, op->outputs[0])) {
+    return false;
+  }
+
   // We require constant inputs.
   if (!IsConstantParameterArray(*model, op->inputs[0]) ||
       !IsConstantParameterArray(*model, op->inputs[1])) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc
index e880a3f44d..3a95d39cd4 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc
@@ -37,6 +37,14 @@ bool ResolveConstantSelect::Run(Model* model, std::size_t op_index) {
 
   CHECK_GE(op->inputs.size(), 3);
   CHECK_EQ(op->outputs.size(), 1);
+
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, op->outputs[0])) {
+    return false;
+  }
+
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
index 8a0e3e8995..452bef1f16 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
@@ -27,6 +27,14 @@ bool ResolveConstantShapeOrRank::Run(Model* model, std::size_t op_index) {
   }
 
   CHECK_EQ(op->outputs.size(), 1);
+
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, op->outputs[0])) {
+    return false;
+  }
+
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been resolved
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc
index b35c3e19c4..58d6797e1c 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc
@@ -96,6 +96,14 @@ bool ResolveConstantSlice::Run(Model* model, std::size_t op_index) {
   const SliceOperator* op = static_cast<const SliceOperator*>(base_op);
 
   CHECK_EQ(op->outputs.size(), 1);
+
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, op->outputs[0])) {
+    return false;
+  }
+
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
index 8853ed87e6..e275447a0c 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -114,6 +114,14 @@ bool ResolveConstantStridedSlice::Run(Model* model, std::size_t op_index) {
       static_cast<const StridedSliceOperator*>(base_op);
 
   CHECK_EQ(op->outputs.size(), 1);
+
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, op->outputs[0])) {
+    return false;
+  }
+
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc
index 5cfa1a5582..378a38f14b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc
@@ -105,6 +105,13 @@ bool ResolveConstantTile::Run(Model* model, std::size_t op_index) {
   }
   const auto* op = static_cast<const TensorFlowTileOperator*>(base_op);
 
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, op->outputs[0])) {
+    return false;
+  }
+
   CHECK_GE(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
   auto& output_array = model->GetArray(op->outputs[0]);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
index fe15dfa06f..5d3f4a6240 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -111,6 +111,14 @@ bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
 
   CHECK_EQ(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
+
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, op->outputs[0])) {
+    return false;
+  }
+
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
index c698a9567a..e35ed0898b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -48,6 +48,14 @@ bool CopyMinMaxFromFirstInput(const Operator& op, Model* model) {
 bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
   const auto unary_it = model->operators.begin() + op_index;
   const auto* unary_op = unary_it->get();
+
+  // If the output of this op is a non-discardable array such as an input_array
+  // or a state array of the model, then this is a job for RemoveUnusedOp, not
+  // for constants-propagation.
+  if (!IsDiscardableArray(*model, unary_op->outputs[0])) {
+    return false;
+  }
+
   // Test for unary ops of types that we know how to resolve.
   switch (unary_op->type) {
     case OperatorType::kCast:
-- 
GitLab


From fa61b939bec50d731b86f40c79054503d629e29b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 19:28:27 -0700
Subject: [PATCH 324/570] [XLA] Merge the single instruction parsing and the
 full module parsing in one function.

PiperOrigin-RevId: 215501702
---
 tensorflow/compiler/xla/service/hlo_parser.cc | 66 ++++++++++---------
 tensorflow/compiler/xla/service/hlo_parser.h  |  6 --
 .../compiler/xla/service/hlo_parser_test.cc   | 22 +++----
 3 files changed, 45 insertions(+), 49 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 5a125b4c08..0440f1b54f 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -68,7 +68,7 @@ class HloParser {
 
   // Runs the parser and constructs the resulting HLO in the given (empty)
   // HloModule. Returns false if an error occurred.
-  bool Run(HloModule* module);
+  Status Run(HloModule* module);
 
   // Returns the error information.
   string GetError() const { return StrJoin(error_, "\n"); }
@@ -79,9 +79,6 @@ class HloParser {
   StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbersOnly();
   StatusOr<PaddingConfig> ParsePaddingConfigOnly();
 
-  // Stand-alone parsing utility for a single instruction worth of text.
-  Status ParseSingleInstruction(HloModule* module);
-
  private:
   using InstrNameTable =
       std::unordered_map<string, std::pair<HloInstruction*, LocTy>>;
@@ -100,8 +97,12 @@ class HloParser {
   std::pair<HloInstruction*, LocTy>* FindInstruction(
       const string& name, const optional<Shape>& shape = nullopt);
 
+  // Parse a single instruction worth of text.
+  bool ParseSingleInstruction(HloModule* module);
+
   // ParseXXX returns false if an error occurred.
   bool ParseHloModule(HloModule* module);
+
   bool ParseComputations(HloModule* module);
   bool ParseComputation(HloComputation** entry_computation);
   bool ParseInstructionList(HloComputation** computation,
@@ -376,9 +377,25 @@ bool HloParser::TokenError(absl::string_view msg) {
   return Error(lexer_.GetLoc(), msg);
 }
 
-bool HloParser::Run(HloModule* module) {
+Status HloParser::Run(HloModule* module) {
   lexer_.Lex();
-  return ParseHloModule(module);
+  if (lexer_.GetKind() == TokKind::kw_HloModule) {
+    // This means that the text contains a full HLO module.
+    if (!ParseHloModule(module)) {
+      return InvalidArgument(
+          "Syntax error when trying to parse the text as a HloModule:\n%s",
+          GetError());
+    }
+    return Status::OK();
+  }
+  // This means that the text is a single HLO instruction.
+  if (!ParseSingleInstruction(module)) {
+    return InvalidArgument(
+        "Syntax error when trying to parse the text as single "
+        "HloInstruction:\n%s",
+        GetError());
+  }
+  return Status::OK();
 }
 
 std::pair<HloInstruction*, HloParser::LocTy>* HloParser::FindInstruction(
@@ -3279,9 +3296,11 @@ StatusOr<PaddingConfig> HloParser::ParsePaddingConfigOnly() {
   return padding_config;
 }
 
-Status HloParser::ParseSingleInstruction(HloModule* module) {
-  TF_RET_CHECK(create_missing_instruction_ == nullptr);
-  TF_RET_CHECK(scoped_name_tables_.empty());
+bool HloParser::ParseSingleInstruction(HloModule* module) {
+  if (create_missing_instruction_ != nullptr || !scoped_name_tables_.empty()) {
+    LOG(FATAL) << "Parser state is not clean. Please do not call any other "
+                  "methods before calling ParseSingleInstruction.";
+  }
   HloComputation::Builder builder(module->name());
 
   // The missing instruction hook we register creates the shaped instruction on
@@ -3298,9 +3317,6 @@ Status HloParser::ParseSingleInstruction(HloModule* module) {
     return tensorflow::gtl::FindOrNull(current_name_table(), new_name);
   };
 
-  // Prime the lexer.
-  lexer_.Lex();
-
   // Parse the instruction with the registered hook.
   Scope scope(&scoped_name_tables_);
   if (CanBeShape()) {
@@ -3309,7 +3325,7 @@ Status HloParser::ParseSingleInstruction(HloModule* module) {
     //
     //  f32[10] fusion(...), calls={...}
     if (!ParseInstruciontRhs(&builder, module->name(), lexer_.GetLoc())) {
-      return InvalidArgument("Syntax error:\n%s", GetError());
+      return false;
     }
   } else {
     // This means that the instruction's left-hand side might exist, e.g.
@@ -3317,7 +3333,7 @@ Status HloParser::ParseSingleInstruction(HloModule* module) {
     //  foo = f32[10] fusion(...), calls={...}
     string root_name;
     if (!ParseInstruction(&builder, &root_name)) {
-      return InvalidArgument("Syntax error:\n%s", GetError());
+      return false;
     }
   }
 
@@ -3325,7 +3341,7 @@ Status HloParser::ParseSingleInstruction(HloModule* module) {
   for (auto& comp : computations_) {
     module->AddEmbeddedComputation(std::move(comp));
   }
-  return Status::OK();
+  return true;
 }
 
 }  // namespace
@@ -3334,38 +3350,24 @@ StatusOr<std::unique_ptr<HloModule>> ParseHloString(
     absl::string_view str, const HloModuleConfig& config) {
   auto module = absl::make_unique<HloModule>(/*name=*/"", config);
   HloParser parser(str);
-  if (!parser.Run(module.get())) {
-    return InvalidArgument("Syntax error:\n%s", parser.GetError());
-  }
+  TF_RETURN_IF_ERROR(parser.Run(module.get()));
   return std::move(module);
 }
 
 StatusOr<std::unique_ptr<HloModule>> ParseHloString(absl::string_view str) {
   auto module = absl::make_unique<HloModule>(/*name=*/"", HloModuleConfig());
   HloParser parser(str);
-  if (!parser.Run(module.get())) {
-    return InvalidArgument("Syntax error:\n%s", parser.GetError());
-  }
+  TF_RETURN_IF_ERROR(parser.Run(module.get()));
   return std::move(module);
 }
 
 Status ParseHloString(absl::string_view str, HloModule* module) {
   TF_RET_CHECK(module->computation_count() == 0);
   HloParser parser(str);
-  if (!parser.Run(module)) {
-    return InvalidArgument("Syntax error:\n%s", parser.GetError());
-  }
+  TF_RETURN_IF_ERROR(parser.Run(module));
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<HloModule>> ParseHloOpToModule(
-    absl::string_view str, absl::string_view name) {
-  HloParser parser(str);
-  auto module = absl::make_unique<HloModule>(string(name), HloModuleConfig());
-  TF_RETURN_IF_ERROR(parser.ParseSingleInstruction(module.get()));
-  return std::move(module);
-}
-
 StatusOr<HloSharding> ParseSharding(absl::string_view str) {
   HloParser parser(str);
   return parser.ParseShardingOnly();
diff --git a/tensorflow/compiler/xla/service/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h
index 97d6f0117e..81eeb9f13b 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.h
+++ b/tensorflow/compiler/xla/service/hlo_parser.h
@@ -40,12 +40,6 @@ StatusOr<std::unique_ptr<HloModule>> ParseHloString(
 // point to an empty module (no computations).
 Status ParseHloString(absl::string_view str, HloModule* module);
 
-// Parses the text for a single HLO instruction into an HLO module with an
-// entry computation that runs that instruction (with the same parameters) as
-// its root instruction.
-StatusOr<std::unique_ptr<HloModule>> ParseHloOpToModule(
-    absl::string_view str, absl::string_view name = "single_op");
-
 // Given a string in the HloModule::ToString() format, parses the string and
 // creates a HloModule with default config.
 StatusOr<std::unique_ptr<HloModule>> ParseHloString(absl::string_view str);
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index d10acf3814..b618510640 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -1835,7 +1835,7 @@ TEST(HloParserSingleOpTest, SingleOp) {
   const string text =
       "%multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0} %broadcast, "
       "f32[2,4]{1,0} %x)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloOpToModule(text));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
@@ -1844,7 +1844,7 @@ TEST(HloParserSingleOpTest, SingleOp) {
 
 TEST(HloParserSingleOpTest, SingleOpNoShapeProducesError) {
   const string text = "multiply(f32[2,4]{1,0} %broadcast, f32[2,4]{1,0} %x)";
-  StatusOr<std::unique_ptr<HloModule>> module = ParseHloOpToModule(text);
+  StatusOr<std::unique_ptr<HloModule>> module = ParseHloString(text);
   ASSERT_TRUE(!module.status().ok());
   LOG(INFO) << "Status: " << module.status();
   EXPECT_THAT(module.status().ToString(),
@@ -1853,7 +1853,7 @@ TEST(HloParserSingleOpTest, SingleOpNoShapeProducesError) {
 
 TEST(HloParserSingleOpTest, SingleOpNoOperandShapesProducesError) {
   const string text = "%multiply = f32[2,4]{1,0} multiply(%broadcast, %x)";
-  StatusOr<std::unique_ptr<HloModule>> module = ParseHloOpToModule(text);
+  StatusOr<std::unique_ptr<HloModule>> module = ParseHloString(text);
   ASSERT_TRUE(!module.status().ok());
   LOG(INFO) << "Status: " << module.status();
   EXPECT_THAT(module.status().ToString(),
@@ -1863,7 +1863,7 @@ TEST(HloParserSingleOpTest, SingleOpNoOperandShapesProducesError) {
 TEST(HloParserSingleOpTest, SingleOpNoNames) {
   const string text =
       "%multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0}, f32[2,4]{1,0})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloOpToModule(text));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
@@ -1872,7 +1872,7 @@ TEST(HloParserSingleOpTest, SingleOpNoNames) {
 
 TEST(HloParserSingleOpTest, CanonicalOp) {
   const string text = "f32[2,4]{1,0} multiply(f32[2,4]{1,0}, f32[2,4]{1,0})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloOpToModule(text));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
@@ -1908,7 +1908,7 @@ TEST(HloParserSingleOpTest, CanonicalOpWithNested) {
   }
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloOpToModule(text));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_EQ(
@@ -1926,7 +1926,7 @@ TEST(HloParserSingleOpTest, SingleOpWithNested) {
   ROOT %subtract = f32[3,2,1,1]{3,2,1,0} subtract(f32[3,2,1,1]{3,2,1,0} %param_0, f32[3,2,1,1]{3,2,1,0} %broadcast)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloOpToModule(text));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
@@ -1939,7 +1939,7 @@ TEST(HloParserSingleOpTest, SingleOpWithNested_DoesNotExist) {
 {
   result = f32[] add(f32[] x, f32[] y)
 })";
-  auto status = ParseHloOpToModule(text).status();
+  auto status = ParseHloString(text).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.error_message(),
               ::testing::HasSubstr("does not exist: x"));
@@ -1951,7 +1951,7 @@ TEST(HloParserSingleOpTest, SingleOpWithNested_NoLhs) {
 {
   f32[] add(f32[] x, f32[] y)
 })";
-  auto status = ParseHloOpToModule(text).status();
+  auto status = ParseHloString(text).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.error_message(), ::testing::HasSubstr("expects name"));
 }
@@ -1962,7 +1962,7 @@ TEST(HloParserSingleOpTest, SingleOpWithNested_NoOperandName) {
 {
   result = f32[] add(f32[], f32[])
 })";
-  auto status = ParseHloOpToModule(text).status();
+  auto status = ParseHloString(text).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.error_message(), ::testing::HasSubstr("expects name"));
 }
@@ -1970,7 +1970,7 @@ TEST(HloParserSingleOpTest, SingleOpWithNested_NoOperandName) {
 TEST(HloParserSingleOpTest, ConvolutionTrivialFeatureGroupCount) {
   const string text =
       R"(%convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloOpToModule(text));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-- 
GitLab


From 4b2d0180ba8c903f098f52eb9a12d26a7626dd34 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 19:28:31 -0700
Subject: [PATCH 325/570] Update ops-related pbtxt files.

PiperOrigin-RevId: 215501709
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 46 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  8 ++++
 2 files changed, 54 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index e46cbc863d..4845767405 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -27069,6 +27069,52 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Igamma"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 0e9f939ab4..229022b64c 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -13176,6 +13176,14 @@ op {
     name: "else_branch"
     type: "func"
   }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
   is_stateful: true
 }
 op {
-- 
GitLab


From 2597b883a14749c77fffd7e5f9677107021ff40a Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Tue, 2 Oct 2018 20:00:36 -0700
Subject: [PATCH 326/570] Automated rollback of commit
 b7e9cbab27c893283acc4a6154d7a59dffb23758

PiperOrigin-RevId: 215503549
---
 .../contrib/distribute/python/input_ops.py    |  2 +-
 tensorflow/python/data/ops/dataset_ops.py     | 60 +++++++++++--------
 tensorflow/python/eager/function.py           | 14 -----
 tensorflow/python/eager/function_test.py      |  9 +--
 4 files changed, 40 insertions(+), 45 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/input_ops.py b/tensorflow/contrib/distribute/python/input_ops.py
index 423952c9e2..f07ec8234d 100644
--- a/tensorflow/contrib/distribute/python/input_ops.py
+++ b/tensorflow/contrib/distribute/python/input_ops.py
@@ -78,7 +78,7 @@ def auto_shard_dataset(dataset, num_shards, index):
       elif hasattr(dataset, "_map_func"):
         # TODO(priyag): Make this check more robust by enforcing some common
         # property on all map/flatmap/interleave datasets.
-        map_func_def = dataset._map_func.function_def
+        map_func_def = dataset._map_func.definition
         for node in map_func_def.node_def:
           if node.op in _READER_DATASET_OPS:
             found_reader_op = True
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index d90da5908d..46ce191f7b 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -30,7 +30,6 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import random_seed
 from tensorflow.python.data.util import sparse
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -38,7 +37,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -1715,8 +1713,7 @@ class _VariantDataset(Dataset):
 
 
 class StructuredFunctionWrapper(object):
-  """A wrapper for `defun` that supports structured arguments and return values.
-
+  """A wrapper for `Defun` that supports structured arguments and return values.
   """
 
   def __init__(self, func, transformation_name, dataset=None,
@@ -1768,7 +1765,7 @@ class StructuredFunctionWrapper(object):
     # TODO(b/110122868): Enable this support for all `tf.data` functions.
     self._nested_dataset_support = experimental_nested_dataset_support
 
-    @eager_function.defun(input_signature=self._defun_args())
+    @function.Defun(*self._defun_args())
     def tf_data_structured_function_wrapper(*args):
       """Wrapper for passing nested structures to and from tf.data functions."""
       flat_args = []
@@ -1853,43 +1850,36 @@ class StructuredFunctionWrapper(object):
       self._output_shapes = nest.pack_sequence_as(ret, flat_shapes)
       self._output_types = nest.pack_sequence_as(ret, flat_types)
 
-      return flat_ret
+      _warn_if_collections(transformation_name)
 
-    table_initializers_len = len(ops.get_default_graph().get_collection(
-        ops.GraphKeys.TABLE_INITIALIZERS))
+      return flat_ret
 
-    self._function = tf_data_structured_function_wrapper.get_concrete_function()
+    self._function = tf_data_structured_function_wrapper
     if add_to_graph:
       self._function.add_to_graph(ops.get_default_graph())
-    if len(
-        self._function.graph.get_collection(
-            ops.GraphKeys.TABLE_INITIALIZERS)) != table_initializers_len:
-      warnings.warn(
-          "Creating lookup tables inside a function passed to %s is not"
-          " supported. Create each table outside the function, and "
-          "capture it inside the function to use it." % transformation_name)
+    else:
+      # Use the private method that will execute
+      # `tf_data_structured_function_wrapper` but delay adding it to the graph
+      # in case (e.g.) we need to rerun the function.
+      self._function._create_definition_if_needed()  # pylint: disable=protected-access
 
   def _defun_args(self):
-    """Returns a list of `tf.TensorSpec` for the input element structure."""
+    """Returns a flat list of `tf.DType` for the input element structure."""
     ret = []
-    for input_type, input_shape, input_class in zip(
-        nest.flatten(self._input_types), nest.flatten(self._input_shapes),
-        nest.flatten(self._input_classes)):
+    for input_type, input_class in zip(nest.flatten(self._input_types),
+                                       nest.flatten(self._input_classes)):
       # TODO(b/110122868): Add a registration mechanism for new component types.
       if input_class is sparse_tensor_lib.SparseTensor:
-        ret.append(
-            tensor_spec.TensorSpec(
-                tensor_shape.TensorShape(None), dtypes.variant))
+        ret.append(dtypes.variant)
       elif isinstance(input_class, _NestedDatasetComponent):
         if not self._nested_dataset_support:
           raise NotImplementedError(
               "The %s transformation does not currently support nested "
               "datasets as inputs." % self._transformation_name)
-        ret.append(
-            tensor_spec.TensorSpec(tensor_shape.scalar(), dtypes.variant))
+        ret.append(dtypes.variant)
       else:
         assert isinstance(input_type, dtypes.DType)
-        ret.append(tensor_spec.TensorSpec(input_shape, input_type))
+        ret.append(input_type)
     return ret
 
   @property
@@ -2589,6 +2579,24 @@ def _should_unpack_args(args):
   return type(args) is tuple  # pylint: disable=unidiomatic-typecheck
 
 
+def _warn_if_collections(transformation_name):
+  """Prints warning message if the current graph uses common graph collections.
+
+  NOTE(mrry): Currently a warning is only generated for lookup tables. Any
+  variables created will be automatically hoisted out to the outermost scope
+  using `init_scope()`. Some collections (such as for control-flow contexts)
+  are benign and should not generate a warning.
+
+  Args:
+    transformation_name: A human-readable name for the transformation.
+  """
+  if ops.get_default_graph().get_collection(ops.GraphKeys.TABLE_INITIALIZERS):
+    warnings.warn("Creating lookup tables inside a function passed to %s is not"
+                  " supported. Create each table outside the function, and "
+                  "capture it inside the function to use it."
+                  % transformation_name)
+
+
 class MapDataset(UnaryDataset):
   """A `Dataset` that maps a function over elements in its input."""
 
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index aeb1cac3e9..f261d92d64 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -662,11 +662,6 @@ class Function(object):
     outputs = self._inference_function.call(ctx, args)
     return self._build_call_outputs(outputs)
 
-  @property
-  def name(self):
-    """Function name."""
-    return self._inference_function.name
-
   @property
   def graph(self):
     """Returns the graph from which this function was constructed."""
@@ -724,10 +719,6 @@ class Function(object):
     return nest.map_structure(lambda x: x.dtype if x is not None else None,
                               self._func_graph.structured_outputs)
 
-  def add_to_graph(self, g):
-    """Adds this function into the graph g."""
-    return self._inference_function.add_to_graph(g)
-
   def _construct_backprop_function(self):
     """Constructs the backprop function object for this function."""
     backwards_graph = FuncGraph(_backward_name(self._func_graph.name))
@@ -1131,8 +1122,6 @@ class PolymorphicFunction(object):
       *args: inputs to specialize on.
       **kwargs: inputs to specialize on.
     """
-    if self._input_signature:
-      args, kwargs = None, None
     graph_function, _ = self._maybe_define_function(args, kwargs)
     return graph_function
 
@@ -1315,9 +1304,6 @@ def register(func, *args, **kwargs):
   function definition into graph. Register function with different input param
   will result into multiple version of functions registered in graph.
 
-  Also, `args` and `kwargs` are ignored if this `PolymorphicFunction` was
-  created with an `input_signature`.
-
   Args:
     func: the PolymorphicFunction instance that generated by a @defun
     *args: input arguments for the Python function.
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index ac45606eb0..9ce367a837 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -1750,10 +1750,11 @@ class FunctionTest(test.TestCase):
         # pylint: disable=protected-access
         self.assertEqual(len(graph._functions), 3)
 
-        # Test register function with cache, note inputs are ignored.
-        function.register(defun_matmul)
-        graph = ops.get_default_graph()
-        self.assertEqual(len(graph._functions), 3)
+        # Test input param shape mismatch
+        t2 = constant_op.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+        with self.assertRaisesRegexp(
+            ValueError, 'Python inputs incompatible with input_signature'):
+          function.register(defun_matmul, t2, t2)
 
   def testRegisterFunctionWithCache(self):
     def matmul(x, y):
-- 
GitLab


From 9f42ebd5982688511ecc0ef7d23de02b64d8dd1e Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Tue, 2 Oct 2018 20:04:31 -0700
Subject: [PATCH 327/570] Improve error messages and doc strings for eager-mode
 tf.keras.Model.fit() + tf.data objects

- Previously, when validation_steps was missing, the error message incorrectly says "please provide either batch_size or steps_per_epoch". Now it reads "please provide either batch_size or validation_steps".
- Some whitespace-related fixes.

PiperOrigin-RevId: 215503991
---
 tensorflow/python/keras/engine/training.py    |  9 ++++--
 .../python/keras/engine/training_eager.py     |  3 +-
 .../keras/engine/training_eager_test.py       | 30 +++++++++++++++++++
 .../python/keras/engine/training_utils.py     | 15 +++++++---
 4 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index c842b8192e..85233de9b1 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -1419,6 +1419,8 @@ class Model(Network):
               - tuple `(x_val, y_val)` of Numpy arrays or tensors
               - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays
               - dataset or a dataset iterator
+            For the first two cases, `batch_size` must be provided.
+            For the last case, `validation_steps` must be provided.
         shuffle: Boolean (whether to shuffle the training data
             before each epoch) or str (for 'batch').
             'batch' is a special option for dealing with the
@@ -1454,9 +1456,10 @@ class Model(Network):
             TensorFlow data tensors, the default `None` is equal to
             the number of samples in your dataset divided by
             the batch size, or 1 if that cannot be determined.
-        validation_steps: Only relevant if `steps_per_epoch`
-            is specified. Total number of steps (batches of samples)
-            to validate before stopping.
+        validation_steps: Only relevant if `validation_data` is provided and
+            is a dataset or dataset iterator. Total number of steps (batches of
+            samples) to draw before stopping when performing validation
+            at the end of every epoch.
         max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
             input only. Maximum size for the generator queue.
             If unspecified, `max_queue_size` will default to 10.
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index fb71bf2596..2a62edd698 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -739,7 +739,8 @@ def test_loop(model, inputs, targets,
       y=targets,
       sample_weights=sample_weights,
       batch_size=batch_size,
-      steps_per_epoch=steps)
+      steps_per_epoch=steps,
+      is_validation=True)
   with backend.learning_phase_scope(0):
     return iterator_test_loop(model, inputs, steps, verbose=verbose)
 
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index 1f5176c4d7..943ede1be9 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -125,6 +125,36 @@ class TrainingTest(test.TestCase):
     model.train_on_batch(inputs, targets)
     model.test_on_batch(inputs, targets)
 
+  def test_model_fit_and_validation_with_missing_arg_errors(self):
+    x = keras.layers.Input(shape=(3,), name='input')
+    y = keras.layers.Dense(4, name='dense')(x)
+    model = keras.Model(x, y)
+    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+
+    x = keras.backend.zeros(shape=(10, 3))
+    y = keras.backend.zeros(shape=(10, 4))
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat(10).batch(5)
+    iterator = dataset.make_one_shot_iterator()
+    validation_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (x, y)).repeat(10).batch(5)
+    validation_iterator = validation_dataset.make_one_shot_iterator()
+
+    with self.assertRaisesRegexp(
+        ValueError, r'specify .* `steps_per_epoch`'):
+      model.fit(iterator, epochs=1, verbose=0)
+    with self.assertRaisesRegexp(
+        ValueError, r'provide either `batch_size` or `validation_steps`'):
+      model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
+                validation_data=(x, y))
+    with self.assertRaisesRegexp(
+        ValueError, r'provide either `batch_size` or `validation_steps`'):
+      model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
+                validation_data=validation_dataset)
+    with self.assertRaisesRegexp(
+        ValueError, r'provide either `batch_size` or `validation_steps`'):
+      model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
+                validation_data=validation_iterator)
+
   def test_generator_methods(self):
     model = keras.Sequential()
     model.add(keras.layers.Dense(4, input_shape=(3,)))
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 9c303f4bed..dd2a7f16ec 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -106,7 +106,8 @@ def convert_to_iterator(x=None,
                         batch_size=None,
                         steps_per_epoch=None,
                         epochs=1,
-                        shuffle=False):
+                        shuffle=False,
+                        is_validation=False):
   """Converts NumPy arrays or EagerTensors to an EagerIterator.
 
   Combines all provided data into a single EagerIterator.
@@ -124,6 +125,9 @@ def convert_to_iterator(x=None,
         epoch.
       epochs: Epochs to repeat iterator for.
       shuffle: Whether to shuffle data after each epoch.
+      is_validation: Whether this call is for validation during a training
+        (e.g., `fit()`) call. This info is used to construct error messages
+        (if any).
 
   Raises:
       ValueError: if steps_per_epoch cannot be calculated from the data
@@ -151,9 +155,12 @@ def convert_to_iterator(x=None,
     steps_per_epoch = int(math.ceil(num_samples / batch_size))
 
   if steps_per_epoch is None:
-    raise ValueError('Could not determine steps_per_epoch.'
-                     'Please provide either batch_size or'
-                     'steps_per_epoch.')
+    alternative_arg_name = (
+        'validation_steps' if is_validation else 'steps_per_epoch')
+    raise ValueError(
+        'Could not determine how to convert EagerTensors into EagerIterator. '
+        'Please provide either `batch_size` or '
+        '`%s`.' % alternative_arg_name)
 
   # TODO(omalleyt) for NumPy arrays in graph mode
   # placeholder ops should be used
-- 
GitLab


From 65b5190065db0074f8722b09ba43423438c40258 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Tue, 2 Oct 2018 21:49:20 -0700
Subject: [PATCH 328/570] Further loosen bounds for depthwise_conv_op_test.

PiperOrigin-RevId: 215512168
---
 tensorflow/python/kernel_tests/depthwise_conv_op_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 9c02b69180..6aee2eb0a3 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -131,7 +131,7 @@ class DepthwiseConv2DTest(test.TestCase):
     with self.session(graph=graph, use_gpu=use_gpu) as sess:
       tolerance = {
           dtypes.float16: 4e-2,
-          dtypes.float32: 1e-7,
+          dtypes.float32: 1e-6,
           dtypes.float64: 1e-12,
       }[data_type]
 
-- 
GitLab


From bbe15eee6779941c54e145d12e16f6473738857c Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Tue, 2 Oct 2018 22:39:09 -0700
Subject: [PATCH 329/570] [XLA] Modify the function that determines whether an
 instruction can change layout so that it can be used by the HLO verifier.

Change the function to a static member function of the LayoutAssignment class.

Add an std::function member to LayoutAssignment to store the function object
passed down from the backend compiler class and use it to decide whether an
instruction can change layouts.

Fix affected test cases.

PiperOrigin-RevId: 215515611
---
 .../compiler/xla/service/cpu/cpu_compiler.cc   |  3 ++-
 .../xla/service/cpu/cpu_layout_assignment.h    |  5 ++++-
 .../service/cpu/cpu_layout_assignment_test.cc  | 10 ++++++----
 .../xla/service/gpu/gpu_layout_assignment.h    |  5 ++++-
 .../service/gpu/gpu_layout_assignment_test.cc  | 17 +++++++++++------
 .../compiler/xla/service/gpu/nvptx_compiler.cc |  3 ++-
 .../xla/service/interpreter/compiler.cc        |  3 ++-
 .../compiler/xla/service/layout_assignment.cc  | 18 ++++++++++++------
 .../compiler/xla/service/layout_assignment.h   | 18 ++++++++++++++----
 .../xla/service/layout_assignment_test.cc      |  3 ++-
 10 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 18fc144efe..ea8c200dee 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -308,7 +308,8 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
       ReducePrecisionInsertion::PassTiming::AFTER_FUSION);
 
   pipeline.AddPass<CpuLayoutAssignment>(
-      module->mutable_entry_computation_layout(), target_machine_features);
+      module->mutable_entry_computation_layout(),
+      LayoutAssignment::InstructionCanChangeLayout, target_machine_features);
   return pipeline.Run(module).status();
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
index 3c4fe68b83..f4da35dd37 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
@@ -30,8 +30,11 @@ class CpuLayoutAssignment : public LayoutAssignment {
  public:
   explicit CpuLayoutAssignment(
       ComputationLayout* entry_computation_layout,
+      std::function<bool(const HloInstruction*)>
+          instruction_can_change_layout_func,
       const TargetMachineFeatures* target_machine_features)
-      : LayoutAssignment(entry_computation_layout),
+      : LayoutAssignment(entry_computation_layout,
+                         std::move(instruction_can_change_layout_func)),
         target_machine_features_(*target_machine_features) {}
   ~CpuLayoutAssignment() override {}
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
index 4668f3872d..97659b88a7 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
@@ -54,8 +54,9 @@ class CpuLayoutAssignmentTest : public HloTestBase {
         [](int64 shape_size) {
           return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
         });
-    cpu::CpuLayoutAssignment layout_assignment(entry_computation_layout,
-                                               &target_machine_features);
+    cpu::CpuLayoutAssignment layout_assignment(
+        entry_computation_layout, LayoutAssignment::InstructionCanChangeLayout,
+        &target_machine_features);
     EXPECT_IS_OK(layout_assignment.Run(module).status());
   }
 };
@@ -321,8 +322,9 @@ static StatusOr<DotOutputFusionLayoutAssignmentResult> RunDotOutputFusion(
       [](int64 shape_size) {
         return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
       });
-  cpu::CpuLayoutAssignment layout_assignment(&computation_layout,
-                                             &target_machine_features);
+  cpu::CpuLayoutAssignment layout_assignment(
+      &computation_layout, LayoutAssignment::InstructionCanChangeLayout,
+      &target_machine_features);
   TF_ASSIGN_OR_RETURN(result.layout_assignment_changed_something,
                       layout_assignment.Run(module));
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
index e2b96a81d4..4ba7989e9c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
@@ -30,8 +30,11 @@ namespace gpu {
 class GpuLayoutAssignment : public LayoutAssignment {
  public:
   explicit GpuLayoutAssignment(ComputationLayout* entry_computation_layout,
+                               std::function<bool(const HloInstruction*)>
+                                   instruction_can_change_layout_func,
                                se::StreamExecutor* stream_executor)
-      : LayoutAssignment(entry_computation_layout),
+      : LayoutAssignment(entry_computation_layout,
+                         std::move(instruction_can_change_layout_func)),
         stream_executor_(stream_executor) {}
   ~GpuLayoutAssignment() override {}
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index fbc8ddf599..04681cfcec 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -75,7 +75,8 @@ TEST_F(LayoutAssignmentTest, Elementwise) {
             ShapeLayout(result_shape_with_layout);
 
         GpuLayoutAssignment layout_assignment(
-            &computation_layout, backend().default_stream_executor());
+            &computation_layout, LayoutAssignment::InstructionCanChangeLayout,
+            backend().default_stream_executor());
         EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
 
         for (const HloInstruction* operand : add->operands()) {
@@ -163,7 +164,8 @@ TEST_F(LayoutAssignmentTest, BatchNormInference) {
       }
 
       GpuLayoutAssignment layout_assignment(
-          &computation_layout, backend().default_stream_executor());
+          &computation_layout, LayoutAssignment::InstructionCanChangeLayout,
+          backend().default_stream_executor());
       EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
 
       // The first operand to batchnorm should have the same layout as the
@@ -233,7 +235,8 @@ TEST_F(LayoutAssignmentTest, BatchNormTraining) {
       }
 
       GpuLayoutAssignment layout_assignment(
-          &computation_layout, backend().default_stream_executor());
+          &computation_layout, LayoutAssignment::InstructionCanChangeLayout,
+          backend().default_stream_executor());
       EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
 
       // The first operand to batchnorm should have the same layout as the
@@ -314,7 +317,8 @@ TEST_F(LayoutAssignmentTest, BatchNormGrad) {
         }
 
         GpuLayoutAssignment layout_assignment(
-            &computation_layout, backend().default_stream_executor());
+            &computation_layout, LayoutAssignment::InstructionCanChangeLayout,
+            backend().default_stream_executor());
         EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
 
         // The first and fourth operands to the batchnorm call should have the
@@ -348,8 +352,9 @@ TEST_F(LayoutAssignmentTest, DotLayout) {
 
   ComputationLayout computation_layout(
       module->entry_computation()->ComputeProgramShape());
-  GpuLayoutAssignment layout_assignment(&computation_layout,
-                                        backend().default_stream_executor());
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout, LayoutAssignment::InstructionCanChangeLayout,
+      backend().default_stream_executor());
   EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
 
   Shape expected_shape =
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 0b3b429710..b4ae2e42c7 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -232,7 +232,8 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     // a layout-sensitive verifier!
     HloPassPipeline pipeline("layout assignment");
     pipeline.AddPass<GpuLayoutAssignment>(
-        hlo_module->mutable_entry_computation_layout(), stream_exec);
+        hlo_module->mutable_entry_computation_layout(),
+        LayoutAssignment::InstructionCanChangeLayout, stream_exec);
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index bb69cb9c47..27fe89375d 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -44,7 +44,8 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
 
   pipeline.AddPass<LayoutAssignment>(
-      hlo_module->mutable_entry_computation_layout());
+      hlo_module->mutable_entry_computation_layout(),
+      LayoutAssignment::InstructionCanChangeLayout);
   return pipeline.Run(hlo_module).status();
 }
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 25d5327561..68a08a0886 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -974,10 +974,15 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
 
 LayoutAssignment::LayoutAssignment(
     ComputationLayout* entry_computation_layout,
+    std::function<bool(const HloInstruction*)>
+        instruction_can_change_layout_func,
     ChannelLayoutConstraints* channel_constraints)
     : entry_computation_layout_(entry_computation_layout),
+
       saved_entry_computation_layout_(*entry_computation_layout),
-      channel_layout_constraints_(channel_constraints) {
+      channel_layout_constraints_(channel_constraints),
+      instruction_can_change_layout_func_(
+          std::move(instruction_can_change_layout_func)) {
   if (channel_layout_constraints_ != nullptr) {
     // Save a copy of the input ChannelLayoutConstraints so that we can reset it
     // if we have to undo previous operations (ClearPreviousPassSideEffects()).
@@ -998,7 +1003,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
   if (!ShapeUtil::IsScalar(operand->shape()) &&
       ShapeUtil::Rank(operand->shape()) ==
           ShapeUtil::Rank(instruction->shape()) &&
-      InstructionRequiresInputLayoutEqualToOutputLayout(instruction)) {
+      !instruction_can_change_layout_func_(instruction)) {
     // Propagate the result layout to the operand layout if the instruction
     // requires the same layout out for the result and the operand.
     //
@@ -1076,7 +1081,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
 
   if (!ShapeUtil::IsScalar(operand->shape()) &&
       ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(user->shape()) &&
-      InstructionRequiresInputLayoutEqualToOutputLayout(user)) {
+      !instruction_can_change_layout_func_(user)) {
     // Assign users the same layout as the operand.
     return absl::make_unique<Layout>(operand_layout);
   }
@@ -1842,7 +1847,8 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   return true;
 }
 
-bool LayoutAssignment::InstructionRequiresInputLayoutEqualToOutputLayout(
+/* static */
+bool LayoutAssignment::InstructionCanChangeLayout(
     const HloInstruction* instruction) {
   switch (instruction->opcode()) {
     case HloOpcode::kAbs:
@@ -1908,7 +1914,7 @@ bool LayoutAssignment::InstructionRequiresInputLayoutEqualToOutputLayout(
     case HloOpcode::kTanh:
     case HloOpcode::kTupleSelect:
     case HloOpcode::kWhile:
-      return true;
+      return false;
     case HloOpcode::kBatchNormGrad:
     case HloOpcode::kBatchNormInference:
     case HloOpcode::kBatchNormTraining:
@@ -1939,7 +1945,7 @@ bool LayoutAssignment::InstructionRequiresInputLayoutEqualToOutputLayout(
     case HloOpcode::kTrace:
     case HloOpcode::kTranspose:
     case HloOpcode::kTuple:
-      return false;
+      return true;
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 15f0adcaaf..2d48e12263 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -286,6 +286,11 @@ class LayoutAssignment : public HloModulePass {
   // entry_computation_layout is modified to populate a layout for the result in
   // the case that no particular layout is requested.
   //
+  // instruction_can_change_layout_func is a function object that determines
+  // whether an instruction can change layouts. An instruction not being able to
+  // change layout means that it requires operands with the same rank as the
+  // output to have the same layout as the output.
+  //
   // channel_constraints is both an input and output. Any sends or recvs that
   // are present in channel_constraints will be laid out as constrained. Any
   // unconstrained sends or recvs will be laid out as locally optimal and their
@@ -295,6 +300,8 @@ class LayoutAssignment : public HloModulePass {
   // within any module passed to `Run`.
   explicit LayoutAssignment(
       ComputationLayout* entry_computation_layout,
+      std::function<bool(const HloInstruction*)>
+          instruction_can_change_layout_func = InstructionCanChangeLayout,
       ChannelLayoutConstraints* channel_constraints = nullptr);
   ~LayoutAssignment() override {}
   absl::string_view name() const override { return "layout-assignment"; }
@@ -303,10 +310,10 @@ class LayoutAssignment : public HloModulePass {
   // (any layouts were changed).
   StatusOr<bool> Run(HloModule* module) override;
 
-  // Returns true if the instruction requires that operands with the same rank
-  // as the output have to have the same layout as the output.
-  virtual bool InstructionRequiresInputLayoutEqualToOutputLayout(
-      const HloInstruction* instruction);
+  // Determines whether an instruction can change layouts. An instruction not
+  // being able to change layout means that it requires operands with the same
+  // rank as the output to have the same layout as the output.
+  static bool InstructionCanChangeLayout(const HloInstruction* instruction);
 
  protected:
   // These methods, invoked by PropagateConstraints, propagate a layout
@@ -522,6 +529,9 @@ class LayoutAssignment : public HloModulePass {
   // The set of HLO instructions which lacked any layout constraint, thus
   // receiving propagated default layouts.
   absl::flat_hash_set<const HloInstruction*> unconstrained_layout_instructions_;
+
+  std::function<bool(const HloInstruction*)>
+      instruction_can_change_layout_func_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 10f9a95121..15c16d667c 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -55,7 +55,8 @@ class LayoutAssignmentTest : public HloVerifiedTestBase {
                      ComputationLayout* entry_computation_layout,
                      ChannelLayoutConstraints* channel_constraints = nullptr) {
     LayoutAssignment layout_assignment(
-        entry_computation_layout, /*channel_constraints=*/channel_constraints);
+        entry_computation_layout, LayoutAssignment::InstructionCanChangeLayout,
+        /*channel_constraints=*/channel_constraints);
     EXPECT_IS_OK(layout_assignment.Run(module).status());
   }
 
-- 
GitLab


From b790ac196148b7547bb4da7091973e8f0ae58803 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 23:10:30 -0700
Subject: [PATCH 330/570] [XLA:CPU] Re-enable the inliner pass in the cpu
 compiler.

PiperOrigin-RevId: 215517752
---
 tensorflow/compiler/xla/service/cpu/cpu_compiler.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index ea8c200dee..afc94f2185 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -249,9 +249,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
       &pipeline, module->config().debug_options(),
       ReducePrecisionInsertion::PassTiming::BEFORE_OPTIMIZATION);
 
-  // TODO(b/35786417): Re-enable inliner pass after fixing the bug and deciding
-  // where we will take this pass in future.
-  // pipeline.AddPass<Inliner>();
+  pipeline.AddPass<Inliner>();
 
   // TODO(b/65775800): Fix wrong output bug in Call and remove the CallInliner
   // pass.
-- 
GitLab


From ac15fb000dc0558495b62e897206e2c4ad189c5a Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Tue, 2 Oct 2018 23:18:36 -0700
Subject: [PATCH 331/570] Internal change.

PiperOrigin-RevId: 215518288
---
 tensorflow/python/kernel_tests/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 9490746fd9..44575fc452 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2999,6 +2999,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     shard_count = 20,
+    tags = ["no_oss"],  # b/117185141
 )
 
 cuda_py_test(
-- 
GitLab


From 3d452dbcf7e1a71ba449f6acf7342cdd1dd11859 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Oct 2018 23:37:58 -0700
Subject: [PATCH 332/570] [XLA] In the HLO parser, give the module a non-empty
 default name.

Otherwise, when parsing a single instruction, the parsed module doesn't have a name, which won't pass the hlo verifier check.

PiperOrigin-RevId: 215519412
---
 tensorflow/compiler/xla/service/hlo_parser.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 0440f1b54f..dd62988bcc 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -391,7 +391,7 @@ Status HloParser::Run(HloModule* module) {
   // This means that the text is a single HLO instruction.
   if (!ParseSingleInstruction(module)) {
     return InvalidArgument(
-        "Syntax error when trying to parse the text as single "
+        "Syntax error when trying to parse the text as a single "
         "HloInstruction:\n%s",
         GetError());
   }
@@ -3348,14 +3348,14 @@ bool HloParser::ParseSingleInstruction(HloModule* module) {
 
 StatusOr<std::unique_ptr<HloModule>> ParseHloString(
     absl::string_view str, const HloModuleConfig& config) {
-  auto module = absl::make_unique<HloModule>(/*name=*/"", config);
+  auto module = absl::make_unique<HloModule>(/*name=*/"_", config);
   HloParser parser(str);
   TF_RETURN_IF_ERROR(parser.Run(module.get()));
   return std::move(module);
 }
 
 StatusOr<std::unique_ptr<HloModule>> ParseHloString(absl::string_view str) {
-  auto module = absl::make_unique<HloModule>(/*name=*/"", HloModuleConfig());
+  auto module = absl::make_unique<HloModule>(/*name=*/"_", HloModuleConfig());
   HloParser parser(str);
   TF_RETURN_IF_ERROR(parser.Run(module.get()));
   return std::move(module);
-- 
GitLab


From 946e58e402778606d26056f5decf91ecfb4a9f89 Mon Sep 17 00:00:00 2001
From: YongJoon Lee <joon0351@gmail.com>
Date: Wed, 3 Oct 2018 16:43:55 +0900
Subject: [PATCH 333/570] fix spelling problem

---
 .../contrib/estimator/python/estimator/boosted_trees.py     | 6 +++---
 .../estimator/python/estimator/dnn_linear_combined.py       | 2 +-
 .../python/estimator/dnn_with_layer_annotations.py          | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
index a1f1c5f3d7..b131ed4f12 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
@@ -75,7 +75,7 @@ class _BoostedTreesEstimator(canned_boosted_trees._BoostedTreesBase):  # pylint:
         layer.
       head: the `Head` instance defined for Estimator.
       model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator
+        also be used to load checkpoints from the directory into an estimator
         to continue training a previously saved model.
       weight_column: A string or a `_NumericColumn` created by
         `tf.feature_column.numeric_column` defining feature column representing
@@ -199,7 +199,7 @@ def boosted_trees_classifier_train_in_memory(
       the model. All items in the set should be instances of classes derived
       from `FeatureColumn`.
     model_dir: Directory to save model parameters, graph and etc. This can
-      also be used to load checkpoints from the directory into a estimator
+      also be used to load checkpoints from the directory into an estimator
       to continue training a previously saved model.
     n_classes: number of label classes. Default is binary classification.
       Multiclass support is not yet implemented.
@@ -345,7 +345,7 @@ def boosted_trees_regressor_train_in_memory(
       the model. All items in the set should be instances of classes derived
       from `FeatureColumn`.
     model_dir: Directory to save model parameters, graph and etc. This can
-      also be used to load checkpoints from the directory into a estimator
+      also be used to load checkpoints from the directory into an estimator
       to continue training a previously saved model.
     label_dimension: Number of regression targets per example.
       Multi-dimensional support is not yet implemented.
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
index 724bc2c82f..4e7965ef26 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
@@ -118,7 +118,7 @@ class DNNLinearCombinedEstimator(estimator.Estimator):
       head: A `_Head` instance constructed with a method such as
         `tf.contrib.estimator.multi_label_head`.
       model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator
+        also be used to load checkpoints from the directory into an estimator
         to continue training a previously saved model.
       linear_feature_columns: An iterable containing all the feature columns
         used by linear part of the model. All items in the set must be
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations.py b/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations.py
index 6ca7aaf989..40a91175b7 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations.py
@@ -248,7 +248,7 @@ def DNNClassifierWithLayerAnnotations(  # pylint: disable=invalid-name
       model. All items in the set should be instances of classes derived from
       `_FeatureColumn`.
     model_dir: Directory to save model parameters, graph and etc. This can also
-      be used to load checkpoints from the directory into a estimator to
+      be used to load checkpoints from the directory into an estimator to
       continue training a previously saved model.
     n_classes: Number of label classes. Defaults to 2, namely binary
       classification. Must be > 1.
-- 
GitLab


From c248f458c76df89fa3d608dcbe7c4c5e10962c24 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 02:25:06 -0700
Subject: [PATCH 334/570] compat: Update forward compatibility horizon to
 2018-10-03

PiperOrigin-RevId: 215534396
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 3bb95b56c2..d833defb8e 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -26,7 +26,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 2)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 3)
 
 
 @tf_export("compat.forward_compatible")
-- 
GitLab


From dd52e1d30702df5dfc805a1f433061dfbb75c814 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 06:14:11 -0700
Subject: [PATCH 335/570] Fix test that was relying on old lax toco behavior

PiperOrigin-RevId: 215553161
---
 .../contrib/lite/testing/generate_examples.py      | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 18036fac6f..3f2255c454 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -762,8 +762,11 @@ def make_constant_tests(zip_path):
         dtype=parameters["dtype"],
         name="input1",
         shape=parameters["input_shape"])
-    out = tf.constant(
+    constant = tf.constant(
         create_tensor_data(parameters["dtype"], parameters["input_shape"]))
+    # This maximum node is here to avoid the situation where a graph output is
+    # a constant, which is an error in toco.
+    out = tf.maximum(dummy_input, constant)
     return [dummy_input], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
@@ -2848,7 +2851,14 @@ def make_zeros_like_tests(zip_path):
         dtype=parameters["input_dtype"],
         name="input",
         shape=parameters["input_shape"])
-    out = tf.zeros_like(input_tensor)
+    zeros = tf.zeros_like(input_tensor)
+    # This maximum node is so that toco can perform the constants-propagation
+    # through the above zeros_like, which it can't do if the output of the
+    # zeros_like as an output of the whole graphs (graph outputs can't be
+    # constants). If toco does not perform such constants-propagation then
+    # the resulting tflite graph retains the zeros_like as a Fill op, which
+    # is unsupported by TFLite, even as a custom op.
+    out = tf.maximum(zeros, input_tensor)
     return [input_tensor], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-- 
GitLab


From c9bdd3938e2b43334a0065b4c198ec9d491c8cb8 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 3 Oct 2018 10:04:37 -0700
Subject: [PATCH 336/570] [tf.data] Switch background threads to use
 `BackgroundWorker`.

PiperOrigin-RevId: 215579950
---
 tensorflow/core/kernels/data/iterator_ops.cc  |  4 ---
 .../kernels/data/map_and_batch_dataset_op.cc  | 10 ++++---
 .../core/kernels/data/model_dataset_op.cc     | 10 ++++---
 .../data/parallel_interleave_dataset_op.cc    | 27 +++++++++++--------
 .../kernels/data/parallel_map_iterator.cc     | 10 ++++---
 .../core/kernels/data/prefetch_dataset_op.cc  | 10 ++++---
 tensorflow/core/kernels/data/writer_ops.cc    | 12 ++++-----
 7 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 7a833668ac..8acd6cc724 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -16,10 +16,8 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
-#include "tensorflow/core/common_runtime/threadpool_device.h"
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/resource_op_kernel.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
@@ -27,13 +25,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/optional_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index bf08970560..6a670f1efb 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -405,9 +406,10 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!runner_thread_) {
           std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
-          runner_thread_.reset(ctx->env()->StartThread(
-              {}, "runner_thread",
-              std::bind(&Iterator::RunnerThread, this, ctx_copy)));
+          runner_thread_ =
+              MakeUnique<BackgroundWorker>(ctx->env(), "runner_thread");
+          runner_thread_->Schedule(
+              std::bind(&Iterator::RunnerThread, this, ctx_copy));
         }
       }
 
@@ -660,7 +662,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_;
       // Buffer for storing the (intermediate) batch results.
       std::deque<std::shared_ptr<BatchResult>> batch_results_ GUARDED_BY(*mu_);
-      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+      std::unique_ptr<BackgroundWorker> runner_thread_ GUARDED_BY(*mu_);
       bool cancelled_ GUARDED_BY(*mu_) = false;
     };
 
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index 9aa505f4f1..859df57962 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -126,9 +127,10 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (!optimize_thread_) {
           std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-          optimize_thread_.reset(ctx->env()->StartThread(
-              {}, "optimize_thread",
-              [this, new_ctx]() { OptimizeThread(new_ctx); }));
+          optimize_thread_ =
+              MakeUnique<BackgroundWorker>(ctx->env(), "optimize_thread");
+          optimize_thread_->Schedule(
+              [this, new_ctx]() { OptimizeThread(new_ctx); });
         }
         return Status::OK();
       }
@@ -167,7 +169,7 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       mutex mu_;
       condition_variable cond_var_;
       std::shared_ptr<model::Model> model_;
-      std::unique_ptr<Thread> optimize_thread_ GUARDED_BY(mu_);
+      std::unique_ptr<BackgroundWorker> optimize_thread_ GUARDED_BY(mu_);
       bool cancelled_ GUARDED_BY(mu_) = false;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
     };
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 6b6b3d6ab9..9c836b836e 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -481,9 +482,10 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
           worker_threads_.reserve(dataset()->num_threads());
           for (size_t i = 0; i < dataset()->num_threads(); ++i) {
             std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, "worker_thread",
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
+            worker_threads_.emplace_back(
+                MakeUnique<BackgroundWorker>(ctx->env(), "worker_thread"));
+            worker_threads_.back()->Schedule(
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); });
           }
         }
         return Status::OK();
@@ -580,9 +582,10 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
             }
             workers_[i].SetInputs(s, std::move(args));
             std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, "worker_thread",
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
+            worker_threads_.emplace_back(
+                MakeUnique<BackgroundWorker>(ctx->env(), "worker_thread"));
+            worker_threads_.back()->Schedule(
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); });
             if (i < dataset()->cycle_length_) {
               interleave_indices_.push_back(i);
             } else {
@@ -1047,7 +1050,8 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       // The worker threads. This must be last to ensure the
       // threads have exited before any other members are deallocated.
       // TODO(b/65178177): Avoid allocating additional threads.
-      std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
+      std::vector<std::unique_ptr<BackgroundWorker>> worker_threads_
+          GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
@@ -1389,9 +1393,10 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!runner_thread_) {
           std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-          runner_thread_.reset(ctx->env()->StartThread(
-              {}, "runner_thread",
-              [this, new_ctx]() { RunnerThread(new_ctx); }));
+          runner_thread_ =
+              MakeUnique<BackgroundWorker>(ctx->env(), "runner_thread");
+          runner_thread_->Schedule(
+              [this, new_ctx]() { RunnerThread(new_ctx); });
         }
       }
 
@@ -1645,7 +1650,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       int64 num_calls_ GUARDED_BY(*mu_) = 0;
 
       std::unique_ptr<thread::ThreadPool> thread_pool_;
-      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+      std::unique_ptr<BackgroundWorker> runner_thread_ GUARDED_BY(*mu_);
 
       // Identifies whether background activity should be cancelled.
       bool cancelled_ GUARDED_BY(*mu_) = false;
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index 13bd4b6036..626e98af91 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -180,9 +181,10 @@ class ParallelMapIterator : public DatasetBaseIterator {
       EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     if (!runner_thread_) {
       std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
-      runner_thread_.reset(ctx->env()->StartThread(
-          {}, "runner_thread",
-          std::bind(&ParallelMapIterator::RunnerThread, this, ctx_copy)));
+      runner_thread_ =
+          MakeUnique<BackgroundWorker>(ctx->env(), "runner_thread");
+      runner_thread_->Schedule(
+          std::bind(&ParallelMapIterator::RunnerThread, this, ctx_copy));
     }
   }
 
@@ -330,7 +332,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   // Buffer for storing the invocation results.
   std::deque<std::shared_ptr<InvocationResult>> invocation_results_
       GUARDED_BY(*mu_);
-  std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+  std::unique_ptr<BackgroundWorker> runner_thread_ GUARDED_BY(*mu_);
   bool cancelled_ GUARDED_BY(*mu_) = false;
 };
 
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 754ed772db..e9c38eb8a0 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -256,10 +257,11 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     Status EnsurePrefetchThreadStarted(IteratorContext* ctx)
         EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!prefetch_thread_) {
+        prefetch_thread_ =
+            MakeUnique<BackgroundWorker>(ctx->env(), "prefetch_thread");
         std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-        prefetch_thread_.reset(ctx->env()->StartThread(
-            {}, "prefetch_thread",
-            [this, new_ctx]() { PrefetchThread(new_ctx); }));
+        prefetch_thread_->Schedule(
+            [this, new_ctx]() { PrefetchThread(new_ctx); });
       }
       return Status::OK();
     }
@@ -363,7 +365,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     string prefix_end_;
     PrefetchAutotuner auto_tuner_ GUARDED_BY(mu_);
     std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
-    std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
+    std::unique_ptr<BackgroundWorker> prefetch_thread_ GUARDED_BY(mu_);
     bool cancelled_ GUARDED_BY(mu_) = false;
     bool prefetch_thread_finished_ GUARDED_BY(mu_) = false;
   };
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc
index 3f76695bb1..7bb2077b62 100644
--- a/tensorflow/core/kernels/data/writer_ops.cc
+++ b/tensorflow/core/kernels/data/writer_ops.cc
@@ -29,10 +29,10 @@ class ToTFRecordOp : public AsyncOpKernel {
  public:
   explicit ToTFRecordOp(OpKernelConstruction* ctx)
       : AsyncOpKernel(ctx),
-        thread_pool_(new thread::ThreadPool(
-            ctx->env(), ThreadOptions(),
-            strings::StrCat("to_tf_record__op_", SanitizeThreadSuffix(name())),
-            1 /* num_threads */, false /* low_latency_hint */)) {}
+        background_worker_(
+            ctx->env(),
+            strings::StrCat("to_tf_record_op_", SanitizeThreadSuffix(name()))) {
+  }
 
   template <typename T>
   Status ParseScalarArgument(OpKernelContext* ctx,
@@ -50,7 +50,7 @@ class ToTFRecordOp : public AsyncOpKernel {
     // The call to `iterator->GetNext()` may block and depend on an
     // inter-op thread pool thread, so we issue the call from the
     // owned thread pool.
-    thread_pool_->Schedule([this, ctx, done]() {
+    background_worker_.Schedule([this, ctx, done]() {
       string filename;
       OP_REQUIRES_OK_ASYNC(
           ctx, ParseScalarArgument<string>(ctx, "filename", &filename), done);
@@ -97,7 +97,7 @@ class ToTFRecordOp : public AsyncOpKernel {
   }
 
  private:
-  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  BackgroundWorker background_worker_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("DatasetToTFRecord").Device(DEVICE_CPU),
-- 
GitLab


From 2af8fd975aaf5c70ebb396895fa15a8f034a8440 Mon Sep 17 00:00:00 2001
From: Tong Shen <endlessroad@google.com>
Date: Wed, 3 Oct 2018 10:09:14 -0700
Subject: [PATCH 337/570] Skip control flow functionalization if there is no
 Switch or Merge node.

PiperOrigin-RevId: 215580891
---
 .../tf2xla/functionalize_control_flow.cc      | 129 ++++++++++++------
 1 file changed, 90 insertions(+), 39 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 36c6f5d316..28e09d7b79 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -79,7 +79,10 @@ Status FunctionalizeControlFlowForFunction(
     const string& func_name, const string& new_func_name,
     const protobuf::Map<string, tensorflow::AttrValue>& attrs,
     FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
-    std::map<string, string>* canonicalized_name_to_new_name) {
+    std::map<string, absl::optional<string>>* canonicalized_name_to_new_name,
+    bool* modified) {
+  *modified = false;
+
   // Convert the function to Graph.
   FunctionLibraryRuntime::Handle handle;
   TF_RETURN_IF_ERROR(flr->Instantiate(func_name, AttrSlice(&attrs), &handle));
@@ -92,6 +95,19 @@ Status FunctionalizeControlFlowForFunction(
   });
   const FunctionBody* body = flr->GetFunctionBody(handle);
 
+  // Check if the graph has Switch or Merge node before optimizing the graph.
+  bool has_switch_or_merge = false;
+  for (Node* n : body->graph->nodes()) {
+    if (n->type_string() == "Switch" || n->type_string() == "Merge") {
+      has_switch_or_merge = true;
+      break;
+    }
+  }
+  // We cannot return here directly if the graph has no Switch/Merge.
+  // It might contain function call nodes, or If/While nodes with Switch/Merge
+  // in function body. We still need to rewrite those functions and modify
+  // corresponding nodes.
+
   // Call graph optimizer. The most important optimization we need is constant
   // folding, which will replace ops like Shape/BroadcastGradientArgs with
   // constant shape input. Without this optimization, those ops might become
@@ -129,6 +145,13 @@ Status FunctionalizeControlFlowForFunction(
         absl::StrCat("functionalize_control_flow_after_opt_", func_name),
         *optimized_graph, fld);
   }
+  // Some inlined functions might have Switch/Merge nodes.
+  for (Node* n : optimized_graph->nodes()) {
+    if (n->type_string() == "Switch" || n->type_string() == "Merge") {
+      has_switch_or_merge = true;
+      break;
+    }
+  }
 
   // If any node has associated functions, functionalize them first.
   // Gather nodes with associated functions first, because rewriting those nodes
@@ -151,10 +174,15 @@ Status FunctionalizeControlFlowForFunction(
           Canonicalize(name, AttrSlice(&associated_function.attrs()));
       auto iter = canonicalized_name_to_new_name->find(canonicalized_name);
       string new_name;
+      bool function_modified;
       if (iter != canonicalized_name_to_new_name->end()) {
-        // If we already functionalized this function, skip functionalization
-        // but still rewrite the node.
-        new_name = iter->second;
+        // If we already processed this function, check if it was rewritten. If
+        // the function was rewritten, the entry will be non-empty. Otherwise
+        // the entry will be empty.
+        function_modified = iter->second.has_value();
+        if (function_modified) {
+          new_name = iter->second.value();
+        }
       } else {
         if (associated_function.type() ==
             AssociatedFunctionInfo::AssociatedFunctionType::kSymbolicGradient) {
@@ -166,42 +194,62 @@ Status FunctionalizeControlFlowForFunction(
         }
         TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
             name, new_name, associated_function.attrs(), fld, flr,
-            canonicalized_name_to_new_name));
-        (*canonicalized_name_to_new_name)[canonicalized_name] = new_name;
+            canonicalized_name_to_new_name, &function_modified));
+        if (function_modified) {
+          // If the function was rewritten, add an non-empty entry. So later we
+          // know we have processed this function, and it was rewritten into
+          // another function.
+          (*canonicalized_name_to_new_name)[canonicalized_name] = new_name;
+        } else {
+          // If the function was not rewritten, add an empty entry. So later
+          // we know we have processed this function, and it does not need to be
+          // rewritten.
+          (*canonicalized_name_to_new_name)[canonicalized_name] = absl::nullopt;
+        }
+      }
+      if (function_modified) {
+        *modified = true;
+
+        // Notice that if "n" is a function call, RewriteAssociatedFunction()
+        // will delete it and create a new node instead, making "n" an invalid
+        // pointer. That's fine because in that case, associated_functions will
+        // only have one member and the loop will only run once.
+        TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
+            optimized_graph.get(), n, fld, associated_function, new_name));
       }
-      // Notice that if "n" is a function call, RewriteAssociatedFunction() will
-      // delete it and create a new node instead, making "n" an invalid pointer.
-      // That's fine because in that case, associated_functions will only have
-      // one member and the loop will only run once.
-      TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
-          optimized_graph.get(), n, fld, associated_function, new_name));
     }
   }
 
-  // Functionalize the function body.
-  if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
-        absl::StrCat("functionalize_control_flow_before_fdef_", func_name),
-        *optimized_graph, fld);
-  }
-  TF_RETURN_IF_ERROR(FunctionalizeControlFlow(optimized_graph.get(), fld));
-  if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
-        absl::StrCat("functionalize_control_flow_after_fdef_", func_name),
-        *optimized_graph, fld);
+  if (has_switch_or_merge) {
+    *modified = true;
+
+    // Functionalize the function body.
+    if (VLOG_IS_ON(4)) {
+      dump_graph::DumpGraphToFile(
+          absl::StrCat("functionalize_control_flow_before_fdef_", func_name),
+          *optimized_graph, fld);
+    }
+    TF_RETURN_IF_ERROR(FunctionalizeControlFlow(optimized_graph.get(), fld));
+    if (VLOG_IS_ON(4)) {
+      dump_graph::DumpGraphToFile(
+          absl::StrCat("functionalize_control_flow_after_fdef_", func_name),
+          *optimized_graph, fld);
+    }
   }
-  FunctionDef functionalized_fdef;
-  TF_RETURN_IF_ERROR(GraphToFunctionDef(*optimized_graph, new_func_name,
-                                        &functionalized_fdef));
 
-  // Add rewritten FunctionDef into library.
-  if (func_name == new_func_name) {
-    VLOG(2) << "Replacing function " << func_name;
-    TF_RETURN_IF_ERROR(
-        fld->ReplaceFunction(new_func_name, functionalized_fdef));
-  } else {
-    VLOG(2) << "Adding function " << new_func_name;
-    TF_RETURN_IF_ERROR(fld->AddFunctionDef(functionalized_fdef));
+  if (*modified) {
+    // Add rewritten FunctionDef into library.
+    FunctionDef functionalized_fdef;
+    TF_RETURN_IF_ERROR(GraphToFunctionDef(*optimized_graph, new_func_name,
+                                          &functionalized_fdef));
+    if (func_name == new_func_name) {
+      VLOG(2) << "Replacing function " << func_name;
+      TF_RETURN_IF_ERROR(
+          fld->ReplaceFunction(new_func_name, functionalized_fdef));
+    } else {
+      VLOG(2) << "Adding function " << new_func_name;
+      TF_RETURN_IF_ERROR(fld->AddFunctionDef(functionalized_fdef));
+    }
   }
 
   return ret_status;
@@ -227,7 +275,7 @@ Status FunctionalizeControlFlowPass::Run(
           {"TPUCompile", "function"},
           {"XlaLaunch", "function"},
       };
-  std::map<string, string> canonicalized_name_to_new_name;
+  std::map<string, absl::optional<string>> canonicalized_name_to_new_name;
   for (Node* n : graph->nodes()) {
     auto it = kNodeTypeToFunctionAttrMapping->find(n->type_string());
     if (it == kNodeTypeToFunctionAttrMapping->end()) {
@@ -242,12 +290,15 @@ Status FunctionalizeControlFlowPass::Run(
               << ". Corresponding function: " << func.name();
       string new_func_name = options.flib_def->UniqueFunctionName(
           absl::StrCat(func.name(), "_f15n_"));
+      bool modified;
       TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
           func.name(), new_func_name, func.attr(), options.flib_def, flr,
-          &canonicalized_name_to_new_name));
-      n->ClearAttr(func_attr);
-      func.set_name(new_func_name);
-      n->AddAttr(func_attr, func);
+          &canonicalized_name_to_new_name, &modified));
+      if (modified) {
+        n->ClearAttr(func_attr);
+        func.set_name(new_func_name);
+        n->AddAttr(func_attr, func);
+      }
     }
   }
 
-- 
GitLab


From 022af5300701d457d848e60ea511dd8d05f68738 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Wed, 3 Oct 2018 10:18:59 -0700
Subject: [PATCH 338/570] Fix TfLiteTensor invalidation issue when using the
 Java API

Fix an issue where the Java Tensor class would hold a reference
to an invalidated TfLiteTensor instance. This issue was manifest
in certain models that add temporary tensors during execution.

PiperOrigin-RevId: 215582842
---
 .../lite/NativeInterpreterWrapper.java        | 26 +++++++---
 .../main/java/org/tensorflow/lite/Tensor.java | 27 ++++++++--
 .../native/nativeinterpreterwrapper_jni.cc    | 22 +++-----
 .../native/nativeinterpreterwrapper_jni.h     | 24 ++++-----
 .../lite/java/src/main/native/tensor_jni.cc   | 50 +++++++++++++++----
 .../lite/java/src/main/native/tensor_jni.h    | 17 +++++++
 .../java/org/tensorflow/lite/TensorTest.java  | 13 ++++-
 7 files changed, 129 insertions(+), 50 deletions(-)

diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 9bc44bf797..6f03e7853a 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -18,7 +18,6 @@ package org.tensorflow.lite;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
-import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -83,6 +82,19 @@ final class NativeInterpreterWrapper implements AutoCloseable {
   /** Releases resources associated with this {@code NativeInterpreterWrapper}. */
   @Override
   public void close() {
+    // Close the tensors first as they may reference the native interpreter.
+    for (int i = 0; i < inputTensors.length; ++i) {
+      if (inputTensors[i] != null) {
+        inputTensors[i].close();
+        inputTensors[i] = null;
+      }
+    }
+    for (int i = 0; i < outputTensors.length; ++i) {
+      if (outputTensors[i] != null) {
+        outputTensors[i].close();
+        outputTensors[i] = null;
+      }
+    }
     delete(errorHandle, modelHandle, interpreterHandle);
     errorHandle = 0;
     modelHandle = 0;
@@ -91,8 +103,6 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     inputsIndexes = null;
     outputsIndexes = null;
     isMemoryAllocated = false;
-    Arrays.fill(inputTensors, null);
-    Arrays.fill(outputTensors, null);
   }
 
   /** Sets inputs, runs model inference and returns outputs. */
@@ -260,7 +270,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     Tensor inputTensor = inputTensors[index];
     if (inputTensor == null) {
       inputTensor =
-          inputTensors[index] = Tensor.fromHandle(getInputTensor(interpreterHandle, index));
+          inputTensors[index] =
+              Tensor.fromIndex(interpreterHandle, getInputTensorIndex(interpreterHandle, index));
     }
     return inputTensor;
   }
@@ -282,7 +293,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     Tensor outputTensor = outputTensors[index];
     if (outputTensor == null) {
       outputTensor =
-          outputTensors[index] = Tensor.fromHandle(getOutputTensor(interpreterHandle, index));
+          outputTensors[index] =
+              Tensor.fromIndex(interpreterHandle, getOutputTensorIndex(interpreterHandle, index));
     }
     return outputTensor;
   }
@@ -317,9 +329,9 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native long allocateTensors(long interpreterHandle, long errorHandle);
 
-  private static native long getInputTensor(long interpreterHandle, int inputIdx);
+  private static native int getInputTensorIndex(long interpreterHandle, int inputIdx);
 
-  private static native long getOutputTensor(long interpreterHandle, int outputIdx);
+  private static native int getOutputTensorIndex(long interpreterHandle, int outputIdx);
 
   private static native int getInputCount(long interpreterHandle);
 
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index f174178d98..6ca47aa3ed 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -23,13 +23,26 @@ import java.util.Arrays;
 /**
  * A typed multi-dimensional array used in Tensorflow Lite.
  *
- * <p>The native handle of a {@code Tensor} belongs to {@code NativeInterpreterWrapper}, thus not
- * needed to be closed here.
+ * <p>The native handle of a {@code Tensor} is managed by {@code NativeInterpreterWrapper}, and does
+ * not needed to be closed by the client. However, once the {@code NativeInterpreterWrapper} has
+ * been closed, the tensor handle will be invalidated.
  */
 public final class Tensor {
 
-  static Tensor fromHandle(long nativeHandle) {
-    return new Tensor(nativeHandle);
+  /**
+   * Creates a Tensor wrapper from the provided interpreter instance and tensor index.
+   *
+   * <p>The caller is responsible for closing the created wrapper, and ensuring the provided
+   * native interpreter is valid until the tensor is closed.
+   */
+  static Tensor fromIndex(long nativeInterpreterHandle, int tensorIndex) {
+    return new Tensor(create(nativeInterpreterHandle, tensorIndex));
+  }
+
+  /** Disposes of any resources used by the Tensor wrapper. */
+  void close() {
+    delete(nativeHandle);
+    nativeHandle = 0;
   }
 
   /** Returns the {@link DataType} of elements stored in the Tensor. */
@@ -235,7 +248,7 @@ public final class Tensor {
     return o instanceof ByteBuffer;
   }
 
-  private final long nativeHandle;
+  private long nativeHandle;
   private final DataType dtype;
   private int[] shapeCopy;
 
@@ -249,6 +262,10 @@ public final class Tensor {
     return buffer(nativeHandle).order(ByteOrder.nativeOrder());
   }
 
+  private static native long create(long interpreterHandle, int tensorIndex);
+
+  private static native void delete(long handle);
+
   private static native ByteBuffer buffer(long handle);
 
   private static native void writeDirectBuffer(long handle, ByteBuffer src);
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index abb7320bc5..4dc73fbcf8 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -159,26 +159,20 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_allocateTensors(
   }
 }
 
-JNIEXPORT jlong JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputTensor(JNIEnv* env,
-                                                                 jclass clazz,
-                                                                 jlong handle,
-                                                                 jint index) {
+JNIEXPORT jint JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputTensorIndex(
+    JNIEnv* env, jclass clazz, jlong handle, jint input_index) {
   tflite::Interpreter* interpreter = convertLongToInterpreter(env, handle);
   if (interpreter == nullptr) return 0;
-  return reinterpret_cast<jlong>(
-      interpreter->tensor(interpreter->inputs()[index]));
+  return interpreter->inputs()[input_index];
 }
 
-JNIEXPORT jlong JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputTensor(JNIEnv* env,
-                                                                  jclass clazz,
-                                                                  jlong handle,
-                                                                  jint index) {
+JNIEXPORT jint JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputTensorIndex(
+    JNIEnv* env, jclass clazz, jlong handle, jint output_index) {
   tflite::Interpreter* interpreter = convertLongToInterpreter(env, handle);
   if (interpreter == nullptr) return 0;
-  return reinterpret_cast<jlong>(
-      interpreter->tensor(interpreter->outputs()[index]));
+  return interpreter->outputs()[output_index];
 }
 
 JNIEXPORT jint JNICALL
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
index aa809dff8a..f8f3e7028c 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
@@ -46,25 +46,21 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_allocateTensors(
 
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:    getInputTensor
- *  Signature: (JI)J
+ *  Method:    getInputTensorIndex
+ *  Signature: (JI)I
  */
-JNIEXPORT jlong JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputTensor(JNIEnv* env,
-                                                                 jclass clazz,
-                                                                 jlong handle,
-                                                                 jint index);
+JNIEXPORT jint JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputTensorIndex(
+    JNIEnv* env, jclass clazz, jlong handle, jint input_index);
 
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:    getOutputTensor
- *  Signature: (JI)J
+ *  Method:    getOutputTensorIndex
+ *  Signature: (JI)I
  */
-JNIEXPORT jlong JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputTensor(JNIEnv* env,
-                                                                  jclass clazz,
-                                                                  jlong handle,
-                                                                  jint index);
+JNIEXPORT jint JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputTensorIndex(
+    JNIEnv* env, jclass clazz, jlong handle, jint output_index);
 
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
index 7ff96a3172..d3378f5f14 100644
--- a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
@@ -16,17 +16,36 @@ limitations under the License.
 #include "tensorflow/contrib/lite/java/src/main/native/tensor_jni.h"
 #include <cstring>
 #include <memory>
+#include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/java/src/main/native/exception_jni.h"
 
 namespace {
 
-TfLiteTensor* convertLongToTensor(JNIEnv* env, jlong handle) {
+// Convenience handle for obtaining a TfLiteTensor given an interpreter and
+// tensor index.
+//
+// Historically, the Java Tensor class used a TfLiteTensor pointer as its native
+// handle. However, this approach isn't generally safe, as the interpreter may
+// invalidate all TfLiteTensor* handles during inference or allocation.
+class TensorHandle {
+ public:
+  TensorHandle(tflite::Interpreter* interpreter, int tensor_index)
+      : interpreter_(interpreter), tensor_index_(tensor_index) {}
+
+  TfLiteTensor* tensor() const { return interpreter_->tensor(tensor_index_); }
+
+ private:
+  tflite::Interpreter* const interpreter_;
+  const int tensor_index_;
+};
+
+TfLiteTensor* GetTensorFromHandle(JNIEnv* env, jlong handle) {
   if (handle == 0) {
     throwException(env, kIllegalArgumentException,
                    "Internal error: Invalid handle to TfLiteTensor.");
     return nullptr;
   }
-  return reinterpret_cast<TfLiteTensor*>(handle);
+  return reinterpret_cast<TensorHandle*>(handle)->tensor();
 }
 
 size_t elementByteSize(TfLiteType data_type) {
@@ -192,10 +211,23 @@ size_t writeMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
 
 }  // namespace
 
+JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_Tensor_create(
+    JNIEnv* env, jclass clazz, jlong interpreter_handle, jint tensor_index) {
+  tflite::Interpreter* interpreter =
+      reinterpret_cast<tflite::Interpreter*>(interpreter_handle);
+  return reinterpret_cast<jlong>(new TensorHandle(interpreter, tensor_index));
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_lite_Tensor_delete(JNIEnv* env,
+                                                              jclass clazz,
+                                                              jlong handle) {
+  delete reinterpret_cast<TensorHandle*>(handle);
+}
+
 JNIEXPORT jobject JNICALL Java_org_tensorflow_lite_Tensor_buffer(JNIEnv* env,
                                                                  jclass clazz,
                                                                  jlong handle) {
-  TfLiteTensor* tensor = convertLongToTensor(env, handle);
+  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
   if (tensor == nullptr) return nullptr;
   if (tensor->data.raw == nullptr) {
     throwException(env, kIllegalArgumentException,
@@ -208,7 +240,7 @@ JNIEXPORT jobject JNICALL Java_org_tensorflow_lite_Tensor_buffer(JNIEnv* env,
 
 JNIEXPORT void JNICALL Java_org_tensorflow_lite_Tensor_writeDirectBuffer(
     JNIEnv* env, jclass clazz, jlong handle, jobject src) {
-  TfLiteTensor* tensor = convertLongToTensor(env, handle);
+  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
   if (tensor == nullptr) return;
 
   char* src_data_raw = static_cast<char*>(env->GetDirectBufferAddress(src));
@@ -226,7 +258,7 @@ Java_org_tensorflow_lite_Tensor_readMultiDimensionalArray(JNIEnv* env,
                                                           jclass clazz,
                                                           jlong handle,
                                                           jobject value) {
-  TfLiteTensor* tensor = convertLongToTensor(env, handle);
+  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
   if (tensor == nullptr) return;
   int num_dims = tensor->dims->size;
   if (num_dims == 0) {
@@ -243,7 +275,7 @@ Java_org_tensorflow_lite_Tensor_writeMultiDimensionalArray(JNIEnv* env,
                                                            jclass clazz,
                                                            jlong handle,
                                                            jobject src) {
-  TfLiteTensor* tensor = convertLongToTensor(env, handle);
+  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
   if (tensor == nullptr) return;
   if (tensor->data.raw == nullptr) {
     throwException(env, kIllegalArgumentException,
@@ -262,14 +294,14 @@ Java_org_tensorflow_lite_Tensor_writeMultiDimensionalArray(JNIEnv* env,
 JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_dtype(JNIEnv* env,
                                                              jclass clazz,
                                                              jlong handle) {
-  TfLiteTensor* tensor = convertLongToTensor(env, handle);
+  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
   if (tensor == nullptr) return 0;
   return static_cast<jint>(tensor->type);
 }
 
 JNIEXPORT jintArray JNICALL
 Java_org_tensorflow_lite_Tensor_shape(JNIEnv* env, jclass clazz, jlong handle) {
-  TfLiteTensor* tensor = convertLongToTensor(env, handle);
+  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
   if (tensor == nullptr) return nullptr;
   int num_dims = tensor->dims->size;
   jintArray result = env->NewIntArray(num_dims);
@@ -280,7 +312,7 @@ Java_org_tensorflow_lite_Tensor_shape(JNIEnv* env, jclass clazz, jlong handle) {
 JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_numBytes(JNIEnv* env,
                                                                 jclass clazz,
                                                                 jlong handle) {
-  const TfLiteTensor* tensor = convertLongToTensor(env, handle);
+  const TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
   if (tensor == nullptr) return 0;
   return static_cast<jint>(tensor->bytes);
 }
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.h b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.h
index 2f73128bdf..c5e9690e9a 100644
--- a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.h
+++ b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.h
@@ -23,6 +23,23 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
+/*
+ * Class:     org_tensorflow_lite_Tensor
+ * Method:    create
+ * Signature: (JI)J
+ */
+JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_Tensor_create(
+    JNIEnv* env, jclass clazz, jlong interpreter_handle, jint tensor_index);
+
+/*
+ * Class:     org_tensorflow_lite_Tensor
+ * Method:    delete
+ * Signature: (J)
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_lite_Tensor_delete(JNIEnv* env,
+                                                              jclass clazz,
+                                                              jlong handle);
+
 /*
  * Class:     org_tensorflow_lite_Tensor
  * Method:    buffer
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
index 85ad393d89..56a38ea3e2 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
@@ -182,7 +182,7 @@ public final class TensorTest {
     dataType = Tensor.dataTypeOf(testFloatArray);
     assertThat(dataType).isEqualTo(DataType.FLOAT32);
     float[][] testMultiDimArray = {testFloatArray, testFloatArray, testFloatArray};
-    dataType = Tensor.dataTypeOf(testFloatArray);
+    dataType = Tensor.dataTypeOf(testMultiDimArray);
     assertThat(dataType).isEqualTo(DataType.FLOAT32);
     try {
       double[] testDoubleArray = {0.783, 0.251};
@@ -238,4 +238,15 @@ public final class TensorTest {
     assertThat(shape[1]).isEqualTo(3);
     assertThat(shape[2]).isEqualTo(1);
   }
+
+  @Test
+  public void testUseAfterClose() {
+    tensor.close();
+    try {
+      tensor.numBytes();
+      fail();
+    } catch (IllegalArgumentException e) {
+      // Expected failure.
+    }
+  }
 }
-- 
GitLab


From a5b3cd8b4d28cfcdcb9adb3d3568b168b9b8a088 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 10:19:55 -0700
Subject: [PATCH 339/570] Fix bug in shape function for transpose: If the rank
 of the input is unknown and the rank derived from the permutation array is 0
 or 1, the shape is ambiguous and cannot be determined at graph construction
 time. In this case, forward the shape of the input.

PiperOrigin-RevId: 215583050
---
 tensorflow/core/ops/array_ops.cc              |  8 +++++
 tensorflow/core/ops/array_ops_test.cc         |  1 +
 tensorflow/python/kernel_tests/BUILD          |  2 +-
 .../python/kernel_tests/transpose_op_test.py  | 29 +++++++++++++++++--
 4 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index c9f80df5e4..f55562ec99 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -133,6 +133,14 @@ Status TransposeShapeFn(InferenceContext* c) {
   } else {
     rank = perm->NumElements();
   }
+  if (!c->RankKnown(input) && rank < 2) {
+    // A permutation array containing a single element is ambiguous. It could
+    // indicate either a scalar or a 1-dimensional array, both of which the
+    // transpose op returns unchanged.
+    c->set_output(0, input);
+    return Status::OK();
+  }
+
   std::vector<DimensionHandle> dims;
   dims.resize(rank);
   TF_RETURN_IF_ERROR(c->WithRank(input, rank, &input));
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 03dab390a7..1c29cd2491 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -975,6 +975,7 @@ TEST(ArrayOpsTest, Transpose_ShapeFn) {
   INFER_OK(op, "?;[2]", "[?,?]");
   INFER_OK(op, "[?,?];[2]", "[d0_1,d0_0]");
   INFER_OK(op, "[1,?];[2]", "[d0_1,d0_0]");
+  INFER_OK(op, "?;[0]", "in0");
 
   // Invalid arguments.
   perm = test::AsTensor<int32>({1, 2});
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 44575fc452..c0e9a3c975 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2367,7 +2367,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
-    shard_count = 4,
+    shard_count = 10,
     tags = [
         "no_gpu",
         "no_oss",
diff --git a/tensorflow/python/kernel_tests/transpose_op_test.py b/tensorflow/python/kernel_tests/transpose_op_test.py
index f42800226e..a825052dd2 100644
--- a/tensorflow/python/kernel_tests/transpose_op_test.py
+++ b/tensorflow/python/kernel_tests/transpose_op_test.py
@@ -39,7 +39,12 @@ class TransposeTest(test.TestCase):
     return ret
 
   def _compareCpu(self, x, p, conjugate=False):
-    np_ans = self._np_transpose(x, p)
+    if p is None:
+      rank = x.ndim
+      perm = (rank - 1) - np.arange(rank)
+    else:
+      perm = p
+    np_ans = self._np_transpose(x, perm)
     if conjugate:
       np_ans = np.conj(np_ans)
     with self.test_session(use_gpu=False):
@@ -65,7 +70,12 @@ class TransposeTest(test.TestCase):
       return tf_ans, jacob_t
 
   def _compareGpu(self, x, p, conjugate=False):
-    np_ans = self._np_transpose(x, p)
+    if p is None:
+      rank = x.ndim
+      perm = (rank - 1) - np.arange(rank)
+    else:
+      perm = p
+    np_ans = self._np_transpose(x, perm)
     if conjugate:
       np_ans = np.conj(np_ans)
     with self.test_session(use_gpu=True):
@@ -102,6 +112,11 @@ class TransposeTest(test.TestCase):
         self._compareCpu(x, p, conjugate=c)
         if use_gpu:
           self._compareGpu(x, p, conjugate=c)
+    # Test with an empty permutation
+    for c in cs:
+      self._compareCpu(x, None, conjugate=c)
+      if use_gpu:
+        self._compareGpu(x, None, conjugate=c)
 
   def _compare_cpu_gpu(self, x):
     n = np.ndim(x)
@@ -449,6 +464,10 @@ class TransposeTest(test.TestCase):
     self.assertEqual(
         tensor_shape.TensorShape(None),
         array_ops.transpose(array_ops.placeholder(dtypes.int32)).get_shape())
+    self.assertEqual(
+        tensor_shape.TensorShape(None),
+        array_ops.transpose(array_ops.placeholder(dtypes.int32),
+                            [0]).get_shape())
 
   def testNullTensor(self):
     with self.cached_session():
@@ -456,6 +475,12 @@ class TransposeTest(test.TestCase):
       xt = array_ops.transpose(x, [0, 2, 1]).eval()
       self.assertAllEqual(xt.shape, (1, 0, 4))
 
+  def testScalar(self):
+    with self.cached_session():
+      x = constant_op.constant(42, dtype=dtypes.float32, shape=[])
+      xt = array_ops.transpose(x).eval()
+      self.assertAllEqual(xt, x)
+
   def _testError(self, x, p, err):
     with self.cached_session():
       with self.assertRaisesOpError(err):
-- 
GitLab


From 0f9baa02a4e32b672b0cc29e99d5bfcf1329988c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 10:26:41 -0700
Subject: [PATCH 340/570] Re-enable the arithmetic optimizer by default in
 tests. Add a warning to not disable optimizers without consulting with the
 Grappler team.

PiperOrigin-RevId: 215584369
---
 tensorflow/python/framework/test_util.py                    | 6 ++++--
 .../python/kernel_tests/distributions/laplace_test.py       | 4 ++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 6673bc5561..4ec4b41b5e 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1992,10 +1992,12 @@ class TensorFlowTestCase(googletest.TestCase):
       # Don't perform optimizations for tests so we don't inadvertently run
       # gpu ops on cpu
       config.graph_options.optimizer_options.opt_level = -1
+      # Disable Grappler constant folding since some tests & benchmarks
+      # use constant input and become meaningless after constant folding.
+      # DO NOT DISABLE GRAPPLER OPTIMIZERS WITHOUT CONSULTING WITH THE
+      # GRAPPLER TEAM.
       config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
-      config.graph_options.rewrite_options.arithmetic_optimization = (
-          rewriter_config_pb2.RewriterConfig.OFF)
       config.graph_options.rewrite_options.pin_to_host_optimization = (
           rewriter_config_pb2.RewriterConfig.OFF)
       return config
diff --git a/tensorflow/python/kernel_tests/distributions/laplace_test.py b/tensorflow/python/kernel_tests/distributions/laplace_test.py
index 630c2cb424..2610ba23b8 100644
--- a/tensorflow/python/kernel_tests/distributions/laplace_test.py
+++ b/tensorflow/python/kernel_tests/distributions/laplace_test.py
@@ -275,8 +275,8 @@ class LaplaceTest(test.TestCase):
     self.assertAllClose(
         sample_values.var(axis=0),
         stats.laplace.var(loc_bc, scale=scale_bc),
-        rtol=0.10,
-        atol=0.)
+        rtol=0.105,
+        atol=0.0)
     fails = 0
     trials = 0
     for ai, a in enumerate(np.reshape(loc_v, [-1])):
-- 
GitLab


From 26ce26d127587bc1f5dc7950e22f7d935d372abf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 10:31:02 -0700
Subject: [PATCH 341/570] Re-add proto fields temporarily for internal
 compatibility.

PiperOrigin-RevId: 215585187
---
 .../tpu/proto/optimization_parameters.proto     | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
index b9e0747fa4..8529b48c15 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
@@ -28,6 +28,7 @@ message LearningRate {
 // https://www.tensorflow.org/api_docs/python/tf/train/AdagradOptimizer
 // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L151
 message AdagradParameters {
+  float initial_accumulator = 1;
 }
 
 // https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer
@@ -41,6 +42,8 @@ message FtrlParameters {
   float l1 = 1;
   float l2 = 2;
   float lr_power = 3;
+  float initial_accum = 4;
+  float initial_linear = 5;
 }
 
 // The Adam optimizer does not implement hyper-parameter update; use the dynamic
@@ -67,6 +70,8 @@ message AdamParameters {
   float beta1 = 3;
   float beta2 = 4;
   float epsilon = 5;
+  float initial_m = 6;
+  float initial_v = 7;
   bool use_non_lazy_adam = 8;
   bool use_max_with_epsilon = 9;
 }
@@ -76,6 +81,7 @@ message AdamParameters {
 message MomentumParameters {
   float momentum = 1;
   bool use_nesterov = 2;
+  float initial_accum = 3;
 }
 
 // https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
@@ -84,6 +90,8 @@ message RmsPropParameters {
   float rho = 1;
   float momentum = 2;
   float epsilon = 3;
+  float initial_ms = 4;
+  float initial_mom = 5;
 }
 
 // https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
@@ -92,6 +100,9 @@ message CenteredRmsPropParameters {
   float rho = 1;
   float momentum = 2;
   float epsilon = 3;
+  float initial_ms = 4;
+  float initial_mom = 5;
+  float initial_mg = 6;
 }
 
 // Variant of algorithm in http://proceedings.mlr.press/v44/shamir15.pdf
@@ -108,6 +119,9 @@ message MdlAdagradLightParameters {
   float mdl_hard_limit = 10;
   bool hard_limit_min_benefit = 11;
   bool mdl_regularize = 12;
+  float initial_accumulator = 13;
+  float initial_weight = 14;
+  float initial_benefit = 15;
 }
 
 // https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
@@ -115,6 +129,8 @@ message MdlAdagradLightParameters {
 message AdadeltaParameters {
   float rho = 1;
   float epsilon = 2;
+  float initial_accumulator = 3;
+  float initial_update = 4;
 }
 
 // https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
@@ -122,6 +138,7 @@ message AdadeltaParameters {
 message ProximalAdagradParameters {
   float l1 = 1;
   float l2 = 2;
+  float initial_accumulator = 3;
 }
 
 message OptimizationParameters {
-- 
GitLab


From af1458a9c1a3bc8d49a1e55386950b4941ab1815 Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Wed, 3 Oct 2018 10:39:07 -0700
Subject: [PATCH 342/570] Fix filename/line number lookup for logging.

Log messages now show the correct file/function name/line number instead of that of the helper function.

PiperOrigin-RevId: 215586852
---
 tensorflow/python/platform/tf_logging.py | 58 ++++++++++++++++++------
 1 file changed, 45 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 5962d2f220..59e60856ae 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -25,6 +25,7 @@ import logging as _logging
 import os as _os
 import sys as _sys
 import time as _time
+import traceback as _traceback
 from logging import DEBUG
 from logging import ERROR
 from logging import FATAL
@@ -36,13 +37,49 @@ import six
 
 from tensorflow.python.util.tf_export import tf_export
 
-
 # Don't use this directly. Use _get_logger() instead.
 _logger = None
 _logger_lock = threading.Lock()
 
 
+def _get_caller(offset=3):
+  """Returns a code and frame object for the lowest non-logging stack frame."""
+  # Use sys._getframe().  This avoids creating a traceback object.
+  # pylint: disable=protected-access
+  f = _sys._getframe(offset)
+  # pylint: enable=protected-access
+  our_file = f.f_code.co_filename
+  f = f.f_back
+  while f:
+    code = f.f_code
+    if code.co_filename != our_file:
+      return code, f
+    f = f.f_back
+  return None, None
+
+
+# The definition of `findCaller` changed in Python 3.2
+if _sys.version_info.major >= 3 and _sys.version_info.minor >= 2:
+  def _logger_find_caller(stack_info=False):  # pylint: disable=g-wrong-blank-lines
+    code, frame = _get_caller(4)
+    sinfo = None
+    if stack_info:
+      sinfo = '\n'.join(_traceback.format_stack())
+    if code:
+      return (code.co_filename, frame.f_lineno, code.co_name, sinfo)
+    else:
+      return '(unknown file)', 0, '(unknown function)', sinfo
+else:
+  def _logger_find_caller():  # pylint: disable=g-wrong-blank-lines
+    code, frame = _get_caller(4)
+    if code:
+      return (code.co_filename, frame.f_lineno, code.co_name)
+    else:
+      return '(unknown file)', 0, '(unknown function)'
+
+
 def _get_logger():
+  """Return TF logger instance."""
   global _logger
 
   # Use double-checked locking to avoid taking lock unnecessarily.
@@ -58,6 +95,9 @@ def _get_logger():
     # Scope the TensorFlow logger to not conflict with users' loggers.
     logger = _logging.getLogger('tensorflow')
 
+    # Override findCaller on the logger to skip internal helper functions
+    logger.findCaller = _logger_find_caller
+
     # Don't further configure the TensorFlow logger if the root logger is
     # already configured. This prevents double logging in those cases.
     if not _logging.getLogger().handlers:
@@ -216,18 +256,10 @@ def log_if(level, msg, condition, *args):
 
 def _GetFileAndLine():
   """Returns (filename, linenumber) for the stack frame."""
-  # Use sys._getframe().  This avoids creating a traceback object.
-  # pylint: disable=protected-access
-  f = _sys._getframe()
-  # pylint: enable=protected-access
-  our_file = f.f_code.co_filename
-  f = f.f_back
-  while f:
-    code = f.f_code
-    if code.co_filename != our_file:
-      return (code.co_filename, f.f_lineno)
-    f = f.f_back
-  return ('<unknown>', 0)
+  code, f = _get_caller()
+  if not code:
+    return ('<unknown>', 0)
+  return (code.co_filename, f.f_lineno)
 
 
 def google2_log_prefix(level, timestamp=None, file_and_line=None):
-- 
GitLab


From 560624bff65b7b502da2c52f9b250d9181c4a3f7 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Wed, 3 Oct 2018 10:51:17 -0700
Subject: [PATCH 343/570] Internal change.

PiperOrigin-RevId: 215589009
---
 tensorflow/contrib/lite/python/interpreter.py | 17 ++++
 .../interpreter_wrapper.cc                    | 19 ++++-
 .../interpreter_wrapper/interpreter_wrapper.h |  1 +
 .../model_coverage/model_coverage_lib.py      | 81 +++++++++++++++++--
 .../model_coverage/model_coverage_lib_test.py | 38 +++++++++
 5 files changed, 147 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index 5700bf7892..6300552cbe 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -129,6 +129,23 @@ class Interpreter(object):
 
     return details
 
+  def get_tensor_details(self):
+    """Gets tensor details for every tensor with valid tensor details.
+
+    Tensors where required information about the tensor is not found are not
+    added to the list. This includes temporary tensors without a name.
+
+    Returns:
+      A list of dictionaries containing tensor information.
+    """
+    tensor_details = []
+    for idx in range(self._interpreter.NumTensors()):
+      try:
+        tensor_details.append(self._get_tensor_details(idx))
+      except ValueError:
+        pass
+    return tensor_details
+
   def get_input_details(self):
     """Gets model input details.
 
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 418f19a179..1e2384b6d2 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -277,13 +277,20 @@ PyObject* InterpreterWrapper::ResizeInputTensor(int i, PyObject* value) {
   Py_RETURN_NONE;
 }
 
+int InterpreterWrapper::NumTensors() const {
+  if (!interpreter_) {
+    return 0;
+  }
+  return interpreter_->tensors_size();
+}
+
 std::string InterpreterWrapper::TensorName(int i) const {
   if (!interpreter_ || i >= interpreter_->tensors_size() || i < 0) {
     return "";
   }
 
   const TfLiteTensor* tensor = interpreter_->tensor(i);
-  return tensor->name;
+  return tensor->name ? tensor->name : "";
 }
 
 PyObject* InterpreterWrapper::TensorType(int i) const {
@@ -291,6 +298,11 @@ PyObject* InterpreterWrapper::TensorType(int i) const {
   TFLITE_PY_TENSOR_BOUNDS_CHECK(i);
 
   const TfLiteTensor* tensor = interpreter_->tensor(i);
+  if (tensor->type == kTfLiteNoType) {
+    PyErr_Format(PyExc_ValueError, "Tensor with no type found.");
+    return nullptr;
+  }
+
   int code = TfLiteTypeToPyArrayType(tensor->type);
   if (code == -1) {
     PyErr_Format(PyExc_ValueError, "Invalid tflite type code %d", code);
@@ -302,7 +314,12 @@ PyObject* InterpreterWrapper::TensorType(int i) const {
 PyObject* InterpreterWrapper::TensorSize(int i) const {
   TFLITE_PY_ENSURE_VALID_INTERPRETER();
   TFLITE_PY_TENSOR_BOUNDS_CHECK(i);
+
   const TfLiteTensor* tensor = interpreter_->tensor(i);
+  if (tensor->dims == nullptr) {
+    PyErr_Format(PyExc_ValueError, "Tensor with no shape found.");
+    return nullptr;
+  }
   PyObject* np_array =
       PyArrayFromIntVector(tensor->dims->data, tensor->dims->size);
 
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index f5ca81e62a..b98046fe8a 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -59,6 +59,7 @@ class InterpreterWrapper {
   PyObject* OutputIndices() const;
   PyObject* ResizeInputTensor(int i, PyObject* value);
 
+  int NumTensors() const;
   std::string TensorName(int i) const;
   PyObject* TensorType(int i) const;
   PyObject* TensorSize(int i) const;
diff --git a/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib.py b/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib.py
index 5ca57d083d..72029ed03c 100644
--- a/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib.py
+++ b/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib.py
@@ -35,9 +35,9 @@ def _convert(converter, **kwargs):
   """Converts the model.
 
   Args:
-    converter: TocoConverter object.
+    converter: TFLiteConverter object.
     **kwargs: Additional arguments to be passed into the converter. Supported
-      flags are {"converter_mode", "post_training_quant"}.
+      flags are {"converter_mode", "post_training_quantize"}.
 
   Returns:
     The converted TFLite model in serialized format.
@@ -174,7 +174,7 @@ def compare_models_random_data(tflite_model, tf_eval_func, tolerance=5):
     tflite_model: Serialized TensorFlow Lite model.
     tf_eval_func: Lambda function that takes in input data and outputs the
       results of the TensorFlow model ([np.ndarray data] : [np.ndarray result]).
-    tolerance: Decimal place to check accuracy to.
+    tolerance: Decimal place to check accuracy to. (default 5)
   """
   input_data = _generate_random_input_data(tflite_model)
   tf_results = tf_eval_func(input_data)
@@ -183,6 +183,71 @@ def compare_models_random_data(tflite_model, tf_eval_func, tolerance=5):
     np.testing.assert_almost_equal(tf_result, tflite_result, tolerance)
 
 
+def test_frozen_graph_quant(filename,
+                            input_arrays,
+                            output_arrays,
+                            input_shapes=None,
+                            **kwargs):
+  """Sanity check to validate post quantize flag alters the graph.
+
+  This test does not check correctness of the converted model. It converts the
+  TensorFlow frozen graph to TFLite with and without the post_training_quantized
+  flag. It ensures some tensors have different types between the float and
+  quantized models in the case of an all TFLite model or mix-and-match model.
+  It ensures tensor types do not change in the case of an all Flex model.
+
+  Args:
+    filename: Full filepath of file containing frozen GraphDef.
+    input_arrays: List of input tensors to freeze graph with.
+    output_arrays: List of output tensors to freeze graph with.
+    input_shapes: Dict of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
+      Automatically determined when input shapes is None (e.g., {"foo" : None}).
+        (default None)
+    **kwargs: Additional arguments to be passed into the converter.
+
+  Raises:
+    ValueError: post_training_quantize flag doesn't act as intended.
+  """
+  # Convert and load the float model.
+  converter = _lite.TFLiteConverter.from_frozen_graph(
+      filename, input_arrays, output_arrays, input_shapes)
+  tflite_model_float = _convert(converter, **kwargs)
+
+  interpreter_float = _lite.Interpreter(model_content=tflite_model_float)
+  interpreter_float.allocate_tensors()
+  float_tensors = interpreter_float.get_tensor_details()
+
+  # Convert and load the quantized model.
+  converter = _lite.TFLiteConverter.from_frozen_graph(filename, input_arrays,
+                                                      output_arrays)
+  tflite_model_quant = _convert(
+      converter, post_training_quantize=True, **kwargs)
+
+  interpreter_quant = _lite.Interpreter(model_content=tflite_model_quant)
+  interpreter_quant.allocate_tensors()
+  quant_tensors = interpreter_quant.get_tensor_details()
+  quant_tensors_map = {
+      tensor_detail["name"]: tensor_detail for tensor_detail in quant_tensors
+  }
+
+  # Check if weights are of different types in the float and quantized models.
+  num_tensors_float = len(float_tensors)
+  num_tensors_same_dtypes = sum(
+      float_tensor["dtype"] == quant_tensors_map[float_tensor["name"]]["dtype"]
+      for float_tensor in float_tensors)
+  has_quant_tensor = num_tensors_float != num_tensors_same_dtypes
+
+  if ("converter_mode" in kwargs and
+      kwargs["converter_mode"] == _lite.ConverterMode.TOCO_FLEX_ALL):
+    if has_quant_tensor:
+      raise ValueError("--post_training_quantize flag unexpectedly altered the "
+                       "full Flex mode graph.")
+  elif not has_quant_tensor:
+    raise ValueError("--post_training_quantize flag was unable to quantize the "
+                     "graph as expected in TFLite and mix-and-match mode.")
+
+
 def test_frozen_graph(filename,
                       input_arrays,
                       output_arrays,
@@ -203,8 +268,8 @@ def test_frozen_graph(filename,
         (default None)
     **kwargs: Additional arguments to be passed into the converter.
   """
-  converter = _lite.TocoConverter.from_frozen_graph(filename, input_arrays,
-                                                    output_arrays, input_shapes)
+  converter = _lite.TFLiteConverter.from_frozen_graph(
+      filename, input_arrays, output_arrays, input_shapes)
   tflite_model = _convert(converter, **kwargs)
 
   tf_eval_func = evaluate_frozen_graph(filename, input_arrays, output_arrays)
@@ -224,8 +289,8 @@ def test_saved_model(directory, tag_set=None, signature_key=None, **kwargs):
     signature_key: Key identifying SignatureDef containing inputs and outputs.
     **kwargs: Additional arguments to be passed into the converter.
   """
-  converter = _lite.TocoConverter.from_saved_model(directory, tag_set,
-                                                   signature_key)
+  converter = _lite.TFLiteConverter.from_saved_model(directory, tag_set,
+                                                     signature_key)
   tflite_model = _convert(converter, **kwargs)
 
   tf_eval_func = evaluate_saved_model(directory, tag_set, signature_key)
@@ -242,7 +307,7 @@ def test_keras_model(filename, **kwargs):
     filename: Full filepath of HDF5 file containing the tf.keras model.
     **kwargs: Additional arguments to be passed into the converter.
   """
-  converter = _lite.TocoConverter.from_keras_model_file(filename)
+  converter = _lite.TFLiteConverter.from_keras_model_file(filename)
   tflite_model = _convert(converter, **kwargs)
 
   tf_eval_func = evaluate_keras_model(filename)
diff --git a/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib_test.py b/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib_test.py
index 1498f86c6f..e07202b1a6 100644
--- a/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib_test.py
+++ b/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import os
 import tempfile
+import numpy as np
 
 from tensorflow.contrib.lite.python import lite
 from tensorflow.contrib.lite.testing.model_coverage import model_coverage_lib as model_coverage
@@ -66,6 +67,43 @@ class EvaluateFrozenGraph(test.TestCase):
     model_coverage.test_frozen_graph(filename, ['inputA', 'inputB'],
                                      ['add', 'Mean'])
 
+  def _getQuantizedModel(self):
+    np.random.seed(0)
+    with session.Session().as_default() as sess:
+      # The tensor needs to have more than 1024 elements for quantize_weights to
+      # kick in. Thus, the [33, 33] shape.
+      in_tensor_1 = array_ops.placeholder(
+          shape=[33, 33], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = constant_op.constant(
+          np.random.uniform(low=-10., high=10., size=(33, 33)),
+          shape=[33, 33],
+          dtype=dtypes.float32,
+          name='inputB')
+      _ = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
+
+    filename = self._saveFrozenGraph(sess)
+    return filename
+
+  def testQuantized(self):
+    filename = self._getQuantizedModel()
+    model_coverage.test_frozen_graph_quant(filename, ['inputA', 'inputB'],
+                                           ['output'])
+
+  def testQuantizedInputShapes(self):
+    filename = self._getQuantizedModel()
+    model_coverage.test_frozen_graph_quant(
+        filename, ['inputA', 'inputB'], ['output'],
+        input_shapes={
+            'inputA': [33, 33],
+            'inputB': [33, 33],
+        })
+
+  def testQuantizedFlexAll(self):
+    filename = self._getQuantizedModel()
+    model_coverage.test_frozen_graph_quant(
+        filename, ['inputA', 'inputB'], ['output'],
+        converter_mode=lite.ConverterMode.TOCO_FLEX_ALL)
+
 
 class EvaluateSavedModel(test.TestCase):
 
-- 
GitLab


From 0796d711f17c8c981d19461c9edd0e16837c8ab7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 10:51:56 -0700
Subject: [PATCH 344/570] Update _check_shape to accept six.integer_types
 instead of int

Currently _check_shape requires that a shape be an `int` or sequence of `int`s.  This CL allows `six.integer_type`s so now (1L,) would be a valid shape.

PiperOrigin-RevId: 215589131
---
 tensorflow/python/feature_column/feature_column.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 618e70f3a5..5352796174 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -2829,7 +2829,7 @@ def _check_shape(shape, key):
     shape = [shape]
   shape = tuple(shape)
   for dimension in shape:
-    if not isinstance(dimension, int):
+    if not isinstance(dimension, six.integer_types):
       raise TypeError('shape dimensions must be integer. '
                       'shape: {}, key: {}'.format(shape, key))
     if dimension < 1:
-- 
GitLab


From b25ef3877da28b7ec31d0bd69a7a6268f5e8a4b4 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Wed, 3 Oct 2018 10:58:53 -0700
Subject: [PATCH 345/570] Add a new GetRunFilesDir function to Env.

PiperOrigin-RevId: 215590440
---
 tensorflow/core/platform/env.h          |  6 ++++++
 tensorflow/core/platform/posix/env.cc   | 11 +++++++++++
 tensorflow/core/platform/windows/env.cc | 11 +++++++++++
 3 files changed, 28 insertions(+)

diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 5b237c4736..5732271f15 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -228,6 +228,10 @@ class Env {
   /// |suffix|. Returns true if success.
   bool CreateUniqueFileName(string* prefix, const string& suffix);
 
+  /// \brief Return the runfiles directory if running under bazel. Returns
+  /// the directory the executable is located in if not running under bazel.
+  virtual string GetRunfilesDir() = 0;
+
   // TODO(jeff,sanjay): Add back thread/thread-pool support if needed.
   // TODO(jeff,sanjay): if needed, tighten spec so relative to epoch, or
   // provide a routine to get the absolute time.
@@ -360,6 +364,8 @@ class EnvWrapper : public Env {
     return target_->FormatLibraryFileName(name, version);
   }
 
+  string GetRunfilesDir() override { return target_->GetRunfilesDir(); }
+
  private:
   void GetLocalTempDirectories(std::vector<string>* list) override {
     target_->GetLocalTempDirectories(list);
diff --git a/tensorflow/core/platform/posix/env.cc b/tensorflow/core/platform/posix/env.cc
index 418874d340..af95d8201e 100644
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@@ -119,6 +119,17 @@ class PosixEnv : public Env {
     return tensorflow::internal::FormatLibraryFileName(name, version);
   }
 
+  string GetRunfilesDir() override {
+    string bin_path = this->GetExecutablePath();
+    string runfiles_path = bin_path + ".runfiles/org_tensorflow";
+    Status s = this->IsDirectory(runfiles_path);
+    if (!s.ok()) {
+      return runfiles_path;
+    } else {
+      return bin_path.substr(0, bin_path.find_last_of("/\\"));
+    }
+  }
+
  private:
   void GetLocalTempDirectories(std::vector<string>* list) override;
 };
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 68ee3595a2..f26ccd1662 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -160,6 +160,17 @@ class WindowsEnv : public Env {
     return filename;
   }
 
+  string GetRunfilesDir() override {
+    string bin_path = this->GetExecutablePath();
+    string runfiles_path = bin_path + ".runfiles\\org_tensorflow";
+    Status s = this->IsDirectory(runfiles_path);
+    if (!s.ok()) {
+      return runfiles_path;
+    } else {
+      return bin_path.substr(0, bin_path.find_last_of("/\\"));
+    }
+  }
+
  private:
   void GetLocalTempDirectories(std::vector<string>* list) override;
 
-- 
GitLab


From 55ea7f89ee6aa45c5a7623ac9ba671044467e807 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 11:00:21 -0700
Subject: [PATCH 346/570] Supports TPUEstimatorSpec in multi_head for TRAIN and
 PREDICT modes.

PiperOrigin-RevId: 215590676
---
 .../estimator/python/estimator/multi_head.py  | 67 ++++++++++++-----
 .../python/estimator/multi_head_test.py       | 75 ++++++++++++++++---
 2 files changed, 111 insertions(+), 31 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head.py b/tensorflow/contrib/estimator/python/estimator/multi_head.py
index ce75899214..6e793c8302 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head.py
@@ -233,6 +233,22 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
       self, features, mode, logits, labels=None, optimizer=None,
       train_op_fn=None):
     """See `_Head`."""
+    return self._create_estimator_spec(
+        features=features, mode=mode, logits=logits, labels=labels,
+        optimizer=optimizer, train_op_fn=train_op_fn, use_tpu=False)
+
+  def _create_tpu_estimator_spec(
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None):
+    """See `_Head`."""
+    return self._create_estimator_spec(
+        features=features, mode=mode, logits=logits, labels=labels,
+        optimizer=optimizer, train_op_fn=train_op_fn, use_tpu=True)
+
+  def _create_estimator_spec(
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None, use_tpu=False):
+    """Returns `EstimatorSpec` or `TPUEstimatorSpec`."""
     if isinstance(logits, dict):
       logits_dict = logits
     else:
@@ -255,14 +271,15 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
       spec = self._merge_train(
           all_estimator_spec=all_estimator_spec,
           optimizer=optimizer,
-          train_op_fn=train_op_fn)
+          train_op_fn=train_op_fn,
+          use_tpu=use_tpu)
       with ops.name_scope(''):
         summary.scalar(metric_keys.MetricKeys.LOSS, spec.loss)
       return spec
     if mode == model_fn.ModeKeys.PREDICT:
-      return self._merge_predict(all_estimator_spec)
+      return self._merge_predict(all_estimator_spec, use_tpu=use_tpu)
     if mode == model_fn.ModeKeys.EVAL:
-      return self._merge_eval(all_estimator_spec)
+      return self._merge_eval(all_estimator_spec, use_tpu=use_tpu)
     raise ValueError('mode={} unrecognized'.format(mode))
 
   def _split_logits(self, logits):
@@ -284,28 +301,28 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
         begin_idx += head.logits_dimension
     return logits_dict
 
-  def _merge_train(self, all_estimator_spec, optimizer, train_op_fn):
-    """Merges list of `EstimatorSpec` for training.
+  def _merge_train(
+      self, all_estimator_spec, optimizer, train_op_fn, use_tpu=False):
+    """Merges list of `EstimatorSpec` or `TPUEstimatorSpec` for training.
 
     Args:
-      all_estimator_spec: list of `EstimatorSpec` for the individual heads.
+      all_estimator_spec: list of `EstimatorSpec` or `TPUEstimatorSpec` for the
+        individual heads.
       optimizer: `Optimizer` instance to create train op. See
         `create_estimator_spec` documentation for more details.
       train_op_fn: Function to create train op. Used if `optimizer` is `None`.
+      use_tpu: If `True`, returns `TPUEstimatorSpec`.
 
     Returns:
-      `EstimatorSpec` that merges all heads for TRAIN.
+      `EstimatorSpec` or `TPUEstimatorSpec` that merges all heads for TRAIN.
 
     Raises:
       ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
         mode.
     """
     losses = []
-    metrics = {}
     for spec in all_estimator_spec:
       losses.append(spec.loss)
-      # Metric keys already contain head.name.
-      metrics.update(spec.eval_metric_ops or {})
     loss = _merge_losses(losses, self._head_weights)
     if optimizer is not None:
       if train_op_fn is not None:
@@ -317,20 +334,23 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
     else:
       raise ValueError('train_op_fn and optimizer cannot both be None.')
 
-    return model_fn.EstimatorSpec(
+    spec_type = (
+        model_fn._TPUEstimatorSpec if use_tpu else model_fn.EstimatorSpec)  # pylint:disable=protected-access
+    return spec_type(
         mode=model_fn.ModeKeys.TRAIN,
         loss=loss,
-        train_op=train_op,
-        eval_metric_ops=metrics)
+        train_op=train_op)
 
-  def _merge_predict(self, all_estimator_spec):
-    """Merges list of `EstimatorSpec` for prediction.
+  def _merge_predict(self, all_estimator_spec, use_tpu=False):
+    """Merges list of `EstimatorSpec` or `TPUEstimatorSpec` for prediction.
 
     Args:
-      all_estimator_spec: list of `EstimatorSpec` for the individual heads.
+      all_estimator_spec: list of `EstimatorSpec` or `TPUEstimatorSpec` for the
+        individual heads.
+      use_tpu: If `True`, returns `TPUEstimatorSpec`.
 
     Returns:
-      `EstimatorSpec` that merges all heads for PREDICT.
+      `EstimatorSpec` or `TPUEstimatorSpec` that merges all heads for PREDICT.
     """
     predictions = {}
     export_outputs = {
@@ -357,20 +377,29 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
     export_outputs[head_lib._PREDICT_SERVING_KEY] = (  # pylint:disable=protected-access
         export_output_lib.PredictOutput(merged_predict_outputs))
 
-    return model_fn.EstimatorSpec(
+    spec_type = (
+        model_fn._TPUEstimatorSpec if use_tpu else model_fn.EstimatorSpec)  # pylint:disable=protected-access
+    return spec_type(
         mode=model_fn.ModeKeys.PREDICT,
         predictions=predictions,
         export_outputs=export_outputs)
 
-  def _merge_eval(self, all_estimator_spec):
+  def _merge_eval(self, all_estimator_spec, use_tpu=False):
     """Merges list of `EstimatorSpec` for eval.
 
     Args:
       all_estimator_spec: list of `EstimatorSpec` for the individual heads.
+      use_tpu: If `True`, will raise `NotImplementedError`, because TPU is not
+        yet supported for eval.
 
     Returns:
       `EstimatorSpec` that merges all heads for EVAL.
+    Raises:
+      NotImplementedError: If `use_tpu` is `True`.
     """
+    if use_tpu:
+      raise NotImplementedError(
+          'TPU evaluation is not implemented for multi_head.')
     predictions = {}
     metrics = {}
     losses = []
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
index 2b4d5f5261..a602f87b4a 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
@@ -106,7 +106,7 @@ class MultiHeadTest(test.TestCase):
     multi_head = multi_head_lib.multi_head([head1, head2])
     self.assertEqual('head1_head2', multi_head.name)
 
-  def test_predict_two_heads_logits_dict(self):
+  def _test_predict_two_heads_logits_dict(self, use_tpu):
     """Tests predict with logits as dict."""
     head1 = head_lib.multi_label_head(n_classes=2, name='head1')
     head2 = head_lib.multi_label_head(n_classes=3, name='head2')
@@ -121,10 +121,16 @@ class MultiHeadTest(test.TestCase):
         'head2': _sigmoid(logits['head2']),
     }
 
-    spec = multi_head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
+    if use_tpu:
+      spec = multi_head._create_tpu_estimator_spec(
+          features={'x': np.array(((42,),), dtype=np.int32)},
+          mode=model_fn.ModeKeys.PREDICT,
+          logits=logits).as_estimator_spec()
+    else:
+      spec = multi_head.create_estimator_spec(
+          features={'x': np.array(((42,),), dtype=np.int32)},
+          mode=model_fn.ModeKeys.PREDICT,
+          logits=logits)
 
     self.assertItemsEqual(
         (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/classification',
@@ -175,6 +181,12 @@ class MultiHeadTest(test.TestCase):
           sess.run(
               spec.export_outputs['head2/predict'].outputs['probabilities']))
 
+  def test_predict_two_heads_logits_dict(self):
+    self._test_predict_two_heads_logits_dict(use_tpu=False)
+
+  def test_predict_two_heads_logits_dict_tpu(self):
+    self._test_predict_two_heads_logits_dict(use_tpu=True)
+
   def test_predict_two_heads_logits_tensor(self):
     """Tests predict with logits as Tensor."""
     head1 = head_lib.multi_label_head(n_classes=2, name='head1')
@@ -350,6 +362,31 @@ class MultiHeadTest(test.TestCase):
           rtol=tol,
           atol=tol)
 
+  def test_eval_tpu(self):
+    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
+    head2 = head_lib.multi_label_head(n_classes=3, name='head2')
+    multi_head = multi_head_lib.multi_head(
+        [head1, head2], head_weights=[1., 2.])
+
+    logits = {
+        'head1': np.array([[-10., 10.], [-15., 10.]], dtype=np.float32),
+        'head2': np.array([[20., -20., 20.], [-30., 20., -20.]],
+                          dtype=np.float32),
+    }
+    labels = {
+        'head1': np.array([[1, 0], [1, 1]], dtype=np.int64),
+        'head2': np.array([[0, 1, 0], [1, 1, 0]], dtype=np.int64),
+    }
+
+    with self.assertRaisesRegexp(
+        NotImplementedError,
+        r'TPU evaluation is not implemented for multi_head\.'):
+      multi_head._create_tpu_estimator_spec(
+          features={'x': np.array(((42,),), dtype=np.int32)},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=logits,
+          labels=labels)
+
   def test_train_create_loss_one_head(self):
     head1 = head_lib.multi_label_head(n_classes=2, name='head1')
     multi_head = multi_head_lib.multi_head([head1])
@@ -587,7 +624,7 @@ class MultiHeadTest(test.TestCase):
           six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
           train_result)
 
-  def test_train_two_heads_with_weights(self):
+  def _test_train_two_heads_with_weights(self, use_tpu):
     head1 = head_lib.multi_label_head(n_classes=2, name='head1')
     head2 = head_lib.multi_label_head(n_classes=3, name='head2')
     multi_head = multi_head_lib.multi_head(
@@ -619,12 +656,20 @@ class MultiHeadTest(test.TestCase):
           [constant_op.constant(expected_train_result),
            string_ops.as_string(loss, precision=3)])
 
-    spec = multi_head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
+    if use_tpu:
+      spec = multi_head._create_tpu_estimator_spec(
+          features={'x': np.array(((42,),), dtype=np.int32)},
+          mode=model_fn.ModeKeys.TRAIN,
+          logits=logits,
+          labels=labels,
+          train_op_fn=_train_op_fn).as_estimator_spec()
+    else:
+      spec = multi_head.create_estimator_spec(
+          features={'x': np.array(((42,),), dtype=np.int32)},
+          mode=model_fn.ModeKeys.TRAIN,
+          logits=logits,
+          labels=labels,
+          train_op_fn=_train_op_fn)
 
     self.assertIsNotNone(spec.loss)
     self.assertEqual({}, spec.eval_metric_ops)
@@ -649,6 +694,12 @@ class MultiHeadTest(test.TestCase):
           metric_keys.MetricKeys.LOSS + '/head2': expected_loss_head2,
       }, summary_str, tol)
 
+  def test_train_two_heads_with_weights(self):
+    self._test_train_two_heads_with_weights(use_tpu=False)
+
+  def test_train_two_heads_with_weights_tpu(self):
+    self._test_train_two_heads_with_weights(use_tpu=True)
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 51b266fba181dffb6b3f9207280cde6b7670dd90 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Wed, 3 Oct 2018 11:09:44 -0700
Subject: [PATCH 347/570] [tf.data] Fix noisy warning.

PiperOrigin-RevId: 215592456
---
 tensorflow/python/data/ops/dataset_ops.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 46ce191f7b..3693cc88f2 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -1830,10 +1830,11 @@ class StructuredFunctionWrapper(object):
           component = _NestedDatasetComponent(t)
           flat_classes.append(component)
           flat_shapes.append(component)
-          flat_types.append(component)
-          if t.options() is not None:  # pylint: disable=protected-access
-            warnings.warn("Encountered a nested dataset with options. These "
-                          "options will not be applied to the outer dataset.")
+          flat_types.append(component)          
+          if t.options() != Options():  # pylint: disable=protected-access
+            warnings.warn("Encountered a nested dataset with non-default "
+                          "options. These options will not be propagated to "
+                          "the outer dataset.")
         else:
           try:
             t = ops.convert_to_tensor(t)
-- 
GitLab


From 880dcb7a91e5ee497045614d9c5f4ab93c9ffacf Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Wed, 3 Oct 2018 11:17:48 -0700
Subject: [PATCH 348/570] Automated rollback of commit
 51b266fba181dffb6b3f9207280cde6b7670dd90

PiperOrigin-RevId: 215593867
---
 tensorflow/python/data/ops/dataset_ops.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 3693cc88f2..46ce191f7b 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -1830,11 +1830,10 @@ class StructuredFunctionWrapper(object):
           component = _NestedDatasetComponent(t)
           flat_classes.append(component)
           flat_shapes.append(component)
-          flat_types.append(component)          
-          if t.options() != Options():  # pylint: disable=protected-access
-            warnings.warn("Encountered a nested dataset with non-default "
-                          "options. These options will not be propagated to "
-                          "the outer dataset.")
+          flat_types.append(component)
+          if t.options() is not None:  # pylint: disable=protected-access
+            warnings.warn("Encountered a nested dataset with options. These "
+                          "options will not be applied to the outer dataset.")
         else:
           try:
             t = ops.convert_to_tensor(t)
-- 
GitLab


From 47eafbaf43c763dc65a2cd3cfd9ecbd8fbbdf668 Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Wed, 3 Oct 2018 11:24:41 -0700
Subject: [PATCH 349/570] [tf.data] Add utility to deduplicate graph node names
 (after vectorization)

PiperOrigin-RevId: 215595078
---
 tensorflow/core/graph/graph.cc                |  5 ++++
 tensorflow/core/graph/graph.h                 |  1 +
 .../core/grappler/optimizers/data/BUILD       |  2 ++
 .../grappler/optimizers/data/graph_utils.cc   | 21 ++++++++++++++
 .../grappler/optimizers/data/graph_utils.h    |  9 ++++++
 .../optimizers/data/graph_utils_test.cc       | 28 +++++++++++++++++++
 .../optimizers/data/vectorization_utils.cc    |  2 ++
 7 files changed, 68 insertions(+)

diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 4c0cd14ff1..7a4a0096fa 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -192,6 +192,11 @@ void Node::ClearAttr(const string& name) {
   (*props_->node_def.mutable_attr()).erase(name);
 }
 
+void Node::set_name(string name) {
+  MaybeCopyOnWrite();
+  props_->node_def.set_name(std::move(name));
+}
+
 void Node::set_requested_device(const string& device) {
   MaybeCopyOnWrite();
   props_->node_def.set_device(device);
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 72cef07072..2944951f82 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -72,6 +72,7 @@ class Node {
   int id() const { return id_; }
   int cost_id() const { return cost_id_; }
   const string& name() const;
+  void set_name(string name);
   const string& type_string() const;
 
   // def() provides the NodeDef the user supplied, but the specifics
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 5a3abbb545..755af3361e 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -129,6 +129,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
 )
 
@@ -138,6 +139,7 @@ tf_cc_test(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index 3eaaf8fbef..b863a25dc5 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
@@ -272,6 +273,26 @@ void ConcatAttributeList(const string& attribute_name, const NodeDef& first,
       ->MergeFrom(second.attr().at(attribute_name).list());
 }
 
+Status EnsureNodeNamesUnique(Graph* g) {
+  // Modeled after Scope::Impl::GetUniqueName
+  std::unordered_map<string, int> name_map;
+
+  for (auto node : g->op_nodes()) {
+    const string& prefix = node->name();
+    if (auto entry = gtl::FindOrNull(name_map, prefix)) {
+      string unique_name;
+      do {
+        unique_name = strings::StrCat(prefix, "_", ++(*entry));
+      } while (name_map.find(unique_name) != name_map.end());
+      name_map.insert({unique_name, 0});
+      node->set_name(std::move(unique_name));
+    } else {
+      name_map.insert({node->name(), 0});
+    }
+  }
+
+  return Status::OK();
+}
 }  // end namespace graph_utils
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index 3af34f6904..d130fee204 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -131,6 +132,14 @@ void CopyAttribute(const string& attribute_name, const NodeDef& from,
 void ConcatAttributeList(const string& attribute_name, const NodeDef& first,
                          const NodeDef& second, NodeDef* to_node);
 
+// Checks that all nodes in the graphs have unique names, and sets their names
+// to be unique if they are not already.  This is necessary as Graph does not
+// have the provisions to deduplicate names, and name deduplication elsewhere
+// in tensorflow happens in other layers (for example, in the Scope class of the
+// C++ API). Note that the nodes in the graph are identified by their id,
+// and renaming nodes does not mutate any edges.
+Status EnsureNodeNamesUnique(Graph* g);
+
 }  // end namespace graph_utils
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index db986542b2..4ab6d71532 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -229,6 +230,33 @@ TEST(GraphUtilsTest, GetInputNode) {
   EXPECT_EQ(GetInputNode(*node1, graph), nullptr);
 }
 
+TEST(GraphUtilsTest, EnsureNodeNamesUnique) {
+  Graph g(OpRegistry::Global());
+
+  Node *const_0, *const_1, *const_2;
+
+  // Arbitrary const
+  Tensor tensor(DT_INT32, {});
+  tensor.scalar<int32>()() = 5;
+
+  for (auto node : {&const_0, &const_1}) {
+    TF_EXPECT_OK(NodeBuilder("Const", "Const")
+                     .Attr("value", tensor)
+                     .Attr("dtype", DT_INT32)
+                     .Finalize(&g, node));
+  }
+  // Make sure generated name doesn't clash with existing name either
+  TF_EXPECT_OK(NodeBuilder("Const_1", "Const")
+                   .Attr("value", tensor)
+                   .Attr("dtype", DT_INT32)
+                   .Finalize(&g, &const_2));
+
+  TF_EXPECT_OK(EnsureNodeNamesUnique(&g));
+  EXPECT_NE(const_0->name(), const_1->name());
+  EXPECT_NE(const_1->name(), const_2->name());
+  EXPECT_NE(const_0->name(), const_2->name());
+}
+
 }  // namespace
 }  // namespace graph_utils
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
index cea667f668..2d6cf562b1 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
@@ -352,6 +352,8 @@ Status Vectorization::Initialize(const FunctionDef& outer_scope,
 
 Status Vectorization::GetResult(FunctionDef** vectorized_function) {
   TF_RETURN_IF_ERROR(status_);
+  TF_RETURN_IF_ERROR(graph_utils::EnsureNodeNamesUnique(outer_scope_.get()));
+  TF_RETURN_IF_ERROR(graph_utils::EnsureNodeNamesUnique(map_defun_fn_->graph));
 
   if (!map_defun_fn_->ret_nodes.empty()) {
     FunctionDef* map_defun_fn = lib_->add_function();
-- 
GitLab


From 3d76a83037388b61bcda1571d3b3e175a2f53f2e Mon Sep 17 00:00:00 2001
From: Jeremy Lau <lauj@google.com>
Date: Wed, 3 Oct 2018 12:25:25 -0700
Subject: [PATCH 350/570] Disable XLA for Android builds.

PiperOrigin-RevId: 215605865
---
 tensorflow/tools/ci_build/builds/configured | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/tools/ci_build/builds/configured b/tensorflow/tools/ci_build/builds/configured
index 868a3beac5..3eee11fd7e 100755
--- a/tensorflow/tools/ci_build/builds/configured
+++ b/tensorflow/tools/ci_build/builds/configured
@@ -32,6 +32,10 @@ COMMAND=("$@")
 
 export CI_BUILD_PYTHON="${CI_BUILD_PYTHON:-python}"
 export PYTHON_BIN_PATH="${PYTHON_BIN_PATH:-$(which ${CI_BUILD_PYTHON})}"
+# XLA currently does not build under Android, so disable it for now.
+if [[ "${CONTAINER_TYPE}" -eq 'android' ]]; then
+  export TF_ENABLE_XLA=0
+fi
 
 pushd "${CI_TENSORFLOW_SUBMODULE_PATH:-.}"
 yes "" | $PYTHON_BIN_PATH configure.py
-- 
GitLab


From 295b3c80555cc82d8d70faf96a47681e1d904b9c Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 3 Oct 2018 12:32:16 -0700
Subject: [PATCH 351/570] Automated rollback of commit
 c9bdd3938e2b43334a0065b4c198ec9d491c8cb8

PiperOrigin-RevId: 215607038
---
 tensorflow/core/kernels/data/iterator_ops.cc  |  4 +++
 .../kernels/data/map_and_batch_dataset_op.cc  | 10 +++----
 .../core/kernels/data/model_dataset_op.cc     | 10 +++----
 .../data/parallel_interleave_dataset_op.cc    | 27 ++++++++-----------
 .../kernels/data/parallel_map_iterator.cc     | 10 +++----
 .../core/kernels/data/prefetch_dataset_op.cc  | 10 +++----
 tensorflow/core/kernels/data/writer_ops.cc    | 12 ++++-----
 7 files changed, 37 insertions(+), 46 deletions(-)

diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 8acd6cc724..7a833668ac 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -16,8 +16,10 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_op_kernel.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
@@ -25,11 +27,13 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/optional_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index 6a670f1efb..bf08970560 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/tracing.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -406,10 +405,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!runner_thread_) {
           std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
-          runner_thread_ =
-              MakeUnique<BackgroundWorker>(ctx->env(), "runner_thread");
-          runner_thread_->Schedule(
-              std::bind(&Iterator::RunnerThread, this, ctx_copy));
+          runner_thread_.reset(ctx->env()->StartThread(
+              {}, "runner_thread",
+              std::bind(&Iterator::RunnerThread, this, ctx_copy)));
         }
       }
 
@@ -662,7 +660,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_;
       // Buffer for storing the (intermediate) batch results.
       std::deque<std::shared_ptr<BatchResult>> batch_results_ GUARDED_BY(*mu_);
-      std::unique_ptr<BackgroundWorker> runner_thread_ GUARDED_BY(*mu_);
+      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
       bool cancelled_ GUARDED_BY(*mu_) = false;
     };
 
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index 859df57962..9aa505f4f1 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -127,10 +126,9 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (!optimize_thread_) {
           std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-          optimize_thread_ =
-              MakeUnique<BackgroundWorker>(ctx->env(), "optimize_thread");
-          optimize_thread_->Schedule(
-              [this, new_ctx]() { OptimizeThread(new_ctx); });
+          optimize_thread_.reset(ctx->env()->StartThread(
+              {}, "optimize_thread",
+              [this, new_ctx]() { OptimizeThread(new_ctx); }));
         }
         return Status::OK();
       }
@@ -169,7 +167,7 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       mutex mu_;
       condition_variable cond_var_;
       std::shared_ptr<model::Model> model_;
-      std::unique_ptr<BackgroundWorker> optimize_thread_ GUARDED_BY(mu_);
+      std::unique_ptr<Thread> optimize_thread_ GUARDED_BY(mu_);
       bool cancelled_ GUARDED_BY(mu_) = false;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
     };
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 9c836b836e..6b6b3d6ab9 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -482,10 +481,9 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
           worker_threads_.reserve(dataset()->num_threads());
           for (size_t i = 0; i < dataset()->num_threads(); ++i) {
             std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(
-                MakeUnique<BackgroundWorker>(ctx->env(), "worker_thread"));
-            worker_threads_.back()->Schedule(
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); });
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, "worker_thread",
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
           }
         }
         return Status::OK();
@@ -582,10 +580,9 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
             }
             workers_[i].SetInputs(s, std::move(args));
             std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(
-                MakeUnique<BackgroundWorker>(ctx->env(), "worker_thread"));
-            worker_threads_.back()->Schedule(
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); });
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, "worker_thread",
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
             if (i < dataset()->cycle_length_) {
               interleave_indices_.push_back(i);
             } else {
@@ -1050,8 +1047,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       // The worker threads. This must be last to ensure the
       // threads have exited before any other members are deallocated.
       // TODO(b/65178177): Avoid allocating additional threads.
-      std::vector<std::unique_ptr<BackgroundWorker>> worker_threads_
-          GUARDED_BY(mu_);
+      std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
@@ -1393,10 +1389,9 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!runner_thread_) {
           std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-          runner_thread_ =
-              MakeUnique<BackgroundWorker>(ctx->env(), "runner_thread");
-          runner_thread_->Schedule(
-              [this, new_ctx]() { RunnerThread(new_ctx); });
+          runner_thread_.reset(ctx->env()->StartThread(
+              {}, "runner_thread",
+              [this, new_ctx]() { RunnerThread(new_ctx); }));
         }
       }
 
@@ -1650,7 +1645,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       int64 num_calls_ GUARDED_BY(*mu_) = 0;
 
       std::unique_ptr<thread::ThreadPool> thread_pool_;
-      std::unique_ptr<BackgroundWorker> runner_thread_ GUARDED_BY(*mu_);
+      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
 
       // Identifies whether background activity should be cancelled.
       bool cancelled_ GUARDED_BY(*mu_) = false;
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index 626e98af91..13bd4b6036 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -181,10 +180,9 @@ class ParallelMapIterator : public DatasetBaseIterator {
       EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     if (!runner_thread_) {
       std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
-      runner_thread_ =
-          MakeUnique<BackgroundWorker>(ctx->env(), "runner_thread");
-      runner_thread_->Schedule(
-          std::bind(&ParallelMapIterator::RunnerThread, this, ctx_copy));
+      runner_thread_.reset(ctx->env()->StartThread(
+          {}, "runner_thread",
+          std::bind(&ParallelMapIterator::RunnerThread, this, ctx_copy)));
     }
   }
 
@@ -332,7 +330,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   // Buffer for storing the invocation results.
   std::deque<std::shared_ptr<InvocationResult>> invocation_results_
       GUARDED_BY(*mu_);
-  std::unique_ptr<BackgroundWorker> runner_thread_ GUARDED_BY(*mu_);
+  std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
   bool cancelled_ GUARDED_BY(*mu_) = false;
 };
 
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index e9c38eb8a0..754ed772db 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -257,11 +256,10 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     Status EnsurePrefetchThreadStarted(IteratorContext* ctx)
         EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!prefetch_thread_) {
-        prefetch_thread_ =
-            MakeUnique<BackgroundWorker>(ctx->env(), "prefetch_thread");
         std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-        prefetch_thread_->Schedule(
-            [this, new_ctx]() { PrefetchThread(new_ctx); });
+        prefetch_thread_.reset(ctx->env()->StartThread(
+            {}, "prefetch_thread",
+            [this, new_ctx]() { PrefetchThread(new_ctx); }));
       }
       return Status::OK();
     }
@@ -365,7 +363,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     string prefix_end_;
     PrefetchAutotuner auto_tuner_ GUARDED_BY(mu_);
     std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
-    std::unique_ptr<BackgroundWorker> prefetch_thread_ GUARDED_BY(mu_);
+    std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
     bool cancelled_ GUARDED_BY(mu_) = false;
     bool prefetch_thread_finished_ GUARDED_BY(mu_) = false;
   };
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc
index 7bb2077b62..3f76695bb1 100644
--- a/tensorflow/core/kernels/data/writer_ops.cc
+++ b/tensorflow/core/kernels/data/writer_ops.cc
@@ -29,10 +29,10 @@ class ToTFRecordOp : public AsyncOpKernel {
  public:
   explicit ToTFRecordOp(OpKernelConstruction* ctx)
       : AsyncOpKernel(ctx),
-        background_worker_(
-            ctx->env(),
-            strings::StrCat("to_tf_record_op_", SanitizeThreadSuffix(name()))) {
-  }
+        thread_pool_(new thread::ThreadPool(
+            ctx->env(), ThreadOptions(),
+            strings::StrCat("to_tf_record__op_", SanitizeThreadSuffix(name())),
+            1 /* num_threads */, false /* low_latency_hint */)) {}
 
   template <typename T>
   Status ParseScalarArgument(OpKernelContext* ctx,
@@ -50,7 +50,7 @@ class ToTFRecordOp : public AsyncOpKernel {
     // The call to `iterator->GetNext()` may block and depend on an
     // inter-op thread pool thread, so we issue the call from the
     // owned thread pool.
-    background_worker_.Schedule([this, ctx, done]() {
+    thread_pool_->Schedule([this, ctx, done]() {
       string filename;
       OP_REQUIRES_OK_ASYNC(
           ctx, ParseScalarArgument<string>(ctx, "filename", &filename), done);
@@ -97,7 +97,7 @@ class ToTFRecordOp : public AsyncOpKernel {
   }
 
  private:
-  BackgroundWorker background_worker_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("DatasetToTFRecord").Device(DEVICE_CPU),
-- 
GitLab


From d4e9282dc53697432178a68940634612c4ab2baa Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Wed, 3 Oct 2018 12:32:57 -0700
Subject: [PATCH 352/570] [tf.data] Fix noisy warning.

PiperOrigin-RevId: 215607171
---
 tensorflow/python/data/ops/dataset_ops.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 46ce191f7b..b7e19055f2 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -1831,9 +1831,10 @@ class StructuredFunctionWrapper(object):
           flat_classes.append(component)
           flat_shapes.append(component)
           flat_types.append(component)
-          if t.options() is not None:  # pylint: disable=protected-access
-            warnings.warn("Encountered a nested dataset with options. These "
-                          "options will not be applied to the outer dataset.")
+          if t.options() != Options():
+            warnings.warn("Encountered a nested dataset with non-default "
+                          "options. These options will not be propagated to "
+                          "the outer dataset.")
         else:
           try:
             t = ops.convert_to_tensor(t)
-- 
GitLab


From 506ea0b8d3af1b54f42721584a414957e1525c8a Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Wed, 3 Oct 2018 12:36:16 -0700
Subject: [PATCH 353/570] Change hierarchical_tree_broadcaster_test from small
 to medium.

PiperOrigin-RevId: 215607769
---
 tensorflow/core/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 0aae29d10c..6a3ee3c1cb 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -3750,7 +3750,7 @@ tf_cc_tests_gpu(
 
 tf_cc_tests_gpu(
     name = "hierarchical_tree_broadcaster_test",
-    size = "small",
+    size = "medium",
     srcs = [
         "common_runtime/hierarchical_tree_broadcaster_test.cc",
     ],
-- 
GitLab


From 19833284cc8fa555115aacde350ad66652b250dc Mon Sep 17 00:00:00 2001
From: Tong Shen <endlessroad@google.com>
Date: Wed, 3 Oct 2018 12:39:32 -0700
Subject: [PATCH 354/570] Automated rollback of commit
 2af8fd975aaf5c70ebb396895fa15a8f034a8440

PiperOrigin-RevId: 215608349
---
 .../tf2xla/functionalize_control_flow.cc      | 129 ++++++------------
 1 file changed, 39 insertions(+), 90 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 28e09d7b79..36c6f5d316 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -79,10 +79,7 @@ Status FunctionalizeControlFlowForFunction(
     const string& func_name, const string& new_func_name,
     const protobuf::Map<string, tensorflow::AttrValue>& attrs,
     FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
-    std::map<string, absl::optional<string>>* canonicalized_name_to_new_name,
-    bool* modified) {
-  *modified = false;
-
+    std::map<string, string>* canonicalized_name_to_new_name) {
   // Convert the function to Graph.
   FunctionLibraryRuntime::Handle handle;
   TF_RETURN_IF_ERROR(flr->Instantiate(func_name, AttrSlice(&attrs), &handle));
@@ -95,19 +92,6 @@ Status FunctionalizeControlFlowForFunction(
   });
   const FunctionBody* body = flr->GetFunctionBody(handle);
 
-  // Check if the graph has Switch or Merge node before optimizing the graph.
-  bool has_switch_or_merge = false;
-  for (Node* n : body->graph->nodes()) {
-    if (n->type_string() == "Switch" || n->type_string() == "Merge") {
-      has_switch_or_merge = true;
-      break;
-    }
-  }
-  // We cannot return here directly if the graph has no Switch/Merge.
-  // It might contain function call nodes, or If/While nodes with Switch/Merge
-  // in function body. We still need to rewrite those functions and modify
-  // corresponding nodes.
-
   // Call graph optimizer. The most important optimization we need is constant
   // folding, which will replace ops like Shape/BroadcastGradientArgs with
   // constant shape input. Without this optimization, those ops might become
@@ -145,13 +129,6 @@ Status FunctionalizeControlFlowForFunction(
         absl::StrCat("functionalize_control_flow_after_opt_", func_name),
         *optimized_graph, fld);
   }
-  // Some inlined functions might have Switch/Merge nodes.
-  for (Node* n : optimized_graph->nodes()) {
-    if (n->type_string() == "Switch" || n->type_string() == "Merge") {
-      has_switch_or_merge = true;
-      break;
-    }
-  }
 
   // If any node has associated functions, functionalize them first.
   // Gather nodes with associated functions first, because rewriting those nodes
@@ -174,15 +151,10 @@ Status FunctionalizeControlFlowForFunction(
           Canonicalize(name, AttrSlice(&associated_function.attrs()));
       auto iter = canonicalized_name_to_new_name->find(canonicalized_name);
       string new_name;
-      bool function_modified;
       if (iter != canonicalized_name_to_new_name->end()) {
-        // If we already processed this function, check if it was rewritten. If
-        // the function was rewritten, the entry will be non-empty. Otherwise
-        // the entry will be empty.
-        function_modified = iter->second.has_value();
-        if (function_modified) {
-          new_name = iter->second.value();
-        }
+        // If we already functionalized this function, skip functionalization
+        // but still rewrite the node.
+        new_name = iter->second;
       } else {
         if (associated_function.type() ==
             AssociatedFunctionInfo::AssociatedFunctionType::kSymbolicGradient) {
@@ -194,62 +166,42 @@ Status FunctionalizeControlFlowForFunction(
         }
         TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
             name, new_name, associated_function.attrs(), fld, flr,
-            canonicalized_name_to_new_name, &function_modified));
-        if (function_modified) {
-          // If the function was rewritten, add an non-empty entry. So later we
-          // know we have processed this function, and it was rewritten into
-          // another function.
-          (*canonicalized_name_to_new_name)[canonicalized_name] = new_name;
-        } else {
-          // If the function was not rewritten, add an empty entry. So later
-          // we know we have processed this function, and it does not need to be
-          // rewritten.
-          (*canonicalized_name_to_new_name)[canonicalized_name] = absl::nullopt;
-        }
-      }
-      if (function_modified) {
-        *modified = true;
-
-        // Notice that if "n" is a function call, RewriteAssociatedFunction()
-        // will delete it and create a new node instead, making "n" an invalid
-        // pointer. That's fine because in that case, associated_functions will
-        // only have one member and the loop will only run once.
-        TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
-            optimized_graph.get(), n, fld, associated_function, new_name));
+            canonicalized_name_to_new_name));
+        (*canonicalized_name_to_new_name)[canonicalized_name] = new_name;
       }
+      // Notice that if "n" is a function call, RewriteAssociatedFunction() will
+      // delete it and create a new node instead, making "n" an invalid pointer.
+      // That's fine because in that case, associated_functions will only have
+      // one member and the loop will only run once.
+      TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
+          optimized_graph.get(), n, fld, associated_function, new_name));
     }
   }
 
-  if (has_switch_or_merge) {
-    *modified = true;
-
-    // Functionalize the function body.
-    if (VLOG_IS_ON(4)) {
-      dump_graph::DumpGraphToFile(
-          absl::StrCat("functionalize_control_flow_before_fdef_", func_name),
-          *optimized_graph, fld);
-    }
-    TF_RETURN_IF_ERROR(FunctionalizeControlFlow(optimized_graph.get(), fld));
-    if (VLOG_IS_ON(4)) {
-      dump_graph::DumpGraphToFile(
-          absl::StrCat("functionalize_control_flow_after_fdef_", func_name),
-          *optimized_graph, fld);
-    }
+  // Functionalize the function body.
+  if (VLOG_IS_ON(4)) {
+    dump_graph::DumpGraphToFile(
+        absl::StrCat("functionalize_control_flow_before_fdef_", func_name),
+        *optimized_graph, fld);
   }
+  TF_RETURN_IF_ERROR(FunctionalizeControlFlow(optimized_graph.get(), fld));
+  if (VLOG_IS_ON(4)) {
+    dump_graph::DumpGraphToFile(
+        absl::StrCat("functionalize_control_flow_after_fdef_", func_name),
+        *optimized_graph, fld);
+  }
+  FunctionDef functionalized_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(*optimized_graph, new_func_name,
+                                        &functionalized_fdef));
 
-  if (*modified) {
-    // Add rewritten FunctionDef into library.
-    FunctionDef functionalized_fdef;
-    TF_RETURN_IF_ERROR(GraphToFunctionDef(*optimized_graph, new_func_name,
-                                          &functionalized_fdef));
-    if (func_name == new_func_name) {
-      VLOG(2) << "Replacing function " << func_name;
-      TF_RETURN_IF_ERROR(
-          fld->ReplaceFunction(new_func_name, functionalized_fdef));
-    } else {
-      VLOG(2) << "Adding function " << new_func_name;
-      TF_RETURN_IF_ERROR(fld->AddFunctionDef(functionalized_fdef));
-    }
+  // Add rewritten FunctionDef into library.
+  if (func_name == new_func_name) {
+    VLOG(2) << "Replacing function " << func_name;
+    TF_RETURN_IF_ERROR(
+        fld->ReplaceFunction(new_func_name, functionalized_fdef));
+  } else {
+    VLOG(2) << "Adding function " << new_func_name;
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(functionalized_fdef));
   }
 
   return ret_status;
@@ -275,7 +227,7 @@ Status FunctionalizeControlFlowPass::Run(
           {"TPUCompile", "function"},
           {"XlaLaunch", "function"},
       };
-  std::map<string, absl::optional<string>> canonicalized_name_to_new_name;
+  std::map<string, string> canonicalized_name_to_new_name;
   for (Node* n : graph->nodes()) {
     auto it = kNodeTypeToFunctionAttrMapping->find(n->type_string());
     if (it == kNodeTypeToFunctionAttrMapping->end()) {
@@ -290,15 +242,12 @@ Status FunctionalizeControlFlowPass::Run(
               << ". Corresponding function: " << func.name();
       string new_func_name = options.flib_def->UniqueFunctionName(
           absl::StrCat(func.name(), "_f15n_"));
-      bool modified;
       TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
           func.name(), new_func_name, func.attr(), options.flib_def, flr,
-          &canonicalized_name_to_new_name, &modified));
-      if (modified) {
-        n->ClearAttr(func_attr);
-        func.set_name(new_func_name);
-        n->AddAttr(func_attr, func);
-      }
+          &canonicalized_name_to_new_name));
+      n->ClearAttr(func_attr);
+      func.set_name(new_func_name);
+      n->AddAttr(func_attr, func);
     }
   }
 
-- 
GitLab


From 808b1dcb318b1feb5a8c9fed5558f95cd05728e4 Mon Sep 17 00:00:00 2001
From: Shivani Agrawal <shivaniagrawal@google.com>
Date: Wed, 3 Oct 2018 12:44:47 -0700
Subject: [PATCH 355/570] [data-stats] Sets user given `tag` and
 `counter_prefix` with `set_stats_aggregator`. `tag` would get prep-end with
 all the statistics recorded as summary and `counter_prefix` would set the
 prefix for the statistics recorded as counter. Note: `counter` defaults to
 `\tensorflow`, and `tag` and `prefix` gets associated with the dataset (not
 the stats_aggregator).

PiperOrigin-RevId: 215609159
---
 tensorflow/core/framework/dataset.h           | 22 +-----
 tensorflow/core/kernels/data/BUILD            |  1 +
 .../experimental/threadpool_dataset_op.cc     |  2 +-
 .../kernels/data/parse_example_dataset_op.cc  |  4 +-
 .../data/stats_aggregator_dataset_op.cc       | 78 +++++++++++++++++--
 .../core/kernels/data/stats_aggregator_ops.cc | 11 +--
 .../core/ops/compat/ops_history.v1.pbtxt      |  8 ++
 tensorflow/core/ops/dataset_ops.cc            |  2 +
 .../kernel_tests/stats_dataset_ops_test.py    | 69 ++++++++++++++++
 .../python/data/experimental/ops/stats_ops.py | 17 +++-
 .../v1/tensorflow.data.experimental.pbtxt     |  2 +-
 .../v2/tensorflow.data.experimental.pbtxt     |  2 +-
 12 files changed, 179 insertions(+), 39 deletions(-)

diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 8c1151cb56..964a7d5f8c 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -278,15 +278,8 @@ class IteratorContext {
     // Function call support.
     std::function<void(std::function<void()>)> runner = nullptr;
 
-    // A function that returns the current `StatsAggregator` instance to be
-    // used when recording statistics about the iterator.
-    //
-    // NOTE(mrry): This is somewhat awkward, because (i) the `StatsAggregator`
-    // is a property of the `IteratorResource` (which this class does not know
-    // about), and (ii) it can change after the `IteratorContext` has been
-    // created. Better suggestions are welcome!
-    std::function<std::shared_ptr<StatsAggregator>()> stats_aggregator_getter =
-        nullptr;
+    // The `StatsAggregator` object to record statistics about the iterator.
+    std::shared_ptr<StatsAggregator> stats_aggregator = nullptr;
 
     // The FunctionLibraryRuntime object to be used to make function calls.
     FunctionLibraryRuntime* lib = nullptr;
@@ -320,13 +313,6 @@ class IteratorContext {
     return &params_.runner;
   }
 
-  std::shared_ptr<StatsAggregator> stats_aggregator() {
-    if (params_.stats_aggregator_getter) {
-      return params_.stats_aggregator_getter();
-    } else {
-      return nullptr;
-    }
-  }
 
   std::shared_ptr<const FunctionLibraryDefinition> function_library() {
     return params_.function_library;
@@ -344,8 +330,8 @@ class IteratorContext {
     return params_.allocator_getter;
   }
 
-  std::function<std::shared_ptr<StatsAggregator>()> stats_aggregator_getter() {
-    return params_.stats_aggregator_getter;
+  std::shared_ptr<StatsAggregator> stats_aggregator() {
+    return params_.stats_aggregator;
   }
 
   std::shared_ptr<model::Model> model() { return params_.model; }
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 6333853cdf..451f8c1a6c 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -458,6 +458,7 @@ tf_kernel_library(
     srcs = ["stats_aggregator_dataset_op.cc"],
     deps = [
         ":dataset",
+        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_internal",
     ],
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index c80493d3a1..8d561ca0e3 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -191,7 +191,7 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
         params.runner = [pool](std::function<void()> c) {
           pool->Schedule(std::move(c));
         };
-        params.stats_aggregator_getter = ctx->stats_aggregator_getter();
+        params.stats_aggregator = ctx->stats_aggregator();
         params.lib = ctx->lib();
         params.function_library = ctx->function_library();
         params.allocator_getter = ctx->allocator_getter();
diff --git a/tensorflow/core/kernels/data/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/parse_example_dataset_op.cc
index c28c06da62..1d1a717062 100644
--- a/tensorflow/core/kernels/data/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parse_example_dataset_op.cc
@@ -253,7 +253,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
               for (example::PerExampleFeatureStats feature_stats :
                    example_result.feature_stats) {
                 stats_aggregator->AddToHistogram(
-                    strings::StrCat("record_stats", ":features"),
+                    "features",
                     {static_cast<double>(feature_stats.features_count)});
                 stats_aggregator->IncrementCounter(
                     "features_count", "trainer", feature_stats.features_count);
@@ -261,7 +261,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
                     "feature_values_count", "trainer",
                     feature_stats.feature_values_count);
                 stats_aggregator->AddToHistogram(
-                    strings::StrCat("record_stats", ":feature-values"),
+                    "feature-values",
                     {static_cast<double>(feature_stats.feature_values_count)});
               }
             }
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
index c8abfb9eb5..c09a73fff1 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <memory>
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
@@ -22,6 +24,52 @@ namespace tensorflow {
 namespace data {
 namespace {
 
+class StatsAggregatorWithTagAndPrefix : public StatsAggregator {
+ public:
+  StatsAggregatorWithTagAndPrefix(
+      std::shared_ptr<StatsAggregator> stats_aggregator, const string& tag,
+      const string& prefix)
+      : wrapped_(stats_aggregator), tag_(tag), prefix_(prefix) {}
+
+  void AddToHistogram(const string& name,
+                      gtl::ArraySlice<double> values) override {
+    if (!tag_.empty()) {
+      wrapped_->AddToHistogram(strings::StrCat(tag_, "_", name), values);
+    } else {
+      wrapped_->AddToHistogram(name, values);
+    }
+  }
+
+  void AddScalar(const string& name, float value) override {
+    if (!tag_.empty()) {
+      wrapped_->AddScalar(strings::StrCat(tag_, "_", name), value);
+    } else {
+      wrapped_->AddScalar(name, value);
+    }
+  }
+
+  void EncodeToProto(Summary* out_summary) override {
+    wrapped_->EncodeToProto(out_summary);
+  }
+
+  void IncrementCounter(const string& name, const string& label,
+                        int64 val) override {
+    if (!prefix_.empty()) {
+      wrapped_->IncrementCounter(strings::StrCat(prefix_, "/", name), label,
+                                 val);
+    } else {
+      wrapped_->IncrementCounter(strings::StrCat("/tensorflow/", name), label,
+                                 val);
+    }
+  }
+
+ private:
+  std::shared_ptr<StatsAggregator> wrapped_;
+  string tag_;
+  string prefix_;
+  TF_DISALLOW_COPY_AND_ASSIGN(StatsAggregatorWithTagAndPrefix);
+};
+
 class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit SetStatsAggregatorDatasetOp(OpKernelConstruction* ctx)
@@ -33,8 +81,13 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
                                        &stats_aggregator_resource));
     core::ScopedUnref unref_stats_aggregator(stats_aggregator_resource);
+    string tag;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "tag", &tag));
+    string prefix;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "counter_prefix", &prefix));
 
-    *output = new Dataset(ctx, input, ctx->input(1), stats_aggregator_resource);
+    *output = new Dataset(ctx, input, ctx->input(1), stats_aggregator_resource,
+                          tag, prefix);
   }
 
  private:
@@ -42,11 +95,14 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
    public:
     explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
                      const Tensor& resource_handle,
-                     StatsAggregatorResource* stats_aggregator_resource)
+                     StatsAggregatorResource* stats_aggregator_resource,
+                     const string& tag, const string& prefix)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           resource_handle_(resource_handle),
-          stats_aggregator_resource_(stats_aggregator_resource) {
+          stats_aggregator_resource_(stats_aggregator_resource),
+          tag_(tag),
+          prefix_(prefix) {
       input_->Ref();
       stats_aggregator_resource_->Ref();
     }
@@ -81,8 +137,13 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* resource_handle_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddTensor(resource_handle_, &resource_handle_node));
+      Node* tag_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(tag_, &tag_node));
+      Node* prefix_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(prefix_, &prefix_node));
       TF_RETURN_IF_ERROR(b->AddDataset(
-          this, {input_graph_node, resource_handle_node}, output));
+          this, {input_graph_node, resource_handle_node, tag_node, prefix_node},
+          output));
       return Status::OK();
     }
 
@@ -105,9 +166,10 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
         IteratorContext::Params params;
         params.env = ctx->env();
         params.runner = *(ctx->runner());
-        params.stats_aggregator_getter = [stats_aggregator_resource]() {
-          return stats_aggregator_resource->stats_aggregator();
-        };
+        params.stats_aggregator = std::shared_ptr<StatsAggregator>(
+            new StatsAggregatorWithTagAndPrefix(
+                stats_aggregator_resource->stats_aggregator(), dataset()->tag_,
+                dataset()->prefix_));
         params.lib = ctx->lib();
         params.function_library = ctx->function_library();
         params.allocator_getter = ctx->allocator_getter();
@@ -136,6 +198,8 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
     const DatasetBase* const input_;
     const Tensor resource_handle_;
     StatsAggregatorResource* stats_aggregator_resource_;
+    string tag_;
+    string prefix_;
   };
 };
 
diff --git a/tensorflow/core/kernels/data/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
index a7ded67876..2d51467616 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
@@ -82,11 +82,12 @@ class StatsAggregatorImpl : public StatsAggregator {
     auto counters_map = get_counters_map();
     if (counters_map->find(name) == counters_map->end()) {
       counters_map->emplace(
-          name, monitoring::Counter<1>::New(
-                    /*streamz name*/ "/tensorflow/" + name,
-                    /*streamz description*/
-                    name + " generated or consumed by the component.",
-                    /*streamz label name*/ "component_descriptor"));
+          name,
+          monitoring::Counter<1>::New(
+              /*streamz name*/ name,
+              /*streamz description*/
+              strings::StrCat(name, " generated or consumed by the component."),
+              /*streamz label name*/ "component_descriptor"));
     }
     counters_map->at(name)->GetCell(label)->IncrementBy(val);
   }
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 4845767405..33f18ae13f 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -59785,6 +59785,14 @@ op {
     name: "stats_aggregator"
     type: DT_RESOURCE
   }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
+    type: DT_STRING
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 71f4cc3c4c..889a6a4640 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -185,6 +185,8 @@ REGISTER_OP("ParseExampleDataset")
 REGISTER_OP("SetStatsAggregatorDataset")
     .Input("input_dataset: variant")
     .Input("stats_aggregator: resource")
+    .Input("tag: string")
+    .Input("counter_prefix: string")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
index 6761fbd16b..19f5a62d45 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base
 from tensorflow.python.data.experimental.ops import stats_ops
 from tensorflow.python.data.ops import dataset_ops
@@ -248,6 +249,74 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         sess.run(next_element)
       self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
 
+  def testMultipleDatasetWithTags(self):
+    stats_aggregator = stats_ops.StatsAggregator()
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.set_stats_aggregator(stats_aggregator, "dataset1"))
+    dataset2 = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.set_stats_aggregator(stats_aggregator, "dataset2"))
+    iterator_0 = dataset.make_initializable_iterator()
+    iterator_1 = dataset2.make_initializable_iterator()
+    next_element = iterator_0.get_next() + iterator_1.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run([iterator_0.initializer, iterator_1.initializer])
+      for i in range(100):
+        self.assertEqual(i * 2, sess.run(next_element))
+        self._assertSummaryHasCount(
+            sess.run(summary_t), "dataset1_record_latency", float(i + 1))
+        self._assertSummaryHasCount(
+            sess.run(summary_t), "dataset2_record_latency", float(i + 1))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      self._assertSummaryHasCount(
+          sess.run(summary_t), "dataset1_record_latency", 100.0)
+      self._assertSummaryHasCount(
+          sess.run(summary_t), "dataset2_record_latency", 100.0)
+
+
+class FeatureStatsDatasetTest(
+    stats_dataset_test_base.StatsDatasetTestBase,
+    reader_dataset_ops_test_base.ReadBatchFeaturesTestBase):
+
+  def testFeaturesStats(self):
+    num_epochs = 5
+    total_records = num_epochs * self._num_records
+    batch_size = 2
+    stats_aggregator = stats_ops.StatsAggregator()
+    dataset = self.make_batch_feature(
+        filenames=self.test_filenames[0],
+        num_epochs=num_epochs,
+        batch_size=batch_size,
+        shuffle=True,
+        shuffle_seed=5,
+        drop_final_batch=False).apply(
+            stats_ops.set_stats_aggregator(stats_aggregator, "record_stats"))
+    iterator = dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      for _ in range(total_records // batch_size + 1 if total_records %
+                     batch_size else total_records // batch_size):
+        sess.run(next_element)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      self._assertSummaryHasCount(
+          sess.run(summary_t), "record_stats_features", total_records)
+      self._assertSummaryHasCount(
+          sess.run(summary_t), "record_stats_feature-values", total_records)
+      self._assertSummaryHasSum(
+          sess.run(summary_t), "record_stats_features", total_records * 4)
+      self._assertSummaryHasSum(
+          sess.run(summary_t), "record_stats_feature-values",
+          self._sum_keywords(1) * num_epochs + 3 * total_records)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/ops/stats_ops.py b/tensorflow/python/data/experimental/ops/stats_ops.py
index c918d223e8..54ef6fc3e8 100644
--- a/tensorflow/python/data/experimental/ops/stats_ops.py
+++ b/tensorflow/python/data/experimental/ops/stats_ops.py
@@ -89,15 +89,19 @@ class StatsAggregator(object):
 class _SetStatsAggregatorDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that acts as an identity, and sets given stats_aggregator."""
 
-  def __init__(self, input_dataset, stats_aggregator):
+  def __init__(self, input_dataset, stats_aggregator, tag, prefix):
     super(_SetStatsAggregatorDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._stats_aggregator = stats_aggregator
+    self._tag = tag
+    self._prefix = prefix
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.set_stats_aggregator_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._stats_aggregator._resource,  # pylint: disable=protected-access
+        self._tag,
+        self._prefix,
         **dataset_ops.flat_structure(self))
 
   @property
@@ -114,11 +118,15 @@ class _SetStatsAggregatorDataset(dataset_ops.UnaryDataset):
 
 
 @tf_export("data.experimental.set_stats_aggregator")
-def set_stats_aggregator(stats_aggregator):
+def set_stats_aggregator(stats_aggregator, tag="", counter_prefix=""):
   """Set the given `stats_aggregator` for aggregating the input dataset stats.
 
   Args:
-    stats_aggregator: A `tf.data.experimental.StatsAggregator` object.
+    stats_aggregator: A `tf.contrib.data.StatsAggregator` object.
+    tag: (Optional) String, all statistics recorded for the input `dataset`
+      will have given `tag` prepend with the name.
+    counter_prefix: (Optional) String, all statistics recorded as `counters`
+      will have the given `prefix` for the counter. Defaults to "/tesorflow".
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -126,7 +134,8 @@ def set_stats_aggregator(stats_aggregator):
   """
 
   def _apply_fn(dataset):
-    return _SetStatsAggregatorDataset(dataset, stats_aggregator)
+    return _SetStatsAggregatorDataset(dataset, stats_aggregator, tag,
+                                      counter_prefix)
 
   return _apply_fn
 
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index b14585f8d7..2a1f899dc0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -122,7 +122,7 @@ tf_module {
   }
   member_method {
     name: "set_stats_aggregator"
-    argspec: "args=[\'stats_aggregator\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'stats_aggregator\', \'tag\', \'counter_prefix\'], varargs=None, keywords=None, defaults=[\'\', \'\'], "
   }
   member_method {
     name: "shuffle_and_repeat"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index b14585f8d7..2a1f899dc0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -122,7 +122,7 @@ tf_module {
   }
   member_method {
     name: "set_stats_aggregator"
-    argspec: "args=[\'stats_aggregator\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'stats_aggregator\', \'tag\', \'counter_prefix\'], varargs=None, keywords=None, defaults=[\'\', \'\'], "
   }
   member_method {
     name: "shuffle_and_repeat"
-- 
GitLab


From 7566f3d5ad690c71c36e78611b1ae5913ec3e845 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 13:22:52 -0700
Subject: [PATCH 356/570] Fix handling of tuples in CreateCopyWithNewLayout.

If the layout of a single tensor in a tuple is different from its use, then
CreateCopyWithNewLayout will do a deep copy of the entire tuple.  Not only does
this operation create unnecessary copies of elements where the layout is the
same, it will throw an error if the tuple contains elements like token[] that
cannot be copied.  As a result, layout assignment on TPU occassionally causes
mysterious compilation failures for code that runs correctly on CPU and GPU.

PiperOrigin-RevId: 215615731
---
 .../compiler/xla/service/layout_assignment.cc | 28 +++++----
 .../xla/service/layout_assignment_test.cc     | 59 +++++++++++++++++++
 2 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 68a08a0886..cc4a342e9d 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -792,21 +792,27 @@ StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
       << " instruction: " << instruction->ToString();
 
   if (ShapeUtil::IsTuple(instruction->shape())) {
-    // Deep-copy tuples.
+    // Copy tuple elements which have differing layouts.
     std::vector<HloInstruction*> element_copies;
     for (int64 i = 0; i < ShapeUtil::TupleElementCount(instruction->shape());
          ++i) {
+      const Shape& target_shape =
+          ShapeUtil::GetSubshape(shape_with_layout, {i});
+      const Shape& instr_shape =
+          ShapeUtil::GetSubshape(instruction->shape(), {i});
       HloInstruction* gte = instruction->parent()->AddInstruction(
-          HloInstruction::CreateGetTupleElement(
-              ShapeUtil::GetSubshape(instruction->shape(), {i}), instruction,
-              i));
-      SetupCopiedInstruction(*instruction, gte, {i});
-      // Recurse to copy each elements.
-      TF_ASSIGN_OR_RETURN(
-          HloInstruction * element_copy,
-          CreateCopyWithNewLayout(
-              ShapeUtil::GetSubshape(shape_with_layout, {i}), gte));
-      element_copies.push_back(element_copy);
+          HloInstruction::CreateGetTupleElement(instr_shape, instruction, i));
+
+      if (ShapeUtil::Equal(target_shape, instr_shape)) {
+        // Shapes and layouts are equal, no need to copy.
+        element_copies.push_back(gte);
+      } else {
+        SetupCopiedInstruction(*instruction, gte, {i});
+        // Recurse to copy each element.
+        TF_ASSIGN_OR_RETURN(HloInstruction * element_copy,
+                            CreateCopyWithNewLayout(target_shape, gte));
+        element_copies.push_back(element_copy);
+      }
     }
     // Gather element copies into a tuple with a new Tuple instruction.
     HloInstruction* tuple_copy = instruction->parent()->AddInstruction(
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 15c16d667c..2c549cd872 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -1043,5 +1043,64 @@ TEST_F(LayoutAssignmentTest, PropagatingLayoutFromResultToOperand) {
                                     op::ShapeWithLayout(shape_copy))));
 }
 
+TEST_F(LayoutAssignmentTest, TupleCopyOnLayoutMismatch) {
+  // The first infeed uses layout {0,1}, while the second uses layout {1,0}.
+  // The mismatch forces a copy of the tuple.  The tuple contains a token, so
+  // layout assignment will fail if it tries to copy the whole tuple.
+  const char* module_str = R"(
+    HloModule TupleCopyOnLayoutMismatch
+
+    condition.1 (tup: (s32[], token[], f32[512,1024]{0,1})) -> pred[] {
+      tup.1 = (s32[], token[], f32[512,1024]{0,1}) parameter(0)
+      counter.1 = s32[] get-tuple-element(tup.1), index=0
+      five = s32[] constant(5)
+      ROOT lt = pred[] less-than(counter.1, five)
+    }
+
+    body.2 (tup: (s32[], token[], f32[512,1024]{0,1})) -> (s32[], token[], f32[512,1024]{0,1}) {
+      tup.2 = (s32[], token[], f32[512,1024]{0,1}) parameter(0)
+      counter.2 = s32[] get-tuple-element(tup.2), index=0
+      tok.2 = token[] get-tuple-element(tup.2), index=1
+
+      ifeed.2 = (f32[512,1024]{1,0}, token[]) infeed(tok.2)
+      next_tok = token[] get-tuple-element(ifeed.2), index=1
+      next_buf = f32[512,1024]{1,0} get-tuple-element(ifeed.2), index=0
+
+      one = s32[] constant(1)
+      next_counter = s32[] add(counter.2, one)
+      ROOT tup = (s32[], token[], f32[512,1024]{0,1}) tuple(next_counter, next_tok, next_buf)
+    }
+
+    ENTRY main () -> f32[512,1024]{0,1} {
+      start_tok = token[] after-all()
+
+      ifeed.3 = (f32[512,1024]{0,1}, token[]) infeed(start_tok)
+      itok = token[] get-tuple-element(ifeed.3), index=1
+      ibuf = f32[512,1024]{0,1} get-tuple-element(ifeed.3), index=0
+
+      zero = s32[] constant(0)
+      itup = (s32[], token[], f32[512,1024]{0,1}) tuple(zero, itok, ibuf)
+
+      loop = (s32[], token[], f32[512,1024]{0,1}) while(itup), condition=condition.1, body=body.2
+      ROOT result = f32[512,1024]{0,1} get-tuple-element(loop), index=2
+    }
+  )";
+
+  ParseAndVerifyModule(module_str);
+  ComputationLayout computation_layout(
+      module().entry_computation()->ComputeProgramShape());
+
+  // Sanity check to verify that there's a layout mismatch.
+  EXPECT_THAT(LayoutOf(&module(), "ibuf"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(&module(), "next_buf"), ElementsAre(1, 0));
+
+  AssignLayouts(&module(), &computation_layout);
+
+  // Make sure that layout assignment did not magically eliminate the mismatch,
+  // in which case the test didn't prove anything.
+  EXPECT_THAT(LayoutOf(&module(), "ibuf"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(&module(), "next_buf"), ElementsAre(1, 0));
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From c2c8cfe22492cf7fab804d32283b623632270035 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 13:25:22 -0700
Subject: [PATCH 357/570] Add the option of merging bidirectional RNN and LSTM
 outputs into a single output tensor.

This is useful if the output of both directions will be passed to the next layer as a single output, as it avoids adding a concatenation op, which can be expensive on mobile devices where memory movement is relatively expensive.

PiperOrigin-RevId: 215616140
---
 tensorflow/contrib/lite/c/builtin_op_data.h   |  16 ++
 .../contrib/lite/c/builtin_op_data_test.cc    |   2 +
 .../lite/core/api/flatbuffer_conversions.cc   |  34 ++-
 .../kernels/bidirectional_sequence_lstm.cc    | 116 +++++----
 .../bidirectional_sequence_lstm_test.cc       | 186 +++++++++++++-
 .../kernels/bidirectional_sequence_rnn.cc     |  85 +++---
 .../bidirectional_sequence_rnn_test.cc        |  56 +++-
 tensorflow/contrib/lite/schema/schema.fbs     |  12 +
 .../contrib/lite/schema/schema_generated.h    | 243 +++++++++++++++++-
 9 files changed, 640 insertions(+), 110 deletions(-)

diff --git a/tensorflow/contrib/lite/c/builtin_op_data.h b/tensorflow/contrib/lite/c/builtin_op_data.h
index be9d551ee4..44daf7adaa 100644
--- a/tensorflow/contrib/lite/c/builtin_op_data.h
+++ b/tensorflow/contrib/lite/c/builtin_op_data.h
@@ -99,6 +99,12 @@ typedef struct {
   TfLiteFusedActivation activation;
 } TfLiteSequenceRNNParams;
 
+typedef struct {
+  bool time_major;
+  TfLiteFusedActivation activation;
+  bool merge_outputs;
+} TfLiteBidirectionalSequenceRNNParams;
+
 typedef enum {
   kTfLiteFullyConnectedWeightsFormatDefault = 0,
   kTfLiteFullyConnectedWeightsFormatShuffled4x16Int8 = 1,
@@ -180,6 +186,16 @@ typedef struct {
   TfLiteLSTMKernelType kernel_type;
 } TfLiteLSTMParams;
 
+typedef struct {
+  // Parameters for the LSTM kernel.
+  TfLiteFusedActivation activation;
+  float cell_clip;
+  float proj_clip;
+
+  // If true, store the outputs of both directions in the first output.
+  bool merge_outputs;
+} TfLiteBidirectionalSequenceLSTMParams;
+
 typedef struct {
   bool align_corners;
 } TfLiteResizeBilinearParams;
diff --git a/tensorflow/contrib/lite/c/builtin_op_data_test.cc b/tensorflow/contrib/lite/c/builtin_op_data_test.cc
index 4d0ba75e68..ba458b4252 100644
--- a/tensorflow/contrib/lite/c/builtin_op_data_test.cc
+++ b/tensorflow/contrib/lite/c/builtin_op_data_test.cc
@@ -73,6 +73,8 @@ TEST(IntArray, CanCompileStructs) {
   TfLiteFakeQuantParams fake_quant_params;
   TfLitePackParams pack_params;
   TfLiteOneHotParams one_hot_params;
+  TfLiteBidirectionalSequenceRNNParams bidi_sequence_rnn_params;
+  TfLiteBidirectionalSequenceLSTMParams bidi_sequence_lstm_params;
 }
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc b/tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc
index e6900e0950..eac7db9a88 100644
--- a/tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc
@@ -224,10 +224,8 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
-    case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
     case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN: {
-      TfLiteSequenceRNNParams* params =
-          allocator->AllocatePOD<TfLiteSequenceRNNParams>();
+      auto params = allocator->AllocatePOD<TfLiteSequenceRNNParams>();
       if (auto* sequence_rnn_params =
               op->builtin_options_as_SequenceRNNOptions()) {
         params->activation =
@@ -237,6 +235,19 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN: {
+      auto params =
+          allocator->AllocatePOD<TfLiteBidirectionalSequenceRNNParams>();
+      if (auto* bidi_sequence_rnn_params =
+              op->builtin_options_as_BidirectionalSequenceRNNOptions()) {
+        params->activation = parse_activation(
+            bidi_sequence_rnn_params->fused_activation_function());
+        params->time_major = bidi_sequence_rnn_params->time_major();
+        params->merge_outputs = bidi_sequence_rnn_params->merge_outputs();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_RNN: {
       TfLiteRNNParams* params = allocator->AllocatePOD<TfLiteRNNParams>();
       if (auto* rnn_params = op->builtin_options_as_RNNOptions()) {
@@ -360,10 +371,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
-    case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
     case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
     case BuiltinOperator_LSTM: {
-      TfLiteLSTMParams* params = allocator->AllocatePOD<TfLiteLSTMParams>();
+      auto params = allocator->AllocatePOD<TfLiteLSTMParams>();
       if (auto* lstm_params = op->builtin_options_as_LSTMOptions()) {
         params->activation =
             parse_activation(lstm_params->fused_activation_function());
@@ -381,6 +391,20 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM: {
+      auto params =
+          allocator->AllocatePOD<TfLiteBidirectionalSequenceLSTMParams>();
+      if (auto* bidi_lstm_params =
+              op->builtin_options_as_BidirectionalSequenceLSTMOptions()) {
+        params->activation =
+            parse_activation(bidi_lstm_params->fused_activation_function());
+        params->cell_clip = bidi_lstm_params->cell_clip();
+        params->proj_clip = bidi_lstm_params->proj_clip();
+        params->merge_outputs = bidi_lstm_params->merge_outputs();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_RESIZE_BILINEAR: {
       auto* params = allocator->AllocatePOD<TfLiteResizeBilinearParams>();
       if (auto* schema_params =
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
index 66b947771c..0532528f52 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
@@ -119,7 +119,7 @@ constexpr int kBwAuxInputToOutputWeightsTensor = 47;  // Optional
 
 // Output tensors.
 constexpr int kFwOutputTensor = 0;
-constexpr int kBwOutputTensor = 1;
+constexpr int kBwOutputTensor = 1;  // Ignored if merge_outputs is set.
 
 // Temporary tensors.
 enum TemporaryTensor {
@@ -162,7 +162,8 @@ TfLiteStatus CheckLstmTensorDimensions(
     int input_gate_bias_tensor, int forget_gate_bias_tensor,
     int cell_gate_bias_tensor, int output_gate_bias_tensor,
     int projection_weights_tensor, int projection_bias_tensor) {
-  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>(
+      node->builtin_data);
 
   // Making sure clipping parameters have valid values.
   // == 0 means no clipping
@@ -347,10 +348,13 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 // tensors. Also check that the size of the input tensors match each other.
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>(
+      node->builtin_data);
 
   // Check we have all the inputs and outputs we need.
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 48);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size,
+                    params->merge_outputs ? 1 : 2);
 
   // Inferring batch size, number of outputs and sequence length and
   // number of cells from the input tensors.
@@ -368,6 +372,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, fw_input_to_output_weights->dims->data[1],
                     n_input);
 
+  const TfLiteTensor* bw_input_to_output_weights =
+      GetInput(context, node, kBwInputToOutputWeightsTensor);
+  const int n_bw_cell = bw_input_to_output_weights->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->data[1],
+                    n_input);
+
   const TfLiteTensor* fw_recurrent_to_output_weights =
       GetInput(context, node, kFwRecurrentToOutputWeightsTensor);
   TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->dims->size, 2);
@@ -375,6 +386,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     n_fw_cell);
   const int n_fw_output = fw_recurrent_to_output_weights->dims->data[1];
 
+  const TfLiteTensor* bw_recurrent_to_output_weights =
+      GetInput(context, node, kBwRecurrentToOutputWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->data[0],
+                    n_bw_cell);
+  const int n_bw_output = bw_recurrent_to_output_weights->dims->data[1];
+
   // Check that input tensor dimensions matches with each other.
   TF_LITE_ENSURE_OK(
       context, CheckInputTensorDimensions(context, node, n_input, n_fw_output,
@@ -440,7 +458,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteIntArray* fw_output_size = TfLiteIntArrayCreate(3);
   fw_output_size->data[0] = max_time;
   fw_output_size->data[1] = n_batch;
-  fw_output_size->data[2] = n_fw_output;
+  fw_output_size->data[2] =
+      params->merge_outputs ? n_bw_output + n_fw_output : n_fw_output;
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, fw_output, fw_output_size));
 
@@ -479,39 +498,28 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_scratch_buffer,
                                                    fw_scratch_buffer_size));
   // Same for the backward cell.
-  const TfLiteTensor* bw_input_to_output_weights =
-      GetInput(context, node, kBwInputToOutputWeightsTensor);
-  const int n_bw_cell = bw_input_to_output_weights->dims->data[0];
-  TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->data[1],
-                    n_input);
-
-  const TfLiteTensor* bw_recurrent_to_output_weights =
-      GetInput(context, node, kBwRecurrentToOutputWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->data[0],
-                    n_bw_cell);
-  const int n_bw_output = bw_recurrent_to_output_weights->dims->data[1];
 
   // Check that input tensor dimensions matches with each other.
   TF_LITE_ENSURE_OK(
       context, CheckInputTensorDimensions(context, node, n_input, n_bw_output,
                                           n_bw_cell));
 
-  // Get the pointer to output, activation_state and cell_state buffer tensors.
-  TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
+  // Get the pointer to activation_state and cell_state buffer tensors.
   TfLiteTensor* bw_activation_state =
       GetVariableInput(context, node, kBwInputActivationStateTensor);
   TfLiteTensor* bw_cell_state =
       GetVariableInput(context, node, kBwInputCellStateTensor);
 
   // Resize the output tensors.
-  TfLiteIntArray* bw_output_size = TfLiteIntArrayCreate(3);
-  bw_output_size->data[0] = max_time;
-  bw_output_size->data[1] = n_batch;
-  bw_output_size->data[2] = n_bw_output;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, bw_output, bw_output_size));
+  if (!params->merge_outputs) {
+    TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
+    TfLiteIntArray* bw_output_size = TfLiteIntArrayCreate(3);
+    bw_output_size->data[0] = max_time;
+    bw_output_size->data[1] = n_batch;
+    bw_output_size->data[2] = n_bw_output;
+    TF_LITE_ENSURE_OK(
+        context, context->ResizeTensor(context, bw_output, bw_output_size));
+  }
 
   // Check the shape of input state tensors.
   // These tensor may be 1D or 2D. It's fine as long as the total size is
@@ -705,7 +713,7 @@ TfLiteStatus EvalFloat(
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
     const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, bool forward_sequence,
+    const TfLiteLSTMParams* params, bool forward_sequence, int output_offset,
     TfLiteTensor* scratch_buffer, TfLiteTensor* activation_state,
     TfLiteTensor* cell_state, TfLiteTensor* output) {
   const int max_time = input->dims->data[0];
@@ -771,12 +779,13 @@ TfLiteStatus EvalFloat(
 
   // Loop through the sequence.
   const int input_step = n_batch * n_input;
-  const int output_step = n_batch * n_output;
+  const int output_step = n_batch * output->dims->data[2];
   for (int t = 0; t < max_time; t++) {
     // If this is the forward_sequence, step forward, otherwise step backwards.
     const int t_rel = forward_sequence ? t : max_time - t - 1;
     const float* input_ptr = input->data.f + t_rel * input_step;
-    float* output_ptr_time = output->data.f + t_rel * output_step;
+    float* output_ptr_time =
+        output->data.f + t_rel * output_step + output_offset;
 
     kernel_utils::LstmStepWithAuxInput(
         input_ptr, input_to_input_weights_ptr, input_to_forget_weights->data.f,
@@ -816,7 +825,7 @@ TfLiteStatus EvalHybrid(
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
     const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, bool forward_sequence,
+    const TfLiteLSTMParams* params, bool forward_sequence, int output_offset,
     TfLiteTensor* scratch_buffer, TfLiteTensor* scaling_factors,
     TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_cell_weights,
     TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized,
@@ -972,12 +981,12 @@ TfLiteStatus EvalHybrid(
 
   // Feed the sequence into the LSTM step-by-step.
   const int input_step = n_batch * n_input;
-  const int output_step = n_batch * n_output;
+  const int output_step = n_batch * output->dims->data[2];
   for (int t = 0; t < max_time; t++) {
     // If this is the forward_sequence, step forward, otherwise step backwards.
     const int t_rel = forward_sequence ? t : max_time - t - 1;
     const float* input_ptr = input->data.f + t_rel * input_step;
-    float* output_ptr = output->data.f + t_rel * output_step;
+    float* output_ptr = output->data.f + t_rel * output_step + output_offset;
 
     kernel_utils::LstmStepWithAuxInput(
         input_ptr, input_to_input_weights_ptr, input_to_input_weights_scale,
@@ -1011,7 +1020,8 @@ TfLiteStatus EvalHybrid(
 
 // The LSTM Op engine.
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>(
+      node->builtin_data);
 
   // Input tensor.
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
@@ -1107,7 +1117,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetVariableInput(context, node, kBwInputActivationStateTensor);
   TfLiteTensor* bw_cell_state =
       GetVariableInput(context, node, kBwInputCellStateTensor);
-  TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
+  TfLiteTensor* bw_output = params->merge_outputs
+                                ? nullptr
+                                : GetOutput(context, node, kBwOutputTensor);
 
   // Temporary tensors.
   TfLiteTensor* fw_scratch_buffer =
@@ -1135,6 +1147,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bw_aux_input_to_output_weights =
       GetOptionalInputTensor(context, node, kBwAuxInputToOutputWeightsTensor);
 
+  // Populate a TfLiteLSTMParams struct for the evaluation functions.
+  TfLiteLSTMParams lstm_params = {params->activation, params->cell_clip,
+                                  params->proj_clip, kTfLiteLSTMFullKernel};
+
+  const int bw_output_offset =
+      params->merge_outputs ? fw_recurrent_to_output_weights->dims->data[1] : 0;
+  const auto actual_bw_output = params->merge_outputs ? fw_output : bw_output;
+
   switch (fw_input_to_output_weights->type) {
     case kTfLiteFloat32: {
       TfLiteStatus fw_pass_status = EvalFloat(
@@ -1147,9 +1167,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           fw_aux_input_to_forget_weights, fw_aux_input_to_cell_weights,
           fw_aux_input_to_output_weights, fw_input_gate_bias,
           fw_forget_gate_bias, fw_cell_bias, fw_output_gate_bias,
-          fw_projection_weights, fw_projection_bias, params,
-          /*forward_sequence=*/true, fw_scratch_buffer, fw_activation_state,
-          fw_cell_state, fw_output);
+          fw_projection_weights, fw_projection_bias, &lstm_params,
+          /*forward_sequence=*/true, /*output_offset=*/0, fw_scratch_buffer,
+          fw_activation_state, fw_cell_state, fw_output);
       TF_LITE_ENSURE_OK(context, fw_pass_status);
 
       TfLiteStatus bw_pass_status = EvalFloat(
@@ -1162,9 +1182,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           bw_aux_input_to_forget_weights, bw_aux_input_to_cell_weights,
           bw_aux_input_to_output_weights, bw_input_gate_bias,
           bw_forget_gate_bias, bw_cell_bias, bw_output_gate_bias,
-          bw_projection_weights, bw_projection_bias, params,
-          /*forward_sequence=*/false, bw_scratch_buffer, bw_activation_state,
-          bw_cell_state, bw_output);
+          bw_projection_weights, bw_projection_bias, &lstm_params,
+          /*forward_sequence=*/false, bw_output_offset, bw_scratch_buffer,
+          bw_activation_state, bw_cell_state, actual_bw_output);
       TF_LITE_ENSURE_OK(context, bw_pass_status);
       return kTfLiteOk;
     }
@@ -1198,10 +1218,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           fw_aux_input_to_forget_weights, fw_aux_input_to_cell_weights,
           fw_aux_input_to_output_weights, fw_input_gate_bias,
           fw_forget_gate_bias, fw_cell_bias, fw_output_gate_bias,
-          fw_projection_weights, fw_projection_bias, params,
-          /*forward_sequence=*/true, fw_scratch_buffer, scaling_factors,
-          prod_scaling_factors, recovered_cell_weights, input_quantized,
-          aux_input_quantized, fw_activation_state_quantized,
+          fw_projection_weights, fw_projection_bias, &lstm_params,
+          /*forward_sequence=*/true, /*output_offset=*/0, fw_scratch_buffer,
+          scaling_factors, prod_scaling_factors, recovered_cell_weights,
+          input_quantized, aux_input_quantized, fw_activation_state_quantized,
           fw_cell_state_quantized, fw_activation_state, fw_cell_state,
           fw_output);
       TF_LITE_ENSURE_OK(context, fw_pass_status);
@@ -1216,12 +1236,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           fw_aux_input_to_forget_weights, fw_aux_input_to_cell_weights,
           fw_aux_input_to_output_weights, bw_input_gate_bias,
           bw_forget_gate_bias, bw_cell_bias, bw_output_gate_bias,
-          bw_projection_weights, bw_projection_bias, params,
-          /*forward_sequence=*/false, bw_scratch_buffer, scaling_factors,
-          prod_scaling_factors, recovered_cell_weights, input_quantized,
-          aux_input_quantized, bw_activation_state_quantized,
+          bw_projection_weights, bw_projection_bias, &lstm_params,
+          /*forward_sequence=*/false, bw_output_offset, bw_scratch_buffer,
+          scaling_factors, prod_scaling_factors, recovered_cell_weights,
+          input_quantized, aux_input_quantized, bw_activation_state_quantized,
           bw_cell_state_quantized, bw_activation_state, bw_cell_state,
-          bw_output);
+          actual_bw_output);
       TF_LITE_ENSURE_OK(context, bw_pass_status);
       return kTfLiteOk;
     }
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc
index 74ba8021c2..9cc04907e1 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc
@@ -35,8 +35,8 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
   BidirectionalLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
                            int sequence_length, bool use_cifg,
                            bool use_peephole, bool use_projection_weights,
-                           bool use_projection_bias, float cell_clip,
-                           float proj_clip,
+                           bool use_projection_bias, bool merge_outputs,
+                           float cell_clip, float proj_clip,
                            const std::vector<std::vector<int>>& input_shapes)
       : n_batch_(n_batch),
         n_input_(n_input),
@@ -175,7 +175,9 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
 
     fw_output_ = AddOutput(TensorType_FLOAT32);
 
-    bw_output_ = AddOutput(TensorType_FLOAT32);
+    if (!merge_outputs) {
+      bw_output_ = AddOutput(TensorType_FLOAT32);
+    }
 
     aux_input_ = AddNullInput();
     fw_aux_input_to_input_weights_ = AddNullInput();
@@ -188,9 +190,10 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
     bw_aux_input_to_output_weights_ = AddNullInput();
 
     SetBuiltinOp(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
-                 BuiltinOptions_LSTMOptions,
-                 CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
-                                   cell_clip, proj_clip)
+                 BuiltinOptions_BidirectionalSequenceLSTMOptions,
+                 CreateBidirectionalSequenceLSTMOptions(
+                     builder_, ActivationFunctionType_TANH, cell_clip,
+                     proj_clip, merge_outputs)
                      .Union());
     BuildInterpreter(input_shapes);
   }
@@ -380,7 +383,8 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/false, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -526,6 +530,162 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
               ElementsAreArray(ArrayFloatNear(bw_expected)));
 }
 
+// Same as the previous test, yet with a single merged output tensor.
+TEST(LSTMOpTest, BlackBoxTestMergedOutput) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+
+  BidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
+      /*use_peephole=*/false, /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*merge_outputs=*/true, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          // Forward cell
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          // Backward cell
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {n_batch, sequence_length, 0},  // aux_input tensor
+          {n_cell, 0},                    // aux_fw_input_to_input tensor
+          {n_cell, 0},                    // aux_fw_input_to_forget tensor
+          {n_cell, 0},                    // aux_fw_input_to_cell tensor
+          {n_cell, 0},                    // aux_fw_input_to_output tensor
+          {n_cell, 0},                    // aux_bw_input_to_input tensor
+          {n_cell, 0},                    // aux_bw_input_to_forget tensor
+          {n_cell, 0},                    // aux_bw_input_to_cell tensor
+          {n_cell, 0},                    // aux_bw_input_to_output tensor
+      });
+
+  lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
+                               -0.34550029, 0.04266912, -0.15680569,
+                               -0.34856534, 0.43890524});
+
+  lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163,
+                              -0.20583314, 0.44344562, 0.22077113,
+                              -0.29909778});
+
+  lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935,
+                                -0.31343272, -0.40032279, 0.44781327,
+                                0.01387155, -0.35593212});
+
+  lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829,
+                                0.40525138, 0.44272184, 0.03897077, -0.1556896,
+                                0.19487578});
+
+  lstm.SetInputGateBias({0., 0., 0., 0.});
+
+  lstm.SetCellBias({0., 0., 0., 0.});
+
+  lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+  lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+  lstm.SetRecurrentToInputWeights(
+      {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324,
+       -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322,
+       -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296});
+
+  lstm.SetRecurrentToCellWeights(
+      {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841,
+       -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659,
+       -0.46367589, 0.26016325, -0.03894562, -0.16368064});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892,
+       -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436,
+       0.28053468, 0.01560611, -0.20127171, -0.01140004});
+
+  lstm.SetRecurrentToOutputWeights(
+      {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793,
+       0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421,
+       -0.51818722, -0.15390486, 0.0468148, 0.39922136});
+
+  // Input should have n_input * sequence_length many values.
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  static float lstm_fw_golden_output[] = {
+      -0.02973187, 0.1229473,  0.20885126, -0.15358765,
+      -0.03716109, 0.12507336, 0.41193449, -0.20860538,
+      -0.15053082, 0.09120187, 0.24278517, -0.12222792};
+  static float lstm_bw_golden_output[] = {
+      -0.0806187, 0.139077, 0.400476,   -0.197842, -0.0332076, 0.123838,
+      0.309777,   -0.17621, -0.0490733, 0.0739237, 0.067706,   -0.0208124};
+
+  float* batch0_start = lstm_input;
+  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+
+  lstm.Invoke();
+
+  std::vector<float> merged_expected;
+  for (int k = 0; k < lstm.sequence_length(); k++) {
+    merged_expected.insert(
+        merged_expected.end(),
+        lstm_fw_golden_output + k * lstm.num_fw_outputs(),
+        lstm_fw_golden_output + (k + 1) * lstm.num_fw_outputs());
+    merged_expected.insert(
+        merged_expected.end(),
+        lstm_bw_golden_output + k * lstm.num_bw_outputs(),
+        lstm_bw_golden_output + (k + 1) * lstm.num_bw_outputs());
+  }
+  EXPECT_THAT(lstm.GetFwOutput(),
+              ElementsAreArray(ArrayFloatNear(merged_expected)));
+}
+
 TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClippingReverse) {
   const int n_batch = 1;
   const int n_input = 2;
@@ -537,7 +697,8 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClippingReverse) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/false, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -696,7 +857,8 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true,
       /*use_peephole=*/true, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -845,7 +1007,8 @@ TEST(LSTMOpTest,
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true,
       /*use_peephole=*/true, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -994,7 +1157,8 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/true, /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
index 2f896c5289..9f62ac3f2c 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
@@ -47,7 +47,7 @@ constexpr int kFwAuxWeightsTensor = 10;  // Optional.
 constexpr int kBwAuxWeightsTensor = 11;  // Optional.
 // Output tensors.
 constexpr int kFwOutputTensor = 0;
-constexpr int kBwOutputTensor = 1;
+constexpr int kBwOutputTensor = 1;  // Only if merge_outputs is false.
 
 // Temporary tensors.
 enum TemporaryTensor {
@@ -70,9 +70,13 @@ void Free(TfLiteContext* context, void* buffer) {
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceRNNParams*>(
+      node->builtin_data);
+
   // Check we have all the inputs and outputs we need.
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 12);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size,
+                    params->merge_outputs ? 1 : 2);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* fw_input_weights =
@@ -142,9 +146,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                       bw_aux_input_weights->dims->data[1]);
   }
 
-  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
-  TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
-
   const bool is_hybrid_op =
       (fw_input_weights->type == kTfLiteUInt8 && input->type == kTfLiteFloat32);
 
@@ -233,18 +234,23 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // Resize outputs.
+  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
   TfLiteIntArray* fw_output_size_array = TfLiteIntArrayCreate(3);
   fw_output_size_array->data[0] = batch_size;
   fw_output_size_array->data[1] = max_time;
-  fw_output_size_array->data[2] = fw_num_units;
+  fw_output_size_array->data[2] =
+      params->merge_outputs ? fw_num_units + bw_num_units : fw_num_units;
   TF_LITE_ENSURE_OK(
       context, context->ResizeTensor(context, fw_output, fw_output_size_array));
-  TfLiteIntArray* bw_output_size_array = TfLiteIntArrayCreate(3);
-  bw_output_size_array->data[0] = batch_size;
-  bw_output_size_array->data[1] = max_time;
-  bw_output_size_array->data[2] = bw_num_units;
-  TF_LITE_ENSURE_OK(
-      context, context->ResizeTensor(context, bw_output, bw_output_size_array));
+  if (!params->merge_outputs) {
+    TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
+    TfLiteIntArray* bw_output_size_array = TfLiteIntArrayCreate(3);
+    bw_output_size_array->data[0] = batch_size;
+    bw_output_size_array->data[1] = max_time;
+    bw_output_size_array->data[2] = bw_num_units;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_output,
+                                                     bw_output_size_array));
+  }
 
   return kTfLiteOk;
 }
@@ -256,9 +262,9 @@ TfLiteStatus EvalFloat(
     const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias,
     const TfLiteTensor* aux_input, const TfLiteTensor* fw_aux_input_weights,
     const TfLiteTensor* bw_aux_input_weights,
-    const TfLiteSequenceRNNParams* params, TfLiteTensor* fw_hidden_state,
-    TfLiteTensor* fw_output, TfLiteTensor* bw_hidden_state,
-    TfLiteTensor* bw_output) {
+    const TfLiteBidirectionalSequenceRNNParams* params,
+    TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
+    TfLiteTensor* bw_hidden_state, TfLiteTensor* bw_output) {
   const int batch_size = input->dims->data[0];
   const int max_time = input->dims->data[1];
   const int input_size = input->dims->data[2];
@@ -281,10 +287,15 @@ TfLiteStatus EvalFloat(
                                               ? bw_aux_input_weights->data.f
                                               : nullptr;
 
+  const int fw_output_step =
+      params->merge_outputs ? fw_num_units + bw_num_units : fw_num_units;
+  const int bw_output_step =
+      params->merge_outputs ? fw_num_units + bw_num_units : bw_num_units;
   for (int b = 0; b < batch_size; b++) {
     // Forward cell.
     float* fw_hidden_state_ptr_batch =
         fw_hidden_state->data.f + b * fw_num_units;
+    float* fw_output_offset = fw_output->data.f + b * fw_output_step * max_time;
     for (int s = 0; s < max_time; s++) {
       const float* input_ptr_batch =
           input->data.f + b * input_size * max_time + s * input_size;
@@ -292,8 +303,7 @@ TfLiteStatus EvalFloat(
           (aux_input != nullptr)
               ? aux_input->data.f + b * input_size * max_time + s * input_size
               : nullptr;
-      float* output_ptr_batch =
-          fw_output->data.f + b * fw_num_units * max_time + s * fw_num_units;
+      float* output_ptr_batch = fw_output_offset + s * fw_output_step;
 
       kernel_utils::RnnBatchStep(
           input_ptr_batch, fw_input_weights_ptr, aux_input_ptr_batch,
@@ -304,6 +314,10 @@ TfLiteStatus EvalFloat(
     // Backward cell.
     float* bw_hidden_state_ptr_batch =
         bw_hidden_state->data.f + b * bw_num_units;
+    float* bw_output_offset =
+        params->merge_outputs
+            ? fw_output->data.f + b * bw_output_step * max_time + fw_num_units
+            : bw_output->data.f + b * bw_output_step * max_time;
     for (int s = max_time - 1; s >= 0; s--) {
       const float* input_ptr_batch =
           input->data.f + b * input_size * max_time + s * input_size;
@@ -311,8 +325,7 @@ TfLiteStatus EvalFloat(
           (aux_input != nullptr)
               ? aux_input->data.f + b * input_size * max_time + s * input_size
               : nullptr;
-      float* output_ptr_batch =
-          bw_output->data.f + b * bw_num_units * max_time + s * bw_num_units;
+      float* output_ptr_batch = bw_output_offset + s * bw_output_step;
 
       kernel_utils::RnnBatchStep(
           input_ptr_batch, bw_input_weights_ptr, aux_input_ptr_batch,
@@ -331,11 +344,12 @@ TfLiteStatus EvalHybrid(
     const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias,
     const TfLiteTensor* aux_input, const TfLiteTensor* aux_fw_input_weights,
     const TfLiteTensor* aux_bw_input_weights,
-    const TfLiteSequenceRNNParams* params, TfLiteTensor* scaling_factors,
-    TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized,
-    TfLiteTensor* fw_hidden_state_quantized, TfLiteTensor* fw_hidden_state,
-    TfLiteTensor* fw_output, TfLiteTensor* bw_hidden_state_quantized,
-    TfLiteTensor* bw_hidden_state, TfLiteTensor* bw_output) {
+    const TfLiteBidirectionalSequenceRNNParams* params,
+    TfLiteTensor* scaling_factors, TfLiteTensor* input_quantized,
+    TfLiteTensor* aux_input_quantized, TfLiteTensor* fw_hidden_state_quantized,
+    TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
+    TfLiteTensor* bw_hidden_state_quantized, TfLiteTensor* bw_hidden_state,
+    TfLiteTensor* bw_output) {
   const int batch_size = input->dims->data[0];
   const int max_time = input->dims->data[1];
   const int input_size = input->dims->data[2];
@@ -384,10 +398,15 @@ TfLiteStatus EvalHybrid(
       reinterpret_cast<int8_t*>(bw_hidden_state_quantized->data.uint8);
   float* scaling_factors_ptr = scaling_factors->data.f;
 
+  const int fw_output_step =
+      params->merge_outputs ? fw_num_units + bw_num_units : fw_num_units;
+  const int bw_output_step =
+      params->merge_outputs ? fw_num_units + bw_num_units : bw_num_units;
   for (int b = 0; b < batch_size; b++) {
     // Forward cell.
     float* fw_hidden_state_ptr_batch =
         fw_hidden_state->data.f + b * fw_num_units;
+    float* fw_output_offset = fw_output->data.f + b * fw_output_step * max_time;
     for (int s = 0; s < max_time; s++) {
       const float* input_ptr_batch =
           input->data.f + b * input_size * max_time + s * input_size;
@@ -395,8 +414,7 @@ TfLiteStatus EvalHybrid(
           (aux_input != nullptr)
               ? aux_input->data.f + b * input_size * max_time + s * input_size
               : nullptr;
-      float* output_ptr_batch =
-          fw_output->data.f + b * fw_num_units * max_time + s * fw_num_units;
+      float* output_ptr_batch = fw_output_offset + s * fw_output_step;
 
       kernel_utils::RnnBatchStep(
           input_ptr_batch, fw_input_weights_ptr, fw_input_weights_scale,
@@ -411,6 +429,10 @@ TfLiteStatus EvalHybrid(
     // Backward cell.
     float* bw_hidden_state_ptr_batch =
         bw_hidden_state->data.f + b * bw_num_units;
+    float* bw_output_offset =
+        params->merge_outputs
+            ? fw_output->data.f + b * bw_output_step * max_time
+            : bw_output->data.f + b * bw_output_step * max_time;
     for (int s = max_time - 1; s >= 0; s--) {
       const float* input_ptr_batch =
           input->data.f + b * input_size * max_time + s * input_size;
@@ -418,8 +440,7 @@ TfLiteStatus EvalHybrid(
           (aux_input != nullptr)
               ? aux_input->data.f + b * input_size * max_time + s * input_size
               : nullptr;
-      float* output_ptr_batch =
-          bw_output->data.f + b * bw_num_units * max_time + s * bw_num_units;
+      float* output_ptr_batch = bw_output_offset + s * bw_output_step;
 
       kernel_utils::RnnBatchStep(
           input_ptr_batch, bw_input_weights_ptr, bw_input_weights_scale,
@@ -436,8 +457,8 @@ TfLiteStatus EvalHybrid(
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const auto* params =
-      reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
+  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceRNNParams*>(
+      node->builtin_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* fw_input_weights =
@@ -465,7 +486,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetVariableInput(context, node, kBwHiddenStateTensor);
 
   TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
-  TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
+  TfLiteTensor* bw_output = params->merge_outputs
+                                ? nullptr
+                                : GetOutput(context, node, kBwOutputTensor);
 
   switch (fw_input_weights->type) {
     case kTfLiteFloat32:
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
index 3e34ba6196..f555c472f5 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -654,7 +654,7 @@ const std::initializer_list<float> recurrent_weights = {
 class BidirectionalRNNOpModel : public SingleOpModel {
  public:
   BidirectionalRNNOpModel(int batches, int sequence_len, int fw_units,
-                          int bw_units, int input_size)
+                          int bw_units, int input_size, bool merge_outputs)
       : batches_(batches),
         sequence_len_(sequence_len),
         fw_units_(fw_units),
@@ -675,12 +675,15 @@ class BidirectionalRNNOpModel : public SingleOpModel {
     aux_bw_weights_ = AddNullInput();
 
     fw_output_ = AddOutput(TensorType_FLOAT32);
-    bw_output_ = AddOutput(TensorType_FLOAT32);
+    if (!merge_outputs) {
+      bw_output_ = AddOutput(TensorType_FLOAT32);
+    }
 
     SetBuiltinOp(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
-                 BuiltinOptions_SequenceRNNOptions,
-                 CreateSequenceRNNOptions(builder_, /*time_major=*/false,
-                                          ActivationFunctionType_RELU)
+                 BuiltinOptions_BidirectionalSequenceRNNOptions,
+                 CreateBidirectionalSequenceRNNOptions(
+                     builder_, /*time_major=*/false,
+                     ActivationFunctionType_RELU, merge_outputs)
                      .Union());
     BuildInterpreter({
         {batches_, sequence_len_, input_size_},  // input
@@ -767,7 +770,7 @@ class BidirectionalRNNOpModel : public SingleOpModel {
 TEST(BidirectionalRNNOpTest, BlackBoxTest) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8);
+                              /*input_size=*/8, /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
   rnn.SetFwBias(biases);
@@ -800,12 +803,49 @@ TEST(BidirectionalRNNOpTest, BlackBoxTest) {
   EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
 }
 
+// Same as the previous test, yet with merged outputs.
+TEST(BidirectionalRNNOpTest, BlackBoxTestMergeOutputs) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8, /*merge_outputs=*/true);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  float* batch_start = rnn_input;
+  float* batch_end = batch_start + input_sequence_size;
+  rnn.SetInput(0, batch_start, batch_end);
+  rnn.SetInput(input_sequence_size, batch_start, batch_end);
+
+  rnn.Invoke();
+
+  std::vector<float> merged_expected;
+  for (int bid = 0; bid < rnn.num_batches(); bid++) {
+    for (int step = 0; step < rnn.sequence_len(); step++) {
+      merged_expected.insert(
+          merged_expected.end(),
+          rnn_golden_fw_output + rnn.num_fw_units() * step,
+          rnn_golden_fw_output + rnn.num_fw_units() * (step + 1));
+      merged_expected.insert(
+          merged_expected.end(),
+          rnn_golden_bw_output + rnn.num_bw_units() * step,
+          rnn_golden_bw_output + rnn.num_bw_units() * (step + 1));
+    }
+  }
+  EXPECT_THAT(rnn.GetFwOutput(),
+              ElementsAreArray(ArrayFloatNear(merged_expected)));
+}
+
 // Check that if the input sequence is reversed the outputs are the same just
 // forward and backward are swapped (and reversed).
 TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8);
+                              /*input_size=*/8, /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
   rnn.SetFwBias(biases);
@@ -851,7 +891,7 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
 TEST(BidirectionalRNNOpTest, EndToEndTest) {
   BidirectionalRNNOpModel rnn(/*batches=*/1, /*sequence_len=*/4,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8);
+                              /*input_size=*/8, /*merge_outputs=*/false);
   const int output_size = 4;
   float dnn_weights[] = {
       -0.5782342,  -0.052212059, 0.73036242,  -0.81216097, -0.80088139,
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 3da3188c3a..ff8430827c 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -248,6 +248,8 @@ union BuiltinOptions {
   SquareOptions,
   ZerosLikeOptions,
   FillOptions,
+  BidirectionalSequenceLSTMOptions,
+  BidirectionalSequenceRNNOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -327,6 +329,7 @@ table SequenceRNNOptions {
 table BidirectionalSequenceRNNOptions {
   time_major:bool;
   fused_activation_function:ActivationFunctionType;
+  merge_outputs: bool;
 }
 
 enum FullyConnectedOptionsWeightsFormat: byte {
@@ -391,6 +394,15 @@ table LSTMOptions {
   kernel_type: LSTMKernelType = FULL;
 }
 
+table BidirectionalSequenceLSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true, store the outputs of both directions into the first output.
+  merge_outputs: bool;
+}
+
 table ResizeBilinearOptions {
   new_height: int (deprecated);
   new_width: int (deprecated);
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 23ac8484de..f3cb113c9c 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -79,6 +79,9 @@ struct LocalResponseNormalizationOptionsT;
 struct LSTMOptions;
 struct LSTMOptionsT;
 
+struct BidirectionalSequenceLSTMOptions;
+struct BidirectionalSequenceLSTMOptionsT;
+
 struct ResizeBilinearOptions;
 struct ResizeBilinearOptionsT;
 
@@ -676,11 +679,13 @@ enum BuiltinOptions {
   BuiltinOptions_SquareOptions = 66,
   BuiltinOptions_ZerosLikeOptions = 67,
   BuiltinOptions_FillOptions = 68,
+  BuiltinOptions_BidirectionalSequenceLSTMOptions = 69,
+  BuiltinOptions_BidirectionalSequenceRNNOptions = 70,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_FillOptions
+  BuiltinOptions_MAX = BuiltinOptions_BidirectionalSequenceRNNOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[69] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[71] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -750,7 +755,9 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[69] {
     BuiltinOptions_FloorDivOptions,
     BuiltinOptions_SquareOptions,
     BuiltinOptions_ZerosLikeOptions,
-    BuiltinOptions_FillOptions
+    BuiltinOptions_FillOptions,
+    BuiltinOptions_BidirectionalSequenceLSTMOptions,
+    BuiltinOptions_BidirectionalSequenceRNNOptions
   };
   return values;
 }
@@ -826,6 +833,8 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "SquareOptions",
     "ZerosLikeOptions",
     "FillOptions",
+    "BidirectionalSequenceLSTMOptions",
+    "BidirectionalSequenceRNNOptions",
     nullptr
   };
   return names;
@@ -1112,6 +1121,14 @@ template<> struct BuiltinOptionsTraits<FillOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_FillOptions;
 };
 
+template<> struct BuiltinOptionsTraits<BidirectionalSequenceLSTMOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BidirectionalSequenceLSTMOptions;
+};
+
+template<> struct BuiltinOptionsTraits<BidirectionalSequenceRNNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BidirectionalSequenceRNNOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1687,6 +1704,22 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_FillOptions ?
       reinterpret_cast<const FillOptionsT *>(value) : nullptr;
   }
+  BidirectionalSequenceLSTMOptionsT *AsBidirectionalSequenceLSTMOptions() {
+    return type == BuiltinOptions_BidirectionalSequenceLSTMOptions ?
+      reinterpret_cast<BidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
+  }
+  const BidirectionalSequenceLSTMOptionsT *AsBidirectionalSequenceLSTMOptions() const {
+    return type == BuiltinOptions_BidirectionalSequenceLSTMOptions ?
+      reinterpret_cast<const BidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
+  }
+  BidirectionalSequenceRNNOptionsT *AsBidirectionalSequenceRNNOptions() {
+    return type == BuiltinOptions_BidirectionalSequenceRNNOptions ?
+      reinterpret_cast<BidirectionalSequenceRNNOptionsT *>(value) : nullptr;
+  }
+  const BidirectionalSequenceRNNOptionsT *AsBidirectionalSequenceRNNOptions() const {
+    return type == BuiltinOptions_BidirectionalSequenceRNNOptions ?
+      reinterpret_cast<const BidirectionalSequenceRNNOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -2834,9 +2867,11 @@ struct BidirectionalSequenceRNNOptionsT : public flatbuffers::NativeTable {
   typedef BidirectionalSequenceRNNOptions TableType;
   bool time_major;
   ActivationFunctionType fused_activation_function;
+  bool merge_outputs;
   BidirectionalSequenceRNNOptionsT()
       : time_major(false),
-        fused_activation_function(ActivationFunctionType_NONE) {
+        fused_activation_function(ActivationFunctionType_NONE),
+        merge_outputs(false) {
   }
 };
 
@@ -2844,7 +2879,8 @@ struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuf
   typedef BidirectionalSequenceRNNOptionsT NativeTableType;
   enum {
     VT_TIME_MAJOR = 4,
-    VT_FUSED_ACTIVATION_FUNCTION = 6
+    VT_FUSED_ACTIVATION_FUNCTION = 6,
+    VT_MERGE_OUTPUTS = 8
   };
   bool time_major() const {
     return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0;
@@ -2852,10 +2888,14 @@ struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuf
   ActivationFunctionType fused_activation_function() const {
     return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
+  bool merge_outputs() const {
+    return GetField<uint8_t>(VT_MERGE_OUTPUTS, 0) != 0;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_TIME_MAJOR) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<uint8_t>(verifier, VT_MERGE_OUTPUTS) &&
            verifier.EndTable();
   }
   BidirectionalSequenceRNNOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2872,6 +2912,9 @@ struct BidirectionalSequenceRNNOptionsBuilder {
   void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(BidirectionalSequenceRNNOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
+  void add_merge_outputs(bool merge_outputs) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_MERGE_OUTPUTS, static_cast<uint8_t>(merge_outputs), 0);
+  }
   explicit BidirectionalSequenceRNNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2887,8 +2930,10 @@ struct BidirectionalSequenceRNNOptionsBuilder {
 inline flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
     bool time_major = false,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
+    bool merge_outputs = false) {
   BidirectionalSequenceRNNOptionsBuilder builder_(_fbb);
+  builder_.add_merge_outputs(merge_outputs);
   builder_.add_fused_activation_function(fused_activation_function);
   builder_.add_time_major(time_major);
   return builder_.Finish();
@@ -3424,6 +3469,96 @@ inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
 
 flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct BidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
+  typedef BidirectionalSequenceLSTMOptions TableType;
+  ActivationFunctionType fused_activation_function;
+  float cell_clip;
+  float proj_clip;
+  bool merge_outputs;
+  BidirectionalSequenceLSTMOptionsT()
+      : fused_activation_function(ActivationFunctionType_NONE),
+        cell_clip(0.0f),
+        proj_clip(0.0f),
+        merge_outputs(false) {
+  }
+};
+
+struct BidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef BidirectionalSequenceLSTMOptionsT NativeTableType;
+  enum {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_CELL_CLIP = 6,
+    VT_PROJ_CLIP = 8,
+    VT_MERGE_OUTPUTS = 10
+  };
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  float cell_clip() const {
+    return GetField<float>(VT_CELL_CLIP, 0.0f);
+  }
+  float proj_clip() const {
+    return GetField<float>(VT_PROJ_CLIP, 0.0f);
+  }
+  bool merge_outputs() const {
+    return GetField<uint8_t>(VT_MERGE_OUTPUTS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<float>(verifier, VT_CELL_CLIP) &&
+           VerifyField<float>(verifier, VT_PROJ_CLIP) &&
+           VerifyField<uint8_t>(verifier, VT_MERGE_OUTPUTS) &&
+           verifier.EndTable();
+  }
+  BidirectionalSequenceLSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<BidirectionalSequenceLSTMOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BidirectionalSequenceLSTMOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(BidirectionalSequenceLSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_cell_clip(float cell_clip) {
+    fbb_.AddElement<float>(BidirectionalSequenceLSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
+  }
+  void add_proj_clip(float proj_clip) {
+    fbb_.AddElement<float>(BidirectionalSequenceLSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
+  }
+  void add_merge_outputs(bool merge_outputs) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceLSTMOptions::VT_MERGE_OUTPUTS, static_cast<uint8_t>(merge_outputs), 0);
+  }
+  explicit BidirectionalSequenceLSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  BidirectionalSequenceLSTMOptionsBuilder &operator=(const BidirectionalSequenceLSTMOptionsBuilder &);
+  flatbuffers::Offset<BidirectionalSequenceLSTMOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<BidirectionalSequenceLSTMOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
+    float cell_clip = 0.0f,
+    float proj_clip = 0.0f,
+    bool merge_outputs = false) {
+  BidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
+  builder_.add_proj_clip(proj_clip);
+  builder_.add_cell_clip(cell_clip);
+  builder_.add_merge_outputs(merge_outputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct ResizeBilinearOptionsT : public flatbuffers::NativeTable {
   typedef ResizeBilinearOptions TableType;
   bool align_corners;
@@ -6347,6 +6482,12 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const FillOptions *builtin_options_as_FillOptions() const {
     return builtin_options_type() == BuiltinOptions_FillOptions ? static_cast<const FillOptions *>(builtin_options()) : nullptr;
   }
+  const BidirectionalSequenceLSTMOptions *builtin_options_as_BidirectionalSequenceLSTMOptions() const {
+    return builtin_options_type() == BuiltinOptions_BidirectionalSequenceLSTMOptions ? static_cast<const BidirectionalSequenceLSTMOptions *>(builtin_options()) : nullptr;
+  }
+  const BidirectionalSequenceRNNOptions *builtin_options_as_BidirectionalSequenceRNNOptions() const {
+    return builtin_options_type() == BuiltinOptions_BidirectionalSequenceRNNOptions ? static_cast<const BidirectionalSequenceRNNOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -6650,6 +6791,14 @@ template<> inline const FillOptions *Operator::builtin_options_as<FillOptions>()
   return builtin_options_as_FillOptions();
 }
 
+template<> inline const BidirectionalSequenceLSTMOptions *Operator::builtin_options_as<BidirectionalSequenceLSTMOptions>() const {
+  return builtin_options_as_BidirectionalSequenceLSTMOptions();
+}
+
+template<> inline const BidirectionalSequenceRNNOptions *Operator::builtin_options_as<BidirectionalSequenceRNNOptions>() const {
+  return builtin_options_as_BidirectionalSequenceRNNOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -7407,6 +7556,7 @@ inline void BidirectionalSequenceRNNOptions::UnPackTo(BidirectionalSequenceRNNOp
   (void)_resolver;
   { auto _e = time_major(); _o->time_major = _e; };
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  { auto _e = merge_outputs(); _o->merge_outputs = _e; };
 }
 
 inline flatbuffers::Offset<BidirectionalSequenceRNNOptions> BidirectionalSequenceRNNOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -7419,10 +7569,12 @@ inline flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalS
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BidirectionalSequenceRNNOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _time_major = _o->time_major;
   auto _fused_activation_function = _o->fused_activation_function;
+  auto _merge_outputs = _o->merge_outputs;
   return tflite::CreateBidirectionalSequenceRNNOptions(
       _fbb,
       _time_major,
-      _fused_activation_function);
+      _fused_activation_function,
+      _merge_outputs);
 }
 
 inline FullyConnectedOptionsT *FullyConnectedOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -7657,6 +7809,41 @@ inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBuffe
       _kernel_type);
 }
 
+inline BidirectionalSequenceLSTMOptionsT *BidirectionalSequenceLSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new BidirectionalSequenceLSTMOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void BidirectionalSequenceLSTMOptions::UnPackTo(BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  { auto _e = cell_clip(); _o->cell_clip = _e; };
+  { auto _e = proj_clip(); _o->proj_clip = _e; };
+  { auto _e = merge_outputs(); _o->merge_outputs = _e; };
+}
+
+inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> BidirectionalSequenceLSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBidirectionalSequenceLSTMOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BidirectionalSequenceLSTMOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _cell_clip = _o->cell_clip;
+  auto _proj_clip = _o->proj_clip;
+  auto _merge_outputs = _o->merge_outputs;
+  return tflite::CreateBidirectionalSequenceLSTMOptions(
+      _fbb,
+      _fused_activation_function,
+      _cell_clip,
+      _proj_clip,
+      _merge_outputs);
+}
+
 inline ResizeBilinearOptionsT *ResizeBilinearOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new ResizeBilinearOptionsT();
   UnPackTo(_o, _resolver);
@@ -9425,6 +9612,14 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const FillOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const BidirectionalSequenceLSTMOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const BidirectionalSequenceRNNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -9715,6 +9910,14 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const FillOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const BidirectionalSequenceLSTMOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const BidirectionalSequenceRNNOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -9993,6 +10196,14 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const FillOptionsT *>(value);
       return CreateFillOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const BidirectionalSequenceLSTMOptionsT *>(value);
+      return CreateBidirectionalSequenceLSTMOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const BidirectionalSequenceRNNOptionsT *>(value);
+      return CreateBidirectionalSequenceRNNOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -10271,6 +10482,14 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new FillOptionsT(*reinterpret_cast<FillOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      value = new BidirectionalSequenceLSTMOptionsT(*reinterpret_cast<BidirectionalSequenceLSTMOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      value = new BidirectionalSequenceRNNOptionsT(*reinterpret_cast<BidirectionalSequenceRNNOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -10618,6 +10837,16 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<BidirectionalSequenceLSTMOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      auto ptr = reinterpret_cast<BidirectionalSequenceRNNOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
-- 
GitLab


From 261b6958fb95db18cd28c1aba140a627deb790a1 Mon Sep 17 00:00:00 2001
From: Ayush Dubey <ayushd@google.com>
Date: Wed, 3 Oct 2018 13:25:23 -0700
Subject: [PATCH 358/570] Enable collective graph key test for GPU builds.

In the process, properly place nodes on devices in the collective graph key
test.

PiperOrigin-RevId: 215616146
---
 .../common_runtime/direct_session_test.cc     | 58 +++++++++----------
 1 file changed, 26 insertions(+), 32 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index e3e431f800..a6440c55ad 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -2262,8 +2262,8 @@ class DirectSessionCollectiveTest : public ::testing::Test {
     TF_RETURN_IF_ERROR(session->Create(g));
     std::vector<Tensor> outputs;
     TF_RETURN_IF_ERROR(
-        session->Run({{"input1:0", t1}, {"input2:0", t2}}, {},
-                     {"collective_call1:0", "collective_call2:0"}, &outputs));
+        session->Run({{"input0:0", t1}, {"input1:0", t2}}, {},
+                     {"collective_call0:0", "collective_call1:0"}, &outputs));
     DirectSession* direct_session = static_cast<DirectSession*>(session.get());
     {
       mutex_lock l(direct_session->collective_graph_key_lock_);
@@ -2301,6 +2301,26 @@ class DirectSessionCollectiveTest : public ::testing::Test {
         }});
   }
 
+  NodeDef Input(int id) {
+    AttrValue dtype_attr;
+    SetAttrValue(DT_FLOAT, &dtype_attr);
+    NodeDef input;
+    input.set_name(strings::StrCat("input", id));
+    input.set_op("Placeholder");
+    input.mutable_attr()->insert({"dtype", dtype_attr});
+    return input;
+  }
+
+  NodeDef CollectiveCall(const string& op, const string& input, int cpu_id) {
+    NodeDef collective_call;
+    collective_call.set_name(strings::StrCat("collective_call", cpu_id));
+    collective_call.set_op(op);
+    collective_call.add_input(input);
+    collective_call.set_device(
+        strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", cpu_id));
+    return collective_call;
+  }
+
   // Creates a GraphDef that adds two CollectiveFunctions, one each on CPU0 and
   // CPU1, with instance_key 1, and appropriate placeholder inputs.  If
   // `add_unused_function` is true, adds another CollectiveFunction with
@@ -2317,42 +2337,17 @@ class DirectSessionCollectiveTest : public ::testing::Test {
       *lib->add_function() = unused_function;
     }
 
-    // Inputs.
-    AttrValue dtype_attr;
-    SetAttrValue(DT_FLOAT, &dtype_attr);
-    NodeDef input1;
-    input1.set_name("input1");
-    input1.set_op("Placeholder");
-    input1.mutable_attr()->insert({"dtype", dtype_attr});
-    NodeDef input2;
-    input2.set_name("input2");
-    input2.set_op("Placeholder");
-    input2.mutable_attr()->insert({"dtype", dtype_attr});
-
+    *g.add_node() = Input(0);
+    *g.add_node() = Input(1);
     // CollectiveReduce on CPU0 with instance_key 1.
-    NodeDef collective_call1;
-    collective_call1.set_name("collective_call1");
-    collective_call1.set_op("CollectiveFunction1");
-    collective_call1.add_input("input1");
-    collective_call1.set_device("/job:localhost/replica:0/task:0/device:CPU:0");
+    *g.add_node() = CollectiveCall("CollectiveFunction1", "input0", 0);
     // CollectiveReduce on CPU1 with instance_key 1.
-    NodeDef collective_call2;
-    collective_call2.set_name("collective_call2");
-    collective_call2.set_op("CollectiveFunction1");
-    collective_call2.add_input("input2");
-    collective_call1.set_device("/job:localhost/replica:0/task:0/device:CPU:1");
-
-    *g.add_node() = input1;
-    *g.add_node() = input2;
-    *g.add_node() = collective_call1;
-    *g.add_node() = collective_call2;
+    *g.add_node() = CollectiveCall("CollectiveFunction1", "input1", 1);
 
     return g;
   }
 };
 
-#ifndef GOOGLE_CUDA
-// TODO(ayushd): enable this test for GPU builds.
 TEST_F(DirectSessionCollectiveTest,
        TestCollectiveGraphKeyUsesOnlyCalledFunctions) {
   int64 key1;
@@ -2361,6 +2356,5 @@ TEST_F(DirectSessionCollectiveTest,
   TF_ASSERT_OK(RunGraphWithCollectiveFunctions(true, &key2));
   ASSERT_EQ(key1, key2);
 }
-#endif
 
 }  // namespace tensorflow
-- 
GitLab


From d66aac16855ddb70c8d3d5b4c9d4da24a34dffec Mon Sep 17 00:00:00 2001
From: Mustafa Ispir <ispir@google.com>
Date: Wed, 3 Oct 2018 13:33:12 -0700
Subject: [PATCH 359/570] Updates the doc of SyncReplicasOptimizer. It notes
 that some worker can consume multiple mini-batches while some may not even
 one.

PiperOrigin-RevId: 215617588
---
 tensorflow/python/training/sync_replicas_optimizer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index 7afaa92699..6a3756fba9 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -78,7 +78,11 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
   4. Only after all variables have been updated, increment the global step.
   5. Only after step 4, pushes `global_step` in the `token_queue`, once for
      each worker replica. The workers can now fetch the global step, use it to
-     update its local_step variable and start the next batch.
+     update its local_step variable and start the next batch. Please note that
+     some workers can consume multiple minibatches, while some may not consume
+     even one. This is because each worker fetches minibatches as long as
+     a token exists. If one worker is stuck for some reason and does not
+     consume a token, another worker can use it.
 
   For the replicas:
 
-- 
GitLab


From 43073e9d4dc957367d8e2b73c37733ff1dc376c1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 13:34:21 -0700
Subject: [PATCH 360/570] Update ops-related pbtxt files.

PiperOrigin-RevId: 215617800
---
 tensorflow/core/ops/ops.pbtxt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 229022b64c..0e58a9475d 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -28603,6 +28603,14 @@ op {
     name: "stats_aggregator"
     type: DT_RESOURCE
   }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
+    type: DT_STRING
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
-- 
GitLab


From ce9a5d143f89a37ab029a29c62433883323987e8 Mon Sep 17 00:00:00 2001
From: Priya Gupta <priyag@google.com>
Date: Wed, 3 Oct 2018 13:39:44 -0700
Subject: [PATCH 361/570] Tests for metrics correctness with TPU strategy

PiperOrigin-RevId: 215618809
---
 tensorflow/contrib/distribute/python/BUILD    |  17 ++-
 .../contrib/distribute/python/combinations.py |   4 +-
 .../distribute/python/metrics_v1_test.py      | 121 ++++++++++--------
 3 files changed, 86 insertions(+), 56 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index defa82f98a..8267612236 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -737,18 +737,27 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "metrics_v1_test",
+py_library(
+    name = "metrics_v1_test_lib",
+    testonly = 1,
     srcs = ["metrics_v1_test.py"],
-    additional_deps = [
+    deps = [
         ":combinations",
-        "@absl_py//absl/testing:parameterized",
         "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_test(
+    name = "metrics_v1_test",
+    srcs = ["metrics_v1_test.py"],
+    additional_deps = [
+        ":metrics_v1_test_lib",
     ],
     tags = [
         "multi_and_single_gpu",
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 82ca041cc2..cff4b0a463 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -329,10 +329,10 @@ one_device_strategy = NamedDistribution(
     required_gpus=None)
 tpu_strategy = NamedDistribution(
     "TPU", lambda: tpu_lib.TPUStrategy(
-        TPUClusterResolver(""), steps_per_run=5),
+        TPUClusterResolver(""), steps_per_run=2),
     required_tpu=True)
 tpu_strategy_one_step = NamedDistribution(
-    "TPU", lambda: tpu_lib.TPUStrategy(
+    "TPUOneStep", lambda: tpu_lib.TPUStrategy(
         TPUClusterResolver(""), steps_per_run=1),
     required_tpu=True)
 # Note that we disable prefetching for testing since prefetching makes
diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py
index 8163494c8e..ae4189eb1c 100644
--- a/tensorflow/contrib/distribute/python/metrics_v1_test.py
+++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import tpu_strategy
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import test
 from tensorflow.python.framework import ops
@@ -35,7 +36,8 @@ def _labeled_dataset_fn():
   #  8: 3, 2 -> False;  9: 4, 0 -> False; 10: 0, 1 -> False; 11: 1, 2 -> False
   # 12: 2, 0 -> False; 13: 3, 1 -> False; 14: 4, 2 -> False; 15: 0, 0 -> True
   return dataset_ops.Dataset.range(1000).map(
-      lambda x: {"labels": x % 5, "predictions": x % 3}).batch(4)
+      lambda x: {"labels": x % 5, "predictions": x % 3}).batch(
+          4, drop_remainder=True)
 
 
 def _boolean_dataset_fn():
@@ -47,7 +49,8 @@ def _boolean_dataset_fn():
   #   F, T -> FP;  T, F -> FN;   F, F -> TN
   return dataset_ops.Dataset.from_tensor_slices({
       "labels": [True, False, True, False],
-      "predictions": [True, True, False, False]}).repeat().batch(3)
+      "predictions": [True, True, False, False]}).repeat().batch(
+          3, drop_remainder=True)
 
 
 def _threshold_dataset_fn():
@@ -59,7 +62,8 @@ def _threshold_dataset_fn():
   #  False, .75 -> FP;   True, .25 -> FN;  False, 0.0 -> TN
   return dataset_ops.Dataset.from_tensor_slices({
       "labels": [True, False, True, False],
-      "predictions": [1.0, 0.75, 0.25, 0.]}).repeat().batch(3)
+      "predictions": [1.0, 0.75, 0.25, 0.]}).repeat().batch(
+          3, drop_remainder=True)
 
 
 def _regression_dataset_fn():
@@ -79,6 +83,12 @@ def all_combinations():
       mode=["graph"])
 
 
+def tpu_combinations():
+  return combinations.combine(distribution=[combinations.tpu_strategy_one_step,
+                                            combinations.tpu_strategy],
+                              mode=["graph"])
+
+
 # TODO(josh11b): Test metrics.recall_at_top_k, metrics.average_precision_at_k,
 # metrics.precision_at_k
 class MetricsV1Test(test.TestCase, parameterized.TestCase):
@@ -87,42 +97,50 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     with ops.Graph().as_default(), distribution.scope():
       iterator = distribution.distribute_dataset(
           dataset_fn).make_one_shot_iterator()
-      value, update = distribution.call_for_each_tower(
-          metric_fn, iterator.get_next())
-      update = distribution.group(update)
+      if isinstance(distribution, tpu_strategy.TPUStrategy):
+        def step_fn(ctx, inputs):
+          value, update = distribution.call_for_each_tower(
+              metric_fn, inputs)
+          ctx.set_non_tensor_output(name="value", output=value)
+          return distribution.group(update)
+
+        ctx = distribution.run_steps_on_dataset(
+            step_fn, iterator, iterations=distribution.steps_per_run)
+        update = ctx.run_op
+        value = ctx.non_tensor_outputs["value"]
+        # In each run, we run multiple steps, and each steps consumes as many
+        # batches as number of towers.
+        batches_per_update = (
+            distribution.num_towers * distribution.steps_per_run)
+      else:
+        value, update = distribution.call_for_each_tower(
+            metric_fn, iterator.get_next())
+        update = distribution.group(update)
+        # TODO(josh11b): Once we switch to using a global batch size for input,
+        # replace "distribution.num_towers" with "1".
+        batches_per_update = distribution.num_towers
+
+      self.evaluate(distribution.initialize())
       self.evaluate(variables.local_variables_initializer())
-      # TODO(josh11b): Once we switch to using a global batch size for input,
-      # replace "distribution.num_towers" with "1".
-      batches_per_update = distribution.num_towers
-
-      # Update variables using the first `num_towers` batches.
-      self.evaluate(update)
-      self.assertAllClose(expected_fn(batches_per_update), self.evaluate(value),
-                          0.001, msg="After first update")
-
-      # Update variables using the second `num_towers` batches.
-      self.evaluate(update)
-      self.assertAllClose(expected_fn(2 * batches_per_update),
-                          self.evaluate(value),
-                          0.001,
-                          msg="After second update")
-
-      if batches_per_update == 1:  # Consume 4 input batches
-        self.evaluate(update)
-        self.assertAllClose(expected_fn(3 * batches_per_update),
-                            self.evaluate(value),
-                            0.001,
-                            msg="After third update")
+
+      batches_consumed = 0
+      for i in range(4):
         self.evaluate(update)
-        self.assertAllClose(expected_fn(4 * batches_per_update),
+        batches_consumed += batches_per_update
+        self.assertAllClose(expected_fn(batches_consumed),
                             self.evaluate(value),
                             0.001,
-                            msg="After fourth update")
+                            msg="After update #" + str(i+1))
+        if batches_consumed >= 4:  # Consume 4 input batches in total.
+          break
 
-  @combinations.generate(all_combinations())
+      self.evaluate(distribution.finalize())
+
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testMean(self, distribution):
     def _dataset_fn():
-      return dataset_ops.Dataset.range(1000).map(math_ops.to_float).batch(4)
+      return dataset_ops.Dataset.range(1000).map(math_ops.to_float).batch(
+          4, drop_remainder=True)
 
     def _expected_fn(num_batches):
       # Mean(0..3) = 1.5, Mean(0..7) = 3.5, Mean(0..11) = 5.5, etc.
@@ -130,7 +148,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
 
     self._test_metric(distribution, _dataset_fn, metrics.mean, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testAccuracy(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -143,6 +161,8 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _labeled_dataset_fn, _metric_fn, _expected_fn)
 
+  # TODO(priyag, jhseu): Enable TPU for this test once scatter_add is added
+  # for TPUMirroredVariable.
   @combinations.generate(all_combinations())
   def testMeanPerClassAccuracy(self, distribution):
     def _metric_fn(x):
@@ -161,6 +181,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _labeled_dataset_fn, _metric_fn, _expected_fn)
 
+  # NOTE(priyag): This metric doesn't work on TPUs yet.
   @combinations.generate(all_combinations())
   def testMeanIOU(self, distribution):
     def _metric_fn(x):
@@ -179,7 +200,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _labeled_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testMeanTensor(self, distribution):
     def _dataset_fn():
       dataset = dataset_ops.Dataset.range(1000).map(math_ops.to_float)
@@ -198,7 +219,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _dataset_fn, metrics.mean_tensor, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testAUCROC(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -212,7 +233,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testAUCPR(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -226,7 +247,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testFalseNegatives(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -239,7 +260,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testFalseNegativesAtThresholds(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -252,7 +273,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testTrueNegatives(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -265,7 +286,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testTrueNegativesAtThresholds(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -278,7 +299,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testFalsePositives(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -291,7 +312,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testFalsePositivesAtThresholds(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -304,7 +325,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testTruePositives(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -317,7 +338,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testTruePositivesAtThresholds(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -330,7 +351,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testPrecision(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -343,7 +364,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testPrecisionAtThreshold(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -356,7 +377,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testRecall(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -369,7 +390,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testRecallAtThreshold(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -382,7 +403,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testMeanSquaredError(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
@@ -395,7 +416,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
     self._test_metric(
         distribution, _regression_dataset_fn, _metric_fn, _expected_fn)
 
-  @combinations.generate(all_combinations())
+  @combinations.generate(all_combinations() + tpu_combinations())
   def testRootMeanSquaredError(self, distribution):
     def _metric_fn(x):
       labels = x["labels"]
-- 
GitLab


From c26b5e9685b05fafc509d8ebc88c8304be5974a4 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Wed, 3 Oct 2018 13:45:59 -0700
Subject: [PATCH 362/570] Some tiny speed improvements for defun.

Before:
entry {
  name: "MicroBenchmarks.benchmark_defun_matmul_2_by_2_CPU"
  iters: 30000
  wall_time: 48.4476327896
  extras {
    key: "examples_per_sec"
    value {
      double_value: 20640.8433688
    }
  }
}

After:
entry {
  name: "MicroBenchmarks.benchmark_defun_matmul_2_by_2_CPU"
  iters: 30000
  wall_time: 45.2344338099
  extras {
    key: "examples_per_sec"
    value {
      double_value: 22107.0524327
    }
  }
}
PiperOrigin-RevId: 215619902
---
 tensorflow/python/eager/function.py | 36 +++++++++++++++--------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index f261d92d64..dd9f5e233c 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -1152,23 +1152,22 @@ class PolymorphicFunction(object):
       del args, kwargs
       cache_key = self._flat_input_signature
 
+    ctx = context.context()
     with ops.init_scope():
-      init_graph = ops.get_default_graph()
-
       # The graph, or whether we're executing eagerly, should be a part of the
       # cache key so we don't improperly capture tensors such as variables.
-      executing_eagerly = context.executing_eagerly()
-      execution_context = executing_eagerly or init_graph
-
-    default_graph = ops.get_default_graph()
-    # Putting the device in the cache key ensures that call-site device
-    # annotations are respected.
-    device_functions = _get_device_functions(context.context(), default_graph)
+      executing_eagerly = ctx.executing_eagerly()
+      execution_context = executing_eagerly or ops.get_default_graph()
 
-    # `ops.colocate_with` directives translate into `ops.device` directives when
-    # eager execution is enabled.
-    colocation_stack = (() if executing_eagerly else
-                        tuple(default_graph._colocation_stack.peek_objs()))  # pylint: disable=protected-access
+    if executing_eagerly:
+      device_functions = (pydev.merge_device(ctx.device_name),)
+      colocation_stack = ()
+    else:
+      default_graph = ops.get_default_graph()
+      # Putting the device in the cache key ensures that call-site device
+      # annotations are respected.
+      device_functions = tuple(default_graph._device_functions_outer_to_inner)  # pylint: disable=protected-access
+      colocation_stack = tuple(default_graph._colocation_stack.peek_objs())  # pylint: disable=protected-access
 
     return (cache_key, execution_context, device_functions, colocation_stack)
 
@@ -1195,9 +1194,6 @@ class PolymorphicFunction(object):
     """
     args = self._args_to_prepend + args
     kwargs = dict(kwargs, **self._kwargs_to_include)
-    # Maps from index of arg to its corresponding value, according to `args`
-    # and `kwargs`; seeded with the default values for the named args that
-    # aren't in `args`.
     if not kwargs:
       if self._default_values:
         inputs = args + self._default_values[len(args) -
@@ -1205,6 +1201,9 @@ class PolymorphicFunction(object):
       else:
         inputs = args
     else:
+      # Maps from index of arg to its corresponding value, according to `args`
+      # and `kwargs`; seeded with the default values for the named args that
+      # aren't in `args`.
       arg_indices_to_values = {
           index: default for index, default in six.iteritems(
               self._arg_indices_to_default_values) if index >= len(args)
@@ -1227,9 +1226,12 @@ class PolymorphicFunction(object):
     flat_inputs = nest.flatten(inputs)
 
     # Check for NumPy arrays in arguments and convert them to Tensors.
+    # TODO(nareshmodi): Skip ndarray conversion to tensor altogether, perhaps
+    # finding a way to store them directly in the cache key (currently not
+    # possible since ndarrays are not hashable).
     need_packing = False
     for index, value in enumerate(flat_inputs):
-      if isinstance(value, np.ndarray):
+      if type(value) == np.ndarray:
         flat_inputs[index] = constant_op.constant(value)
         need_packing = True
     if need_packing:
-- 
GitLab


From 0b7a3df432f0e607b39ab17d1b85fb0b04e05bd5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 13:46:19 -0700
Subject: [PATCH 363/570] Fixes bug in Conv2D unit test that made it test a
 SeparableConv2D layer instead of a Conv2D layer.

PiperOrigin-RevId: 215619966
---
 tensorflow/python/keras/layers/convolutional_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 2d3d38a5ce..cad5e4c8bd 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -113,7 +113,7 @@ class Conv2DTest(test.TestCase):
       test_kwargs[arg] = value
       with self.test_session(use_gpu=True):
         testing_utils.layer_test(
-            keras.layers.SeparableConv2D,
+            keras.layers.Conv2D,
             kwargs=test_kwargs,
             input_shape=(num_samples, num_row, num_col, stack_size))
 
-- 
GitLab


From ed904611009a74ae530335d3bd16b7070238cec3 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Wed, 3 Oct 2018 14:01:16 -0700
Subject: [PATCH 364/570] Update reference to tools/bazel.rc to .bazelrc after
 cl/215483141

PiperOrigin-RevId: 215623215
---
 configure.py             | 4 ++--
 tensorflow/workspace.bzl | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/configure.py b/configure.py
index 2d2da11700..a88fdb3555 100644
--- a/configure.py
+++ b/configure.py
@@ -1676,8 +1676,8 @@ def main():
   # TODO(pcloudy): remove the following if check when they make sense on Windows
   if not is_windows():
     print('Preconfigured Bazel build configs. You can use any of the below by '
-          'adding "--config=<>" to your build command. See tools/bazel.rc for '
-          'more details.')
+          'adding "--config=<>" to your build command. See .bazelrc for more '
+          'details.')
     config_info_line('mkl', 'Build with MKL support.')
     config_info_line('monolithic', 'Config for mostly static monolithic build.')
     config_info_line('gdr', 'Build with GDR support.')
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index bcc89ef729..d27732a801 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -888,7 +888,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     # why we can't depend on the canonical build target.
 
     # gRPC wants a cares dependency but its contents is not actually
-    # important since we have set GRPC_ARES=0 in tools/bazel.rc
+    # important since we have set GRPC_ARES=0 in .bazelrc
     native.bind(
         name = "cares",
         actual = "@grpc//third_party/nanopb:nanopb",
-- 
GitLab


From 94267ccc14516ad9df67897bea8ede20cbad24ca Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 3 Oct 2018 14:09:05 -0700
Subject: [PATCH 365/570] Move out-params to end of argument list and add an
 out_ prefix; NFC

PiperOrigin-RevId: 215624875
---
 tensorflow/compiler/jit/kernels/xla_ops.cc    |  2 +-
 .../compiler/jit/xla_compilation_cache.cc     | 33 ++++++++++---------
 .../compiler/jit/xla_compilation_cache.h      | 29 ++++++++--------
 .../compiler/jit/xla_compile_on_demand_op.cc  |  2 +-
 4 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index cfd27a6510..accc86a86d 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -277,7 +277,7 @@ static Status CompileToLocalExecutable(
   compile_options.always_return_tuple = false;
 
   return cache->Compile(options, function, constant_args, *variables, ctx,
-                        kernel, executable, compile_options);
+                        compile_options, kernel, executable);
 }
 
 void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 3aa9e9c7ed..0471995015 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -228,37 +228,38 @@ Status XlaCompilationCache::Compile(
     const XlaCompiler::Options& options, const NameAttrList& function,
     const std::map<int, Tensor>& constant_args,
     const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
-    const XlaCompiler::CompilationResult** compilation_result,
-    xla::LocalExecutable** executable,
-    const XlaCompiler::CompileOptions& compile_options) {
+    const XlaCompiler::CompileOptions& compile_options,
+    const XlaCompiler::CompilationResult** out_compilation_result,
+    xla::LocalExecutable** out_executable) {
   return CompileImpl(options, function, constant_args, variable_args, ctx,
-                     compilation_result, executable, compile_options, false);
+                     compile_options, /*compile_single_op=*/false,
+                     out_compilation_result, out_executable);
 }
 
 Status XlaCompilationCache::CompileSingleOp(
     const XlaCompiler::Options& options,
     const std::map<int, Tensor>& constant_args,
     const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
-    const XlaCompiler::CompilationResult** compilation_result,
-    xla::LocalExecutable** executable,
-    const XlaCompiler::CompileOptions& compile_options) {
+    const XlaCompiler::CompileOptions& compile_options,
+    const XlaCompiler::CompilationResult** out_compilation_result,
+    xla::LocalExecutable** out_executable) {
   const NodeDef& def = ctx->op_kernel().def();
   NameAttrList name;
   name.set_name(def.op());
   *name.mutable_attr() = def.attr();
-  return CompileImpl(options, name, constant_args, variable_args, ctx,
-                     compilation_result, executable, compile_options, true);
+  return CompileImpl(
+      options, name, constant_args, variable_args, ctx, compile_options,
+      /*compile_single_op=*/true, out_compilation_result, out_executable);
 }
 
 Status XlaCompilationCache::CompileImpl(
     const XlaCompiler::Options& options, const NameAttrList& function,
     const std::map<int, Tensor>& constant_args,
     const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
-    const XlaCompiler::CompilationResult** compilation_result,
-    xla::LocalExecutable** executable,
-    const XlaCompiler::CompileOptions& compile_options,
-    bool compile_single_op) {
-  CHECK_NE(executable, nullptr);
+    const XlaCompiler::CompileOptions& compile_options, bool compile_single_op,
+    const XlaCompiler::CompilationResult** out_compilation_result,
+    xla::LocalExecutable** out_executable) {
+  DCHECK_NE(out_executable, nullptr);
   VLOG(2) << "XlaCompilationCache::Compile " << DebugString();
 
   if (VLOG_IS_ON(2)) {
@@ -357,8 +358,8 @@ Status XlaCompilationCache::CompileImpl(
     }
   }
   TF_RETURN_IF_ERROR(entry->compilation_status);
-  *compilation_result = &entry->compilation_result;
-  *executable = entry->executable.get();
+  *out_compilation_result = &entry->compilation_result;
+  *out_executable = entry->executable.get();
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index 17c0321c1e..75c7758f73 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -68,9 +68,9 @@ class XlaCompilationCache : public ResourceBase {
                  const std::map<int, Tensor>& constant_args,
                  const std::map<int, OptionalTensor>& variable_args,
                  OpKernelContext* ctx,
-                 const XlaCompiler::CompilationResult** compilation_result,
-                 xla::LocalExecutable** executable,
-                 const XlaCompiler::CompileOptions& compile_options);
+                 const XlaCompiler::CompileOptions& compile_options,
+                 const XlaCompiler::CompilationResult** out_compilation_result,
+                 xla::LocalExecutable** out_executable);
 
   // As above, but calls XlaCompiler::CompileSingleOp instead of
   // XlaCompiler::CompileFunction.
@@ -78,9 +78,9 @@ class XlaCompilationCache : public ResourceBase {
       const XlaCompiler::Options& options,
       const std::map<int, Tensor>& constant_args,
       const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
-      const XlaCompiler::CompilationResult** compilation_result,
-      xla::LocalExecutable** executable,
-      const XlaCompiler::CompileOptions& compile_options);
+      const XlaCompiler::CompileOptions& compile_options,
+      const XlaCompiler::CompilationResult** out_compilation_result,
+      xla::LocalExecutable** out_executable);
 
   xla::LocalClient* client() const { return client_; }
   const DeviceType& device_type() const { return device_type_; }
@@ -89,15 +89,14 @@ class XlaCompilationCache : public ResourceBase {
 
  private:
   // Common implementation of Compile and CompileSingleOp.
-  Status CompileImpl(const XlaCompiler::Options& options,
-                     const NameAttrList& function,
-                     const std::map<int, Tensor>& constant_args,
-                     const std::map<int, OptionalTensor>& variable_args,
-                     OpKernelContext* ctx,
-                     const XlaCompiler::CompilationResult** compilation_result,
-                     xla::LocalExecutable** executable,
-                     const XlaCompiler::CompileOptions& compile_options,
-                     bool compile_single_op);
+  Status CompileImpl(
+      const XlaCompiler::Options& options, const NameAttrList& function,
+      const std::map<int, Tensor>& constant_args,
+      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+      const XlaCompiler::CompileOptions& compile_options,
+      bool compile_single_op,
+      const XlaCompiler::CompilationResult** out_compilation_result,
+      xla::LocalExecutable** out_executable);
 
   // Takes `result` which has been compiled from a Tensorflow subgraph to a
   // XLA computation already, and generates an XLA LocalExecutable `executable`.
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index b98c0cb028..79976c85df 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -180,7 +180,7 @@ Status XlaCompileOnDemandOp::Compile(
 
   std::map<int, OptionalTensor> variable_args = GetVariables(ctx);
   return cache->CompileSingleOp(options, constant_arguments, variable_args, ctx,
-                                result, executable, compile_options);
+                                compile_options, result, executable);
 }
 
 void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
-- 
GitLab


From f5f8dff270b9f2cdf36bba9d671c324a4f7c6fac Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 14:28:25 -0700
Subject: [PATCH 366/570] Add NNAPI padding enums to NeuralNetworksShim.h

PiperOrigin-RevId: 215628561
---
 tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index 687944023b..eccf4aefb6 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -179,6 +179,14 @@ enum {
   ANEURALNETWORKS_BAD_STATE = 6,
 };
 
+/**
+ * Implicit padding algorithms.
+ */
+enum {
+  ANEURALNETWORKS_PADDING_SAME = 1,
+  ANEURALNETWORKS_PADDING_VALID = 2,
+};
+
 /**
  * ANeuralNetworksMemory is an opaque type that represents memory.
  *
-- 
GitLab


From 2e11deba60cb00027de4373af17703676fa74bd7 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Wed, 3 Oct 2018 14:37:57 -0700
Subject: [PATCH 367/570] [XLA] Disable a test for layout changing elementwise
 operations.

Rename the test to make it obvious that it is for testing the codegen
correctness in handling layout changing elementwise operations.

Keep the test only for the CPU backend.

PiperOrigin-RevId: 215630611
---
 tensorflow/compiler/xla/tests/fusion_test.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index 9c94acb437..fd79a9d041 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -764,8 +764,9 @@ XLA_TEST_F(FusionTest, Clamp2D) {
   TestElementwise2D<float, 3>(HloOpcode::kClamp);
 }
 
-// TODO(b/73903144): Enable on interpreter once interpreter supports bitcast.
-XLA_TEST_F(FusionTest, DISABLED_ON_INTERPRETER(FusionWithLayout)) {
+// TODO(b/117156505): Remove this test when the bug is fixed.
+XLA_TEST_F(FusionTest, DISABLED_ON_GPU(DISABLED_ON_INTERPRETER(
+                           LayoutChangingElementWiseOp))) {
   const string hlo_text = R"(
 HloModule Cluster
 
-- 
GitLab


From c1b3b0b9e041d82e80c2cdcc623a387753daf0b4 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 3 Oct 2018 14:42:56 -0700
Subject: [PATCH 368/570] Internal change.

PiperOrigin-RevId: 215631612
---
 tensorflow/contrib/lite/kernels/BUILD | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index daaf6714cc..b349a2863c 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -337,7 +337,10 @@ tf_cc_test(
     name = "activations_test",
     size = "small",
     srcs = ["activations_test.cc"],
-    tags = ["tflite_not_portable_ios"],
+    tags = [
+        "nomac",
+        "tflite_not_portable_ios",
+    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
-- 
GitLab


From 312e37cee391b0d207293d59d8882db3c8030f9d Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Wed, 3 Oct 2018 14:51:08 -0700
Subject: [PATCH 369/570] Add a require_static_shapes argument to
 DistributionStrategy class. This allows us to identify if we need to set the
 drop_remainder option when creating Dataset objects.

PiperOrigin-RevId: 215633097
---
 tensorflow/contrib/distribute/python/tpu_strategy.py |  4 +++-
 tensorflow/python/keras/engine/training.py           | 11 +++++------
 tensorflow/python/training/distribute.py             |  7 +++++++
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index c3c7df3cd8..1d9e299b38 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -132,7 +132,7 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
     """
     # TODO(sourabhbajaj): OneDeviceStrategy should be initialized with the
     # master node fetched from the cluster resolver.
-    super(TPUStrategy, self).__init__('/device:CPU:0')
+    super(TPUStrategy, self).__init__("/device:CPU:0")
 
     self._tpu_cluster_resolver = tpu_cluster_resolver
     self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
@@ -152,6 +152,8 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
     # at a time is comparable to multiple steps.
     self.steps_per_run = steps_per_run
 
+    self._require_static_shapes = True
+
   def _get_enqueue_op_per_host(self, host_id, iterator, input_shapes,
                                iterations):
     """Create an enqueue op for a single host identified using host_id.
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 85233de9b1..d81bd83f7f 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -814,6 +814,9 @@ class Model(Network):
       x_shape = first_x_value.shape
       if batch_size is None:
         batch_size = x_shape[0] // steps
+      # We need to use the drop_remainder argument to allow for a static
+      # input shape which is required for TPUs.
+      drop_remainder = self._distribution_strategy.require_static_shapes
       if y is not None:
         var_x = distributed_training_utils.get_var_for_numpy(
             self._distribution_strategy, x)
@@ -824,9 +827,7 @@ class Model(Network):
         # TODO(anjalisridhar): What should the buffer size be?
         x = x.shuffle(10000)
         x = x.repeat()
-        # We need to use the drop_remainder argument to allow for a static
-        # input shape which is required for TPUs.
-        x = x.batch(batch_size, drop_remainder=True)
+        x = x.batch(batch_size, drop_remainder=drop_remainder)
         y = None
       else:
         # This case is for the predict call where the dataset only contains
@@ -838,9 +839,7 @@ class Model(Network):
             self._distribution_strategy, x)
         x = dataset_ops.Dataset.from_tensor_slices(var_x)
         x = x.repeat()
-        # We need to use the drop_remainder argument to allow for a static
-        # input shape which is required for TPUs.
-        x = x.batch(batch_size, drop_remainder=True)
+        x = x.batch(batch_size, drop_remainder=drop_remainder)
 
     # TODO(anjalisridhar): Can we use the iterator and getnext op cache?
     # We require users to pass Datasets since we distribute the dataset across
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index a92a1bdee7..b3f3c29b2f 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -436,6 +436,9 @@ class DistributionStrategy(object):
 
   def __init__(self):
     self._default_device = None
+    # This property is used to determine if we should set drop_remainder=True
+    # when creating Datasets from numpy array inputs.
+    self._require_static_shapes = False
 
   def scope(self):
     """Returns a context manager selecting this DistributionStrategy as current.
@@ -898,6 +901,10 @@ class DistributionStrategy(object):
     """
     raise NotImplementedError("must be implemented in descendants")
 
+  @property
+  def require_static_shapes(self):
+    return self._require_static_shapes
+
   @property
   def num_towers(self):
     """Returns number of towers, for purposes of averaging across towers."""
-- 
GitLab


From 148bc62dba0a0b9d26945ce48b6dcd903613de14 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Wed, 3 Oct 2018 15:14:32 -0700
Subject: [PATCH 370/570] Update size of multi_device_iterator_test to medium
 to fix timeouts

PiperOrigin-RevId: 215637785
---
 tensorflow/python/data/kernel_tests/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index bf76860aa4..c7295d6e69 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -291,7 +291,7 @@ tf_py_test(
 
 cuda_py_test(
     name = "multi_device_iterator_test",
-    size = "small",
+    size = "medium",
     srcs = ["multi_device_iterator_test.py"],
     additional_deps = [
         ":test_base",
-- 
GitLab


From efbee1ab2cac59f511cc0850d84414e711bbda3b Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Wed, 3 Oct 2018 15:15:23 -0700
Subject: [PATCH 371/570] Fix ci_parameterized_build to pass environment
 variables to tests.

This is particularly important when using --run_under with
parallel_gpu_execute, since the envvars control the execution.

PiperOrigin-RevId: 215637931
---
 .../tools/ci_build/ci_parameterized_build.sh   | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 49a9048c03..99bdedf7b4 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -65,8 +65,6 @@
 #   TF_GPU_COUNT:
 #                      Run this many parallel tests for serial builds.
 #                      For now, only can be edited for PIP builds.
-#                      TODO(gunan): Find a way to pass this environment variable
-#                      to the script bazel runs (using --run_under).
 #   TF_BUILD_TEST_TUTORIALS:
 #                      If set to any non-empty and non-0 value, will perform
 #                      tutorials tests (Applicable only if TF_BUILD_IS_PIP is
@@ -150,6 +148,13 @@ ANDROID_FULL_CMD="${CI_BUILD_DIR}/builds/android_full.sh"
 TF_GPU_COUNT=${TF_GPU_COUNT:-4}
 PARALLEL_GPU_TEST_CMD='//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute'
 
+# Environment variables to set when running bazel tests.  These are especially
+# important when using --run_under with parallel_gpu_execute.
+BAZEL_TEST_ENV=""\
+"--test_env=TF_GPU_COUNT=${TF_GPU_COUNT} "\
+"--test_env=TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU} "\
+"--test_env=TF_PER_DEVICE_MEMORY_LIMIT_MB=${TF_PER_DEVICE_MEMORY_LIMIT_MB} "
+
 BENCHMARK_CMD="${CI_BUILD_DIR}/builds/benchmark.sh"
 
 EXTRA_PARAMS=""
@@ -410,13 +415,14 @@ if [[ ${TF_BUILD_IS_PIP} == "no_pip" ]] ||
   if [[ ${CTYPE} == cpu* ]] || \
      [[ ${CTYPE} == "debian.jessie.cpu" ]]; then
     # CPU only command, fully parallel.
-    NO_PIP_MAIN_CMD="${MAIN_CMD} ${BAZEL_CMD} ${OPT_FLAG} ${EXTRA_ARGS} -- "\
-"${BAZEL_TARGET}"
+    NO_PIP_MAIN_CMD="${MAIN_CMD} ${BAZEL_CMD} ${BAZEL_TEST_ENV} ${OPT_FLAG} "\
+      "${EXTRA_ARGS} -- ${BAZEL_TARGET}"
   elif [[ ${CTYPE} == gpu* ]]; then
     # GPU only command, run as many jobs as the GPU count only.
-    NO_PIP_MAIN_CMD="${BAZEL_CMD} ${OPT_FLAG} "\
+    NO_PIP_MAIN_CMD="${BAZEL_CMD} ${BAZEL_TEST_ENV} ${OPT_FLAG} "\
 "--local_test_jobs=${TF_GPU_COUNT} "\
-"--run_under=${PARALLEL_GPU_TEST_CMD} ${EXTRA_ARGS} -- ${BAZEL_TARGET}"
+"--run_under=${PARALLEL_GPU_TEST_CMD} "\
+"${EXTRA_ARGS} -- ${BAZEL_TARGET}"
   elif [[ ${CTYPE} == "android" ]]; then
     # Run android specific script for android build.
     NO_PIP_MAIN_CMD="${ANDROID_CMD} ${OPT_FLAG} "
-- 
GitLab


From 0dfde8ab8addef36f90a445f0d604618a199508c Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Wed, 3 Oct 2018 15:48:53 -0700
Subject: [PATCH 372/570] Disable norm_op_test and svd_op_test under msan

PiperOrigin-RevId: 215643600
---
 tensorflow/python/kernel_tests/BUILD | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index c0e9a3c975..9303c70c60 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2999,7 +2999,10 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     shard_count = 20,
-    tags = ["no_oss"],  # b/117185141
+    tags = [
+        "no_oss",  # b/117185141.
+        "nomsan",  # TODO(b/117236102): Re-enable in msan build.
+    ],
 )
 
 cuda_py_test(
@@ -3014,7 +3017,11 @@ cuda_py_test(
         "//tensorflow/python:linalg_ops",
     ],
     shard_count = 20,
-    tags = ["no_windows_gpu"],
+    # TODO(b/117236102): Re-enable in msan build.
+    tags = [
+        "no_windows_gpu",
+        "nomsan",
+    ],
 )
 
 cuda_py_test(
-- 
GitLab


From 041c347df995e6c6d9206920ae061f558e120b92 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 3 Oct 2018 15:59:41 -0700
Subject: [PATCH 373/570] [TF:XLA] Bump open source abseil revision to
 f21d187b80e3b7f08fb279775ea9c8b48c636030

PiperOrigin-RevId: 215645351
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index d27732a801..72f3fd0cf8 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -110,11 +110,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
-        sha256 = "7dd09690ae7ca4551de3111d4a86b75b23ec17445f273d3c42bdcdc1c7b02e4e",
-        strip_prefix = "abseil-cpp-48cd2c3f351ff188bc85684b84a91b6e6d17d896",
+        sha256 = "507903ef9353cb25cccd0a6840048fdd348fd20e98314d694f04a990c0f277e3",
+        strip_prefix = "abseil-cpp-f21d187b80e3b7f08fb279775ea9c8b48c636030",
         urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/48cd2c3f351ff188bc85684b84a91b6e6d17d896.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/48cd2c3f351ff188bc85684b84a91b6e6d17d896.tar.gz",
+            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/f21d187b80e3b7f08fb279775ea9c8b48c636030.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/f21d187b80e3b7f08fb279775ea9c8b48c636030.tar.gz",
         ],
     )
 
-- 
GitLab


From 207bea0e35ab635e66137520963761a6e94354ea Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Wed, 3 Oct 2018 16:34:05 -0700
Subject: [PATCH 374/570] [XLA] Revise the way to express a CPU specific test.

Use #ifdef XLA_TEST_BACKEND_CPU to protect the test instead of disabling it for
all the other backends except for the CPU backend.

PiperOrigin-RevId: 215651036
---
 tensorflow/compiler/xla/tests/fusion_test.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index fd79a9d041..4d4b676a53 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -764,9 +764,10 @@ XLA_TEST_F(FusionTest, Clamp2D) {
   TestElementwise2D<float, 3>(HloOpcode::kClamp);
 }
 
-// TODO(b/117156505): Remove this test when the bug is fixed.
-XLA_TEST_F(FusionTest, DISABLED_ON_GPU(DISABLED_ON_INTERPRETER(
-                           LayoutChangingElementWiseOp))) {
+// TODO(b/117156505): Remove this test when the bug is fixed and the CPU backend
+// should not generate layout changing elementwise operations.
+#ifdef XLA_TEST_BACKEND_CPU
+XLA_TEST_F(FusionTest, LayoutChangingElementWiseOp) {
   const string hlo_text = R"(
 HloModule Cluster
 
@@ -795,6 +796,7 @@ ENTRY main {
       LiteralUtil::CreateR3<float>({{{0.}, {0.76159415595}}, {{0.}, {0.}}}),
       result));
 }
+#endif
 
 class FusionClientLibraryTest : public ClientLibraryTestBase {};
 
-- 
GitLab


From 9801b8810e07859141d4417746317cc3dbebc227 Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Wed, 3 Oct 2018 16:36:23 -0700
Subject: [PATCH 375/570] Reduce batch sizes for some eager tests to prevert
 OOMs in OSS runs

PiperOrigin-RevId: 215651413
---
 .../python/examples/resnet50/resnet50_graph_test.py    | 10 +++++++---
 .../eager/python/examples/revnet/revnet_test.py        |  3 +++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
index 551c76b0df..f3bb978875 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
@@ -51,7 +51,9 @@ def random_batch(batch_size):
 class ResNet50GraphTest(tf.test.TestCase):
 
   def testApply(self):
-    batch_size = 64
+    # Use small batches for tests because the OSS version runs
+    # in constrained GPU environment with 1-2GB of memory.
+    batch_size = 8
     with tf.Graph().as_default():
       images = tf.placeholder(tf.float32, image_shape(None))
       model = resnet50.ResNet50(data_format())
@@ -63,7 +65,7 @@ class ResNet50GraphTest(tf.test.TestCase):
         sess.run(init)
         np_images, _ = random_batch(batch_size)
         out = sess.run(predictions, feed_dict={images: np_images})
-        self.assertAllEqual([64, 1000], out.shape)
+        self.assertAllEqual([batch_size, 1000], out.shape)
 
   def testTrainWithSummary(self):
     with tf.Graph().as_default():
@@ -87,7 +89,9 @@ class ResNet50GraphTest(tf.test.TestCase):
       init = tf.global_variables_initializer()
       self.assertEqual(321, len(tf.global_variables()))
 
-      batch_size = 32
+      # Use small batches for tests because the OSS version runs
+      # in constrained GPU environment with 1-2GB of memory.
+      batch_size = 2
       with tf.Session() as sess:
         sess.run(init)
         sess.run(tf.contrib.summary.summary_writer_initializer_op())
diff --git a/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py b/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
index 6a921e1997..4f4cc3af6f 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
@@ -50,6 +50,9 @@ class RevNetTest(tf.test.TestCase):
     # Reconstruction could cause numerical error, use double precision for tests
     config.dtype = tf.float64
     config.fused = False  # Fused batch norm does not support tf.float64
+    # Reduce the batch size for tests because the OSS version runs
+    # in constrained GPU environment with 1-2GB of memory.
+    config.batch_size = 2
     shape = (config.batch_size,) + config.input_shape
     self.model = revnet.RevNet(config=config)
     self.x = tf.random_normal(shape=shape, dtype=tf.float64)
-- 
GitLab


From d5b362a67a57f53f610536ed6068a5b67bc37b88 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Wed, 3 Oct 2018 16:38:22 -0700
Subject: [PATCH 376/570] Update size of mvn_diag_test and core_rnn_cell_test
 to medium to fix timeouts

PiperOrigin-RevId: 215651746
---
 tensorflow/contrib/distributions/BUILD | 2 +-
 tensorflow/contrib/rnn/BUILD           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 3ff7da4f89..60f6b90edc 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -299,7 +299,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "mvn_diag_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/mvn_diag_test.py"],
     additional_deps = [
         ":distributions_py",
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 4e67d80558..1385a9ddc1 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -108,7 +108,7 @@ cuda_py_tests(
 
 cuda_py_tests(
     name = "core_rnn_cell_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/core_rnn_cell_test.py"],
     additional_deps = [
         ":rnn_py",
-- 
GitLab


From aeb044c9784d30a25c0d15fa31f479001be55052 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Wed, 3 Oct 2018 16:41:21 -0700
Subject: [PATCH 377/570] assert_nontrivial_match in
 tf.keras.Model.load_weights (TF format)

Adds a bit of sanity checking by default to load_weights (e.g. for the case when absolutely nothing matches) while still supporting restore-on-create and the addition of new Layers to checkpointed models.

PiperOrigin-RevId: 215652168
---
 tensorflow/python/keras/engine/network.py     |  1 +
 tensorflow/python/keras/engine/saving_test.py | 13 +++++
 .../python/training/checkpointable/util.py    | 56 +++++++++++++++++--
 .../training/checkpointable/util_test.py      |  5 ++
 4 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 5ef8d13487..8d34006967 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -1526,6 +1526,7 @@ class Network(base_layer.Layer):
         # Restore existing variables (if any) immediately, and set up a
         # streaming restore for any variables created in the future.
         checkpointable_utils.streaming_restore(status=status, session=session)
+      status.assert_nontrivial_match()
       return status
     if h5py is None:
       raise ImportError(
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index 02d99d5d69..f5045be907 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import training as training_module
+from tensorflow.python.training.checkpointable import util as checkpointable
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -922,6 +923,18 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
         SubclassedModel, SubclassedModelRestore,
         _restore_init_fn)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_incompatible_checkpoint(self):
+    save_path = checkpointable.Checkpoint().save(
+        os.path.join(self.get_temp_dir(), 'ckpt'))
+    m = keras.Model()
+    with self.assertRaisesRegexp(AssertionError, 'Nothing to load'):
+      m.load_weights(save_path)
+    m.dense = keras.layers.Dense(2)
+    m.dense(constant_op.constant([[1.]]))
+    with self.assertRaisesRegexp(
+        AssertionError, 'Nothing except the root object matched'):
+      m.load_weights(save_path)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py
index eff15b24ce..edab6cc6eb 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -853,6 +853,11 @@ class _LoadStatus(object):
     """Raises an exception unless existing Python objects have been matched."""
     pass
 
+  @abc.abstractmethod
+  def assert_nontrivial_match(self):
+    """Raises an exception if only the root object matched."""
+    pass
+
   @abc.abstractmethod
   def run_restore_ops(self, session=None):
     """Runs restore ops from the checkpoint. Requires a valid checkpoint."""
@@ -975,6 +980,26 @@ class CheckpointLoadStatus(_LoadStatus):
           % (list(unused_python_objects),))
     return self
 
+  def assert_nontrivial_match(self):
+    """Raises an exception if only the root object matched."""
+    for checkpointable_object in list_objects(self._root_checkpointable):
+      self._checkpoint.all_python_objects.add(checkpointable_object)
+    if len(self._checkpoint.object_by_proto_id) <= 1:
+      unused_python_objects = (
+          _ObjectIdentitySet(self._checkpoint.all_python_objects)
+          - _ObjectIdentitySet(self._checkpoint.object_by_proto_id.values()))
+      if unused_python_objects:
+        raise AssertionError(
+            ("Nothing except the root object matched a checkpointed value. "
+             "Typically this means that the checkpoint does not match the "
+             "Python program. The following objects have no matching "
+             "checkpointed value: %s") % (list(unused_python_objects),))
+      else:
+        raise AssertionError(
+            "Nothing to load. No dependencies have been added to %s yet." % (
+                self._root_checkpointable,))
+    return self
+
   def run_restore_ops(self, session=None):
     """Run operations to restore objects in the dependency graph."""
     if context.executing_eagerly():
@@ -1039,6 +1064,11 @@ class InitializationOnlyStatus(_LoadStatus):
     raise AssertionError(
         "No checkpoint specified (save_path=None); nothing is being restored.")
 
+  def assert_nontrivial_match(self):
+    """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
+    raise AssertionError(
+        "No checkpoint specified (save_path=None); nothing is being restored.")
+
   def run_restore_ops(self, session=None):
     """For consistency with `CheckpointLoadStatus`.
 
@@ -1122,6 +1152,14 @@ class NameBasedSaverStatus(_LoadStatus):
     # useful since we don't touch Python objects or Python state).
     return self.assert_consumed()
 
+  def assert_nontrivial_match(self):
+    """Raises an exception if currently created objects are unmatched."""
+    # For name-based checkpoints there's no object information in the
+    # checkpoint, so there's no distinction between
+    # assert_nontrivial_match and assert_consumed (and both are less
+    # useful since we don't touch Python objects or Python state).
+    return self.assert_consumed()
+
   def _gather_saveable_objects(self):
     """Walk the object graph, using global names for SaveableObjects."""
     objects = list_objects(self._root_checkpointable)
@@ -1779,13 +1817,15 @@ class Checkpoint(tracking.Checkpointable):
       status of a checkpoint restoration and run initialization/restore ops.
 
       The returned status object has the following methods:
-      - `assert_consumed()`:
+
+      * `assert_consumed()`:
           Raises an exception if any variables/objects are unmatched: either
           checkpointed values which don't have a matching Python object or
           Python objects in the dependency graph with no values in the
           checkpoint. This method returns the status object, and so may be
           chained with `initialize_or_restore` or `run_restore_ops`.
-      -  `assert_existing_objects_matched()`:
+
+      * `assert_existing_objects_matched()`:
           Raises an exception if any existing Python objects in the dependency
           graph are unmatched. Unlike `assert_consumed`, this assertion will
           pass if values in the checkpoint have no corresponding Python
@@ -1796,12 +1836,20 @@ class Checkpoint(tracking.Checkpointable):
           a `tf.train.Optimizer` was saved but only the state required for
           inference is being loaded. This method returns the status object, and
           so may be chained with `initialize_or_restore` or `run_restore_ops`.
-      - `initialize_or_restore(session=None)`:
+
+      * `assert_nontrivial_match()`: Asserts that something aside from the root
+          object was matched. This is a very weak assertion, but is useful for
+          sanity checking in library code where objects may exist in the
+          checkpoint which haven't been created in Python and some Python
+          objects may not have a checkpointed value.
+
+      * `initialize_or_restore(session=None)`:
           When graph building, runs variable initializers if `save_path` is
           `None`, but otherwise runs restore operations. If no `session` is
           explicitly specified, the default session is used. No effect when
           executing eagerly (variables are initialized or restored eagerly).
-      - `run_restore_ops(session=None)`:
+
+      * `run_restore_ops(session=None)`:
           When graph building, runs restore operations. If no `session` is
           explicitly specified, the default session is used. No effect when
           executing eagerly (restore operations are run eagerly). May only be
diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py
index f8b5bd8501..14b47a1940 100644
--- a/tensorflow/python/training/checkpointable/util_test.py
+++ b/tensorflow/python/training/checkpointable/util_test.py
@@ -437,6 +437,7 @@ class CheckpointingTests(test.TestCase):
         optimizer=on_create_optimizer, model=on_create_model)
     # Deferred restoration
     status = on_create_root.restore(save_path=save_path)
+    status.assert_nontrivial_match()
     status.assert_existing_objects_matched()
     with self.assertRaises(AssertionError):
       status.assert_consumed()
@@ -1509,6 +1510,8 @@ class CheckpointCompatibilityTests(test.TestCase):
           status.assert_consumed()
         with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
           status.assert_existing_objects_matched()
+        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
+          status.assert_nontrivial_match()
       else:
         # When graph building, we haven't read any keys, so we don't know
         # whether the restore will be complete.
@@ -1516,6 +1519,8 @@ class CheckpointCompatibilityTests(test.TestCase):
           status.assert_consumed()
         with self.assertRaisesRegexp(AssertionError, "not restored"):
           status.assert_existing_objects_matched()
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_nontrivial_match()
       status.run_restore_ops()
       self._check_sentinels(root)
       self._set_sentinels(root)
-- 
GitLab


From 13941241e984e4a4296891f4e61a9ed5b3107b22 Mon Sep 17 00:00:00 2001
From: Dimitris Vardoulakis <dimvar@google.com>
Date: Wed, 3 Oct 2018 16:47:49 -0700
Subject: [PATCH 378/570] [TF:XLA] Improve the accounting for subcomputations
 in the heap simulator.

Subtract the size of the aliased buffers from the subcomputation estimate instead of from the current computation. This way, the memory estimate for the current computation is more accurate.

For the newly added test, the heap simulation calculates 48 bytes at head instead of the correct 64 bytes.

PiperOrigin-RevId: 215653047
---
 .../compiler/xla/service/heap_simulator.cc    |  34 +++--
 .../compiler/xla/service/heap_simulator.h     |  13 +-
 .../xla/service/heap_simulator_test.cc        | 118 +++++++++++++++++
 .../xla/service/hlo_memory_scheduler_test.cc  | 120 ------------------
 4 files changed, 136 insertions(+), 149 deletions(-)

diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index b343305554..9220865867 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -240,6 +240,7 @@ Status HeapSimulator::RunComputation(
 
     // Make sure each buffer get reused at most once.
     flat_hash_set<const BufferValue*> reused_buffers;
+    int64 alloc_size_by_instruction = 0;
     for (const BufferValue* buffer : buffers_defined_by_instruction) {
       if (IgnoreBuffer(buffer)) {
         continue;
@@ -272,14 +273,15 @@ Status HeapSimulator::RunComputation(
 
       if (!shared) {
         VLOG(3) << "  Allocating: " << buffer->ToString();
+        alloc_size_by_instruction += size_fn_(*buffer);
         Alloc(buffer, instruction);
       }
     }
     // Account for the memory used by subcomputations when estimating the
     // current heap size.
     if (memory_by_computation_ != nullptr) {
-      algorithm_->AccountForSubcomputationMemory(instruction,
-                                                 *memory_by_computation_);
+      algorithm_->AccountForSubcomputationMemory(
+          instruction, alloc_size_by_instruction, *memory_by_computation_);
     }
 
     // If all computations in the module have been scheduled, we can save memory
@@ -385,10 +387,8 @@ void HeapSimulator::Alloc(const BufferValue* buffer,
 
   allocated_buffers_.insert(buffer);
   const int64 size = size_fn_(*buffer);
-  const HloInstruction* instruction_to_calc_aliasing =
-      memory_by_computation_ == nullptr ? nullptr : instruction;
-  algorithm_->Alloc(buffer, size, instruction_to_calc_aliasing);
-  no_fragmentation_stats_->Alloc(buffer, size, instruction_to_calc_aliasing);
+  algorithm_->Alloc(buffer, size);
+  no_fragmentation_stats_->Alloc(buffer, size);
   FillDebugTrace(HeapSimulatorTrace::Event::ALLOC, buffer, instruction,
                  nullptr);
 }
@@ -526,20 +526,8 @@ void NoFragmentationStatsHeap::Alloc(const BufferValue* buffer, int64 size) {
   }
 }
 
-void NoFragmentationStatsHeap::Alloc(const BufferValue* buffer, int64 size,
-                                     const HloInstruction* instruction) {
-  // The output buffer of while/call/conditional is always aliased with the
-  // output buffer of the root instruction in the body. Don't double count.
-  if (instruction == nullptr ||
-      (instruction->opcode() != HloOpcode::kWhile &&
-       instruction->opcode() != HloOpcode::kCall &&
-       instruction->opcode() != HloOpcode::kConditional)) {
-    Alloc(buffer, size);
-  }
-}
-
 void NoFragmentationStatsHeap::AccountForSubcomputationMemory(
-    const HloInstruction* instruction,
+    const HloInstruction* instruction, int64 alloc_size_by_instruction,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
   // We only count the memory usage of the largest subcomputation, instead of
@@ -554,6 +542,14 @@ void NoFragmentationStatsHeap::AccountForSubcomputationMemory(
       }
     }
   }
+  if (max_subcomputation_bytes > 0 &&
+      (instruction->opcode() == HloOpcode::kWhile ||
+       instruction->opcode() == HloOpcode::kCall ||
+       instruction->opcode() == HloOpcode::kConditional)) {
+    // The output buffer of while/call/conditional is always aliased with the
+    // output buffer of the root instruction in the body. Don't double count.
+    max_subcomputation_bytes -= alloc_size_by_instruction;
+  }
   max_heap_size_ =
       std::max(max_heap_size_, current_heap_size_ + max_subcomputation_bytes);
 }
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index b0295a6163..dbbf43082f 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -218,12 +218,6 @@ class HeapAlgorithm {
   // Alloc allocates a buffer of 'size' bytes.
   virtual void Alloc(const BufferValue* buffer, int64 size) = 0;
 
-  // NoFragmentationStatsHeap overrides this method.
-  virtual void Alloc(const BufferValue* buffer, int64 size,
-                     const HloInstruction* instruction) {
-    Alloc(buffer, size);
-  }
-
   // Takes memory usage of subcomputations into account when calculating the
   // memory usage of a computation. Currently, we don't handle buffer aliasing
   // between computations entirely correctly. We are careful to not double count
@@ -235,6 +229,8 @@ class HeapAlgorithm {
   // analysis, it's not worth making major changes to HeapSimulator now.
   virtual void AccountForSubcomputationMemory(
       const HloInstruction* instruction,
+      // The total number of bytes allocated by instruction.
+      int64 alloc_size_by_instruction,
       const absl::flat_hash_map<const HloComputation*, int64>&
           memory_by_computation) {}
 
@@ -257,11 +253,8 @@ class NoFragmentationStatsHeap : public HeapAlgorithm {
 
   void Alloc(const BufferValue* buffer, int64 size) override;
 
-  void Alloc(const BufferValue* buffer, int64 size,
-             const HloInstruction* instruction) override;
-
   void AccountForSubcomputationMemory(
-      const HloInstruction* instruction,
+      const HloInstruction* instruction, int64 alloc_size_by_instruction,
       const absl::flat_hash_map<const HloComputation*, int64>&
           memory_by_computation) override;
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index ea0bced923..e30e7667f3 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -98,6 +98,124 @@ TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
       HeapSimulator::MinimumMemoryForModule(schedule, size_fn).ValueOrDie());
 }
 
+TEST_F(MinimumMemoryForSequenceTest, SubcomputationAccounting) {
+  // HloModule SubcomputationAccounting
+
+  // %WhileBody (body_param: f32[4]) -> f32[4] {
+  //   %body_param = f32[4]{0} parameter(0)
+  //   %constant.1 = f32[4]{0} constant({1, 1, 1, 1})
+  //   ROOT %subtract = f32[4]{0} subtract(f32[4]{0} %body_param, f32[4]{0}
+  //   %constant.1)
+  // }
+
+  // %WhileCond (cond_param: f32[4]) -> pred[] {
+  //   %cond_param = f32[4]{0} parameter(0)
+  //   %slice = f32[1]{0} slice(f32[4]{0} %cond_param), slice={[0:1]}
+  //   %reshape = f32[] reshape(f32[1]{0} %slice)
+  //   %constant = f32[] constant(0)
+  //   ROOT %not-equal-to = pred[] not-equal-to(f32[] %reshape, f32[] %constant)
+  // }
+
+  // ENTRY %SubcomputationAccounting () -> f32[2,4] {
+  //   %constant.3 = f32[2,4]{1,0} constant(f32[2,4] { { 1, 2, 3, 4 }, { 1, 2,
+  //   3, 4 } }) %transpose = f32[2,4]{1,0} transpose(f32[2,4]{1,0}
+  //   %constant.3), dimensions={0,1} %constant.2 = f32[4]{0} constant({1, 1, 1,
+  //   1}) %while = f32[4]{0} while(f32[4]{0} %constant.2),
+  //   condition=%WhileCond, body=%WhileBody %broadcast = f32[2,4]{1,0}
+  //   broadcast(f32[4]{0} %while), dimensions={1} ROOT %add = f32[2,4]{1,0}
+  //   add(f32[2,4]{1,0} %transpose, f32[2,4]{1,0} %broadcast)
+  // }
+
+  auto module = CreateNewVerifiedModule();
+  const Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  const Shape r1f32 = ShapeUtil::MakeShape(F32, {4});
+  const Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 4});
+
+  // reshape(slice(param)) != 0
+  // Needs 5 bytes
+  auto cond_builder = HloComputation::Builder("WhileCond");
+  HloInstruction* cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "cond_param"));
+  HloInstruction* slice =
+      cond_builder.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {1}), cond_param, {0}, {1}, {1}));
+  HloInstruction* reshape =
+      cond_builder.AddInstruction(HloInstruction::CreateReshape(r0f32, slice));
+  HloInstruction* zero = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
+  HloInstruction* cond_comparison =
+      cond_builder.AddInstruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(PRED, {}), HloOpcode::kNe, reshape, zero));
+  auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build());
+
+  // param - 1
+  // Needs 16 bytes
+  auto body_builder = HloComputation::Builder("WhileBody");
+  HloInstruction* body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "body_param"));
+  HloInstruction* one_vector =
+      body_builder.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR1<float>({1, 1, 1, 1})));
+  HloInstruction* subtract =
+      body_builder.AddInstruction(HloInstruction::CreateBinary(
+          r1f32, HloOpcode::kSubtract, body_param, one_vector));
+  auto body_computation = module->AddEmbeddedComputation(body_builder.Build());
+
+  // transpose(matrix) + bcast(while)
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* while_init =
+      builder.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR1<float>({1, 1, 1, 1})));
+  // Creates 16 bytes, ignoring subcomputations
+  HloInstruction* while_loop =
+      builder.AddInstruction(HloInstruction::CreateWhile(
+          r1f32, cond_computation, body_computation, while_init));
+
+  // Creates 32 bytes and frees 16
+  HloInstruction* bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r2f32, while_loop, {1}));
+
+  HloInstruction* matrix = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR2<float>(
+          {{1.0, 2.0, 3.0, 4.0}, {1.0, 2.0, 3.0, 4.0}})));
+  // Creates 32 bytes
+  HloInstruction* transpose = builder.AddInstruction(
+      HloInstruction::CreateTranspose(r2f32, matrix, {0, 1}));
+
+  // Creates 32 bytes and frees 64
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, transpose, bcast));
+
+  auto entry_computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  std::vector<HloInstruction*> cond_vec = {cond_param, slice, reshape, zero,
+                                           cond_comparison};
+  std::vector<HloInstruction*> while_body_vec = {body_param, one_vector,
+                                                 subtract};
+  std::vector<HloInstruction*> entry_comp_vec = {while_init, while_loop, bcast,
+                                                 matrix,     transpose,  add};
+  schedule.set_sequence(cond_computation, cond_vec);
+  schedule.set_sequence(body_computation, while_body_vec);
+  schedule.set_sequence(entry_computation, entry_comp_vec);
+
+  auto size_fn = [](const BufferValue& buffer) {
+    return ShapeUtil::ByteSizeOf(buffer.shape());
+  };
+  absl::flat_hash_map<const HloComputation*, int64> memory_by_computation;
+  memory_by_computation[cond_computation] = 5;
+  memory_by_computation[body_computation] = 16;
+  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis =
+      TuplePointsToAnalysis::Run(module.get()).ValueOrDie();
+
+  // HeapSimulator accounts for subcomputations. The output buffer is aliased,
+  // so we don't double count.
+  EXPECT_EQ(64, HeapSimulator::MinimumMemoryForComputation(
+                    *entry_computation, schedule.sequence(entry_computation),
+                    *points_to_analysis, size_fn, &memory_by_computation)
+                    .ValueOrDie());
+}
+
 const char kAlloc[] = "Alloc";
 const char kFree[] = "Free";
 const char kFinish[] = "Finish";
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
index 5a9fccc7dd..214119fba8 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
@@ -147,126 +147,6 @@ ENTRY root {
                                       instructions_by_name.at("e")));
 }
 
-TEST_F(HloSchedulingTest, ListAccountsForSubcomputations) {
-  // %WhileCond (cond_param: f32[4]) -> pred[] {
-  //   %cond_param = f32[4]{0} parameter(0)
-  //   %constant = f32[1,4]{1,0} constant(f32[1,4] { { 0, 0, 0, 0 } })
-  //   ROOT %not-equal-to = pred[] not-equal-to(
-  //     f32[4]{0} %cond_param, f32[1,4]{1,0} %constant)
-  // }
-  // %WhileBody (body_param: f32[4]) -> f32[4] {
-  //   %body_param = f32[4]{0} parameter(0)
-  //   %constant.1 = f32[1,4]{1,0} constant(f32[1,4] { { 1, 1, 1, 1 } })
-  //   ROOT %subtract = f32[4]{0} subtract(
-  //     f32[4]{0} %body_param, f32[1,4]{1,0} %constant.1)
-  // }
-  // %ListAccountsForSubcomputations () -> f32[2,4] {
-  //   %constant.3 = f32[2,4]{1,0} constant(
-  //     f32[2,4] { { 1, 2, 3, 4 }, { 1, 2, 3, 4 } })
-  //   %transpose = f32[2,4]{1,0} transpose(
-  //     f32[2,4]{1,0} %constant.3), dimensions={0,1}
-  //   %constant.2 = f32[1,4]{1,0} constant(f32[1,4] { { 1, 1, 1, 1 } })
-  //   %while = f32[4]{0} while(f32[1,4]{1,0} %constant.2),
-  //      condition=%WhileCond,
-  //      body=%WhileBody
-  //   %broadcast = f32[2,4]{1,0} broadcast(f32[4]{0} %while), dimensions={0}
-  //   ROOT %add = f32[2,4]{1,0} add(
-  //     f32[2,4]{1,0} %transpose, f32[2,4]{1,0} %broadcast)
-  // }
-
-  auto module = CreateNewModule();
-  const Shape r1f32 = ShapeUtil::MakeShape(F32, {4});
-  const Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 4});
-
-  // param != 0
-  // Needs 17 bytes
-  auto cond_builder = HloComputation::Builder("WhileCond");
-  HloInstruction* cond_param = cond_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r1f32, "cond_param"));
-  HloInstruction* zero_vector =
-      cond_builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR2<float>({{0, 0, 0, 0}})));
-  cond_builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kNe, cond_param, zero_vector));
-  auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build());
-
-  // param - 1
-  // Needs 16 bytes
-  auto body_builder = HloComputation::Builder("WhileBody");
-  HloInstruction* body_param = body_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r1f32, "body_param"));
-  HloInstruction* one_vector =
-      body_builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR2<float>({{1, 1, 1, 1}})));
-  body_builder.AddInstruction(HloInstruction::CreateBinary(
-      r1f32, HloOpcode::kSubtract, body_param, one_vector));
-  auto body_computation = module->AddEmbeddedComputation(body_builder.Build());
-
-  // transpose(matrix) + bcast(while)
-  auto builder = HloComputation::Builder(TestName());
-  HloInstruction* while_init =
-      builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR2<float>({{1, 1, 1, 1}})));
-  // Creates 16 bytes, ignoring subcomputations
-  HloInstruction* while_loop =
-      builder.AddInstruction(HloInstruction::CreateWhile(
-          r1f32, cond_computation, body_computation, while_init));
-
-  // Creates 32 bytes and frees 16
-  HloInstruction* bcast = builder.AddInstruction(
-      HloInstruction::CreateBroadcast(r2f32, while_loop, {0}));
-
-  HloInstruction* matrix = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR2<float>(
-          {{1.0, 2.0, 3.0, 4.0}, {1.0, 2.0, 3.0, 4.0}})));
-  // Creates 32 bytes
-  HloInstruction* transpose = builder.AddInstruction(
-      HloInstruction::CreateTranspose(r2f32, matrix, {0, 1}));
-
-  // Creates 32 bytes and frees 64
-  HloInstruction* add = builder.AddInstruction(
-      HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, transpose, bcast));
-
-  module->AddEntryComputation(builder.Build());
-
-  auto size_fn = [](const BufferValue& buffer) {
-    return ShapeUtil::ByteSizeOf(buffer.shape());
-  };
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloSchedule schedule,
-      ScheduleModule(*module, size_fn, ListMemoryScheduler));
-  // Verify that all instructions are in the sequence.
-  auto entry_computation = module->entry_computation();
-  EXPECT_EQ(entry_computation->instruction_count(),
-            schedule.sequence(entry_computation).size());
-  SequentialHloOrdering ordering(schedule);
-  // This schedule is an example of List's greedy heuristics being suboptimal.
-  // The while_loop is more expensive than transpose, so it would have been
-  // better to schedule it first, instead of during the busy time.
-  EXPECT_TRUE(ordering.ExecutesBefore(transpose, while_loop));
-  EXPECT_TRUE(ordering.ExecutesBefore(transpose, bcast));
-  EXPECT_TRUE(ordering.ExecutesBefore(bcast, add));
-  EXPECT_TRUE(ordering.ExecutesBefore(transpose, add));
-
-  absl::flat_hash_map<const HloComputation*, int64> memory_by_computation;
-  memory_by_computation[cond_computation] = 17;
-  memory_by_computation[body_computation] = 16;
-  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis =
-      TuplePointsToAnalysis::Run(module.get()).ValueOrDie();
-
-  // HeapSimulator doesn't account for subcomputations
-  EXPECT_EQ(80, HeapSimulator::MinimumMemoryForComputation(
-                    *entry_computation, schedule.sequence(entry_computation),
-                    *points_to_analysis, size_fn)
-                    .ValueOrDie());
-  // HeapSimulator accounts for subcomputations. The output buffer is aliased,
-  // so we don't double count.
-  EXPECT_EQ(64, HeapSimulator::MinimumMemoryForComputation(
-                    *entry_computation, schedule.sequence(entry_computation),
-                    *points_to_analysis, size_fn, &memory_by_computation)
-                    .ValueOrDie());
-}
-
 TEST_F(HloSchedulingTest, TuplesAreAccountedCorrectly) {
   auto builder = HloComputation::Builder(TestName());
   const auto TUPLE_SIZE = 1;
-- 
GitLab


From caaf9a89750a9a0b3d66f3ce3e9bd507f4c6514c Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Wed, 3 Oct 2018 16:51:30 -0700
Subject: [PATCH 379/570] Create new classes for Keras tests to allow us to
 create new test targets.

PiperOrigin-RevId: 215653650
---
 .../contrib/distribute/python/keras_test.py   | 256 +++++++++---------
 1 file changed, 131 insertions(+), 125 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index 993cb2bac3..3511b7761f 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -355,48 +355,9 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     gfile.DeleteRecursively(self._config.model_dir)
 
 
-class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
-
-  def test_validating_dataset_input_tensors_with_shape_mismatch(self):
-    with self.cached_session():
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
-      a = constant_op.constant([1, 2], shape=(1, 2))
-      b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2))
-      x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b})
-      y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a})
-      with strategy.scope():
-        # Removed device and input tensor shape details from the error message
-        # since the order of the device and the corresponding input tensor shape
-        # is not deterministic over different runs.
-        with self.assertRaisesRegexp(ValueError,
-                                     'Input tensor shapes do not match for '
-                                     'distributed tensor inputs '
-                                     'DistributedValues:.+'):
-          distributed_training_utils.validate_distributed_dataset_inputs(
-              strategy, x, y)
-
-  def test_validating_dataset_input_tensors_with_dtype_mismatch(self):
-    with self.cached_session():
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
-      a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32)
-      b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64)
-      x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b})
-      y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a})
-      with strategy.scope():
-        # Removed device and input tensor dtype details from the error message
-        # since the order of the device and the corresponding input tensor dtype
-        # is not deterministic over different runs.
-        with self.assertRaisesRegexp(ValueError,
-                                     'Input tensor dtypes do not match for '
-                                     'distributed tensor inputs '
-                                     'DistributedValues:.+'):
-          distributed_training_utils.validate_distributed_dataset_inputs(
-              strategy, x, y)
+class TestDistributionStrategyWithNumpyArrays(test.TestCase,
+                                              parameterized.TestCase):
 
-  # TODO(anjalisridhar): Move this test along with other numpy related tests to
-  # its own class.
   @combinations.generate(strategy_combinations())
   def test_creating_var_with_numpy_arrays(self, distribution):
     with self.cached_session():
@@ -479,6 +440,10 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
       # with batch_size
       model.predict(inputs, batch_size=8)
 
+
+class TestDistributionStrategyWithDatasets(test.TestCase,
+                                           parameterized.TestCase):
+
   @combinations.generate(strategy_combinations())
   def test_calling_model_on_same_dataset(self, distribution):
     with self.cached_session():
@@ -572,86 +537,6 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
       model.evaluate(dataset, steps=2, verbose=1)
       model.predict(get_predict_dataset(distribution), steps=2)
 
-  def test_unsupported_features(self):
-    with self.cached_session():
-      model = get_model()
-
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-      metrics = ['mae']
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
-
-      dataset = get_dataset(strategy)
-
-      # Test with validation split
-      with self.assertRaisesRegexp(
-          ValueError, '`validation_split` argument is not '
-                      'supported when input `x` is a dataset or a '
-                      'dataset iterator.+'):
-        model.fit(dataset,
-                  epochs=1, steps_per_epoch=2, verbose=0,
-                  validation_split=0.5, validation_steps=2)
-
-      # Test with sample weight.
-      sample_weight = np.random.random((10,))
-      with self.assertRaisesRegexp(
-          NotImplementedError, '`sample_weight` is currently not supported '
-                               'when using DistributionStrategy.'):
-        model.fit(
-            dataset,
-            epochs=1,
-            steps_per_epoch=2,
-            verbose=0,
-            sample_weight=sample_weight)
-
-      # Test with not specifying the `steps` argument.
-      with self.assertRaisesRegexp(
-          ValueError, 'you should specify the `steps_per_epoch` argument'):
-        model.fit(dataset, epochs=1, verbose=0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'you should specify the `steps` argument'):
-        model.evaluate(dataset, verbose=0)
-
-      with self.assertRaisesRegexp(ValueError,
-                                   'you should specify the `steps` argument'):
-        model.predict(dataset, verbose=0)
-
-  def test_calling_with_unsupported_predefined_callbacks(self):
-    with self.cached_session():
-      model = get_model()
-
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-      metrics = ['mae']
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
-
-      dataset = get_dataset(strategy)
-
-      def schedule(_):
-        return 0.001
-      with self.assertRaisesRegexp(ValueError,
-                                   'LearningRateScheduler callback is not '
-                                   'supported with DistributionStrategy.'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                  callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
-
-      with self.assertRaisesRegexp(ValueError,
-                                   'ReduceLROnPlateau callback is not '
-                                   'supported with DistributionStrategy.'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                  callbacks=[keras.callbacks.ReduceLROnPlateau()])
-      with self.assertRaisesRegexp(ValueError,
-                                   'histogram_freq in the TensorBoard callback '
-                                   'is not supported when using '
-                                   'DistributionStrategy.'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                  callbacks=[keras.callbacks.TensorBoard(histogram_freq=10)])
-
   def test_dataset_input_shape_validation(self):
     with self.cached_session():
       model = get_model()
@@ -736,7 +621,128 @@ class TestWithDistributionStrategy(test.TestCase, parameterized.TestCase):
       self.assertNotEqual(np.mean(predict_output), 0)
 
 
-class LossMaskingWithDistributionStrategyTest(test.TestCase):
+class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
+
+  def test_validating_dataset_input_tensors_with_shape_mismatch(self):
+    with self.cached_session():
+      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
+                                                     '/device:CPU:0'])
+      a = constant_op.constant([1, 2], shape=(1, 2))
+      b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2))
+      x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b})
+      y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a})
+      with strategy.scope():
+        # Removed device and input tensor shape details from the error message
+        # since the order of the device and the corresponding input tensor shape
+        # is not deterministic over different runs.
+        with self.assertRaisesRegexp(ValueError,
+                                     'Input tensor shapes do not match for '
+                                     'distributed tensor inputs '
+                                     'DistributedValues:.+'):
+          distributed_training_utils.validate_distributed_dataset_inputs(
+              strategy, x, y)
+
+  def test_validating_dataset_input_tensors_with_dtype_mismatch(self):
+    with self.cached_session():
+      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
+                                                     '/device:CPU:0'])
+      a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32)
+      b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64)
+      x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b})
+      y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a})
+      with strategy.scope():
+        # Removed device and input tensor dtype details from the error message
+        # since the order of the device and the corresponding input tensor dtype
+        # is not deterministic over different runs.
+        with self.assertRaisesRegexp(ValueError,
+                                     'Input tensor dtypes do not match for '
+                                     'distributed tensor inputs '
+                                     'DistributedValues:.+'):
+          distributed_training_utils.validate_distributed_dataset_inputs(
+              strategy, x, y)
+
+  def test_unsupported_features(self):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
+                                                     '/device:GPU:0'])
+
+      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+
+      dataset = get_dataset(strategy)
+
+      # Test with validation split
+      with self.assertRaisesRegexp(
+          ValueError, '`validation_split` argument is not '
+                      'supported when input `x` is a dataset or a '
+                      'dataset iterator.+'):
+        model.fit(dataset,
+                  epochs=1, steps_per_epoch=2, verbose=0,
+                  validation_split=0.5, validation_steps=2)
+
+      # Test with sample weight.
+      sample_weight = np.random.random((10,))
+      with self.assertRaisesRegexp(
+          NotImplementedError, '`sample_weight` is currently not supported '
+                               'when using DistributionStrategy.'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            sample_weight=sample_weight)
+
+      # Test with not specifying the `steps` argument.
+      with self.assertRaisesRegexp(
+          ValueError, 'you should specify the `steps_per_epoch` argument'):
+        model.fit(dataset, epochs=1, verbose=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'you should specify the `steps` argument'):
+        model.evaluate(dataset, verbose=0)
+
+      with self.assertRaisesRegexp(ValueError,
+                                   'you should specify the `steps` argument'):
+        model.predict(dataset, verbose=0)
+
+  def test_calling_with_unsupported_predefined_callbacks(self):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
+                                                     '/device:GPU:0'])
+      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+
+      dataset = get_dataset(strategy)
+
+      def schedule(_):
+        return 0.001
+      with self.assertRaisesRegexp(ValueError,
+                                   'LearningRateScheduler callback is not '
+                                   'supported with DistributionStrategy.'):
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+                  callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
+
+      with self.assertRaisesRegexp(ValueError,
+                                   'ReduceLROnPlateau callback is not '
+                                   'supported with DistributionStrategy.'):
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+                  callbacks=[keras.callbacks.ReduceLROnPlateau()])
+      with self.assertRaisesRegexp(ValueError,
+                                   'histogram_freq in the TensorBoard callback '
+                                   'is not supported when using '
+                                   'DistributionStrategy.'):
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+                  callbacks=[keras.callbacks.TensorBoard(histogram_freq=10)])
+
+
+class TestDistributionStrategyWithLossMasking(test.TestCase):
 
   # TODO(priyag): Enable all strategies for this test. Currently it does not
   # work for TPU due to some invalid datatype.
@@ -763,7 +769,7 @@ class LossMaskingWithDistributionStrategyTest(test.TestCase):
       self.assertEqual(hist.history['loss'][0], 0)
 
 
-class NormalizationLayerWithDistributionStrategyTest(
+class TestDistributionStrategyWithNormalizationLayer(
     test.TestCase, parameterized.TestCase):
 
   @combinations.generate(strategy_combinations())
@@ -795,8 +801,8 @@ class NormalizationLayerWithDistributionStrategyTest(
       np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
 
 
-class CorrectnessWithDistributionStrategyTest(test.TestCase,
-                                              parameterized.TestCase):
+class TestDistributionStrategyCorrectness(test.TestCase,
+                                          parameterized.TestCase):
 
   @combinations.generate(strategy_combinations())
   def test_metric_correctness(self, distribution):
-- 
GitLab


From 3a9a3664fe1aa9e5c81ca4959f028c2a8161520e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 16:52:30 -0700
Subject: [PATCH 380/570] Fix 1970s-style bug in LogSoftmax eval.

PiperOrigin-RevId: 215653797
---
 tensorflow/contrib/lite/kernels/activations.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
index cf9441aee3..9aed4f09b8 100644
--- a/tensorflow/contrib/lite/kernels/activations.cc
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -616,13 +616,15 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
-    case kTfLiteFloat32:
+    case kTfLiteFloat32: {
       SoftmaxParams op_params;
       optimized_ops::LogSoftmax(
           op_params, GetTensorShape(input), GetTensorData<float>(input),
           GetTensorShape(output), GetTensorData<float>(output));
       return kTfLiteOk;
-    case kTfLiteUInt8:
+    }
+    case kTfLiteUInt8: {
+      SoftmaxParams op_params;
       op_params.input_multiplier = data->input_multiplier;
       op_params.input_left_shift = data->input_left_shift;
       op_params.reverse_scaling_divisor = data->reverse_scaling_divisor;
@@ -632,6 +634,7 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
           op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
           GetTensorShape(output), GetTensorData<uint8_t>(output));
       return kTfLiteOk;
+    }
     default:
       context->ReportError(context, "Only float32 supported currently., got %d",
                            input->type);
-- 
GitLab


From d340eb9f7ea46012b7ead202f4c12fb6b32cc56d Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Wed, 3 Oct 2018 16:56:14 -0700
Subject: [PATCH 381/570] Increase error-epsilon for
 ProfilingTest::ProfilesAreCollected.

PiperOrigin-RevId: 215654327
---
 tensorflow/contrib/lite/profiling/profiler_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/profiling/profiler_test.cc b/tensorflow/contrib/lite/profiling/profiler_test.cc
index 0fba0450a0..cf56eed2a4 100644
--- a/tensorflow/contrib/lite/profiling/profiler_test.cc
+++ b/tensorflow/contrib/lite/profiling/profiler_test.cc
@@ -83,8 +83,8 @@ TEST(ProfilingTest, ProfilesAreCollected) {
   EXPECT_EQ("SleepForQuarter", profile_events[4]->tag);
 
 #ifndef ADDRESS_SANITIZER
-  // ASAN build is sometimes very slow.
-  const int eps_ms = 10;
+  // ASAN build is sometimes very slow. Set a large epsilon to avoid flakiness.
+  const int eps_ms = 50;
   AssertDurationOfEventAroundMs(profile_events[0], /*expected_ms*/ 500, eps_ms);
   AssertDurationOfEventAroundMs(profile_events[1], /*expected_ms*/ 250, eps_ms);
   AssertDurationOfEventAroundMs(profile_events[2], /*expected_ms*/ 250, eps_ms);
-- 
GitLab


From c842d38978a0babb373fe2acbb0231960aa1c1d0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 17:05:38 -0700
Subject: [PATCH 382/570] Add MinimalRNN cell.

The implementation is based on: https://arxiv.org/pdf/1806.05394v2.pdf.

PiperOrigin-RevId: 215655857
---
 .../rnn/python/kernel_tests/rnn_cell_test.py  |  72 +++++++++++
 tensorflow/contrib/rnn/python/ops/rnn_cell.py | 116 ++++++++++++++++++
 2 files changed, 188 insertions(+)

diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index 6689664fb9..0a27200015 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -29,6 +29,9 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
@@ -40,7 +43,9 @@ from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
+from tensorflow.python.training import training
 from tensorflow.python.util import nest
 
 
@@ -1115,6 +1120,73 @@ class RNNCellTest(test.TestCase):
             r"input size \(3\) must be divisible by number_of_groups \(2\)"):
           gcell(glstm_input, gcell_zero_state)
 
+  def testMinimalRNNCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root"):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = contrib_rnn_cell.MinimalRNNCell(
+            units=2,
+            kernel_initializer=initializers.Constant(0.5))
+        g, _ = cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.18899589, 0.18899589]])
+      with variable_scope.variable_scope(
+          "other"):
+        # Test MinimalRNN with input_size != num_units.
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        cell = contrib_rnn_cell.MinimalRNNCell(
+            units=2,
+            kernel_initializer=initializers.Constant(0.5))
+        g, _ = cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.19554167, 0.19554167]])
+
+  def testMinimalRNNCellEndToEnd(self):
+    with self.cached_session() as sess:
+      input_shape = 10
+      output_shape = 5
+      timestep = 4
+      batch = 100
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = utils.to_categorical(y_train)
+      cell = contrib_rnn_cell.MinimalRNNCell(output_shape)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape))
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape))
+
+      outputs, state = rnn.dynamic_rnn(
+          cell, inputs, dtype=dtypes.float32)
+      self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape])
+      self.assertEqual(state.shape.as_list(), [None, output_shape])
+      loss = losses.softmax_cross_entropy(predict, state)
+      train_op = training.GradientDescentOptimizer(0.001).minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      _, outputs, state = sess.run(
+          [train_op, outputs, state], {inputs: x_train, predict: y_train})
+
+      self.assertEqual(len(outputs), batch)
+      self.assertEqual(len(state), batch)
+
 
 class LayerNormBasicLSTMCellTest(test.TestCase):
 
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 06c481672c..59a61af7b3 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -28,6 +28,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import initializers
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -3394,3 +3396,117 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
 
     new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_h)
     return new_h, new_state
+
+
+class MinimalRNNCell(rnn_cell_impl.LayerRNNCell):
+  """MinimalRNN cell.
+
+  The implementation is based on:
+
+    https://arxiv.org/pdf/1806.05394v2.pdf
+
+  Minmin Chen, Jeffrey Pennington, Samuel S. Schoenholz.
+  "Dynamical Isometry and a Mean Field Theory of RNNs: Gating Enables Signal
+   Propagation in Recurrent Neural Networks." ICML, 2018.
+
+  A MinimalRNN cell first projects the input to the hidden space. The new
+  hidden state is then calcuated as a weighted sum of the projected input and
+  the previous hidden state, using a single update gate.
+  """
+
+  def __init__(self,
+               units,
+               activation="tanh",
+               kernel_initializer="glorot_uniform",
+               bias_initializer="ones",
+               name=None,
+               dtype=None,
+               **kwargs):
+    """Initialize the parameters for a MinimalRNN cell.
+
+    Args:
+      units: int, The number of units in the MinimalRNN cell.
+      activation: Nonlinearity to use in the feedforward network. Default:
+        `tanh`.
+      kernel_initializer: The initializer to use for the weight in the update
+        gate and feedforward network. Default: `glorot_uniform`.
+      bias_initializer: The initializer to use for the bias in the update
+        gate. Default: `ones`.
+      name: String, the name of the cell.
+      dtype: Default dtype of the cell.
+      **kwargs: Dict, keyword named properties for common cell attributes.
+    """
+    super(MinimalRNNCell, self).__init__(name=name, dtype=dtype, **kwargs)
+
+    # Inputs must be 2-dimensional.
+    self.input_spec = base_layer.InputSpec(ndim=2)
+
+    self.units = units
+    self.activation = activations.get(activation)
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+  @property
+  def state_size(self):
+    return self.units
+
+  @property
+  def output_size(self):
+    return self.units
+
+  def build(self, inputs_shape):
+    if inputs_shape[-1] is None:
+      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
+                       % str(inputs_shape))
+
+    input_size = inputs_shape[-1]
+    # pylint: disable=protected-access
+    # self._kernel contains W_x, W, V
+    self.kernel = self.add_weight(
+        name=rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+        shape=[input_size + 2 * self.units, self.units],
+        initializer=self.kernel_initializer)
+    self.bias = self.add_weight(
+        name=rnn_cell_impl._BIAS_VARIABLE_NAME,
+        shape=[self.units],
+        initializer=self.bias_initializer)
+    # pylint: enable=protected-access
+
+    self.built = True
+
+  def call(self, inputs, state):
+    """Run one step of MinimalRNN.
+
+    Args:
+      inputs: input Tensor, must be 2-D, `[batch, input_size]`.
+      state: state Tensor, must be 2-D, `[batch, state_size]`.
+
+    Returns:
+      A tuple containing:
+
+      - Output: A `2-D` tensor with shape `[batch_size, state_size]`.
+      - New state: A `2-D` tensor with shape `[batch_size, state_size]`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    input_size = inputs.get_shape()[1]
+    if input_size.value is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+    feedforward_weight, gate_weight = array_ops.split(
+        value=self.kernel,
+        num_or_size_splits=[input_size.value, 2 * self.units],
+        axis=0)
+
+    feedforward = math_ops.matmul(inputs, feedforward_weight)
+    feedforward = self.activation(feedforward)
+
+    gate_inputs = math_ops.matmul(
+        array_ops.concat([feedforward, state], 1), gate_weight)
+    gate_inputs = nn_ops.bias_add(gate_inputs, self.bias)
+    u = math_ops.sigmoid(gate_inputs)
+
+    new_h = u * state + (1 - u) * feedforward
+    return new_h, new_h
-- 
GitLab


From 4da5b350e1c062b9d55896ee872e0e4790f30bcb Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Wed, 3 Oct 2018 17:25:46 -0700
Subject: [PATCH 383/570] TFLite Flex: Blacklist Control Flow Ops

PiperOrigin-RevId: 215658384
---
 tensorflow/contrib/lite/toco/tflite/export.cc | 132 +++++++++++++-----
 tensorflow/contrib/lite/toco/tflite/export.h  |  20 ++-
 .../contrib/lite/toco/tflite/export_test.cc   |  40 ++++++
 3 files changed, 152 insertions(+), 40 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index 0c9fac249c..45ca7f7f0c 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -47,6 +47,22 @@ using ::tflite::Tensor;
 
 namespace {
 
+// Check if a TensorFlow Op is a control flow op by its name.
+bool IsControlFlowOp(const string& tensorflow_op) {
+  // Technically this is equalivent to `::tensorflow::Node::IsControlFlow()`.
+  // It requires to construct a `::tensorflow::Graph` to use that helper
+  // function, so we simply hardcode the list of control flow ops here.
+  if (tensorflow_op == "Switch" || tensorflow_op == "RefSwitch" ||
+      tensorflow_op == "Merge" || tensorflow_op == "RefMerge" ||
+      tensorflow_op == "Enter" || tensorflow_op == "RefEnter" ||
+      tensorflow_op == "Exit" || tensorflow_op == "RefExit" ||
+      tensorflow_op == "NextIteration" || tensorflow_op == "RefNextIteration") {
+    return true;
+  }
+  // TODO(ycling): Also check how to handle Variable ops and Assign ops.
+  return false;
+}
+
 details::OperatorKey GetOperatorKey(
     const ::toco::Operator& op,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
@@ -55,21 +71,13 @@ details::OperatorKey GetOperatorKey(
   if (op.type == OperatorType::kUnsupported) {
     const TensorFlowUnsupportedOperator& unsupported_op =
         static_cast<const TensorFlowUnsupportedOperator&>(op);
-
-    // TODO(b/113715895): When `allow_flex_ops` is on, for now there's no way
-    // to populate a regular custom op. We need to find a way to fix this.
-    if (allow_flex_ops) {
-      custom_code = string(::tflite::kFlexCustomCodePrefix) +
-                    unsupported_op.tensorflow_op;
-    } else {
-      custom_code = unsupported_op.tensorflow_op;
-    }
+    custom_code = unsupported_op.tensorflow_op;
   }
   int version = 1;
   if (ops_by_type.count(op.type) != 0) {
     version = ops_by_type.at(op.type)->GetVersion(op);
   }
-  return details::OperatorKey(op.type, custom_code, version);
+  return details::OperatorKey(op.type, custom_code, version, allow_flex_ops);
 }
 
 void WriteModelToString(const flatbuffers::FlatBufferBuilder& builder,
@@ -83,6 +91,29 @@ void WriteModelToString(const flatbuffers::FlatBufferBuilder& builder,
 
 namespace details {
 
+OperatorKey::OperatorKey(OperatorType type, const std::string& custom_code,
+                         int version, bool allow_flex_ops) {
+  this->type = type;
+  this->custom_code = custom_code;
+  this->version = version;
+
+  if (type == OperatorType::kUnsupported) {
+    // TODO(b/113715895): When `allow_flex_ops` is on, for now there's no way
+    // to populate a regular custom op. We need to find a way to fix this.
+    if (allow_flex_ops) {
+      // Memorize the original TensorFlow op name.
+      this->flex_tensorflow_op = custom_code;
+      // Prefix the custom code of the flex op.
+      this->custom_code = string(::tflite::kFlexCustomCodePrefix) + custom_code;
+      this->is_flex_op = true;
+
+      if (IsControlFlowOp(this->flex_tensorflow_op)) {
+        is_unsupported_flex_op = true;
+      }
+    }
+  }
+}
+
 void LoadTensorsMap(const Model& model, TensorsMap* tensors_map) {
   // First find a list of unique array names.
   std::set<string> names;
@@ -199,7 +230,7 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
     const Model& model,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
     const details::OperatorsMap& operators_map, FlatBufferBuilder* builder,
-    std::set<string>* error_summary, const ExportParams& params) {
+    std::set<string>* unsupported_ops, const ExportParams& params) {
   // Map from operator name to TF Lite enum value, for all builtins.
   std::map<string, BuiltinOperator> builtin_ops;
   for (int i = BuiltinOperator_MIN; i <= BuiltinOperator_MAX; ++i) {
@@ -240,8 +271,8 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
       }
       // Either way, this is an operator that is not supported by TF Lite,
       // so we output it as a custom op and add it to the error summary.
-      if (error_summary) {
-        error_summary->insert(name);
+      if (unsupported_ops) {
+        unsupported_ops->insert(name);
       }
       ordered_opcodes[op_index] =
           CreateOperatorCode(*builder, BuiltinOperator_CUSTOM,
@@ -355,9 +386,9 @@ void Export(
   Array empty_array;
   buffers_to_write.push_back(&empty_array);
 
-  std::set<string> error_summary;
+  std::set<string> unsupported_ops;
   auto op_codes = ExportOperatorCodes(model, ops_by_type, operators_map,
-                                      &builder, &error_summary, params);
+                                      &builder, &unsupported_ops, params);
 
   for (const auto& op : model.operators) {
     if (op->type == OperatorType::kFakeQuant) {
@@ -367,30 +398,61 @@ void Export(
                       "for --std_values and --mean_values.";
     }
   }
-  if (!params.allow_custom_ops && !error_summary.empty()) {
-    // Remove ExpandDims and ReorderAxes from unimplemented list unless they
-    // compose the list. Both ops are removed during graph transformations.
-    // However, if an op is unimplemented earlier in the model, the graph
-    // transformation is unable to run because the output shape is not defined.
-    // This causes unnecessary confusion during model conversion time.
-    std::set<string> error_summary_final;
-    for (const auto& op_type : error_summary) {
-      if (op_type != "ReorderAxes" && op_type != "ExpandDims") {
-        error_summary_final.insert(op_type);
+  if (!unsupported_ops.empty()) {
+    if (!params.allow_custom_ops) {
+      // Remove ExpandDims and ReorderAxes from unimplemented list unless they
+      // compose the list. Both ops are removed during graph transformations.
+      // However, if an op is unimplemented earlier in the model, the graph
+      // transformation is unable to run because the output shape is not
+      // defined. This causes unnecessary confusion during model conversion
+      // time.
+      std::set<string> unsupported_ops_final;
+      for (const auto& op_type : unsupported_ops) {
+        if (op_type != "ReorderAxes" && op_type != "ExpandDims") {
+          unsupported_ops_final.insert(op_type);
+        }
+      }
+      if (unsupported_ops_final.empty()) {
+        unsupported_ops_final = unsupported_ops;
+      }
+
+      LOG(QFATAL)
+          << "Some of the operators in the model are not supported by "
+             "the standard TensorFlow Lite runtime. If you have a custom "
+             "implementation for them you can disable this error with "
+             "--allow_custom_ops, or by setting allow_custom_ops=True "
+             "when calling tf.contrib.lite.TFLiteConverter(). Here is a list "
+             "of operators for which  you will need custom implementations: "
+          << absl::StrJoin(unsupported_ops_final, ", ") << ".";
+    }
+
+    std::set<string> unsupported_control_flow_ops;
+    // Check if unsupported ops contains control flow ops. It's impossible
+    // to implement these ops as custom ops at the moment.
+    for (const auto& op : unsupported_ops) {
+      if (IsControlFlowOp(op)) {
+        unsupported_control_flow_ops.insert(op);
       }
     }
-    if (error_summary_final.empty()) {
-      error_summary_final = error_summary;
+    if (!unsupported_control_flow_ops.empty()) {
+      LOG(QFATAL)
+          << "TensorFlow Lite currently doesn't support control flow ops: "
+          << absl::StrJoin(unsupported_control_flow_ops, ", ") << ".";
     }
+  }
+
+  std::set<string> unsupported_flex_ops;
+  for (const auto& it : operators_map) {
+    const details::OperatorKey& key = it.first;
+    if (key.is_unsupported_flex_op) {
+      unsupported_flex_ops.insert(key.custom_code);
+    }
+  }
 
-    LOG(QFATAL)
-        << "Some of the operators in the model are not supported by "
-           "the standard TensorFlow Lite runtime. If you have a custom "
-           "implementation for them you can disable this error with "
-           "--allow_custom_ops, or by setting allow_custom_ops=True "
-           "when calling tf.contrib.lite.TFLiteConverter(). Here is a list "
-           "of operators for which  you will need custom implementations: "
-        << absl::StrJoin(error_summary_final, ", ") << ".";
+  if (!unsupported_flex_ops.empty()) {
+    LOG(QFATAL) << "Some of the operators in the model are not supported by "
+                   "TensorFlow Flex runtime: "
+                << absl::StrJoin(unsupported_flex_ops, ", ") << ".";
   }
 
   std::set<int32_t> variable_tensor_indices;
diff --git a/tensorflow/contrib/lite/toco/tflite/export.h b/tensorflow/contrib/lite/toco/tflite/export.h
index 29d6de4049..9efb282c6c 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.h
+++ b/tensorflow/contrib/lite/toco/tflite/export.h
@@ -81,11 +81,21 @@ using TensorsMap = std::unordered_map<string, int>;
 // Only when `type` is `kUnsupported`, `custom_code` is filled to
 // identify which operation is used.
 struct OperatorKey {
-  OperatorKey(OperatorType type, const std::string& custom_code, int version)
-      : type(type), custom_code(custom_code), version(version) {}
-  const OperatorType type;
-  const std::string custom_code;
-  const int version;
+  OperatorKey(OperatorType type, const std::string& custom_code, int version,
+              bool allow_flex_ops = false);
+
+  // Only `type`, `custom_code` and `version` is used to compute hash and
+  // identity.
+  OperatorType type;
+  std::string custom_code;
+  int version;
+
+  // THe fields below are not used to compute hash and identity.
+  bool is_flex_op = false;
+  bool is_unsupported_flex_op = false;
+  // The original TensorFlow op name for the flex op. Filled only when
+  // `is_flex_op` is true.
+  std::string flex_tensorflow_op;
 
   bool operator<(const OperatorKey& other) const {
     if (type < other.type) return true;
diff --git a/tensorflow/contrib/lite/toco/tflite/export_test.cc b/tensorflow/contrib/lite/toco/tflite/export_test.cc
index 93882a91a7..a71a64d56f 100644
--- a/tensorflow/contrib/lite/toco/tflite/export_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export_test.cc
@@ -313,6 +313,46 @@ TEST_F(VersionedOpExportTest, Export) {
   EXPECT_EQ(1, (*operators)[1]->opcode_index());
 }
 
+TEST(OperatorKeyTest, TestBuiltinOp) {
+  details::OperatorKey key(OperatorType::kConv, "", 2);
+  EXPECT_EQ(key.type, OperatorType::kConv);
+  EXPECT_EQ(key.custom_code, "");
+  EXPECT_EQ(key.version, 2);
+}
+
+TEST(OperatorKeyTest, TestFlexOp) {
+  {
+    details::OperatorKey key(OperatorType::kUnsupported, "SomeUnsupportedOp", 1,
+                             false);
+    EXPECT_EQ(key.type, OperatorType::kUnsupported);
+    // It shouldn't be converted to Flex op if `allow_flex_op` is false.
+    EXPECT_EQ(key.custom_code, "SomeUnsupportedOp");
+    EXPECT_EQ(key.version, 1);
+    EXPECT_FALSE(key.is_flex_op);
+  }
+
+  {
+    details::OperatorKey key(OperatorType::kUnsupported, "SomeUnsupportedOp", 1,
+                             true);
+    EXPECT_EQ(key.type, OperatorType::kUnsupported);
+    // Verify that the custom op name is prefixed by "Flex" and `is_flex_op`
+    // is true.
+    EXPECT_EQ(key.custom_code, "FlexSomeUnsupportedOp");
+    EXPECT_EQ(key.version, 1);
+    EXPECT_TRUE(key.is_flex_op);
+  }
+}
+
+TEST(OperatorKeyTest, TestFlexWithControlFlowOp) {
+  details::OperatorKey key(OperatorType::kUnsupported, "Merge", 1, true);
+  EXPECT_EQ(key.type, OperatorType::kUnsupported);
+  EXPECT_EQ(key.custom_code, "FlexMerge");
+  EXPECT_EQ(key.version, 1);
+  EXPECT_TRUE(key.is_flex_op);
+  // The control flow ops should be marked as unsupported.
+  EXPECT_TRUE(key.is_unsupported_flex_op);
+}
+
 // TODO(ahentz): tests for tensors, inputs, outputs, opcodes and operators.
 
 }  // namespace
-- 
GitLab


From d6e14a53835eed5eed279c83e475440f8f814f0e Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 3 Oct 2018 17:28:57 -0700
Subject: [PATCH 384/570] Automated rollback of commit
 c1b3b0b9e041d82e80c2cdcc623a387753daf0b4

PiperOrigin-RevId: 215658770
---
 tensorflow/contrib/lite/kernels/BUILD | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index b349a2863c..daaf6714cc 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -337,10 +337,7 @@ tf_cc_test(
     name = "activations_test",
     size = "small",
     srcs = ["activations_test.cc"],
-    tags = [
-        "nomac",
-        "tflite_not_portable_ios",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
-- 
GitLab


From f7edc2d308523fa6c2d233c09e3f2da1c98e3dbc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 18:00:17 -0700
Subject: [PATCH 385/570] PinToHostOptimizer: Refactored code. Update
 blacklist. Added recursive lookback for Identity op. This fixes many
 performance regressions.

PiperOrigin-RevId: 215662393
---
 .../core/grappler/costs/graph_properties.h    |   4 +
 tensorflow/core/grappler/graph_view.cc        |  33 +-
 tensorflow/core/grappler/graph_view.h         |   3 +-
 tensorflow/core/grappler/graph_view_test.cc   |  22 +-
 tensorflow/core/grappler/op_types.cc          | 114 ++++---
 tensorflow/core/grappler/op_types.h           |   2 +
 .../optimizers/pin_to_host_optimizer.cc       | 303 ++++++++++++------
 .../optimizers/pin_to_host_optimizer_test.cc  |  42 +++
 8 files changed, 366 insertions(+), 157 deletions(-)

diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index f716cd72c9..28fd7565cc 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -74,6 +74,10 @@ class GraphProperties {
   // shape information.
   void ClearInputProperties(const string& node_name);
   void ClearOutputProperties(const string& node_name);
+  // Returns true if we have *any* properties.
+  bool has_properties() const {
+    return input_properties_.size() > 0 || output_properties_.size() > 0;
+  }
 
  private:
   // Relaxes shapes <shapes_and_types>, determined from an EnqueueV2 node, into
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
index 0b8cb5e919..de0a63fc4e 100644
--- a/tensorflow/core/grappler/graph_view.cc
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -20,23 +20,25 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-int OpOutputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id) {
-  for (int output_arg_id = 0; output_arg_id < op.output_arg_size();
-       ++output_arg_id) {
+namespace {
+int OpPortIdToArgId(const NodeDef& node,
+                    const protobuf::RepeatedPtrField<OpDef::ArgDef>& args,
+                    int port_id) {
+  for (int arg_id = 0; arg_id < args.size(); ++arg_id) {
     if (port_id < 0) {
       return -1;
     } else if (port_id == 0) {
-      return output_arg_id;
+      return arg_id;
     }
 
-    // Default is 1 port per output arg.
+    // Default is 1 port per arg.
     int n = 1;
 
-    const auto& output_arg = op.output_arg(output_arg_id);
-    if (!output_arg.number_attr().empty()) {
-      n = node.attr().at(output_arg.number_attr()).i();
-    } else if (!output_arg.type_list_attr().empty()) {
-      n = node.attr().at(output_arg.type_list_attr()).list().type_size();
+    const auto& arg = args.Get(arg_id);
+    if (!arg.number_attr().empty()) {
+      n = node.attr().at(arg.number_attr()).i();
+    } else if (!arg.type_list_attr().empty()) {
+      n = node.attr().at(arg.type_list_attr()).list().type_size();
     }
 
     if (n < 0) {
@@ -44,13 +46,22 @@ int OpOutputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id) {
       DCHECK_GE(n, 0);
       return -1;
     } else if (port_id < n) {
-      return output_arg_id;
+      return arg_id;
     }
     port_id -= n;
   }
 
   return -1;
 }
+}  // end namespace
+
+int OpOutputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id) {
+  return OpPortIdToArgId(node, op.output_arg(), port_id);
+}
+
+int OpInputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id) {
+  return OpPortIdToArgId(node, op.input_arg(), port_id);
+}
 
 GraphView::GraphView(GraphDef* graph) : graph_(graph) {
   for (int i = 0; i < graph_->node_size(); i++) {
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index ec946ca3b5..09c36a1368 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -26,7 +26,7 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-// Map a node/op's output port_id to arg_id.
+// Map a node/op's input/output port_id to arg_id.
 //
 // The port_id refers to the n-th tensor of the node, while the arg_id refers to
 // the n-th arg of the op. These two can be different if an op's arg is a list
@@ -34,6 +34,7 @@ namespace grappler {
 //
 // We return -1 for any invalid port_id (i.e., no corresponding arg_id).
 int OpOutputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id);
+int OpInputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id);
 
 // A utility class to simplify the traversal of a GraphDef.
 class GraphView {
diff --git a/tensorflow/core/grappler/graph_view_test.cc b/tensorflow/core/grappler/graph_view_test.cc
index 3d7d2faf7c..f90e2c8cfc 100644
--- a/tensorflow/core/grappler/graph_view_test.cc
+++ b/tensorflow/core/grappler/graph_view_test.cc
@@ -26,7 +26,7 @@ namespace {
 
 class GraphViewTest : public ::testing::Test {};
 
-TEST_F(GraphViewTest, OpOutputPortIdToArgIdShapeN) {
+TEST_F(GraphViewTest, OpPortIdToArgIdShapeN) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
   ops::ShapeN b(s.WithOpName("b"), {a, a, a});
@@ -45,9 +45,16 @@ TEST_F(GraphViewTest, OpOutputPortIdToArgIdShapeN) {
   EXPECT_TRUE(
       OpRegistry::Global()->LookUpOpDef(b_node_def.op(), &b_op_def).ok());
 
-  EXPECT_EQ(0, OpOutputPortIdToArgId(b_node_def, *a_op_def, 0));
-  EXPECT_EQ(-1, OpOutputPortIdToArgId(b_node_def, *a_op_def, 1));
+  // Const has 0 inputs, 1 output.
+  EXPECT_EQ(-1, OpInputPortIdToArgId(a_node_def, *a_op_def, 0));
+  EXPECT_EQ(0, OpOutputPortIdToArgId(a_node_def, *a_op_def, 0));
+  EXPECT_EQ(-1, OpOutputPortIdToArgId(a_node_def, *a_op_def, 1));
 
+  // ShapeN has N=3 inputs and outputs.
+  EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 0));
+  EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 1));
+  EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 2));
+  EXPECT_EQ(-1, OpInputPortIdToArgId(b_node_def, *b_op_def, 3));
   EXPECT_EQ(0, OpOutputPortIdToArgId(b_node_def, *b_op_def, 0));
   EXPECT_EQ(0, OpOutputPortIdToArgId(b_node_def, *b_op_def, 1));
   EXPECT_EQ(0, OpOutputPortIdToArgId(b_node_def, *b_op_def, 2));
@@ -55,7 +62,7 @@ TEST_F(GraphViewTest, OpOutputPortIdToArgIdShapeN) {
   EXPECT_EQ(-1, OpOutputPortIdToArgId(b_node_def, *b_op_def, 4));
 }
 
-TEST_F(GraphViewTest, OpOutputPortIdToArgIdSparseSplit) {
+TEST_F(GraphViewTest, OpPortIdToArgIdSparseSplit) {
   for (int num_splits : {1, 2}) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
     Output a = ops::Const<int64>(s.WithOpName("a"), 1, {10, 10});
@@ -70,6 +77,13 @@ TEST_F(GraphViewTest, OpOutputPortIdToArgIdSparseSplit) {
     EXPECT_TRUE(
         OpRegistry::Global()->LookUpOpDef(b_node_def.op(), &b_op_def).ok());
 
+    // We have 4 inputs.
+    EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 0));
+    EXPECT_EQ(1, OpInputPortIdToArgId(b_node_def, *b_op_def, 1));
+    EXPECT_EQ(2, OpInputPortIdToArgId(b_node_def, *b_op_def, 2));
+    EXPECT_EQ(3, OpInputPortIdToArgId(b_node_def, *b_op_def, 3));
+    EXPECT_EQ(-1, OpInputPortIdToArgId(b_node_def, *b_op_def, 4));
+
     for (int port_id = 0; port_id <= num_splits * 3; ++port_id) {
       int arg_id = -1;
       if (port_id < num_splits * 3) {
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 9f0d9dbf28..1b5a215987 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <unordered_set>
-
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -102,6 +101,18 @@ bool IsConjugateTranspose(const NodeDef& node) {
   return node.op() == "ConjugateTranspose";
 }
 
+bool IsControlFlow(const NodeDef& node) {
+  // clang-format off
+  return node.op() == "ControlTrigger" ||
+         node.op() == "Enter" ||
+         node.op() == "Exit" ||
+         node.op() == "LoopCond" ||
+         node.op() == "Merge" ||
+         node.op() == "NextIteration" ||
+         node.op() == "Switch";
+  // clang-format on
+}
+
 bool IsConv2D(const NodeDef& node) { return node.op() == "Conv2D"; }
 
 bool IsConv2DBackpropFilter(const NodeDef& node) {
@@ -140,26 +151,26 @@ bool IsDiv(const NodeDef& node) { return node.op() == "Div"; }
 // e.g. sqrt, exp. *is_non_decreasing is false, the function is non-increasing,
 // e.g. inv.
 bool IsElementWiseMonotonic(const NodeDef& node, bool* is_non_decreasing) {
-  static const std::unordered_set<string>* monotonic_non_decreasing_ops =
-      CHECK_NOTNULL((new std::unordered_set<string>{
+  static const gtl::FlatSet<string>* const kMonotonicNonDecreasingOps =
+      CHECK_NOTNULL((new gtl::FlatSet<string>{
           "Asinh", "Atanh",   "Ceil",  "Elu",  "Erf",  "Exp",   "Expm1",
           "Floor", "Log",     "Log1p", "Relu", "Relu", "Relu6", "Rint",
           "Selu",  "Sigmoid", "Sign",  "Sinh", "Sqrt", "Tanh",
       }));
-  static const std::unordered_set<string>* monotonic_non_increasing_ops =
-      CHECK_NOTNULL((new std::unordered_set<string>{
+  static const gtl::FlatSet<string>* const kMonotonicNonIncreasingOps =
+      CHECK_NOTNULL((new gtl::FlatSet<string>{
           "Inv",
           "Reciprocal",
           "Erfc",
           "Rsqrt",
           "Neg",
       }));
-  if (monotonic_non_decreasing_ops->count(node.op()) > 0) {
+  if (kMonotonicNonDecreasingOps->count(node.op()) > 0) {
     if (is_non_decreasing) {
       *is_non_decreasing = true;
     }
     return true;
-  } else if (monotonic_non_increasing_ops->count(node.op()) > 0) {
+  } else if (kMonotonicNonIncreasingOps->count(node.op()) > 0) {
     if (is_non_decreasing) {
       *is_non_decreasing = false;
     }
@@ -431,6 +442,38 @@ bool IsSymbolicGradient(const NodeDef& node) {
 
 bool IsTanhGrad(const NodeDef& node) { return node.op() == "TanhGrad"; }
 
+bool IsTensorArray(const NodeDef& node) {
+  static const gtl::FlatSet<string>* const kTensorArrayOps =
+      CHECK_NOTNULL((new gtl::FlatSet<string>{
+          "TensorArray",
+          "TensorArrayV2",
+          "TensorArrayV3",
+          "TensorArrayGrad",
+          "TensorArrayGradV2",
+          "TensorArrayGradV3",
+          "TensorArrayGradWithShape",
+          "TensorArrayWrite",
+          "TensorArrayWriteV2",
+          "TensorArrayWriteV3",
+          "TensorArrayRead",
+          "TensorArrayReadV2",
+          "TensorArrayReadV3",
+          "TensorArrayConcat",
+          "TensorArrayConcatV2",
+          "TensorArrayConcatV3",
+          "TensorArraySplit",
+          "TensorArraySplitV2",
+          "TensorArraySplitV3",
+          "TensorArraySize",
+          "TensorArraySizeV2",
+          "TensorArraySizeV3",
+          "TensorArrayClose",
+          "TensorArrayCloseV2",
+          "TensorArrayCloseV3",
+      }));
+  return kTensorArrayOps->count(node.op()) > 0;
+}
+
 bool IsTile(const NodeDef& node) { return node.op() == "Tile"; }
 
 bool IsTranspose(const NodeDef& node) { return node.op() == "Transpose"; }
@@ -542,30 +585,29 @@ OPDEF_PROPERTY_HELPER(Aggregate, aggregate)
 OPDEF_PROPERTY_HELPER(Commutative, commutative)
 
 bool IsInvolution(const NodeDef& node) {
-  static const std::unordered_set<string>* involution_ops =
-      CHECK_NOTNULL((new std::unordered_set<string>{
-          "Conj", "Reciprocal", "Invert", "Neg", "LogicalNot"}));
-  return involution_ops->count(node.op()) > 0;
+  static const gtl::FlatSet<string>* const kInvolutionOps =
+      CHECK_NOTNULL((new gtl::FlatSet<string>{"Conj", "Reciprocal", "Invert",
+                                              "Neg", "LogicalNot"}));
+  return kInvolutionOps->count(node.op()) > 0;
 }
 
 bool IsValueAndOrderAndShapePreserving(const NodeDef& node) {
   if (NumNonControlInputs(node) == 1 && IsAggregate(node)) {
     return true;
   }
-  static const std::unordered_set<string>*
-      value_and_order_and_shape_preserving_ops =
-          CHECK_NOTNULL((new const std::unordered_set<string>{
-              "CheckNumerics",
-              "DebugGradientIdentity",
-              "DeepCopy"
-              "Enter",
-              "Exit",
-              "PreventGradient",
-              "Print",
-              "Snapshot",
-              "StopGradient",
-          }));
-  return value_and_order_and_shape_preserving_ops->count(node.op()) > 0 ||
+  static const gtl::FlatSet<string>* const kValueAndOrderAndShapePreservingOps =
+      CHECK_NOTNULL((new const gtl::FlatSet<string>{
+          "CheckNumerics",
+          "DebugGradientIdentity",
+          "DeepCopy"
+          "Enter",
+          "Exit",
+          "PreventGradient",
+          "Print",
+          "Snapshot",
+          "StopGradient",
+      }));
+  return kValueAndOrderAndShapePreservingOps->count(node.op()) > 0 ||
          IsIdentity(node);
 }
 
@@ -573,31 +615,31 @@ bool IsValueAndOrderPreserving(const NodeDef& node) {
   if (NumNonControlInputs(node) == 1 && IsAggregate(node)) {
     return true;
   }
-  static const std::unordered_set<string>* value_and_order_preserving_ops =
-      CHECK_NOTNULL((new const std::unordered_set<string>{
+  static const gtl::FlatSet<string>* const kValueAndOrderPreservingOps =
+      CHECK_NOTNULL((new const gtl::FlatSet<string>{
           "ExpandDims",
           "Reshape",
           "Squeeze",
       }));
-  return value_and_order_preserving_ops->count(node.op()) > 0 ||
+  return kValueAndOrderPreservingOps->count(node.op()) > 0 ||
          IsValueAndOrderAndShapePreserving(node);
 }
 
 bool IsValuePreserving(const NodeDef& node) {
-  static const std::unordered_set<string>* value_preserving_ops =
-      CHECK_NOTNULL((new std::unordered_set<string>{
+  static const gtl::FlatSet<string>* const kValuePreservingOps =
+      CHECK_NOTNULL((new gtl::FlatSet<string>{
           "InvertPermutation",
           "Reverse",
           "Roll",
           "Transpose",
       }));
   return IsValueAndOrderPreserving(node) ||
-         value_preserving_ops->count(node.op()) > 0;
+         kValuePreservingOps->count(node.op()) > 0;
 }
 
 bool IsUnaryElementWise(const NodeDef& node) {
-  static const std::unordered_set<string>* element_wise_ops =
-      CHECK_NOTNULL((new std::unordered_set<string>{
+  static const gtl::FlatSet<string>* const kElementWiseOps =
+      CHECK_NOTNULL((new gtl::FlatSet<string>{
           "Abs",
           "Acos",
           "Acosh",
@@ -646,7 +688,7 @@ bool IsUnaryElementWise(const NodeDef& node) {
           "Tan"
           "Tanh",
       }));
-  return element_wise_ops->count(node.op()) > 0 ||
+  return kElementWiseOps->count(node.op()) > 0 ||
          IsValueAndOrderAndShapePreserving(node);
 }
 
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 7f86a5f295..d4e0159e81 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -46,6 +46,7 @@ bool IsConjugateTranspose(const NodeDef& node);
 bool IsConcat(const NodeDef& node);
 bool IsConcatOffset(const NodeDef& node);
 bool IsConstant(const NodeDef& node);
+bool IsControlFlow(const NodeDef& node);
 bool IsConv2D(const NodeDef& node);
 bool IsConv2DBackpropFilter(const NodeDef& node);
 bool IsConv2DBackpropInput(const NodeDef& node);
@@ -151,6 +152,7 @@ bool IsSum(const NodeDef& node);
 bool IsSwitch(const NodeDef& node);
 bool IsSymbolicGradient(const NodeDef& node);
 bool IsTanhGrad(const NodeDef& node);
+bool IsTensorArray(const NodeDef& node);
 bool IsTile(const NodeDef& node);
 bool IsTranspose(const NodeDef& node);
 bool IsTruncateDiv(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
index 89eb76046e..8ed4271fa4 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
@@ -35,13 +35,44 @@ namespace internal {
 // dynamically determined.
 constexpr int64 kTensorMaxSize = 64;
 
-// Find KernelDef for `node`.
-Status TryFindKernelDef(const NodeDef& node, const KernelDef** kdef) {
-  // Try find KernelDef for node.device, else GPU or CPU.
-  for (const DeviceType& device :
-       {node.device().c_str(), DEVICE_GPU, DEVICE_CPU}) {
-    Status s = FindKernelDef(device, node, kdef, nullptr);
+// All the nodes that should be blacklisted and not swapped.
+bool IsBlacklisted(const NodeDef& node) {
+  return
+      // Collective ops should not be swapped.
+      IsCollective(node) ||
+      // ControlFlow ops should not be swapped.
+      IsControlFlow(node) ||
+      // NoOp ops should not be swapped (due to group dependencies).
+      IsNoOp(node);
+}
+
+// Check if Tensor is integer and small size.
+bool IsTensorIntegerAndSmall(const OpInfo::TensorProperties& prop) {
+  // Check type to be int32 or int64.
+  if (prop.dtype() != DataType::DT_INT32 &&
+      prop.dtype() != DataType::DT_INT64) {
+    return false;
+  }
+
+  // Check size known and small.
+  const int64 size = NumCoefficients(prop.shape());
+  if (size < 0 || size > kTensorMaxSize) {
+    return false;
+  }
+
+  return true;
+}
+
+// Find KernelDef for `node`, greedily return first found from `devices`.
+Status TryFindKernelDef(const std::vector<DeviceType>& devices,
+                        const NodeDef& node, const KernelDef** kdef) {
+  for (const DeviceType& device : devices) {
+    const KernelDef* kernel = nullptr;
+    Status s = FindKernelDef(device, node, &kernel, nullptr);
     if (s.ok()) {
+      if (kdef) {
+        *kdef = kernel;
+      }
       return Status::OK();
     }
   }
@@ -49,88 +80,183 @@ Status TryFindKernelDef(const NodeDef& node, const KernelDef** kdef) {
   return errors::NotFound("Could not find KernelDef for op: ", node.op());
 }
 
-// Check if all node's inputs are pinned to CPU memory.
-bool AreAllNodeInputsPinnedToHost(const GraphView& graph, const NodeDef& node) {
-  // Loop through all the inputs excluding the controlling nodes.
-  for (const GraphView::OutputPort& fanin : graph.GetFanins(node, false)) {
-    // Check if (the fanin) op's device is on CPU.
-    if (str_util::StrContains(fanin.node->device(), DEVICE_CPU)) {
-      continue;
-    }
-
-    // Check if (the fanin) op's output port is pinned to HostMemory.
-    const OpDef* fanin_odef = nullptr;
-    Status s = OpRegistry::Global()->LookUpOpDef(fanin.node->op(), &fanin_odef);
-    if (!s.ok()) {
-      LOG(INFO) << "Could not find OpDef for : " << fanin.node->op();
-      return false;
-    }
+// Checks if a node's output port is host friendly.
+// Roughly this means checking if the output port is on Host memory.
+Status IsNodeOutputPortHostFriendly(const GraphView& graph,
+                                    GraphProperties* properties,
+                                    const NodeDef& node, int port_id,
+                                    bool* is_candidate) {
+  *is_candidate = false;
 
-    const int output_arg_id =
-        OpOutputPortIdToArgId(*fanin.node, *fanin_odef, fanin.port_id);
-    if (output_arg_id < 0) {
-      LOG(WARNING) << "Invalid port: " << fanin.port_id << "!\n"
-                   << node.DebugString() << "\n"
-                   << fanin.node->DebugString() << "\n"
-                   << fanin_odef->DebugString();
-      return false;
-    }
+  // Make sure we are not a blacklisted op.
+  if (IsBlacklisted(node)) {
+    return Status::OK();
+  }
 
-    const KernelDef* fanin_kdef = nullptr;
-    s = TryFindKernelDef(*fanin.node, &fanin_kdef);
-    if (!s.ok()) {
-      LOG(INFO) << "Could not find KernelDef for : " << fanin.node->op();
-      return false;
-    }
+  // Check to make sure we have the right properties (i.e., statically shaped).
+  if (!properties->has_properties()) {
+    // This is an expensive call, call it lazily.
+    TF_RETURN_IF_ERROR(properties->InferStatically(
+        /*assume_valid_feeds=*/false));
+  }
+  const auto& output_properties = properties->GetOutputProperties(node.name());
+  if (port_id >= output_properties.size()) {
+    LOG(WARNING) << "port_id=" << port_id
+                 << " but output_properties.size()=" << output_properties.size()
+                 << "\n"
+                 << node.DebugString();
+    return Status::OK();
+  }
+  if (!IsTensorIntegerAndSmall(output_properties[port_id])) {
+    return Status::OK();
+  }
 
-    bool fanin_pinned = false;
-    for (const string& host_memory_arg : fanin_kdef->host_memory_arg()) {
-      if (fanin_odef->output_arg(output_arg_id).name() == host_memory_arg) {
-        fanin_pinned = true;
-        break;
+  // These nodes may be optimized away downstream (even if pinned to Host), we
+  // should (recusively) check their source.
+  if (IsIdentity(node)) {
+    for (const auto& fanin : graph.GetFanins(node, false)) {
+      bool fanin_candidate = false;
+      TF_RETURN_IF_ERROR(IsNodeOutputPortHostFriendly(
+          graph, properties, *fanin.node, fanin.port_id, &fanin_candidate));
+      if (!fanin_candidate) {
+        return Status::OK();
       }
     }
+    *is_candidate = true;
+    return Status::OK();
+  }
 
-    if (!fanin_pinned) {
-      return false;
+  // Check if op's device is on CPU.
+  if (str_util::StrContains(node.device(), DEVICE_CPU)) {
+    *is_candidate = true;
+    return Status::OK();
+  }
+
+  // Check if op's output port is pinned to HostMemory.
+  const OpDef* op = nullptr;
+  Status s = OpRegistry::Global()->LookUpOpDef(node.op(), &op);
+  if (!s.ok()) {
+    LOG(WARNING) << "Could not find OpDef for : " << node.op();
+    return Status::OK();
+  }
+
+  // Map the port_id to output_arg_id.
+  const int output_arg_id = OpOutputPortIdToArgId(node, *op, port_id);
+  if (output_arg_id < 0) {
+    LOG(WARNING) << "Invalid port: " << port_id << "!\n"
+                 << node.DebugString() << "\n"
+                 << op->DebugString();
+    return Status::OK();
+  }
+
+  // Find the kernel.
+  const KernelDef* kernel = nullptr;
+  s = TryFindKernelDef({node.device().c_str(), DEVICE_GPU, DEVICE_CPU}, node,
+                       &kernel);
+  if (!s.ok()) {
+    LOG(INFO) << "Could not find KernelDef for: " << node.op();
+    return Status::OK();
+  }
+
+  // Check if the output_arg is pinned to Host.
+  for (const string& host_memory_arg : kernel->host_memory_arg()) {
+    if (op->output_arg(output_arg_id).name() == host_memory_arg) {
+      *is_candidate = true;
+      break;
     }
   }
 
-  return true;
+  return Status::OK();
 }
 
-bool IsTensorIntegerAndSmall(const OpInfo::TensorProperties& prop) {
-  // Check if Tensor is integer and small size.
+// Checks if a node's input port is Host friendly.
+// Roughly this means checking if the input port is on Host memory.
+bool IsNodeInputPortHostFriendly(const NodeDef& node, int port_id) {
+  // If node is on Host, assume its inputs are Host friendly.
+  if (str_util::StrContains(node.device(), DEVICE_CPU)) {
+    return true;
+  }
 
-  // Check type to be int32 or int64.
-  if (prop.dtype() != DataType::DT_INT32 &&
-      prop.dtype() != DataType::DT_INT64) {
+  // Check if op's input port is pinned to HostMemory.
+  const OpDef* op = nullptr;
+  Status s = OpRegistry::Global()->LookUpOpDef(node.op(), &op);
+  if (!s.ok()) {
+    LOG(WARNING) << "Could not find OpDef for : " << node.op();
     return false;
   }
-
-  // Check size known and small.
-  const int64 size = NumCoefficients(prop.shape());
-  if (size < 0 || size > kTensorMaxSize) {
+  const int input_arg_id = OpInputPortIdToArgId(node, *op, port_id);
+
+  // Find the kernel.
+  const KernelDef* kernel = nullptr;
+  s = internal::TryFindKernelDef(
+      {node.device().c_str(), DEVICE_GPU, DEVICE_CPU}, node, &kernel);
+  if (!s.ok()) {
+    LOG(INFO) << "Could not find KernelDef for: " << node.op();
     return false;
   }
 
-  return true;
+  // Check if the input_arg is pinned to Host.
+  for (const string& host_memory_arg : kernel->host_memory_arg()) {
+    if (op->input_arg(input_arg_id).name() == host_memory_arg) {
+      return true;
+    }
+  }
+
+  return false;
 }
 
-bool AreAllNodeInputsAndOutputsIntsAndSmall(const GraphProperties& properties,
-                                            const NodeDef& node) {
-  for (const auto& prop : properties.GetInputProperties(node.name())) {
-    if (!IsTensorIntegerAndSmall(prop)) {
-      return false;
+// Checks if a node is a candidate to pin to Host.
+// The rough algorithm is as follows:
+// 1] Check if node is blacklisted.
+// 2] Check if node can run on Host.
+// 3] Check all input/outputs are Host "friendly" (atm, friendly means small,
+//    ints, and pinned to Host).
+Status IsNodeHostCandidate(const GraphView& graph, GraphProperties* properties,
+                           const NodeDef& node, bool* is_candidate) {
+  *is_candidate = false;
+
+  // Check if node already on CPU.
+  if (str_util::StrContains(node.device(), DEVICE_CPU)) {
+    *is_candidate = true;
+    return Status::OK();
+  }
+
+  // Skip these node types.
+  if (IsBlacklisted(node)) {
+    return Status::OK();
+  }
+
+  // Check the node can be run on CPU.
+  Status s = TryFindKernelDef({DEVICE_CPU}, node, nullptr);
+  if (!s.ok()) {
+    return Status::OK();
+  }
+
+  // Check all inputs are Host friendly.
+  for (const GraphView::OutputPort& fanin :
+       graph.GetFanins(node, /*include_controlling_nodes=*/false)) {
+    bool fanin_candidate = false;
+    TF_RETURN_IF_ERROR(IsNodeOutputPortHostFriendly(
+        graph, properties, *fanin.node, fanin.port_id, &fanin_candidate));
+    if (!fanin_candidate) {
+      return Status::OK();
     }
   }
 
-  for (const auto& prop : properties.GetOutputProperties(node.name())) {
+  // Check all outputs are Host friendly.
+  if (!properties->has_properties()) {
+    // This is an expensive call, call it lazily.
+    TF_RETURN_IF_ERROR(properties->InferStatically(
+        /*assume_valid_feeds=*/false));
+  }
+  for (const auto& prop : properties->GetOutputProperties(node.name())) {
     if (!IsTensorIntegerAndSmall(prop)) {
-      return false;
+      return Status::OK();
     }
   }
-  return true;
+
+  *is_candidate = true;
+  return Status::OK();
 }
 
 string TryFindHostDevice(const gtl::FlatSet<string>& devices,
@@ -167,15 +293,6 @@ bool IsTPUGraphDef(const GraphDef& def) {
   }
   return false;
 }
-
-// All the nodes that should be blacklisted and not swapped.
-bool IsBlacklisted(const NodeDef& node) {
-  return
-      // Collective ops should not be swapped.
-      IsCollective(node) ||
-      // NoOp breaks perf regression tests (probably due to group dependencies).
-      IsNoOp(node);
-}
 }  // end namespace internal
 
 Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
@@ -188,7 +305,6 @@ Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
 
   GraphProperties properties(item);
-  bool has_properties = false;
   GraphView graph(optimized_graph);
 
   gtl::FlatSet<string> devices;
@@ -209,35 +325,10 @@ Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   std::vector<std::pair<NodeDef*, string>> const_nodes;
 
   for (auto& node : *optimized_graph->mutable_node()) {
-    // Check if node already on CPU.
-    if (str_util::StrContains(node.device(), DEVICE_CPU)) {
-      continue;
-    }
-
-    // Skip these node types.
-    if (internal::IsBlacklisted(node)) {
-      continue;
-    }
-
-    // Check the node can be run on CPU.
-    Status s = FindKernelDef(DEVICE_CPU, node, nullptr, nullptr);
-    if (!s.ok()) {
-      continue;
-    }
-
-    // Check all input's are pinned to CPU.
-    if (!internal::AreAllNodeInputsPinnedToHost(graph, node)) {
-      continue;
-    }
-
-    if (!has_properties) {
-      // This is an expensive call, call it lazily.
-      TF_RETURN_IF_ERROR(properties.InferStatically(false));
-      has_properties = true;
-    }
-
-    // Check all inputs and outputs are integers and small.
-    if (!internal::AreAllNodeInputsAndOutputsIntsAndSmall(properties, node)) {
+    bool is_candidate = false;
+    TF_RETURN_IF_ERROR(
+        internal::IsNodeHostCandidate(graph, &properties, node, &is_candidate));
+    if (!is_candidate) {
       continue;
     }
 
@@ -254,10 +345,12 @@ Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     NodeDef* node = it.first;
     const string& device = it.second;
 
-    // Check all the consumers of this node, if any of them are on the original
-    // device, swap this node back onto the original device.
+    // Check all the consumers of this node, if any of them are not on CPU, swap
+    // this node back onto the original device.
     for (const GraphView::InputPort& fanout : graph.GetFanouts(*node, false)) {
-      if (fanout.node->device() == device) {
+      // The consumer is not Host friendly, swap it back to the original device.
+      if (!internal::IsNodeInputPortHostFriendly(*fanout.node,
+                                                 fanout.port_id)) {
         node->set_device(device);
         break;
       }
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
index 173cb3fe3c..7c64529441 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
@@ -160,6 +160,48 @@ TEST_F(PinToHostOptimizerTest, NoSwap) {
   EXPECT_EQ(found, 3);
 }
 
+TEST_F(PinToHostOptimizerTest, Identity) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  // `a,c` is on GPU, `e` is on CPU, consequently `e` should not be swapped.
+  // `b` should be placed onto Host since `c` pins the input to Host memory.
+  Output a =
+      ops::Const(s.WithOpName("a").WithDevice("/device:GPU:0"), 1, {64, 64});
+  Output b = ops::Const(s.WithOpName("b"), {0, 1}, {2});
+  Output c =
+      ops::ReduceProd(s.WithOpName("c").WithDevice("/device:GPU:0"), a, b);
+  Output d = ops::Identity(s.WithDevice("/device:CPU:0").WithOpName("d"), c);
+  Output e = ops::Multiply(s.WithOpName("e"), d, d);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  PinToHostOptimizer optimizer(RewriterConfig::ON);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "a" || node.name() == "c") {
+      EXPECT_EQ(node.device(), "/device:GPU:0");
+    } else if (node.name() == "b") {
+      // If CUDA, then there is a GPU kernel registration that is pinned to Host
+      // memory. Consequently, `b` will be mapped to Host correct if there is
+      // a GPU kernel registered.
+#if GOOGLE_CUDA
+      EXPECT_EQ(node.device(), "/device:CPU:0");
+#else
+      EXPECT_TRUE(node.device().empty());
+#endif
+    } else if (node.name() == "d") {
+      EXPECT_EQ(node.device(), "/device:CPU:0");
+    } else if (node.name() == "e") {
+      EXPECT_TRUE(node.device().empty());
+    }
+    ++found;
+  }
+  EXPECT_EQ(found, 5);
+}
+
 TEST_F(PinToHostOptimizerTest, PortIdToArgId) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output a = ops::Const(s.WithOpName("a"), 1, {1, 2, 3});
-- 
GitLab


From 18f589350f0cb244e2373480048d17cbacd241e1 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Wed, 3 Oct 2018 18:05:22 -0700
Subject: [PATCH 386/570] [XLA] Add a size limit to the constant folder to
 avoid forming giant constants during compilation.

PiperOrigin-RevId: 215663002
---
 .../xla/service/hlo_constant_folding.cc       | 17 ++++++++++++++++
 .../xla/service/hlo_constant_folding_test.cc  | 20 +++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index f837816cea..538816a353 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -76,6 +76,22 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
         continue;
       }
 
+      // Don't constant fold unless it's a net positive or the output is small.
+      int64 elements_in_removed_operands = 0;
+      for (HloInstruction* operand : instruction->operands()) {
+        if (operand->user_count() == 1) {
+          elements_in_removed_operands +=
+              ShapeUtil::ElementsIn(operand->shape());
+        }
+      }
+      int64 elements_in_constant = ShapeUtil::ElementsIn(instruction->shape());
+
+      static const int64 kMaximumConstantSizeElements = 2 * 1000 * 1000;
+      if (elements_in_constant > elements_in_removed_operands &&
+          elements_in_constant > kMaximumConstantSizeElements) {
+        continue;
+      }
+
       Literal result;
       // Currently we skip unimplemented operations.
       // TODO(b/35975797): Fold constant computations for more operations.
@@ -84,6 +100,7 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
                 << instruction->ToString();
         continue;
       }
+      VLOG(4) << "Constant folded: " << instruction->ToString();
 
       TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
           instruction, HloInstruction::CreateConstant(std::move(result))));
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index 3e0def5d26..e45f905f71 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -242,5 +242,25 @@ TEST_F(HloConstantFoldingTest, ConstantFoldReduceNoLayout) {
   EXPECT_THAT(module().entry_computation()->root_instruction(), op::Reduce());
 }
 
+const char* const kConstantFoldLargePad = R"(
+  HloModule ConstantFoldLargePad
+
+  ENTRY r {
+    a = f32[1,1,1] constant(f32[1,1,1]{{{7}}})
+    b = f32[] constant(42)
+    ROOT pad = f32[2048,2048,128] pad(a, b), padding=1024_1023x1024_1023x64_63
+  })";
+
+TEST_F(HloConstantFoldingTest, DoesNotFoldLargePad) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kConstantFoldLargePad));
+  HloConstantFolding const_folder;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
+  EXPECT_FALSE(result);
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Pad(op::Constant(), op::Constant()));
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 54bebc286bbe7d6a866a3bdbcefd8af55adbe39a Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Wed, 3 Oct 2018 18:26:28 -0700
Subject: [PATCH 387/570] Fix a test. - SetCustomOp also sets the name of the
 custom op. Test was checking against the wrong name in the profile.

PiperOrigin-RevId: 215665359
---
 .../contrib/lite/profiling/profile_summarizer_test.cc       | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/profiling/profile_summarizer_test.cc b/tensorflow/contrib/lite/profiling/profile_summarizer_test.cc
index 67a5eecfa0..465c294962 100644
--- a/tensorflow/contrib/lite/profiling/profile_summarizer_test.cc
+++ b/tensorflow/contrib/lite/profiling/profile_summarizer_test.cc
@@ -31,6 +31,8 @@ namespace profiling {
 
 namespace {
 
+const char* kOpName = "SimpleOpEval";
+
 #ifdef TFLITE_PROFILING_ENABLED
 TfLiteStatus SimpleOpEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input1 = tflite::GetInput(context, node, /*index=*/0);
@@ -63,7 +65,7 @@ TfLiteRegistration* RegisterSimpleOpWithProfilingDetails() {
                                             SimpleOpEval,
                                             SimpleOpProfilingString,
                                             tflite::BuiltinOperator_CUSTOM,
-                                            "SimpleOpEval",
+                                            kOpName,
                                             1};
   return &registration;
 }
@@ -89,7 +91,7 @@ void SimpleOpModel::Init(
   inputs_[0] = AddInput({TensorType_INT32, {1}});
   inputs_[1] = AddInput({TensorType_INT32, {1}});
   output_ = AddOutput({TensorType_INT32, {}});
-  SetCustomOp("SimpleAdd", {}, registration);
+  SetCustomOp(kOpName, {}, registration);
   BuildInterpreter({GetShape(inputs_[0]), GetShape(inputs_[1])});
 }
 
-- 
GitLab


From 9bd6f5ed55e533ccac055a5bc7fbb771e2d432c5 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Wed, 3 Oct 2018 18:56:00 -0700
Subject: [PATCH 388/570] [TF:XLA] Use xla::Iota rather than expanding Range
 ops to constants.

PiperOrigin-RevId: 215668016
---
 .../compiler/tf2xla/kernels/sequence_ops.cc   | 39 +++++++++----------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
index 25a5bcbe1d..0c32b8def0 100644
--- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -55,10 +57,10 @@ Status GetIntValue(int index, XlaOpKernelContext* ctx, int64* value) {
 
 // The type-specific part of the implementation of Range.
 template <typename T>
-Status CreateRangeTensor(const xla::LiteralSlice& start_literal,
-                         const xla::LiteralSlice& limit_literal,
-                         const xla::LiteralSlice& delta_literal,
-                         Tensor* output) {
+xla::StatusOr<xla::XlaOp> CreateRangeTensor(
+    const xla::LiteralSlice& start_literal,
+    const xla::LiteralSlice& limit_literal,
+    const xla::LiteralSlice& delta_literal, xla::XlaBuilder* builder) {
   T start = start_literal.Get<T>({});
   T limit = limit_literal.Get<T>({});
   T delta = delta_literal.Get<T>({});
@@ -82,14 +84,10 @@ Status CreateRangeTensor(const xla::LiteralSlice& start_literal,
            ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
            : std::ceil(std::abs((limit - start) / delta)));
 
-  *output = Tensor(DataTypeToEnum<T>::v(), TensorShape({size}));
-  auto flat = output->flat<T>();
-  T val = start;
-  for (int64 i = 0; i < size; ++i) {
-    flat(i) = val;
-    val += delta;
-  }
-  return Status::OK();
+  return xla::ConstantR0(builder, start) +
+         xla::ConstantR0(builder, delta) *
+             xla::Iota(builder, xla::primitive_util::NativeToPrimitiveType<T>(),
+                       size);
 }
 
 class RangeOp : public XlaOpKernel {
@@ -115,27 +113,26 @@ class RangeOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->ConstantInput(2, &delta));
 
     DataType type = input_type(0);
-    Tensor output;
-    Status status;
+    xla::StatusOr<xla::XlaOp> output;
     switch (type) {
       case DT_INT32:
-        status = CreateRangeTensor<int32>(start, limit, delta, &output);
+        output = CreateRangeTensor<int32>(start, limit, delta, ctx->builder());
         break;
       case DT_INT64:
-        status = CreateRangeTensor<int64>(start, limit, delta, &output);
+        output = CreateRangeTensor<int64>(start, limit, delta, ctx->builder());
         break;
       case DT_FLOAT:
-        status = CreateRangeTensor<float>(start, limit, delta, &output);
+        output = CreateRangeTensor<float>(start, limit, delta, ctx->builder());
         break;
       case DT_DOUBLE:
-        status = CreateRangeTensor<double>(start, limit, delta, &output);
+        output = CreateRangeTensor<double>(start, limit, delta, ctx->builder());
         break;
       default:
-        status = errors::InvalidArgument("Invalid type for Range ",
+        output = errors::InvalidArgument("Invalid type for Range ",
                                          DataTypeString(type));
     }
-    OP_REQUIRES_OK(ctx, status);
-    ctx->SetConstantOutput(0, output);
+    OP_REQUIRES_OK(ctx, output.status());
+    ctx->SetOutput(0, output.ValueOrDie());
   }
 };
 
-- 
GitLab


From 2e19f32d28ab88b5bd3dd4f6d42a54040591dfbb Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Wed, 3 Oct 2018 20:48:35 -0700
Subject: [PATCH 389/570] [XLA] Fix handling of tuple constants in HLO constant
 folding.

PiperOrigin-RevId: 215676675
---
 .../xla/service/hlo_constant_folding.cc       | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 538816a353..4f898ce61c 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -77,19 +77,23 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
       }
 
       // Don't constant fold unless it's a net positive or the output is small.
-      int64 elements_in_removed_operands = 0;
-      for (HloInstruction* operand : instruction->operands()) {
-        if (operand->user_count() == 1) {
-          elements_in_removed_operands +=
-              ShapeUtil::ElementsIn(operand->shape());
+      if (ShapeUtil::IsArray(instruction->shape())) {
+        int64 elements_in_removed_operands = 0;
+        for (HloInstruction* operand : instruction->operands()) {
+          if (operand->user_count() == 1 &&
+              ShapeUtil::IsArray(operand->shape())) {
+            elements_in_removed_operands +=
+                ShapeUtil::ElementsIn(operand->shape());
+          }
         }
-      }
-      int64 elements_in_constant = ShapeUtil::ElementsIn(instruction->shape());
+        int64 elements_in_constant =
+            ShapeUtil::ElementsIn(instruction->shape());
 
-      static const int64 kMaximumConstantSizeElements = 2 * 1000 * 1000;
-      if (elements_in_constant > elements_in_removed_operands &&
-          elements_in_constant > kMaximumConstantSizeElements) {
-        continue;
+        static const int64 kMaximumConstantSizeElements = 2 * 1000 * 1000;
+        if (elements_in_constant > elements_in_removed_operands &&
+            elements_in_constant > kMaximumConstantSizeElements) {
+          continue;
+        }
       }
 
       Literal result;
-- 
GitLab


From 8a437200e14c8e09fcc8e952679d489909f175c8 Mon Sep 17 00:00:00 2001
From: Mingxing Tan <tanmingxing@google.com>
Date: Wed, 3 Oct 2018 21:06:27 -0700
Subject: [PATCH 390/570] BEGIN_PUBLIC Rollback some quantization changes that
 breaks some models. END_PUBLIC

Automated rollback of commit d3f14ef70cdf113f9d330c1f7c638003429a1dc4. Revert #19894.

PiperOrigin-RevId: 215678307
---
 .../contrib/quantize/python/quantize.py       | 115 +++++++-----------
 .../quantize/python/quantize_graph_test.py    |  37 ------
 2 files changed, 41 insertions(+), 111 deletions(-)

diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index afb9de8370..5e63d33db8 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -461,8 +461,8 @@ class _LayerMatch(object):
     return self._bias_add_op
 
 
-def _GetFollowingFakeQuantOp(tensor):
-  """Returns the following FakeQuant op if it exists else None."""
+def _FollowedByFakeQuant(tensor):
+  """Returns True if the tensor is followed by a FakeQuant."""
   fake_quant_ops = set([
       'FakeQuantWithMinMaxVars', 'FakeQuantWithMinMaxArgs',
       'FakeQuantWithMinMaxVarsPerChannel'
@@ -472,11 +472,11 @@ def _GetFollowingFakeQuantOp(tensor):
   while consumers:
     c = consumers.pop()
     if c.type in fake_quant_ops:
-      return c
+      return True
     elif c.type in pass_through_ops:
       for output in c.outputs:
         consumers.extend(output.consumers())
-  return None
+  return False
 
 
 def _InsertQuantOp(context,
@@ -559,77 +559,44 @@ def _InsertQuantOp(context,
   # Prevent ops from being quantized multiple times. Bypass ops can sometimes
   # overlap between multiple matches, so we need to ensure that we don't
   # add duplicate FakeQuant operations.
-  fake_quant_op = _GetFollowingFakeQuantOp(inputs)
-
-  # If we find that we are attempting to insert a fake quant op following
-  # a fake quant, we skip inserting a fake quant op
-
-  if fake_quant_op is None:
-    if moving_avg:
-      quant = (
-          quant_ops.MovingAvgQuantize(
-              inputs,
-              init_min=init_min,
-              init_max=init_max,
-              ema_decay=ema_decay,
-              is_training=is_training,
-              num_bits=bits,
-              narrow_range=narrow_range,
-              vars_collection=vars_collection,
-              name_prefix=name_prefix))
-    else:
-      quant = (
-          quant_ops.LastValueQuantize(
-              inputs,
-              init_min=init_min,
-              init_max=init_max,
-              is_training=is_training,
-              num_bits=bits,
-              narrow_range=narrow_range,
-              vars_collection=vars_collection,
-              name_prefix=name_prefix))
-
-    if quant_delay and quant_delay > 0:
-      activate_quant = math_ops.greater_equal(
-          common.CreateOrGetQuantizationStep(),
-          quant_delay,
-          name=name_prefix + '/activate_quant')
-      quant = control_flow_ops.cond(
-          activate_quant,
-          lambda: quant,
-          lambda: inputs,
-          name=name_prefix + '/delayed_quant')
+  if _FollowedByFakeQuant(inputs):
+    return
+
+  if moving_avg:
+    quant = (
+        quant_ops.MovingAvgQuantize(
+            inputs,
+            init_min=init_min,
+            init_max=init_max,
+            ema_decay=ema_decay,
+            is_training=is_training,
+            num_bits=bits,
+            narrow_range=narrow_range,
+            vars_collection=vars_collection,
+            name_prefix=name_prefix))
   else:
-    # If a fake quant op is present already, make sure that
-    # any downstream use of the tensor reroutes to the appropriate quantized
-    # tensor. If there is no quant_delay, this is simply the output of the
-    # fake quant op. If there is a quant delay, we reroute to the output
-    # of the delayed quant operation, which inserts quantization only after
-    # a specified quant_delay
-
-    quant = fake_quant_op.outputs[0]
-    if quant_delay and quant_delay > 0:
-      name_prefix = '/'.join(quant.name.split('/')[:-1])
-      quant = quant.graph.get_tensor_by_name(name_prefix +
-                                             '/delayed_quant/Merge:0')
-    pruned_consumer_set = set()
-    for consumer in consumers:
-      fake_quant_dest_op = _GetFollowingFakeQuantOp(consumer.outputs[0])
-      if (fake_quant_dest_op is None or
-          fake_quant_dest_op.name != fake_quant_op.name):
-        pruned_consumer_set.add(consumer)
-    consumers = pruned_consumer_set
-
-    # If we have
-    # input->pass_through->fake_quant
-    # there is nothing to reroute.
-    #
-    # If we have
-    #  input-> pass_through->fake_quant
-    #                |-> consumer
-    # Then we reroute such that:
-    # input-> pass_through->fake_quant
-    #                            |-> consumer
+    quant = (
+        quant_ops.LastValueQuantize(
+            inputs,
+            init_min=init_min,
+            init_max=init_max,
+            is_training=is_training,
+            num_bits=bits,
+            narrow_range=narrow_range,
+            vars_collection=vars_collection,
+            name_prefix=name_prefix))
+
+  if quant_delay and quant_delay > 0:
+    activate_quant = math_ops.greater_equal(
+        common.CreateOrGetQuantizationStep(),
+        quant_delay,
+        name=name_prefix + '/activate_quant')
+    quant = control_flow_ops.cond(
+        activate_quant,
+        lambda: quant,
+        lambda: inputs,
+        name=name_prefix + '/delayed_quant')
+
   if consumers:
     tensors_modified_count = common.RerouteTensor(
         quant, inputs, can_modify=consumers)
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index a9fc6c3c61..e80d2183a6 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -27,7 +27,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import template
 from tensorflow.python.platform import googletest
 
 
@@ -307,42 +306,6 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
     # No ops should be inserted or removed.
     self.assertEqual(op_names_before_rewrite, op_names_after_rewrite)
 
-  def testWithSharedWeights(self):
-
-    self._RunTestOverAllRewrites(self._TestWithSharedWeights)
-    self._RunTestOverTrainingRewrites(self._TestRewriteWithSharedWeights)
-
-  def _TestRewriteWithSharedWeights(self, rewrite_fn, quant_delay=1):
-    self._TestWithSharedWeights(rewrite_fn, quant_delay)
-
-  def _TestWithSharedWeights(self, rewrite_fn, quant_delay=None):
-    with ops.Graph().as_default() as g:
-      conv = template.make_template('shared_weights_conv', self._ConvLayer)
-      conv()
-      conv()
-      if quant_delay is None:
-        rewrite_fn()
-      else:
-        rewrite_fn(quant_delay=quant_delay)
-
-    conv_ops = [op for op in g.get_operations() if op.type == 'Conv2D']
-    weights_quants = [
-        op for op in g.get_operations()
-        if 'weights_quant' in op.name and op.type == 'FakeQuantWithMinMaxVars'
-    ]
-    # Check that the shared weights variable is not quantized multiple times
-    self.assertTrue(len(weights_quants) == 1)
-    weights_quant_tensor = weights_quants[0].outputs[0]
-    if quant_delay:
-      delayed_weights_quants = [
-          op for op in g.get_operations()
-          if 'weights_quant' in op.name and op.type == 'Merge'
-      ]
-      self.assertTrue(len(delayed_weights_quants) == 1)
-      weights_quant_tensor = delayed_weights_quants[0].outputs[0]
-    # Check that the Conv2D operations get the quantized weights
-    self.assertTrue(all(weights_quant_tensor in op.inputs for op in conv_ops))
-
   def _ConvLayer(
       self, input_tensor=None, scope='test', pre_activation_bypass=False,
       post_activation_bypass=False):
-- 
GitLab


From d3ced638f0496c70c3a063be82b30b358179e369 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Wed, 3 Oct 2018 21:41:43 -0700
Subject: [PATCH 391/570] [XLA] Delete IsInplaceSlice.

PiperOrigin-RevId: 215681153
---
 .../xla/service/hlo_dataflow_analysis.cc      | 24 -------------------
 .../xla/service/hlo_dataflow_analysis.h       |  1 -
 .../compiler/xla/service/hlo_instruction.cc   |  4 ----
 .../compiler/xla/service/hlo_instruction.h    |  3 ---
 .../compiler/xla/service/hlo_instructions.h   | 14 -----------
 .../xla/service/tuple_points_to_analysis.cc   | 23 ++++--------------
 .../xla/service/tuple_points_to_analysis.h    |  1 -
 7 files changed, 4 insertions(+), 66 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 44cde4a3d2..c22adcdd8d 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -356,23 +356,6 @@ bool HloDataflowAnalysis::UpdateBitcastValueSet(HloInstruction* bitcast) {
   return false;
 }
 
-bool HloDataflowAnalysis::UpdateSliceValueSet(HloInstruction* slice) {
-  CHECK_EQ(slice->opcode(), HloOpcode::kSlice);
-  if (!slice->IsInPlaceSlice()) {
-    return false;
-  }
-  // If this slice is lowered to an in-place version, then it forwards the
-  // operand value to the output.
-  const InstructionValueSet& operand_set =
-      GetInstructionValueSet(slice->operand(0));
-  InstructionValueSet& slice_set = GetInstructionValueSet(slice);
-  if (operand_set != slice_set) {
-    slice_set = operand_set;
-    return true;
-  }
-  return false;
-}
-
 bool HloDataflowAnalysis::UpdateSendValueSet(HloInstruction* send) {
   CHECK_EQ(send->opcode(), HloOpcode::kSend);
   bool changed = false;
@@ -641,8 +624,6 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
   switch (instruction->opcode()) {
     case HloOpcode::kBitcast:
       return UpdateBitcastValueSet(instruction);
-    case HloOpcode::kSlice:
-      return UpdateSliceValueSet(instruction);
     case HloOpcode::kDomain:
       return UpdateDomainValueSet(instruction);
     case HloOpcode::kCopy:
@@ -814,11 +795,6 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
             define_all_values();
           }
           break;
-        case HloOpcode::kSlice:
-          if (!instruction->IsInPlaceSlice()) {
-            define_all_values();
-          }
-          break;
         case HloOpcode::kWhile:
         case HloOpcode::kCall:
         case HloOpcode::kConditional:
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index e62c1c2ac8..abac398c04 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -182,7 +182,6 @@ class HloDataflowAnalysis {
   // Updates the value set for a particular instruction type. Returns whether
   // the instruction value set changed.
   bool UpdateBitcastValueSet(HloInstruction* bitcast);
-  bool UpdateSliceValueSet(HloInstruction* slice);
   bool UpdateCallValueSet(HloInstruction* call);
   bool UpdateConditionalValueSet(HloInstruction* conditional);
   bool UpdateCopyValueSet(HloInstruction* copy);
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 8bddaa8c96..fb91adc302 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -3076,10 +3076,6 @@ const std::vector<int64>& HloInstruction::slice_strides() const {
   return Cast<HloSliceInstruction>(this)->slice_strides();
 }
 
-bool HloInstruction::IsInPlaceSlice() const {
-  return Cast<HloSliceInstruction>(this)->IsInPlaceSlice();
-}
-
 const Literal& HloInstruction::literal() const {
   return Cast<HloConstantInstruction>(this)->literal();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 9deed20e5d..374862c4b6 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -1330,9 +1330,6 @@ class HloInstruction {
   int64 slice_strides(int64 dimension) const;
   const std::vector<int64>& slice_strides() const;
 
-  // Delegates to HloSliceInstruction::IsInPlaceSlice.
-  bool IsInPlaceSlice() const;
-
   // Returns the literal associated with this instruction.
   const Literal& literal() const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index c929867bb9..ab168800f6 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -546,17 +546,6 @@ class HloSliceInstruction : public HloInstruction {
   }
   const std::vector<int64>& slice_strides() const { return slice_strides_; }
 
-  // Returns the flag that describes whether a slice must be lowered into an
-  // offset into the original operand.
-  bool IsInPlaceSlice() const { return is_in_place_slice_; }
-
-  // Sets and returns the flag that describes whether a slice must be lowered
-  // into an offset into the original operand.
-  bool SetIsInPlaceSlice(bool value) {
-    is_in_place_slice_ = value;
-    return value;
-  }
-
  private:
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
@@ -573,9 +562,6 @@ class HloSliceInstruction : public HloInstruction {
   std::vector<int64> slice_starts_;
   std::vector<int64> slice_limits_;
   std::vector<int64> slice_strides_;
-
-  // Describes whether the slice can be lowered to an offset into the operand.
-  bool is_in_place_slice_ = false;
 };
 
 class HloConstantInstruction : public HloInstruction {
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 6fed7c76d0..811ac55e2d 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -280,16 +280,6 @@ Status TuplePointsToAnalysis::HandleDomain(HloInstruction* domain) {
   return Status::OK();
 }
 
-Status TuplePointsToAnalysis::HandleSlice(HloInstruction* slice) {
-  // A kSlice instruction aliases its operand if the backend lowers it to an
-  // in-place implementation.
-  if (slice->IsInPlaceSlice()) {
-    CreateCopiedPointsToSet(slice, slice->operand(0));
-    return Status::OK();
-  }
-  return DefaultAction(slice);
-}
-
 Status TuplePointsToAnalysis::HandleRecvDone(HloInstruction* recv_done) {
   // RecvDone aliases its input (Recv) tuple element {0} to element {0} of its
   // output. The other indices ({} and {1}) define their own buffers.
@@ -455,15 +445,10 @@ bool TuplePointsToAnalysis::InstructionDefinesBufferAtIndex(
 
 Status TuplePointsToAnalysis::VerifyBuffer(const LogicalBuffer& buffer) const {
   if (!InstructionDefinesBufferAtIndex(buffer.instruction(), buffer.index())) {
-    // kSlice ops that are lowered to an in-place version are expected to not
-    // define their output buffer.
-    if (buffer.instruction()->opcode() != HloOpcode::kSlice ||
-        !buffer.instruction()->IsInPlaceSlice()) {
-      return FailedPrecondition(
-          "LogicalBuffer %s is ill-defined: instruction %s does not define a "
-          "buffer at that index",
-          buffer.ToString(), buffer.instruction()->name());
-    }
+    return FailedPrecondition(
+        "LogicalBuffer %s is ill-defined: instruction %s does not define a "
+        "buffer at that index",
+        buffer.ToString(), buffer.instruction()->name());
   }
 
   if (buffer.id() < 0 ||
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index 64ad1dc80e..30c365053c 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -247,7 +247,6 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleDomain(HloInstruction* domain) override;
-  Status HandleSlice(HloInstruction* slice) override;
   Status HandleCopy(HloInstruction* copy) override;
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleSend(HloInstruction* send) override;
-- 
GitLab


From 54cde61fbf473270ce19f8b40e9511373fbc12c7 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 3 Oct 2018 22:00:51 -0700
Subject: [PATCH 392/570] [tf.data] Fix bug in
 `tf.data.experimental.unbatch()`.

Previously, if the rank of the input to this transformation was
statically unknown, we would erroneously report that the output is a
scalar, and violate downstream shape integrity checks. Instead, in
that case the output shape should be unknown.

PiperOrigin-RevId: 215683027
---
 tensorflow/core/kernels/data/unbatch_dataset_op.cc | 13 +++++++++----
 .../kernel_tests/batch_dataset_op_test.py          | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
index 81c432b938..74908994b4 100644
--- a/tensorflow/core/kernels/data/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
@@ -41,11 +41,16 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
         : DatasetBase(DatasetContext(ctx)), input_(input) {
       input_->Ref();
       for (const PartialTensorShape& shape : input->output_shapes()) {
-        gtl::InlinedVector<int64, 4> partial_dim_sizes;
-        for (int i = 1; i < shape.dims(); ++i) {
-          partial_dim_sizes.push_back(shape.dim_size(i));
+        if (!shape.unknown_rank()) {
+          gtl::InlinedVector<int64, 4> partial_dim_sizes;
+          for (int i = 1; i < shape.dims(); ++i) {
+            partial_dim_sizes.push_back(shape.dim_size(i));
+          }
+          shapes_.emplace_back(std::move(partial_dim_sizes));
+        } else {
+          // If the input shape is unknown, the output shape will be unknown.
+          shapes_.emplace_back();
         }
-        shapes_.emplace_back(std::move(partial_dim_sizes));
       }
     }
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
index 8703b2810e..956b4518f6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
@@ -131,6 +131,20 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
                                    "larger than the row shape"):
         sess.run(get_next)
 
+  def testUnbatchWithUnknownRankInput(self):
+    placeholder = array_ops.placeholder(dtypes.int32)
+    dataset = dataset_ops.Dataset.from_tensors(placeholder).apply(
+        batching.unbatch())
+    iterator = dataset.make_initializable_iterator()
+    next_elem = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(iterator.initializer, feed_dict={placeholder: [0, 1, 2, 3]})
+      for i in range(4):
+        self.assertEqual(i, sess.run(next_elem))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_elem)
+
   def testUnbatchScalarDataset(self):
     data = tuple([math_ops.range(10) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
-- 
GitLab


From 1f1fe5a01af616707b8554d59651fb4925d7faee Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Wed, 3 Oct 2018 22:23:08 -0700
Subject: [PATCH 393/570] Include .inc files for absl headers

---
 tensorflow/tools/pip_package/setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index d864a7a039..54a7b7ffbe 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -232,6 +232,8 @@ headers = (list(find_files('*.h', 'tensorflow/core')) +
            list(find_files('*', 'third_party/eigen3')) +
            list(find_files('*.h',
                            'tensorflow/include/external/com_google_absl')) +
+           list(find_files('*.inc',
+                           'tensorflow/include/external/com_google_absl')) +
            list(find_files('*', 'tensorflow/include/external/eigen_archive')))
 
 setup(
-- 
GitLab


From 6795491bcc0c276e27be6a9e1a4a14c019c2ba37 Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Wed, 3 Oct 2018 22:24:14 -0700
Subject: [PATCH 394/570] Pin wheel=0.31.1 in install_auditwheel.sh to work
 around issue https://github.com/pypa/auditwheel/issues/102

PiperOrigin-RevId: 215685104
---
 tensorflow/tools/ci_build/install/install_auditwheel.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/tools/ci_build/install/install_auditwheel.sh b/tensorflow/tools/ci_build/install/install_auditwheel.sh
index e6f6124d56..0e6d98c0a8 100755
--- a/tensorflow/tools/ci_build/install/install_auditwheel.sh
+++ b/tensorflow/tools/ci_build/install/install_auditwheel.sh
@@ -18,6 +18,10 @@ set -e
 
 sudo pip3 install auditwheel==1.5.0
 
+# Pin wheel==0.31.1 to work around issue
+# https://github.com/pypa/auditwheel/issues/102
+sudo pip3 install wheel==0.31.1
+
 set +e
 patchelf_location=$(which patchelf)
 if [[ -z "$patchelf_location" ]]; then
-- 
GitLab


From e57874169fca3cfdd15cf0dda3717a6374a7dcb9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 Oct 2018 23:03:11 -0700
Subject: [PATCH 395/570] [XLA] Update Tf2Xla bridge to use Scatter HLO.

PiperOrigin-RevId: 215687800
---
 tensorflow/compiler/tf2xla/lib/scatter.cc     | 213 ++++++++++--------
 tensorflow/compiler/tf2xla/lib/scatter.h      |   6 +-
 tensorflow/compiler/xla/client/xla_builder.cc |   3 +
 tensorflow/compiler/xla/service/hlo_module.cc |   3 +-
 tensorflow/compiler/xla/service/inliner.cc    |  32 +--
 .../compiler/xla/service/inliner_test.cc      |  30 +++
 6 files changed, 177 insertions(+), 110 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/lib/scatter.cc b/tensorflow/compiler/tf2xla/lib/scatter.cc
index 38dfde165d..2b1c2ced92 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.cc
+++ b/tensorflow/compiler/tf2xla/lib/scatter.cc
@@ -38,12 +38,10 @@ xla::StatusOr<xla::XlaOp> XlaScatter(
         combiner,
     xla::XlaBuilder* builder) {
   TF_ASSIGN_OR_RETURN(xla::Shape buffer_shape, builder->GetShape(buffer));
-  TF_RETURN_IF_ERROR(builder->GetShape(updates).status());
+  TF_ASSIGN_OR_RETURN(xla::Shape updates_shape, builder->GetShape(updates));
   TF_ASSIGN_OR_RETURN(xla::Shape indices_shape, builder->GetShape(indices));
   absl::Span<const int64> indices_dims =
       xla::AsInt64Slice(indices_shape.dimensions());
-  absl::Span<const int64> buffer_dims =
-      xla::AsInt64Slice(buffer_shape.dimensions());
 
   // If the indices are N-dimensional, the minor dimension of indices contains
   // the indices to update. Otherwise the indices are all scalars.
@@ -81,104 +79,129 @@ xla::StatusOr<xla::XlaOp> XlaScatter(
     }
   }
 
-  // Shape of the non-indexed dimensions of the buffer.
-  std::vector<int64> buffer_shape_post_axes(
-      buffer_dims.begin() + num_index_dims, buffer_dims.end());
-
-  // Flatten the major dimensions of indices and updates into a single dimension
-  // for ease of iteration.
-  std::vector<int64> flat_indices_shape({num_indices});
-  if (indices_are_vectors) {
-    flat_indices_shape.push_back(num_index_dims);
+  // Example of a 1-D scatter that updates two [3,1] tensors in a tensor of
+  // shape [3,3]:
+  // NOTE: ***This case will not be generated by any of the tf.scatter ops.***
+  //
+  //   operand = s32[3,3] parameter(0)
+  //   indices = s32[2] parameter(1)
+  //   updates = s32[3,2] parameter(2)
+  //   scatter = s32[3,3] scatter(operand, indices, updates),
+  //       to_apply=update_computation,
+  //       update_window_dims={0},
+  //       inserted_window_dims={1},
+  //       scatter_dims_to_operand_dims={1},
+  //       index_vector_dim=1
+  //
+  //
+  // Example of a 1-D scatter that updates two [1,3] tensors in a tensor of
+  // shape [3,3]:
+  //
+  //   operand = s32[3,3] parameter(0)
+  //   indices = s32[2] parameter(1)
+  //   updates = s32[2,3] parameter(2)
+  //   scatter = s32[3,3] scatter(operand, indices, updates),
+  //       to_apply=update_computation,
+  //       update_window_dims={1},
+  //       inserted_window_dims={0},
+  //       scatter_dims_to_operand_dims={0},
+  //       index_vector_dim=1
+  //
+  //
+  // Example of an N-D scatter updating slices of shape [1,1,2] in a tensor of
+  // shape [3,3,2]
+  //
+  //   operand = s32[3,3,2] parameter(0)
+  //   indices = s32[2,2] parameter(1)
+  //   updates = s32[2,2] parameter(2)
+  //   scatter = s32[3,3,2] scatter(operand, indices, updates),
+  //       to_apply=update_computation,
+  //       update_window_dims={1},
+  //       inserted_window_dims={0,1},
+  //       scatter_dims_to_operand_dims={0,1},
+  //       index_vector_dim=1
+  //
+  //
+  // Example of a scatter updating slices of shape [] in a tensor of shape [1,1]
+  //
+  //   operand = s32[1,1] parameter(0)
+  //   indices = s32[1] parameter(1)
+  //   updates = s32[1] parameter(2)
+  //   scatter = s32[1,1] scatter(operand, indices, updates),
+  //       to_apply=update_computation,
+  //       update_window_dims={},
+  //       inserted_window_dims={0,1},
+  //       scatter_dims_to_operand_dims={0},
+  //       index_vector_dim=1
+  // Note that updates operand would be broadcasted into [1] in this case.
+  //
+
+  xla::ScatterDimensionNumbers dim_numbers;
+  dim_numbers.set_index_vector_dim(indices_are_vectors
+                                       ? indices_shape.dimensions_size() - 1
+                                       : indices_shape.dimensions_size());
+
+  int64 updates_rank = xla::ShapeUtil::Rank(updates_shape);
+  int64 buffer_rank = xla::ShapeUtil::Rank(buffer_shape);
+  int64 num_window_dims_in_updates = buffer_rank - num_index_dims;
+
+  // If the rank of `updates` is 0 and does not match the expected rank of
+  // updates, broadcast `updates` to the expected shape of updates.
+  auto new_updates = updates;
+  std::vector<int64> expected_updates_dims(indices_dims.begin(),
+                                           indices_dims.end());
+  for (int64 dim = num_index_dims; dim < buffer_rank; ++dim) {
+    expected_updates_dims.push_back(buffer_shape.dimensions(dim));
+  }
+  int64 expected_updates_rank = expected_updates_dims.size();
+  if (updates_rank == 0 && expected_updates_rank != 0) {
+    new_updates = xla::Broadcast(updates, expected_updates_dims);
+    TF_ASSIGN_OR_RETURN(updates_shape, builder->GetShape(new_updates));
+    updates_rank = xla::ShapeUtil::Rank(updates_shape);
   }
 
-  std::vector<int64> flat_updates_shape({num_indices});
-  flat_updates_shape.insert(flat_updates_shape.end(),
-                            buffer_shape_post_axes.begin(),
-                            buffer_shape_post_axes.end());
-
-  // Construct the initial values of the loop-carried Tensors.
-  auto flat_indices = xla::Reshape(indices, flat_indices_shape);
-  auto flat_updates = xla::Reshape(updates, flat_updates_shape);
-  auto init = {flat_indices, flat_updates, buffer};
-
-  // Constructs the loop body. The implementation of scatter is essentially:
-  // for i in range(num_indices):
-  //   index = dynamic-slice(indices, i)
-  //   update = dynamic-slice(updates, i)
-  //   buffer = dynamic-update-slice(buffer, update, index)
-  auto body_fn = [&](xla::XlaOp i, absl::Span<const xla::XlaOp> loop_vars,
-                     xla::XlaBuilder* body_builder) {
-    auto indices = loop_vars[0];
-    auto updates = loop_vars[1];
-    auto buffer = loop_vars[2];
-
-    auto zero_index = xla::ConstantLiteral(
-        body_builder, xla::LiteralUtil::Zero(indices_shape.element_type()));
-
-    // Slice the i-th index from the indices array.
-    xla::XlaOp index;
-    auto indices_offset = xla::Reshape(i, {1});
-    if (indices_are_vectors) {
-      indices_offset = xla::Pad(indices_offset, zero_index,
-                                xla::MakeEdgePaddingConfig({{0, 1}}));
-
-      index = xla::DynamicSlice(indices, indices_offset, {1, num_index_dims});
-      index = xla::Collapse(index, {0, 1});
-    } else {
-      index = xla::DynamicSlice(indices, indices_offset, {1});
+  if (updates_rank > 0) {
+    for (int64 i = (updates_rank - num_window_dims_in_updates);
+         i < updates_rank; ++i) {
+      dim_numbers.add_update_window_dims(i);
     }
+  }
 
-    // Discard updates with negative indices, since some users expect this.
-    auto index_in_range = xla::ReduceAll(
-        xla::Le(zero_index, index), xla::ConstantR0<bool>(body_builder, true),
-        xla::CreateScalarAndComputation(xla::PRED, body_builder));
-
-    // Make the index in bounds to prevent implementation defined behavior.
-    index = xla::Max(index, zero_index);
-    index = xla::Pad(
-        index, zero_index,
-        xla::MakeEdgePaddingConfig({{0, buffer_shape_post_axes.size()}}));
-
-    // Slice the i-th index from the updates array.
-    auto updates_offset = xla::Reshape(i, {1});
-    updates_offset = xla::Pad(
-        updates_offset, zero_index,
-        xla::MakeEdgePaddingConfig({{0, buffer_shape_post_axes.size()}}));
-    std::vector<int64> flat_updates_slice_shape({1});
-    flat_updates_slice_shape.insert(flat_updates_slice_shape.end(),
-                                    buffer_shape_post_axes.begin(),
-                                    buffer_shape_post_axes.end());
-    auto update =
-        xla::DynamicSlice(updates, updates_offset, flat_updates_slice_shape);
-
-    // Unflatten the major (iteration) dimensions of the slice to their
-    // original shape.
-    std::vector<int64> updates_slice_shape(num_index_dims, 1);
-    updates_slice_shape.insert(updates_slice_shape.end(),
-                               buffer_shape_post_axes.begin(),
-                               buffer_shape_post_axes.end());
-    update = xla::Reshape(update, updates_slice_shape);
-
-    // Apply the update to the buffer. If there is a combiner, use it to merge
-    // the current values with the update.
-    auto current_value = xla::DynamicSlice(buffer, index, updates_slice_shape);
+  for (int64 i = 0; i < num_index_dims; ++i) {
+    dim_numbers.add_inserted_window_dims(i);
+    dim_numbers.add_scatter_dims_to_operand_dims(i);
+  }
+
+  // Build the combiner computation.
+  xla::XlaComputation combiner_computation;
+  {
+    xla::XlaBuilder cb("scatter-combiner");
+    auto xla_scalar_shape =
+        xla::ShapeUtil::MakeShape(buffer_shape.element_type(), {});
+    auto p0 = xla::Parameter(&cb, 0, xla_scalar_shape, "p0");
+    auto p1 = xla::Parameter(&cb, 1, xla_scalar_shape, "p1");
     if (combiner) {
-      update = combiner(current_value, update, body_builder);
+      combiner(p0, p1, &cb);
     }
-    // Use the current value instead of the update if the index is out of
-    // bounds.
-    update = xla::Select(index_in_range, update, current_value);
-    // Apply the update.
-    buffer = xla::DynamicUpdateSlice(buffer, update, index);
-
-    return std::vector<xla::XlaOp>{indices, updates, buffer};
-  };
-
-  TF_ASSIGN_OR_RETURN(auto outputs,
-                      XlaForEachIndex(num_indices, indices_shape.element_type(),
-                                      body_fn, init, "scatter", builder));
-  return outputs[2];
+    combiner_computation = cb.Build().ConsumeValueOrDie();
+  }
+
+  VLOG(3) << "Scatter op:";
+  VLOG(3) << "  Input: " << xla::ShapeUtil::HumanString(buffer_shape);
+  VLOG(3) << "  Indices: " << xla::ShapeUtil::HumanString(indices_shape);
+  VLOG(3) << "  Updates: " << xla::ShapeUtil::HumanString(updates_shape);
+  VLOG(3) << "  Scatter Dimension Numbers: ";
+  VLOG(3) << "    index_vector_dim: " << dim_numbers.index_vector_dim();
+  VLOG(3) << "    update_window_dims: ["
+          << absl::StrJoin(dim_numbers.update_window_dims(), ",") << "]";
+  VLOG(3) << "    inserted_window_dims: ["
+          << absl::StrJoin(dim_numbers.inserted_window_dims(), ",") << "]";
+  VLOG(3) << "    scatter_dims_to_operand_dims: ["
+          << absl::StrJoin(dim_numbers.scatter_dims_to_operand_dims(), ",")
+          << "]";
+
+  return xla::Scatter(buffer, indices, new_updates, combiner_computation,
+                      dim_numbers);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.h b/tensorflow/compiler/tf2xla/lib/scatter.h
index 13a5f1b850..4cf478c4b9 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.h
+++ b/tensorflow/compiler/tf2xla/lib/scatter.h
@@ -34,7 +34,11 @@ namespace tensorflow {
 // Otherwise, `indices_are_vectors`, then indices are multidimensional and the
 // minor dimension of `indices` represents a vector of indices.
 //
-// If any indices are negative, the corresponding update is discarded.
+// If `updates` is a scalar, then it will be broadcasted into the expected shape
+// of updates.
+//
+// If any part of the update region is out-of-bounds, the corresponding update
+// is discarded.
 //
 // If a `combiner` is provided, updates are combined with the existing values in
 // the buffer using the combiner function. Otherwise, the updates replace the
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index e0ec91dba1..d196252db1 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -208,6 +208,9 @@ void XlaBuilder::IsConstantVisitor(const int64 op_handle,
     case HloOpcode::kWhile:
       // TODO(b/32495713): We aren't checking the condition and body
       // computations themselves.
+    case HloOpcode::kScatter:
+      // TODO(b/32495713): We aren't checking the embedded computation in
+      // Scatter.
     case HloOpcode::kSend:
     case HloOpcode::kRecv:
     case HloOpcode::kParameter:
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 7527e35c95..93e04eb3db 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -146,7 +146,8 @@ void HloModule::ReplaceComputations(
         case HloOpcode::kCall:
         case HloOpcode::kMap:
         case HloOpcode::kReduce:
-        case HloOpcode::kReduceWindow: {
+        case HloOpcode::kReduceWindow:
+        case HloOpcode::kScatter: {
           HloComputation* new_arg = tensorflow::gtl::FindWithDefault(
               replacements, instruction->to_apply(), nullptr);
           if (new_arg != nullptr) {
diff --git a/tensorflow/compiler/xla/service/inliner.cc b/tensorflow/compiler/xla/service/inliner.cc
index 5fd779ebf9..50c408f5bb 100644
--- a/tensorflow/compiler/xla/service/inliner.cc
+++ b/tensorflow/compiler/xla/service/inliner.cc
@@ -71,26 +71,23 @@ Status InlinerVisitor::HandleMap(HloInstruction* map) {
   // profitability model for inlining is defined.
   if (hlo_query::AllOperandsAreParameters(root)) {
     if (root.opcode() == HloOpcode::kFusion ||
-        root.opcode() == HloOpcode::kParameter ||
         root.opcode() == HloOpcode::kTrace) {
       // Cloning not supported for these instructions.
       return Status::OK();
     }
     VLOG(10) << "inlining map({X ... Y}, op) => : op(X ... Y) with function "
              << root.ToShortString();
-    // If the input is a constant then the shape of the constant could be
-    // different than the map shape. Hence, a broadcast is needed, else the
-    // cloned operand with new shape and operands work.
-    if (root.opcode() != HloOpcode::kConstant) {
-      std::vector<HloInstruction*> params;
-      for (int64 o = 0; o < root.operands().size(); o++) {
-        params.push_back(map->operands()[root.operand(o)->parameter_number()]);
-      }
-      HloInstruction* placed_instruction = computation_->AddInstruction(
-          root.CloneWithNewOperands(map->shape(), params));
+    if (root.opcode() == HloOpcode::kParameter) {
+      // If the root is a parameter, then use the corresponding operand as the
+      // result of the computation.
       TF_RETURN_IF_ERROR(
-          computation_->ReplaceInstruction(map, placed_instruction));
-    } else {
+          map->ReplaceAllUsesWith(map->operands()[root.parameter_number()]));
+      TF_RETURN_IF_ERROR(computation_->RemoveInstruction(map));
+    } else if (root.opcode() == HloOpcode::kConstant) {
+      // If the input is a constant then the shape of the constant could be
+      // different than the map shape. Hence, a broadcast is needed, else the
+      // cloned operand with new shape and operands work.
+      //
       // The constant is in an embedded computation and needs to be recreated
       // as part of the computation that the broadcast is inserted into.
       HloInstruction* constant = computation_->AddInstruction(root.Clone());
@@ -98,6 +95,15 @@ Status InlinerVisitor::HandleMap(HloInstruction* map) {
           HloInstruction::CreateBroadcast(map->shape(), constant, {}));
       TF_RETURN_IF_ERROR(
           computation_->ReplaceInstruction(map, placed_instruction));
+    } else {
+      std::vector<HloInstruction*> params;
+      for (int64 o = 0; o < root.operands().size(); o++) {
+        params.push_back(map->operands()[root.operand(o)->parameter_number()]);
+      }
+      HloInstruction* placed_instruction = computation_->AddInstruction(
+          root.CloneWithNewOperands(map->shape(), params));
+      TF_RETURN_IF_ERROR(
+          computation_->ReplaceInstruction(map, placed_instruction));
     }
     changed_ = true;
     return Status::OK();
diff --git a/tensorflow/compiler/xla/service/inliner_test.cc b/tensorflow/compiler/xla/service/inliner_test.cc
index 7e967f035c..98e0f2cfd7 100644
--- a/tensorflow/compiler/xla/service/inliner_test.cc
+++ b/tensorflow/compiler/xla/service/inliner_test.cc
@@ -146,6 +146,36 @@ TEST_F(InlinerTest, MapSubtractOppositeOrder) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, expected));
 }
 
+TEST_F(InlinerTest, MapParameter) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+
+  auto param_builder = HloComputation::Builder(TestName());
+  param_builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32, "p0"));
+  param_builder.AddInstruction(HloInstruction::CreateParameter(1, r0f32, "p1"));
+  auto param_f32 = param_builder.Build();
+
+  auto builder = HloComputation::Builder("MapParamFunction");
+  auto lhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1)));
+  auto rhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(4)));
+  builder.AddInstruction(
+      HloInstruction::CreateMap(lhs->shape(), {lhs, rhs}, param_f32.get()));
+
+  auto computation = builder.Build();
+  auto hlo_module = CreateNewVerifiedModule();
+  hlo_module->AddEmbeddedComputation(std::move(param_f32));
+  hlo_module->AddEntryComputation(std::move(computation));
+
+  Inliner inliner;
+  EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
+  EXPECT_THAT(hlo_module->entry_computation()->root_instruction(), rhs);
+
+  // Verify execution on CPU.
+  auto result = ExecuteAndTransfer(hlo_module->Clone(), {});
+  auto expected = LiteralUtil::CreateR0<float>(4);
+  EXPECT_TRUE(LiteralTestUtil::Equal(result, expected));
+}
 
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 67e0ccb3e5c1a48d62bcc45201fd70d2420dc4eb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 02:27:59 -0700
Subject: [PATCH 396/570] compat: Update forward compatibility horizon to
 2018-10-04

PiperOrigin-RevId: 215706500
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index d833defb8e..76e08610ba 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -26,7 +26,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 3)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 4)
 
 
 @tf_export("compat.forward_compatible")
-- 
GitLab


From 98ea840dabc0c4e9417ebe9a0fd10c9d471cda51 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 02:41:25 -0700
Subject: [PATCH 397/570] Improve the performance of the ListMemoryScheduler

This CL replaces a std::unordered_map with an absl::flat_hash_map and
removes an unnecessary map lookup. This two change can improve the
performance of the scheduler on large graphs by up to 2x.

PiperOrigin-RevId: 215707921
---
 .../compiler/xla/service/hlo_memory_scheduler.cc       | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
index 55314d0ae9..bf30764488 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
@@ -263,9 +263,8 @@ class ListScheduler {
     };
 
     for (auto* instruction : computation_.instructions()) {
-      // Instruction with no operands or control predecessors will
-      // not be in the map.
-      if (unscheduled_pred_count.count(instruction) == 0) {
+      if (instruction->operands().empty() &&
+          instruction->control_predecessors().empty()) {
         add_to_ready_queue(instruction);
       }
     }
@@ -356,9 +355,8 @@ class ListScheduler {
       buffer_uses_;
 
   // A map containing the count of unscheduled HLOs which using a particular
-  // LogicalBuffer.  We rely on iterator stability in this map, and that the map
-  // entries are std::pair's.
-  std::unordered_map<const LogicalBuffer*, int64> unscheduled_use_count_;
+  // LogicalBuffer.
+  absl::flat_hash_map<const LogicalBuffer*, int64> unscheduled_use_count_;
 
   // Set of instructions which have been scheduled.
   absl::flat_hash_set<const HloInstruction*> scheduled_instructions_;
-- 
GitLab


From 6b538d9ce54e878576131cde0c76e43a893180c2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 03:12:04 -0700
Subject: [PATCH 398/570] Automated rollback of commit
 70a395f9795a48c21bc35cdf1dc44778f73a7bba

PiperOrigin-RevId: 215710849
---
 tensorflow/python/data/kernel_tests/BUILD     |  1 +
 tensorflow/tensorflow.bzl                     | 39 +++++++++++--------
 .../tools/pip_package/pip_smoke_test.py       |  2 +-
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index c7295d6e69..10ec0dbe1c 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -306,6 +306,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
     ],
     tags = [
+        "no_oss",  # TODO(b/116813115): Investigate timeout and re-enable.
         "no_windows_gpu",
     ],
 )
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index cad5de1b0c..dead44c57e 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1798,22 +1798,29 @@ def cuda_py_test(
         flaky = 0,
         xla_enabled = False,
         grpc_enabled = False):
-    test_tags = tags + tf_cuda_tests_tags()
-    tf_py_test(
-        name = name,
-        size = size,
-        srcs = srcs,
-        data = data,
-        main = main,
-        args = args,
-        tags = test_tags,
-        shard_count = shard_count,
-        additional_deps = additional_deps,
-        kernels = kernels,
-        flaky = flaky,
-        xla_enabled = xla_enabled,
-        grpc_enabled = grpc_enabled,
-    )
+    if main == None:
+        main = name + ".py"
+    for config in ["cpu", "gpu"]:
+        test_name = name
+        test_tags = tags
+        if config == "gpu":
+            test_name += "_gpu"
+            test_tags = test_tags + tf_cuda_tests_tags()
+        tf_py_test(
+            name = test_name,
+            size = size,
+            srcs = srcs,
+            data = data,
+            main = main,
+            args = args,
+            tags = test_tags,
+            shard_count = shard_count,
+            additional_deps = additional_deps,
+            kernels = kernels,
+            flaky = flaky,
+            xla_enabled = xla_enabled,
+            grpc_enabled = grpc_enabled,
+        )
 
 register_extension_info(
     extension_name = "cuda_py_test",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index c6ef82ccdc..e7f9628fa6 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -142,7 +142,7 @@ def main():
 
   missing_dependencies = []
   # File extensions and endings to ignore
-  ignore_extensions = ["_test", "_test.py"]
+  ignore_extensions = ["_test", "_test.py", "_test_gpu", "_test_gpu.py"]
 
   ignored_files = 0
   blacklisted_files = len(BLACKLIST)
-- 
GitLab


From 6cc738da1748e819b9c8ee92dc2f1a7bdb291b50 Mon Sep 17 00:00:00 2001
From: Adria Puigdomenech <adriap@google.com>
Date: Thu, 4 Oct 2018 03:19:46 -0700
Subject: [PATCH 399/570] Make batch_gather work with indices of dtype int64.

PiperOrigin-RevId: 215711383
---
 tensorflow/python/kernel_tests/BUILD               |  1 +
 .../python/kernel_tests/batch_gather_op_test.py    | 13 ++++++++-----
 tensorflow/python/ops/array_ops.py                 | 14 ++++++++++----
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 9303c70c60..e055ef1c1b 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -76,6 +76,7 @@ tf_py_test(
     name = "batch_gather_op_test",
     srcs = ["batch_gather_op_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
diff --git a/tensorflow/python/kernel_tests/batch_gather_op_test.py b/tensorflow/python/kernel_tests/batch_gather_op_test.py
index 7dd347989a..84e93b8136 100644
--- a/tensorflow/python/kernel_tests/batch_gather_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_gather_op_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -29,7 +30,7 @@ _TEST_TYPES = (dtypes.int64, dtypes.float32,
                dtypes.complex64, dtypes.complex128)
 
 
-class GatherTest(test.TestCase):
+class GatherTest(test.TestCase, parameterized.TestCase):
 
   def _buildParams(self, data, dtype):
     data = data.astype(dtype.as_numpy_dtype)
@@ -39,14 +40,15 @@ class GatherTest(test.TestCase):
       return data + 10j * data
     return data
 
-  def testSimpleGather(self):
+  @parameterized.parameters(dtypes.int32, dtypes.int64)
+  def testSimpleGather(self, indices_dtype):
     data = np.array([0, 1, 2, 3, 7, 5, 8, 9, 10, 11, 15, 13])
     indices = [3, 4]
     with self.test_session(use_gpu=True):
       for dtype in _TEST_TYPES:
         params_np = self._buildParams(data, dtype)
         params = constant_op.constant(params_np)
-        indices_tf = constant_op.constant(indices)
+        indices_tf = constant_op.constant(indices, dtype=indices_dtype)
         gather_t = array_ops.batch_gather(params, indices_tf)
         expected_result = np.array([3, 7])
         np_val = self._buildParams(expected_result, dtype)
@@ -54,14 +56,15 @@ class GatherTest(test.TestCase):
         self.assertAllEqual(np_val, gather_val)
         self.assertEqual(np_val.shape, gather_t.get_shape())
 
-  def test2DArray(self):
+  @parameterized.parameters(dtypes.int32, dtypes.int64)
+  def test2DArray(self, indices_dtype):
     data = np.array([[0, 1, 2, 3, 7, 5], [8, 9, 10, 11, 15, 13]])
     indices = [[3], [4]]
     with self.test_session(use_gpu=True):
       for dtype in _TEST_TYPES:
         params_np = self._buildParams(data, dtype)
         params = constant_op.constant(params_np)
-        indices_tf = constant_op.constant(indices)
+        indices_tf = constant_op.constant(indices, dtype=indices_dtype)
         gather_t = array_ops.batch_gather(params, indices_tf)
         expected_result = np.array([[3], [15]])
         np_val = self._buildParams(expected_result, dtype)
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 9f5149d5ac..4be9c532f4 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -2716,16 +2716,22 @@ def batch_gather(params, indices, name=None):
     params = ops.convert_to_tensor(params, name="params")
     indices_shape = shape(indices)
     params_shape = shape(params)
+
     ndims = indices.shape.ndims
     if ndims is None:
       raise ValueError("batch_gather does not allow indices with unknown "
                        "shape.")
     batch_indices = indices
-    accum_dim_value = 1
+    indices_dtype = indices.dtype.base_dtype
+    accum_dim_value = ones((), dtype=indices_dtype)
+    # Use correct type for offset index computation
+    casted_params_shape = gen_math_ops.cast(params_shape, indices_dtype)
     for dim in range(ndims-1, 0, -1):
-      dim_value = params_shape[dim-1]
-      accum_dim_value *= params_shape[dim]
-      dim_indices = gen_math_ops._range(0, dim_value, 1)
+      dim_value = casted_params_shape[dim-1]
+      accum_dim_value *= casted_params_shape[dim]
+      start = zeros((), dtype=indices_dtype)
+      step = ones((), dtype=indices_dtype)
+      dim_indices = gen_math_ops._range(start, dim_value, step)
       dim_indices *= accum_dim_value
       dim_shape = stack([1] * (dim - 1) + [dim_value] + [1] * (ndims - dim),
                         axis=0)
-- 
GitLab


From 9cd6cab4f85f1f35c6532da3fb68839294d44ee4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 03:20:59 -0700
Subject: [PATCH 400/570] Internal change.

PiperOrigin-RevId: 215711454
---
 .../cluster_resolver/python/training/tpu_cluster_resolver.py     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 1056894f18..f4a8e16c99 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -60,6 +60,7 @@ class TPUClusterResolver(ClusterResolver):
     if (self._tpu == compat.as_bytes('') or
         self._tpu == compat.as_bytes('local') or
         self._tpu.startswith(compat.as_bytes('/bns')) or
+        self._tpu.startswith(compat.as_bytes('localhost:')) or
         self._tpu.startswith(compat.as_bytes('grpc://'))):
       return False
     return True
-- 
GitLab


From 28f239fdfa0c94f715fccf0197ab6c3c8df27d28 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 4 Oct 2018 05:34:55 -0700
Subject: [PATCH 401/570] Implement DataFormatVecPermute for XLA.

Also clear "_kernel" attributes of nodes if they are set to "host".
This is not meaningful when processing the graph for XLA, and it
would prevent finding the registered XLA kernel.

PiperOrigin-RevId: 215722216
---
 tensorflow/compiler/tests/BUILD               | 13 +++
 tensorflow/compiler/tests/permute_test.py     | 80 +++++++++++++++
 tensorflow/compiler/tf2xla/kernels/BUILD      |  1 +
 .../compiler/tf2xla/kernels/permute_op.cc     | 98 +++++++++++++++++++
 tensorflow/compiler/tf2xla/xla_compiler.cc    | 11 +++
 5 files changed, 203 insertions(+)
 create mode 100644 tensorflow/compiler/tests/permute_test.py
 create mode 100644 tensorflow/compiler/tf2xla/kernels/permute_op.cc

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 822fedf121..ee36729fd1 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -1028,6 +1028,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "permute_test",
+    size = "small",
+    srcs = ["permute_test.py"],
+    deps = [
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:nn_ops",
+    ],
+)
+
 tf_xla_py_test(
     name = "xla_device_test",
     size = "small",
diff --git a/tensorflow/compiler/tests/permute_test.py b/tensorflow/compiler/tests/permute_test.py
new file mode 100644
index 0000000000..dbb9274df4
--- /dev/null
+++ b/tensorflow/compiler/tests/permute_test.py
@@ -0,0 +1,80 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the DataFormatVecPermute operator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+class XlaPermuteOpTest(xla_test.XLATestCase):
+
+  def _runPermuteAndCompare(self, x, src_format, dst_format, expected):
+    with self.cached_session() as session:
+      with self.test_scope():
+        placeholder = array_ops.placeholder(dtypes.as_dtype(x.dtype), x.shape)
+        param = {placeholder: x}
+        output = nn_ops.data_format_vec_permute(
+            placeholder, src_format=src_format, dst_format=dst_format)
+      result = session.run(output, param)
+    self.assertAllEqual(result, expected)
+
+  def testNHWCToNCHW(self):
+    x = np.array([7, 4, 9, 3], dtype=np.int32)
+    self._runPermuteAndCompare(x, "NHWC", "NCHW", [7, 3, 4, 9])
+
+  def testNCHWToNHWC(self):
+    x = np.array([7, 4, 9, 3], dtype=np.int32)
+    self._runPermuteAndCompare(x, "NCHW", "NHWC", [7, 9, 3, 4])
+
+  def testNHWCToHWNC(self):
+    x = np.array([7, 4, 9, 3], dtype=np.int32)
+    self._runPermuteAndCompare(x, "NHWC", "HWNC", [4, 9, 7, 3])
+
+  def testHWNCToNHWC(self):
+    x = np.array([7, 4, 9, 3], dtype=np.int32)
+    self._runPermuteAndCompare(x, "HWNC", "NHWC", [9, 7, 4, 3])
+
+  def testNHWCToNCHW2D(self):
+    x = np.array([[7, 4], [9, 3], [4, 5], [5, 1]], dtype=np.int32)
+    self._runPermuteAndCompare(x, "NHWC", "NCHW",
+                               [[7, 4], [5, 1], [9, 3], [4, 5]])
+
+  def testNHWCToHWNC2D(self):
+    x = np.array([[7, 4], [9, 3], [4, 5], [5, 1]], dtype=np.int32)
+    self._runPermuteAndCompare(x, "NHWC", "HWNC",
+                               [[9, 3], [4, 5], [7, 4], [5, 1]])
+
+  def testHWNCToNHWC2D(self):
+    x = np.array([[7, 4], [9, 3], [4, 5], [5, 1]], dtype=np.int32)
+    self._runPermuteAndCompare(x, "HWNC", "NHWC",
+                               [[4, 5], [7, 4], [9, 3], [5, 1]])
+
+  def testNCHWToNHWC2D(self):
+    x = np.array([[7, 4], [9, 3], [4, 5], [5, 1]], dtype=np.int32)
+    self._runPermuteAndCompare(x, "NCHW", "NHWC",
+                               [[7, 4], [4, 5], [5, 1], [9, 3]])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 3e823254d3..9a7130f253 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -62,6 +62,7 @@ tf_kernel_library(
         "one_hot_op.cc",
         "pack_op.cc",
         "pad_op.cc",
+        "permute_op.cc",
         "pooling_ops.cc",
         "qr_op.cc",
         "quantize_and_dequantize_op.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/permute_op.cc b/tensorflow/compiler/tf2xla/kernels/permute_op.cc
new file mode 100644
index 0000000000..0764e5503d
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/permute_op.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+namespace {
+
+class DataFormatVecPermuteOp : public XlaOpKernel {
+ public:
+  explicit DataFormatVecPermuteOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("src_format", &src_format_));
+    OP_REQUIRES(
+        ctx, src_format_.size() == 4,
+        errors::InvalidArgument("Data format should have 4 characters"));
+    TensorFormat data_format;
+    OP_REQUIRES(ctx, FormatFromString(src_format_, &data_format),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dst_format", &dst_format_));
+    OP_REQUIRES(
+        ctx, dst_format_.size() == 4,
+        errors::InvalidArgument("Data format should have 4 characters"));
+    OP_REQUIRES(ctx, FormatFromString(dst_format_, &data_format),
+                errors::InvalidArgument("Invalid data format"));
+  }
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto builder = ctx->builder();
+    const TensorShape input_tensor_shape = ctx->InputShape(0);
+    int input_rank = input_tensor_shape.dims();
+    OP_REQUIRES(ctx, input_rank == 1 || input_rank == 2,
+                errors::InvalidArgument(
+                    "Input must be a vector or matrix, but got shape ",
+                    input_tensor_shape.DebugString()));
+    OP_REQUIRES(
+        ctx, input_tensor_shape.dim_size(0) == 4,
+        errors::InvalidArgument(
+            "First dimension of input must be of size 4, but got shape ",
+            input_tensor_shape.DebugString()));
+    if (input_rank == 2) {
+      OP_REQUIRES(
+          ctx, input_tensor_shape.dim_size(1) == 2,
+          errors::InvalidArgument(
+              "Second dimension of 2D input must be of size 2, but got shape ",
+              input_tensor_shape.DebugString()));
+    }
+    std::vector<int32> dst_indices(4, 0);
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        if (src_format_[i] == dst_format_[j]) {
+          dst_indices[i] = j;
+          break;
+        }
+      }
+    }
+    auto keys = xla::ConstantR1(builder, absl::Span<const int32>(dst_indices));
+    if (input_rank == 2) {
+      keys = xla::BroadcastInDim(
+          keys, xla::ShapeUtil::MakeShape(xla::S32, {4, 2}), {0});
+    }
+    auto sorted = xla::Sort(keys, ctx->Input(0), 0);
+    auto output = xla::GetTupleElement(sorted, 1);
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  string src_format_;
+  string dst_format_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DataFormatVecPermuteOp);
+};
+
+// TODO(b/115384656): Support DT_INT64.
+REGISTER_XLA_OP(Name("DataFormatVecPermute").TypeConstraint("T", DT_INT32),
+                DataFormatVecPermuteOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index d5094e8ec5..b2c57e8880 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -194,6 +194,17 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
 
   std::unique_ptr<Graph> graph = GetGraph(fbody);
 
+  // Clear the "_kernel" attribute if it is set to "host". This is used to
+  // indicate that a computation should happen on the host instead of the
+  // accelerator, but doesn't make sense in XLA.
+  const char* const kKernelAttr = "_kernel";
+  for (Node* n : graph->nodes()) {
+    string value;
+    if (GetNodeAttrSimple(n->attrs(), kKernelAttr, &value) && value == "host") {
+      n->ClearAttr(kKernelAttr);
+    }
+  }
+
   // _Arg and _Retval nodes don't exist in the stored subgraph for the function;
   // they are added by the function body looked up.  Therefore, they don't have
   // core assignments here.
-- 
GitLab


From 2c9369c8d878c913b5dfcd3c27849bcd3d6af6c9 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 4 Oct 2018 06:00:02 -0700
Subject: [PATCH 402/570] [TF:XLA] Don't expand complex64 tensors during TF/XLA
 lowering, if possible.

PiperOrigin-RevId: 215724324
---
 tensorflow/compiler/tests/nullary_ops_test.py | 43 +++++++++++++------
 .../compiler/tf2xla/kernels/const_op.cc       | 12 ++++++
 2 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/tests/nullary_ops_test.py b/tensorflow/compiler/tests/nullary_ops_test.py
index f985c5d2d9..38cb2f83ef 100644
--- a/tensorflow/compiler/tests/nullary_ops_test.py
+++ b/tensorflow/compiler/tests/nullary_ops_test.py
@@ -43,18 +43,37 @@ class NullaryOpsTest(xla_test.XLATestCase):
       output.run()
 
   def testConstants(self):
-    constants = [
-        np.float32(42),
-        np.array([], dtype=np.float32),
-        np.array([1, 2], dtype=np.float32),
-        np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32),
-        np.array([[[1, 2], [3, 4], [5, 6]], [[10, 20], [30, 40], [50, 60]]],
-                 dtype=np.float32),
-        np.array([[[]], [[]]], dtype=np.float32),
-        np.array([[[[1]]]], dtype=np.float32),
-    ]
-    for c in constants:
-      self._testNullary(lambda c=c: constant_op.constant(c), expected=c)
+    for dtype in self.numeric_types:
+      constants = [
+          dtype(42),
+          np.array([], dtype=dtype),
+          np.array([1, 2], dtype=dtype),
+          np.array([7, 7, 7, 7, 7], dtype=dtype),
+          np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype),
+          np.array([[[1, 2], [3, 4], [5, 6]], [[10, 20], [30, 40], [50, 60]]],
+                   dtype=dtype),
+          np.array([[[]], [[]]], dtype=dtype),
+          np.array([[[[1]]]], dtype=dtype),
+      ]
+      for c in constants:
+        self._testNullary(lambda c=c: constant_op.constant(c), expected=c)
+
+  def testComplexConstants(self):
+    for dtype in self.complex_types:
+      constants = [
+          dtype(42 + 3j),
+          np.array([], dtype=dtype),
+          np.ones([50], dtype=dtype) * (3 + 4j),
+          np.array([1j, 2 + 1j], dtype=dtype),
+          np.array([[1, 2j, 7j], [4, 5, 6]], dtype=dtype),
+          np.array([[[1, 2], [3, 4 + 6j], [5, 6]],
+                    [[10 + 7j, 20], [30, 40], [50, 60]]],
+                   dtype=dtype),
+          np.array([[[]], [[]]], dtype=dtype),
+          np.array([[[[1 + 3j]]]], dtype=dtype),
+      ]
+      for c in constants:
+        self._testNullary(lambda c=c: constant_op.constant(c), expected=c)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
index da8cf3fc6f..2628ef8e24 100644
--- a/tensorflow/compiler/tf2xla/kernels/const_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -76,6 +77,17 @@ class ConstOp : public XlaOpKernel {
             return;
           }
           break;
+        case DT_COMPLEX64:
+          if (proto_.scomplex_val_size() == 2) {
+            ctx->SetOutput(
+                0,
+                xla::Broadcast(xla::ConstantR0<xla::complex64>(
+                                   b, xla::complex64(proto_.scomplex_val(0),
+                                                     proto_.scomplex_val(1))),
+                               shape.dim_sizes()));
+            return;
+          }
+          break;
         case DT_INT32:
           if (proto_.int_val_size() == 1) {
             ctx->SetOutput(
-- 
GitLab


From 82ea80b979768c7fe1daa4b50cf054e5a0968f31 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 06:09:42 -0700
Subject: [PATCH 403/570] Add option in tf.gradients() to return zero tensors
 for unconnected gradients.

tf.gradients currently returns [NONE] when the gradient of unconnected variables
is required. This backwards compatable change adds in the option to have zero
tensors returned that match the dimensions of the input tensor.

PiperOrigin-RevId: 215725488
---
 tensorflow/python/BUILD                       |  4 ++
 tensorflow/python/ops/gradients.py            |  1 +
 tensorflow/python/ops/gradients_impl.py       | 67 +++++++++++++++++--
 tensorflow/python/ops/gradients_test.py       | 34 ++++++++++
 .../tensorflow.-unconnected-gradients.pbtxt   | 12 ++++
 .../tools/api/golden/v1/tensorflow.pbtxt      |  6 +-
 .../tensorflow.-unconnected-gradients.pbtxt   | 12 ++++
 .../tools/api/golden/v2/tensorflow.pbtxt      |  6 +-
 8 files changed, 135 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.-unconnected-gradients.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.-unconnected-gradients.pbtxt

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index fe81254ef7..da3c56db92 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2152,6 +2152,7 @@ py_library(
         ":array_grad",
         ":array_ops",
         ":bitwise_ops",
+        ":check_ops",
         ":cond_v2_impl",
         ":control_flow_grad",
         ":control_flow_ops",
@@ -2172,8 +2173,11 @@ py_library(
         ":random_grad",
         ":resource_variable_ops",
         ":spectral_grad",
+        ":tensor_array_ops",
+        ":tensor_util",
         ":util",
         ":variable_scope",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:tape",
diff --git a/tensorflow/python/ops/gradients.py b/tensorflow/python/ops/gradients.py
index 1dc666e78b..794465b10e 100644
--- a/tensorflow/python/ops/gradients.py
+++ b/tensorflow/python/ops/gradients.py
@@ -25,4 +25,5 @@ from tensorflow.python.ops.custom_gradient import custom_gradient
 from tensorflow.python.ops.gradients_impl import AggregationMethod
 from tensorflow.python.ops.gradients_impl import gradients
 from tensorflow.python.ops.gradients_impl import hessians
+from tensorflow.python.ops.gradients_impl import UnconnectedGradients
 # pylint: enable=unused-import
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 056015d6b6..aac95037dc 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import collections
 import contextlib
+import enum  # pylint: disable=g-bad-import-order
 import sys
 import warnings
 
@@ -537,6 +538,26 @@ def _Consumers(t, func_graphs):
   return consumers
 
 
+@tf_export("UnconnectedGradients")
+class UnconnectedGradients(enum.Enum):
+  """Controls how gradient computation behaves when y does not depend on x.
+
+  The gradient of y with respect to x can be zero in two different ways: there
+  could be no differentiable path in the graph connecting x to y (and so we can
+  statically prove that the gradient is zero) or it could be that runtime values
+  of tensors in a particular execution lead to a gradient of zero (say, if a
+  relu unit happens to not be activated). To allow you to distinguish between
+  these two cases you can choose what value gets returned for the gradient when
+  there is no path in the graph from x to y:
+
+  * `NONE`: Indicates that [None] will be returned if there is no path from x
+    to y
+  * `ZERO`: Indicates that a zero tensor will be returned in the shape of x.
+  """
+  NONE = "none"
+  ZERO = "zero"
+
+
 @tf_export("gradients")
 def gradients(ys,
               xs,
@@ -545,7 +566,8 @@ def gradients(ys,
               colocate_gradients_with_ops=False,
               gate_gradients=False,
               aggregation_method=None,
-              stop_gradients=None):
+              stop_gradients=None,
+              unconnected_gradients=UnconnectedGradients.NONE):
   """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`.
 
   `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
@@ -596,6 +618,23 @@ def gradients(ys,
   All integer tensors are considered constant with respect to all `xs`, as if
   they were included in `stop_gradients`.
 
+  `unconnected_gradients` determines the value returned for each x in xs if it
+  is unconnected in the graph to ys. By default this is None to safeguard
+  against errors. MAthematically these gradients are zero which can be requested
+  using the `'zero'` option. `tf.UnconnectedGradients` provides the
+  following options and behaviors:
+
+  ```python
+  a = tf.ones([1, 2])
+  b = tf.ones([3, 1])
+  g1 = tf.gradients([b], [a], unnconnected_gradients='none')
+  sess.run(g1)  # [None]
+
+  g2 = tf.gradients([b], [a], unconnected_gradients='zero')
+  sess.run(g2)  # [array([[0., 0.]], dtype=float32)]
+  ```
+
+
   Args:
     ys: A `Tensor` or list of tensors to be differentiated.
     xs: A `Tensor` or list of tensors to be used for differentiation.
@@ -611,6 +650,10 @@ def gradients(ys,
       Accepted values are constants defined in the class `AggregationMethod`.
     stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate
       through.
+    unconnected_gradients: Optional. Specifies the gradient value returned when
+      the given input tensors are unconnected. Accepted values are constants
+      defined in the class `tf.UnconnectedGradients` and the default value is
+      `none`.
 
   Returns:
     A list of `sum(dy/dx)` for each x in `xs`.
@@ -627,7 +670,8 @@ def gradients(ys,
   # mutating new ops.
   with ops.get_default_graph()._mutation_lock():  # pylint: disable=protected-access
     return _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
-                            gate_gradients, aggregation_method, stop_gradients)
+                            gate_gradients, aggregation_method, stop_gradients,
+                            unconnected_gradients)
 
 
 def _GradientsHelper(ys,
@@ -638,6 +682,7 @@ def _GradientsHelper(ys,
                      gate_gradients=False,
                      aggregation_method=None,
                      stop_gradients=None,
+                     unconnected_gradients=UnconnectedGradients.NONE,
                      src_graph=None):
   """Implementation of gradients()."""
   if context.executing_eagerly():
@@ -645,6 +690,11 @@ def _GradientsHelper(ys,
                        "is enabled. Use tf.GradientTape instead.")
   if src_graph is None:
     src_graph = ops.get_default_graph()
+  try:
+    unconnected_gradients = UnconnectedGradients(unconnected_gradients)
+  except ValueError:
+    raise ValueError(
+        "Unknown value for unconnected_gradients: %r" % unconnected_gradients)
 
   # If src_graph is a _FuncGraph (i.e. a function body), gather it and all
   # ancestor graphs. This is necessary for correctly handling captured values.
@@ -856,7 +906,7 @@ def _GradientsHelper(ys,
 
   if loop_state:
     loop_state.PostProcessing()
-  return [_GetGrad(grads, x) for x in xs]
+  return [_GetGrad(grads, x, unconnected_gradients) for x in xs]
 
 
 def _HasAnyNotNoneGrads(grads, op):
@@ -924,12 +974,19 @@ def _SetGrad(grads, t, grad):
     op_grads[t.value_index] = grad
 
 
-def _GetGrad(grads, t):
+def _GetGrad(grads, t, unconnected_gradients):
   """Gets gradient for tensor "t"."""
   op = t.op
   op_grads = grads.get(op)
   if not op_grads:
-    return None
+    if unconnected_gradients == UnconnectedGradients.ZERO:
+      return array_ops.zeros_like(t)
+    elif unconnected_gradients == UnconnectedGradients.NONE:
+      return None
+    else:
+      raise ValueError(
+          "Unknown value for unconnected_gradients: %r" % unconnected_gradients)
+
   t_grad = op_grads[t.value_index]
   assert not isinstance(
       t_grad, list), ("gradients list should have been aggregated by now.")
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 3c9b7a01c7..c93e2493ee 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -350,6 +350,40 @@ class GradientsTest(test_util.TensorFlowTestCase):
       for a, b in zip(npgrad1, npgrad2):
         np.testing.assert_allclose(a, b)
 
+  def testUnconnectedGradientsNoneUnconnectedGradients(self):
+    with ops.Graph().as_default():
+      x = constant(1.0, shape=[2, 2])
+      y = constant(3.0, shape=[3, 1])
+      grad = gradients.gradients(
+          [y], [x], unconnected_gradients="none")
+    self.assertIsNone(grad[0])
+
+  def testUnconnectedGradientsZerosUnconnectedGradients(self):
+    with ops.Graph().as_default():
+      x = constant(1.0, shape=[2, 2])
+      y = constant(3.0, shape=[3, 1])
+      grads = gradients.gradients(
+          [y], [x], unconnected_gradients="zero")
+      with self.cached_session() as sess:
+        self.assertAllEqual([[0.0, 0.0], [0.0, 0.0]], sess.run(grads)[0])
+
+  def testUnconnectedGradientsZeroConnectedGradients(self):
+    with ops.Graph().as_default():
+      x = constant(1.0)
+      y = x * 3.0
+      grad = gradients.gradients(
+          [y], [x], unconnected_gradients="zero")
+      with self.cached_session() as sess:
+        self.assertEquals(3.0, sess.run(grad)[0])
+
+  def testUnknownUnconnectedGradientsValueGiven(self):
+    with ops.Graph().as_default():
+      x = constant(1.0)
+      y = constant(1.0)
+      with self.assertRaisesRegexp(
+          ValueError, "Unknown value for unconnected_gradients: 'nonsense'"):
+        gradients.gradients([y], [x], unconnected_gradients="nonsense")
+
 
 class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-unconnected-gradients.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-unconnected-gradients.pbtxt
new file mode 100644
index 0000000000..c5eb959430
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-unconnected-gradients.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.UnconnectedGradients"
+tf_class {
+  is_instance: "<enum \'UnconnectedGradients\'>"
+  member {
+    name: "NONE"
+    mtype: "<enum \'UnconnectedGradients\'>"
+  }
+  member {
+    name: "ZERO"
+    mtype: "<enum \'UnconnectedGradients\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index a268529c1f..c1cc7322f0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -248,6 +248,10 @@ tf_module {
     name: "TextLineReader"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "UnconnectedGradients"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
   member {
     name: "VERSION"
     mtype: "<type \'str\'>"
@@ -1234,7 +1238,7 @@ tf_module {
   }
   member_method {
     name: "gradients"
-    argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\', \'None\', \'UnconnectedGradients.NONE\'], "
   }
   member_method {
     name: "greater"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-unconnected-gradients.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-unconnected-gradients.pbtxt
new file mode 100644
index 0000000000..c5eb959430
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-unconnected-gradients.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.UnconnectedGradients"
+tf_class {
+  is_instance: "<enum \'UnconnectedGradients\'>"
+  member {
+    name: "NONE"
+    mtype: "<enum \'UnconnectedGradients\'>"
+  }
+  member {
+    name: "ZERO"
+    mtype: "<enum \'UnconnectedGradients\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 5b3ea75bce..571abc3b19 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -220,6 +220,10 @@ tf_module {
     name: "TensorShape"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "UnconnectedGradients"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
   member {
     name: "VERSION"
     mtype: "<type \'str\'>"
@@ -1134,7 +1138,7 @@ tf_module {
   }
   member_method {
     name: "gradients"
-    argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\', \'None\', \'UnconnectedGradients.NONE\'], "
   }
   member_method {
     name: "greater"
-- 
GitLab


From 7b56d4ff7679ed59e3ea799054c5dcefd0600ab0 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 4 Oct 2018 08:08:22 -0700
Subject: [PATCH 404/570] [TF] Fail fast if there is no CPU kernel during
 constant tensor evaluation. Avoids LOG(ERROR) spam when the Executor is
 unable to find a CPU kernel.

PiperOrigin-RevId: 215738481
---
 .../core/common_runtime/eval_const_tensor.cc   | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tensorflow/core/common_runtime/eval_const_tensor.cc b/tensorflow/core/common_runtime/eval_const_tensor.cc
index c1542f1f57..87749da7af 100644
--- a/tensorflow/core/common_runtime/eval_const_tensor.cc
+++ b/tensorflow/core/common_runtime/eval_const_tensor.cc
@@ -113,6 +113,13 @@ Status TryToInferTensorOutputFromInputShapes(const Edge& edge,
   return Status::OK();
 }
 
+// Returns true if 'node' has a registered CPU kernel.
+bool HasCpuKernel(const Node& node) {
+  return FindKernelDef(DeviceType(DEVICE_CPU), node.def(), /*def=*/nullptr,
+                       /*kernel_class_name=*/nullptr)
+      .ok();
+}
+
 // Extracts the subgraph ending at 'target_node' that is statically computable
 // and inserts into 'out_graph'. If statically computable, 'is_constant_graph'
 // will be set to true.
@@ -136,6 +143,12 @@ Status ExtractConstantSubgraph(
     return Status::OK();
   }
 
+  // Since constant-folding runs on the CPU, do not attempt to constant-fold
+  // operators that have no CPU kernel.
+  if (!HasCpuKernel(target_node)) {
+    return Status::OK();
+  }
+
   // TODO(skyewm): should more of the filtering applied in input nodes below be
   // applied to target_node here?
 
@@ -201,6 +214,11 @@ Status ExtractConstantSubgraph(
       return Status::OK();
     }
 
+    if (!HasCpuKernel(*current_node)) {
+      *is_constant_graph = false;
+      return Status::OK();
+    }
+
     // If there is nothing more to recurse down, see if
     // the generator node is a constant.
     if (current_node->num_inputs() == 0) {
-- 
GitLab


From dcd7dd2d2e1ed7d8c26dd22dbbd2bac269c42e1e Mon Sep 17 00:00:00 2001
From: Alan Chiao <alanchiao@google.com>
Date: Thu, 4 Oct 2018 08:30:22 -0700
Subject: [PATCH 405/570] Sparse output fully connected custom op.

PiperOrigin-RevId: 215741296
---
 tensorflow/contrib/lite/kernels/BUILD         |  18 ++
 .../kernels/sparse_output_fully_connected.cc  | 235 ++++++++++++++++++
 .../sparse_output_fully_connected_test.cc     | 158 ++++++++++++
 3 files changed, 411 insertions(+)
 create mode 100644 tensorflow/contrib/lite/kernels/sparse_output_fully_connected.cc
 create mode 100644 tensorflow/contrib/lite/kernels/sparse_output_fully_connected_test.cc

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index daaf6714cc..95e387814d 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -210,6 +210,7 @@ cc_library(
         "slice.cc",
         "space_to_batch_nd.cc",
         "space_to_depth.cc",
+        "sparse_output_fully_connected.cc",
         "sparse_to_dense.cc",
         "split.cc",
         "squeeze.cc",
@@ -333,6 +334,23 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "sparse_output_fully_connected_test",
+    size = "small",
+    srcs = ["sparse_output_fully_connected_test.cc"],
+    tags = [
+        "no_oss",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 tf_cc_test(
     name = "activations_test",
     size = "small",
diff --git a/tensorflow/contrib/lite/kernels/sparse_output_fully_connected.cc b/tensorflow/contrib/lite/kernels/sparse_output_fully_connected.cc
new file mode 100644
index 0000000000..843ed0768c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/sparse_output_fully_connected.cc
@@ -0,0 +1,235 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// SparseOutputFullyConnected is a fully connected layer that uses a single
+// row in the weights and bias via a lookup.
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace sparse_output_fully_connected {
+
+// Input tensors of size {n_batch, n_input}
+constexpr int kInputTensor = 0;
+// Auxiliary input tensor of size { 1 }
+constexpr int kInputLookupTensor = 1;
+
+// Weights tensor of size { n_embeddings , n_input }
+constexpr int kWeightsTensor = 2;
+// Bias tensor of size { n_embeddings }
+constexpr int kBiasTensor = 3;
+
+// Output tensor.
+constexpr int kOutputTensor = 0;
+
+// Temporary tensors.
+enum TemporaryTensor {
+  kInputQuantized = 0,
+  kScalingFactors = 1,
+  kNumTemporaryTensors = 2
+};
+
+// Struct to hold op data.
+struct OpData {
+  int scratch_tensor_index;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  context->AddTensors(context, /*tensors_to_add=*/kNumTemporaryTensors,
+                      &data->scratch_tensor_index);
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
+  const int n_batch = SizeOfDimension(input, 0);
+  const int n_input = SizeOfDimension(input, 1);
+
+  const TfLiteTensor* lookup = GetInput(context, node, kInputLookupTensor);
+  TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1);
+  // Only support single lookup.
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(lookup, 0), 1);
+
+  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 2);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(weights, 1), n_input);
+
+  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
+  TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(weights, 0));
+
+  const bool is_hybrid_op =
+      (weights->type == kTfLiteUInt8 && input->type == kTfLiteFloat32);
+
+  if (is_hybrid_op) {
+    TfLiteIntArrayFree(node->temporaries);
+    node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors);
+
+    // Allocate temporary tensors to store quantized values of input.
+    node->temporaries->data[kInputQuantized] = op_data->scratch_tensor_index;
+    TfLiteTensor* input_quantized =
+        GetTemporary(context, node, /*index=*/kInputQuantized);
+    input_quantized->type = kTfLiteUInt8;
+    input_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
+      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                       input_quantized_size));
+    }
+
+    // Tell interpreter to allocate temporary tensors to store scaling factors.
+    node->temporaries->data[kScalingFactors] =
+        op_data->scratch_tensor_index + kScalingFactors;
+    TfLiteTensor* scaling_factors =
+        GetTemporary(context, node, /*index=*/kScalingFactors);
+    scaling_factors->type = kTfLiteFloat32;
+    scaling_factors->allocation_type = kTfLiteArenaRw;
+    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+    scaling_factors_size->data[0] = n_batch;
+    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
+                                                       scaling_factors_size));
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalFloat(const TfLiteTensor* input, const TfLiteTensor* lookup,
+                       const TfLiteTensor* weights, const TfLiteTensor* bias,
+                       TfLiteTensor* output) {
+  const int n_batch = SizeOfDimension(input, 0);
+  const int n_input = SizeOfDimension(input, 1);
+
+  const float* input_ptr_batch = input->data.f;
+
+  // Initialize pointer to right row according to lookup value.
+  int32 lookup_index = lookup->data.i32[0];
+  const float* weights_ptr = weights->data.f + lookup_index * n_input;
+
+  // Initialize output to bias.
+  if (bias) {
+    float* bias_ptr = bias->data.f + lookup_index;
+    tensor_utils::VectorBatchVectorAssign(bias_ptr, 1, n_batch, output->data.f);
+  } else {
+    tensor_utils::ZeroVector(output->data.f, n_batch * 1);
+  }
+
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      weights_ptr, /*m_rows=*/1, n_input, input_ptr_batch, n_batch,
+      output->data.f, /*result_stride=*/1);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalHybrid(const TfLiteTensor* input, const TfLiteTensor* lookup,
+                        const TfLiteTensor* weights, const TfLiteTensor* bias,
+                        TfLiteTensor* scaling_factors,
+                        TfLiteTensor* input_quantized, TfLiteTensor* output) {
+  const int n_batch = SizeOfDimension(input, 0);
+  const int n_input = SizeOfDimension(input, 1);
+
+  const float* input_ptr_batch = input->data.f;
+  // Initialize the pointer to storage for quantized values and
+  // scaling factors.
+  int8_t* quantized_input_ptr_batch =
+      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+  float* scaling_factors_ptr = scaling_factors->data.f;
+
+  // Initialize pointer to right row according to lookup value.
+  int32 lookup_index = lookup->data.i32[0];
+  int8_t* weights_ptr =
+      reinterpret_cast<int8_t*>(weights->data.uint8) + lookup_index * n_input;
+
+  // Initialize output to bias.
+  if (bias) {
+    float* bias_ptr = bias->data.f + lookup_index;
+    tensor_utils::VectorBatchVectorAssign(bias_ptr, 1, n_batch, output->data.f);
+  } else {
+    tensor_utils::ZeroVector(output->data.f, n_batch * 1);
+  }
+
+  if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
+    // Quantize input from float to int8.
+    float unused_min, unused_max;
+    for (int b = 0; b < n_batch; ++b) {
+      const int offset = b * n_input;
+      tensor_utils::SymmetricQuantizeFloats(
+          input_ptr_batch + offset, n_input, quantized_input_ptr_batch + offset,
+          &unused_min, &unused_max, &scaling_factors_ptr[b]);
+      scaling_factors_ptr[b] *= weights->params.scale;
+    }
+
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        weights_ptr, /*m_rows=*/1, n_input, quantized_input_ptr_batch,
+        scaling_factors_ptr, n_batch, output->data.f, /*result_stride=*/1);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* lookup = GetInput(context, node, kInputLookupTensor);
+  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (weights->type) {
+    case kTfLiteFloat32: {
+      return EvalFloat(input, lookup, weights, bias, output);
+    }
+    case kTfLiteUInt8: {
+      TfLiteTensor* input_quantized =
+          GetTemporary(context, node, /*index=*/kInputQuantized);
+      TfLiteTensor* scaling_factors =
+          GetTemporary(context, node, /*index=*/kScalingFactors);
+      return EvalHybrid(input, lookup, weights, bias, scaling_factors,
+                        input_quantized, output);
+    }
+    default:
+      context->ReportError(context, "Type %d is not currently supported.",
+                           weights->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace sparse_output_fully_connected
+
+TfLiteRegistration* Register_SPARSE_OUTPUT_FULLY_CONNECTED() {
+  static TfLiteRegistration r = {sparse_output_fully_connected::Init,
+                                 sparse_output_fully_connected::Free,
+                                 sparse_output_fully_connected::Prepare,
+                                 sparse_output_fully_connected::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/sparse_output_fully_connected_test.cc b/tensorflow/contrib/lite/kernels/sparse_output_fully_connected_test.cc
new file mode 100644
index 0000000000..365986a5c1
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/sparse_output_fully_connected_test.cc
@@ -0,0 +1,158 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite sparse output fully connected op.
+#include <iomanip>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+
+namespace tflite {
+
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_SPARSE_OUTPUT_FULLY_CONNECTED();
+
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseSparseOutputFullyConnectedOpModel : public SingleOpModel {
+ public:
+  BaseSparseOutputFullyConnectedOpModel(const TensorData& input,
+                                        const TensorData& weights,
+                                        const TensorData& output = {
+                                            TensorType_FLOAT32}) {
+    input_ = AddInput(input);
+    lookup_ = AddInput({TensorType_INT32, {1}});
+    weights_ = AddInput(weights);
+    int bias_size = GetShape(weights_)[0];
+    bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
+    output_ = AddOutput(output);
+
+    // Create empty (required) options map.
+    flexbuffers::Builder fbb;
+    fbb.Map([&]() {});
+    fbb.Finish();
+
+    SetCustomOp("SPARSE_OUTPUT_FULLY_CONNECTED", fbb.GetBuffer(),
+                Register_SPARSE_OUTPUT_FULLY_CONNECTED);
+    BuildInterpreter({GetShape(input_), GetShape(lookup_), GetShape(weights_),
+                      GetShape(bias_)});
+  }
+
+  void SetInput(const std::vector<float>& data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetLookup(const std::vector<int32>& f) { PopulateTensor(lookup_, f); }
+
+  void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input_;
+  int lookup_;
+  int weights_;
+  int bias_;
+  int output_;
+};
+
+class FloatSparseOutputFullyConnectedOpModel
+    : public BaseSparseOutputFullyConnectedOpModel {
+ public:
+  using BaseSparseOutputFullyConnectedOpModel::
+      BaseSparseOutputFullyConnectedOpModel;
+
+  void SetWeights(const std::vector<float>& f) { PopulateTensor(weights_, f); }
+};
+
+class HybridSparseOutputFullyConnectedOpModel
+    : public BaseSparseOutputFullyConnectedOpModel {
+ public:
+  using BaseSparseOutputFullyConnectedOpModel::
+      BaseSparseOutputFullyConnectedOpModel;
+
+  void SetWeights(const std::vector<float>& f) {
+    SymmetricQuantizeAndPopulate(weights_, f);
+  }
+};
+
+TEST(SparseOutputFullyConnectedOpTest, SimpleTestFloat) {
+  FloatSparseOutputFullyConnectedOpModel m({TensorType_FLOAT32, {1, 5}},
+                                           {TensorType_FLOAT32, {3, 5}},
+                                           {TensorType_FLOAT32, {}});
+
+  m.SetInput({-1.0, 0.0, 1.0, 2.0, 3.0});
+
+  m.SetLookup({2});
+
+  m.SetWeights({
+      -1.0, 0.0, 1.0, 2.0, 3.0,  //
+      0.0, 1.0, 2.0, 3.0, 4.0,   //
+      1.0, 2.0, 3.0, 4.0, 5.0,   //
+  });
+
+  m.SetBias({1.0, 2.0, 3.0});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({28}));
+}
+
+TEST(SparseOutputFullyConnectedOpTest, SimpleTestHybrid) {
+  HybridSparseOutputFullyConnectedOpModel m({TensorType_FLOAT32, {1, 5}},
+                                            {TensorType_UINT8, {3, 5}},
+                                            {TensorType_FLOAT32, {}});
+
+  m.SetInput({-1.0, 0.0, 1.0, 2.0, 3.0});
+
+  m.SetLookup({2});
+
+  m.SetWeights({
+      -1.0, 0.0, 1.0, 2.0, 3.0,  //
+      0.0, 1.0, 2.0, 3.0, 4.0,   //
+      1.0, 2.0, 3.0, 4.0, 5.0,   //
+  });
+
+  m.SetBias({1.0, 2.0, 3.0});
+
+  m.Invoke();
+
+  // We get 28.0552 instead of 28.
+  //
+  // Input -> -42, 0, 42, 85, 127 with scale factor of 127/3.
+  // Looked up weights ->  25, 51, 76, 102, 127 with scale factor of 127/5.
+  //
+  // (-42 * 25 + 0 * 51 + 42 * 76 + 85 * 102 + 127 * 127) * (3*5/127^2) + 3.0
+  // gives us the expected result.
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({28}, 0.0553)));
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
-- 
GitLab


From 80c9eec9b2475630f83a596f77a906c8075f8e6c Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Thu, 4 Oct 2018 08:56:45 -0700
Subject: [PATCH 406/570] Remove CHECKs from HloInstruction constructors. Move
 these checks to RET_CHECKs in the HloVerifier. Added a new visitor class
 InstructionVerifier inside of hlo_verifier.cc for handling these random
 non-result-shape verifications.

PiperOrigin-RevId: 215745043
---
 .../compiler/xla/service/hlo_instructions.cc  |  12 -
 .../compiler/xla/service/hlo_instructions.h   |   1 -
 .../compiler/xla/service/hlo_verifier.cc      | 456 ++++++++++--------
 .../compiler/xla/service/hlo_verifier.h       |  11 -
 4 files changed, 248 insertions(+), 232 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 68d0979f5c..152d8eacdb 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -643,14 +643,6 @@ HloTransposeInstruction::HloTransposeInstruction(
     absl::Span<const int64> dimensions)
     : HloInstruction(HloOpcode::kTranspose, shape),
       dimensions_(dimensions.begin(), dimensions.end()) {
-  CHECK_EQ(shape.dimensions().size(), dimensions.size());
-  CHECK_EQ(shape.dimensions().size(), operand->shape().dimensions().size());
-  CHECK(std::equal(operand->shape().dimensions().begin(),
-                   operand->shape().dimensions().end(),
-                   Permute(dimensions, shape.dimensions()).begin()))
-      << "shape: " << ShapeUtil::HumanString(shape)
-      << ", operand->shape(): " << ShapeUtil::HumanString(shape)
-      << ", dimensions: {" << StrJoin(dimensions, ", ") << "}";
   AppendOperand(operand);
 }
 
@@ -1491,7 +1483,6 @@ HloParameterInstruction::CloneWithNewOperandsImpl(
 HloGetTupleElementInstruction::HloGetTupleElementInstruction(
     const Shape& shape, HloInstruction* operand, int64 index)
     : HloInstruction(HloOpcode::kGetTupleElement, shape), tuple_index_(index) {
-  CHECK(ShapeUtil::IsTuple(operand->shape()));
   AppendOperand(operand);
 }
 
@@ -1613,9 +1604,6 @@ HloOutfeedInstruction::HloOutfeedInstruction(const Shape& outfeed_shape,
     : HloInstruction(HloOpcode::kOutfeed, ShapeUtil::MakeTokenShape()),
       outfeed_shape_(outfeed_shape),
       outfeed_config_(outfeed_config) {
-  CHECK(ShapeUtil::Compatible(operand->shape(), outfeed_shape))
-      << "Outfeed shape " << outfeed_shape
-      << " must be compatible with operand shape " << operand->shape();
   AppendOperand(operand);
   AppendOperand(token_operand);
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index ab168800f6..e169604072 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -896,7 +896,6 @@ class HloOutfeedInstruction : public HloInstruction {
                                  absl::string_view outfeed_config);
   // Returns the shape for the Outfeed instruction.
   const Shape& outfeed_shape() const {
-    TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(outfeed_shape_));
     return outfeed_shape_;
   }
   // Returns the config for the Outfeed instruction.
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index a7727824fe..b5498bb936 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -763,7 +763,136 @@ Status VerifyHloStructure(HloModule* module) {
   return Status::OK();
 }
 
-Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
+namespace {
+
+// Returns true if the given Shape has a TOKEN shape as any subshape.
+bool ShapeContainsToken(const Shape& shape) {
+  bool contains_token = false;
+  ShapeUtil::ForEachSubshape(
+      shape, [&contains_token](const Shape& subshape, const ShapeIndex&) {
+        if (ShapeUtil::IsToken(subshape)) {
+          contains_token = true;
+        }
+      });
+  return contains_token;
+}
+
+// Verifies that all types entering and exiting the entry computation are
+// legal.
+Status VerifyEntryAndExitShapes(const HloModule& module) {
+  // Tokens cannot be passed as entry parameters.
+  // TODO(b/80000000): Remove this constraint.
+  for (int i = 0; i < module.entry_computation()->num_parameters(); ++i) {
+    HloInstruction* param =
+        module.entry_computation()->parameter_instruction(i);
+    if (ShapeContainsToken(param->shape())) {
+      return InternalError(
+          "Entry parameter %d is or contains a token shape: %s", i,
+          ShapeUtil::HumanString(param->shape()));
+    }
+  }
+  return Status::OK();
+}
+
+// Checks if the given two instructions share the same channel id.
+Status CheckSameChannel(const HloInstruction* instr1,
+                        const HloInstruction* instr2) {
+  if (instr1->channel_id() != instr2->channel_id()) {
+    return InternalError(
+        "Expected to have the same channel id, actual channel ids are: %s "
+        "(%d), %s (%d)",
+        instr1->ToString(), instr1->channel_id(), instr2->ToString(),
+        instr2->channel_id());
+  }
+  return Status::OK();
+}
+
+// Checks if the given two instructions have the same is_host_transfer
+// attribute value. Intsructions must be send/recv instructions or their
+// 'done' variant.
+Status CheckSameIsHostTransfer(const HloInstruction* instr1,
+                               const HloInstruction* instr2) {
+  const HloSendRecvInstruction* send_recv1 =
+      DynCast<const HloSendRecvInstruction>(instr1);
+  const HloSendRecvInstruction* send_recv2 =
+      DynCast<const HloSendRecvInstruction>(instr2);
+  TF_RET_CHECK(send_recv1 != nullptr);
+  TF_RET_CHECK(send_recv2 != nullptr);
+  if (send_recv1->is_host_transfer() != send_recv2->is_host_transfer()) {
+    return InternalError(
+        "Expected instructions to have the same is-host-transfer property: "
+        "%s, "
+        "%s ",
+        instr1->ToString(), instr2->ToString());
+  }
+  return Status::OK();
+}
+
+// Checks various invariants of send and recv instructions.
+Status VerifySendsAndRecvs(const HloModule& module) {
+  absl::flat_hash_map<int64, const HloInstruction*> host_channels;
+  // Host send/recv instructions must have their own unique channel.
+  auto check_unique_host_channel = [&](const HloInstruction* instruction) {
+    const HloSendRecvInstruction* sendrecv =
+        DynCast<const HloSendRecvInstruction>(instruction);
+    if (sendrecv->is_host_transfer()) {
+      auto it_inserted =
+          host_channels.insert({sendrecv->channel_id(), sendrecv});
+      if (!it_inserted.second) {
+        return FailedPrecondition(
+            "Channel %d is used for multiple host send/recv instructions: "
+            "%s "
+            "and "
+            "%s",
+            sendrecv->channel_id(), sendrecv->ToString(),
+            it_inserted.first->second->ToString());
+      }
+    }
+
+    return Status::OK();
+  };
+
+  // Send/Recv instruction must have a single user: the corresponding
+  // SendDone/RecvDone. with matching channel.
+  for (const HloComputation* computation : module.computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      switch (instruction->opcode()) {
+        case HloOpcode::kSend: {
+          TF_RETURN_IF_ERROR(check_unique_host_channel(instruction));
+          TF_RET_CHECK(instruction->users().size() == 1);
+          const HloInstruction* send_done = instruction->users().front();
+          TF_RET_CHECK(send_done->opcode() == HloOpcode::kSendDone);
+          TF_RETURN_IF_ERROR(CheckSameChannel(instruction, send_done));
+          TF_RETURN_IF_ERROR(CheckSameIsHostTransfer(instruction, send_done));
+          break;
+        }
+        case HloOpcode::kRecv: {
+          TF_RETURN_IF_ERROR(check_unique_host_channel(instruction));
+          TF_RET_CHECK(instruction->users().size() == 1);
+          const HloInstruction* recv_done = instruction->users().front();
+          TF_RET_CHECK(recv_done->opcode() == HloOpcode::kRecvDone);
+          TF_RETURN_IF_ERROR(CheckSameChannel(instruction, recv_done));
+          TF_RETURN_IF_ERROR(CheckSameIsHostTransfer(instruction, recv_done));
+          break;
+        }
+        case HloOpcode::kSendDone:
+          TF_RET_CHECK(instruction->operands().size() == 1);
+          TF_RET_CHECK(instruction->operand(0)->opcode() == HloOpcode::kSend);
+          break;
+        case HloOpcode::kRecvDone:
+          TF_RET_CHECK(instruction->operands().size() == 1);
+          TF_RET_CHECK(instruction->operand(0)->opcode() == HloOpcode::kRecv);
+          break;
+        default:
+          break;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// CHECKs various invariants of a fusion instruction.
+Status CheckFusionInstruction(HloInstruction* fusion) {
   // The parent fusion instruction of the fusion computation must be 'fusion'.
   HloComputation* fused_computation = fusion->fused_instructions_computation();
   if (fusion != fused_computation->FusionInstruction()) {
@@ -866,50 +995,32 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
     }
   }
 
+  TF_RET_CHECK(fusion->called_computations() ==
+               absl::Span<HloComputation* const>(
+                   {fusion->fused_instructions_computation()}))
+      << "Fusion HLO calls computations other than the "
+         "fused_instructions_computation: "
+      << fusion->ToString() << " fusion->fused_instructions_computation(): "
+      << fusion->fused_instructions_computation()->ToString()
+      << " fusion->called_computations(): "
+      << ComputationsToString(fusion->called_computations());
+
+  for (const auto& fused : fusion->fused_instructions()) {
+    TF_RET_CHECK(fused->parent() == fusion->fused_instructions_computation())
+        << "Fused HLO was missing a parent: " << fused->ToString()
+        << " parent: " << fused->parent()
+        << " computation: " << fusion->parent();
+  }
+
   // TODO(b/65423525): We'd like to check that all operands are distinct.
   // This is currently disabled due to the invariant being violated by
   // multi-output fusion.
   return Status::OK();
 }
 
-Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) {
-  auto* while_cond = instruction->while_condition();
-  auto* while_body = instruction->while_body();
-  if (while_cond->num_parameters() != 1) {
-    return FailedPrecondition(
-        "While condition must have exactly 1 parameter; had %d : %s",
-        while_cond->num_parameters(), while_cond->ToString());
-  }
-  if (while_body->num_parameters() != 1) {
-    return FailedPrecondition(
-        "While body must have exactly 1 parameter; had %d : %s",
-        while_body->num_parameters(), while_body->ToString());
-  }
-  if (instruction->operand_count() != 1) {
-    return FailedPrecondition(
-        "While loop must have exactly one operand; had %d : %s",
-        instruction->operand_count(), instruction->ToString());
-  }
-  return Status::OK();
-}
-
-Status HloVerifier::CheckConditionalInstruction(HloInstruction* instruction) {
-  if (instruction->true_computation()->num_parameters() != 1) {
-    return FailedPrecondition(
-        "True computation %s of %s must have 1 parameter insted of %d",
-        instruction->true_computation()->name(), instruction->ToString(),
-        instruction->true_computation()->num_parameters());
-  }
-  if (instruction->false_computation()->num_parameters() != 1) {
-    return FailedPrecondition(
-        "False computation %s of %s must have 1 parameter insted of %d",
-        instruction->false_computation()->name(), instruction->ToString(),
-        instruction->false_computation()->num_parameters());
-  }
-  return Status::OK();
-}
-
-Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) {
+// Checks that the non-scalar operand shapes are compatible to the output
+// shape, i.e., that there are no implicit broadcasts of size-one dimensions.
+Status CheckElementwiseInstruction(HloInstruction* instruction) {
   const Shape& out_shape = instruction->shape();
   for (HloInstruction* operand : instruction->operands()) {
     const Shape& operand_shape = operand->shape();
@@ -926,133 +1037,114 @@ Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) {
   return Status::OK();
 }
 
-namespace {
+// Visitor which verifies various fields on the HLO instruction. This class does
+// not check result shape as that is checked in the ShapeVerifier.
+class InstructionVerifier : public DfsHloVisitorWithDefault {
+ public:
+  InstructionVerifier() {}
 
-// Returns true if the given Shape has a TOKEN shape as any subshape.
-bool ShapeContainsToken(const Shape& shape) {
-  bool contains_token = false;
-  ShapeUtil::ForEachSubshape(
-      shape, [&contains_token](const Shape& subshape, const ShapeIndex&) {
-        if (ShapeUtil::IsToken(subshape)) {
-          contains_token = true;
-        }
-      });
-  return contains_token;
-}
+  Status DefaultAction(HloInstruction*) override { return Status::OK(); }
 
-// Verifies that all types entering and exiting the entry computation are
-// legal.
-Status VerifyEntryAndExitShapes(const HloModule& module) {
-  // Tokens cannot be passed as entry parameters.
-  // TODO(b/80000000): Remove this constraint.
-  for (int i = 0; i < module.entry_computation()->num_parameters(); ++i) {
-    HloInstruction* param =
-        module.entry_computation()->parameter_instruction(i);
-    if (ShapeContainsToken(param->shape())) {
-      return InternalError(
-          "Entry parameter %d is or contains a token shape: %s", i,
-          ShapeUtil::HumanString(param->shape()));
-    }
+  Status HandleFusion(HloInstruction* fusion) override {
+    return CheckFusionInstruction(fusion);
   }
-  return Status::OK();
-}
 
-// Checks if the given two instructions share the same channel id.
-Status CheckSameChannel(const HloInstruction* instr1,
-                        const HloInstruction* instr2) {
-  if (instr1->channel_id() != instr2->channel_id()) {
-    return InternalError(
-        "Expected to have the same channel id, actual channel ids are: %s "
-        "(%d), %s (%d)",
-        instr1->ToString(), instr1->channel_id(), instr2->ToString(),
-        instr2->channel_id());
+  Status HandleBroadcast(HloInstruction* broadcast) override {
+    // If you see this failure then someone has confused the difference
+    // between the HLO broadcast op, and the UserComputation broadcast
+    // op. See https://groups.google.com/forum/#!topic/xla-dev/9LqijHmTt_I
+    // or ComputationLowerer::Visit()
+    TF_RET_CHECK(broadcast->dimensions().size() ==
+                 ShapeUtil::Rank(broadcast->operand(0)->shape()))
+        << "Broadcast HLO (" << broadcast->ToShortString()
+        << ") has invalid number of dimensions: "
+        << broadcast->dimensions().size()
+        << " != " << ShapeUtil::Rank(broadcast->operand(0)->shape());
+    return Status::OK();
   }
-  return Status::OK();
-}
 
-// Checks if the given two instructions have the same is_host_transfer
-// attribute value. Intsructions must be send/recv instructions or their
-// 'done' variant.
-Status CheckSameIsHostTransfer(const HloInstruction* instr1,
-                               const HloInstruction* instr2) {
-  const HloSendRecvInstruction* send_recv1 =
-      DynCast<const HloSendRecvInstruction>(instr1);
-  const HloSendRecvInstruction* send_recv2 =
-      DynCast<const HloSendRecvInstruction>(instr2);
-  TF_RET_CHECK(send_recv1 != nullptr);
-  TF_RET_CHECK(send_recv2 != nullptr);
-  if (send_recv1->is_host_transfer() != send_recv2->is_host_transfer()) {
-    return InternalError(
-        "Expected instructions to have the same is-host-transfer property: "
-        "%s, "
-        "%s ",
-        instr1->ToString(), instr2->ToString());
+  Status HandleWhile(HloInstruction* xla_while) override {
+    auto* while_cond = xla_while->while_condition();
+    auto* while_body = xla_while->while_body();
+    if (while_cond->num_parameters() != 1) {
+      return FailedPrecondition(
+          "While condition must have exactly 1 parameter; had %d : %s",
+          while_cond->num_parameters(), while_cond->ToString());
+    }
+    if (while_body->num_parameters() != 1) {
+      return FailedPrecondition(
+          "While body must have exactly 1 parameter; had %d : %s",
+          while_body->num_parameters(), while_body->ToString());
+    }
+    if (xla_while->operand_count() != 1) {
+      return FailedPrecondition(
+          "While loop must have exactly one operand; had %d : %s",
+          xla_while->operand_count(), xla_while->ToString());
+    }
+    return Status::OK();
   }
-  return Status::OK();
-}
 
-// Checks various invariants of send and recv instructions.
-Status VerifySendsAndRecvs(const HloModule& module) {
-  absl::flat_hash_map<int64, const HloInstruction*> host_channels;
-  // Host send/recv instructions must have their own unique channel.
-  auto check_unique_host_channel = [&](const HloInstruction* instruction) {
-    const HloSendRecvInstruction* sendrecv =
-        DynCast<const HloSendRecvInstruction>(instruction);
-    if (sendrecv->is_host_transfer()) {
-      auto it_inserted =
-          host_channels.insert({sendrecv->channel_id(), sendrecv});
-      if (!it_inserted.second) {
-        return FailedPrecondition(
-            "Channel %d is used for multiple host send/recv instructions: "
-            "%s "
-            "and "
-            "%s",
-            sendrecv->channel_id(), sendrecv->ToString(),
-            it_inserted.first->second->ToString());
-      }
+  Status HandleConditional(HloInstruction* conditional) override {
+    if (conditional->true_computation()->num_parameters() != 1) {
+      return FailedPrecondition(
+          "True computation %s of %s must have 1 parameter insted of %d",
+          conditional->true_computation()->name(), conditional->ToString(),
+          conditional->true_computation()->num_parameters());
     }
+    if (conditional->false_computation()->num_parameters() != 1) {
+      return FailedPrecondition(
+          "False computation %s of %s must have 1 parameter insted of %d",
+          conditional->false_computation()->name(), conditional->ToString(),
+          conditional->false_computation()->num_parameters());
+    }
+    return Status::OK();
+  }
+
+  Status HandleElementwiseUnary(HloInstruction* instruction) override {
+    return CheckElementwiseInstruction(instruction);
+  }
+
+  Status HandleElementwiseBinary(HloInstruction* instruction) override {
+    return CheckElementwiseInstruction(instruction);
+  }
 
+  Status HandleGetTupleElement(HloInstruction* gte) override {
+    TF_RET_CHECK(ShapeUtil::IsTuple(gte->operand(0)->shape()));
     return Status::OK();
-  };
+  }
 
-  // Send/Recv instruction must have a single user: the corresponding
-  // SendDone/RecvDone. with matching channel.
-  for (const HloComputation* computation : module.computations()) {
-    for (const HloInstruction* instruction : computation->instructions()) {
-      switch (instruction->opcode()) {
-        case HloOpcode::kSend: {
-          TF_RETURN_IF_ERROR(check_unique_host_channel(instruction));
-          TF_RET_CHECK(instruction->users().size() == 1);
-          const HloInstruction* send_done = instruction->users().front();
-          TF_RET_CHECK(send_done->opcode() == HloOpcode::kSendDone);
-          TF_RETURN_IF_ERROR(CheckSameChannel(instruction, send_done));
-          TF_RETURN_IF_ERROR(CheckSameIsHostTransfer(instruction, send_done));
-          break;
-        }
-        case HloOpcode::kRecv: {
-          TF_RETURN_IF_ERROR(check_unique_host_channel(instruction));
-          TF_RET_CHECK(instruction->users().size() == 1);
-          const HloInstruction* recv_done = instruction->users().front();
-          TF_RET_CHECK(recv_done->opcode() == HloOpcode::kRecvDone);
-          TF_RETURN_IF_ERROR(CheckSameChannel(instruction, recv_done));
-          TF_RETURN_IF_ERROR(CheckSameIsHostTransfer(instruction, recv_done));
-          break;
-        }
-        case HloOpcode::kSendDone:
-          TF_RET_CHECK(instruction->operands().size() == 1);
-          TF_RET_CHECK(instruction->operand(0)->opcode() == HloOpcode::kSend);
-          break;
-        case HloOpcode::kRecvDone:
-          TF_RET_CHECK(instruction->operands().size() == 1);
-          TF_RET_CHECK(instruction->operand(0)->opcode() == HloOpcode::kRecv);
-          break;
-        default:
-          break;
-      }
-    }
+  Status HandleTranspose(HloInstruction* transpose) override {
+    const Shape& shape = transpose->shape();
+    const HloInstruction* operand = transpose->operand(0);
+    TF_RET_CHECK(shape.dimensions().size() == transpose->dimensions().size());
+    TF_RET_CHECK(shape.dimensions().size() ==
+                 transpose->operand(0)->shape().dimensions().size());
+    TF_RET_CHECK(std::equal(
+        operand->shape().dimensions().begin(),
+        operand->shape().dimensions().end(),
+        Permute(transpose->dimensions(), shape.dimensions()).begin()))
+        << "shape: " << shape << ", operand->shape(): " << shape
+        << ", dimensions: {" << absl::StrJoin(transpose->dimensions(), ", ")
+        << "}";
+    return Status::OK();
   }
-  return Status::OK();
-}
+
+  Status Preprocess(HloInstruction* instruction) override {
+    auto previous = instructions_by_name_.find(instruction->name());
+    TF_RET_CHECK(previous == instructions_by_name_.end())
+        << "HLO has name that is not unique within module:\n"
+        << instruction->ToString()
+        << " in computation: " << instruction->parent()->name()
+        << "\nPrevious HLO with same name:\n"
+        << previous->second->ToString()
+        << " in computation: " << previous->second->parent()->name();
+    instructions_by_name_[instruction->name()] = instruction;
+    return Status::OK();
+  }
+
+ private:
+  absl::flat_hash_map<string, const HloInstruction*> instructions_by_name_;
+};
 
 }  // namespace
 
@@ -1061,65 +1153,13 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
   TF_RETURN_IF_ERROR(VerifyHloStructure(module));
   TF_RETURN_IF_ERROR(VerifySendsAndRecvs(*module));
 
-  absl::flat_hash_map<string, const HloInstruction*> instructions;
 
   for (auto* computation : module->computations()) {
-    for (const auto& instruction : computation->instructions()) {
-      TF_RET_CHECK(instruction->parent() == computation);
-      if (instruction->opcode() == HloOpcode::kFusion) {
-        TF_RETURN_IF_ERROR(CheckFusionInstruction(instruction));
-        TF_RET_CHECK(instruction->called_computations() ==
-                     absl::Span<HloComputation* const>(
-                         {instruction->fused_instructions_computation()}))
-            << "Fusion HLO calls computations other than the "
-               "fused_instructions_computation: "
-            << instruction->ToString()
-            << " instruction->fused_instructions_computation(): "
-            << instruction->fused_instructions_computation()->ToString()
-            << " instruction->called_computations(): "
-            << ComputationsToString(instruction->called_computations());
-
-        for (const auto& fused : instruction->fused_instructions()) {
-          TF_RET_CHECK(fused->parent() ==
-                       instruction->fused_instructions_computation())
-              << "Fused HLO was missing a parent: " << fused->ToString()
-              << " parent: " << fused->parent()
-              << " computation: " << computation;
-        }
-      } else if (instruction->opcode() == HloOpcode::kBroadcast) {
-        // If you see this failure then someone has confused the difference
-        // between the HLO broadcast op, and the UserComputation broadcast
-        // op. See https://groups.google.com/forum/#!topic/xla-dev/9LqijHmTt_I
-        // or ComputationLowerer::Visit()
-        TF_RET_CHECK(instruction->dimensions().size() ==
-                     ShapeUtil::Rank(instruction->operand(0)->shape()))
-            << "Broadcast HLO (" << instruction->ToShortString()
-            << ") has invalid number of dimensions: "
-            << instruction->dimensions().size()
-            << " != " << ShapeUtil::Rank(instruction->operand(0)->shape());
-      } else if (instruction->opcode() == HloOpcode::kWhile) {
-        TF_RETURN_IF_ERROR(CheckWhileInstruction(instruction));
-      } else if (instruction->opcode() == HloOpcode::kConditional) {
-        TF_RETURN_IF_ERROR(CheckConditionalInstruction(instruction));
-      } else if (instruction->opcode() !=
-                     HloOpcode::kRng /* Rng operands are always scalar. */
-                 && instruction->IsElementwise()) {
-        TF_RETURN_IF_ERROR(CheckElementwiseInstruction(instruction));
-      }
-
-      auto previous = instructions.find(instruction->name());
-      TF_RET_CHECK(previous == instructions.end())
-          << "HLO has name that is not unique within module:\n"
-          << instruction->ToString()
-          << " in computation: " << computation->name()
-          << "\nPrevious HLO with same name:\n"
-          << previous->second->ToString()
-          << " in computation: " << previous->second->parent()->name();
-      instructions[instruction->name()] = instruction;
-    }
-
     std::unique_ptr<ShapeVerifier> shape_verifier = shape_verifier_factory_();
     TF_RETURN_IF_ERROR(computation->Accept(shape_verifier.get()));
+
+    InstructionVerifier instruction_verifier;
+    TF_RETURN_IF_ERROR(computation->Accept(&instruction_verifier));
   }
 
   TF_RETURN_IF_ERROR(VerifyEntryAndExitShapes(*module));
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 0cde4a31af..6d16586c2c 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -172,17 +172,6 @@ class HloVerifier : public HloModulePass {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  // CHECKs various invariants of a fusion instruction.
-  Status CheckFusionInstruction(HloInstruction* fusion) const;
-
-  Status CheckWhileInstruction(HloInstruction* instruction);
-
-  Status CheckConditionalInstruction(HloInstruction* instruction);
-
-  // Checks that the non-scalar operand shapes are compatible to the output
-  // shape, i.e., that there are no implicit broadcasts of size-one dimensions.
-  Status CheckElementwiseInstruction(HloInstruction* instruction);
-
   // Creates a ShapeVerifier that checks that shapes match inferred
   // expectations. This is a factory function because ShapeVerifier,
   // being a DfsHloVisitor, is stateful. We want a clean object
-- 
GitLab


From 3302b4c1fcf2ecd3ae3119cddb16d057235ece07 Mon Sep 17 00:00:00 2001
From: Tingbo Lu <tingbopku@gmail.com>
Date: Fri, 5 Oct 2018 00:02:45 +0800
Subject: [PATCH 407/570] Update rnn_cell.py

---
 tensorflow/contrib/rnn/python/ops/rnn_cell.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 59a61af7b3..e8073f8463 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -1110,7 +1110,7 @@ _Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
 class AttentionCellWrapper(rnn_cell_impl.RNNCell):
   """Basic attention cell wrapper.
 
-  Implementation based on https://arxiv.org/abs/1409.0473.
+  Implementation based on https://arxiv.org/abs/1601.06733.
   """
 
   def __init__(self,
-- 
GitLab


From a7e8ad18a61b251ef42c0260dd80a12cea8f268c Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Thu, 4 Oct 2018 09:20:31 -0700
Subject: [PATCH 408/570] Experimental interpreter, kernels, and example
 running TensorFlow Lite on a microcontroller

PiperOrigin-RevId: 215748973
---
 .../contrib/lite/experimental/micro/BUILD     |   76 +
 .../contrib/lite/experimental/micro/README.md |  114 ++
 .../lite/experimental/micro/compatibility.h   |   32 +
 .../micro/examples/micro_speech/BUILD         |   28 +
 .../micro_speech/micro_speech_test.cc         |   55 +
 .../micro_speech/tiny_conv_model_data.cc      | 1672 +++++++++++++++++
 .../micro_speech/tiny_conv_model_data.h       |   27 +
 .../lite/experimental/micro/kernels/BUILD     |  107 ++
 .../micro/kernels/all_ops_resolver.cc         |   43 +
 .../micro/kernels/all_ops_resolver.h          |   34 +
 .../micro/kernels/depthwise_conv.cc           |  208 ++
 .../micro/kernels/depthwise_conv_test.cc      |  406 ++++
 .../micro/kernels/fully_connected.cc          |  184 ++
 .../micro/kernels/fully_connected_test.cc     |  643 +++++++
 .../experimental/micro/kernels/softmax.cc     |  213 +++
 .../micro/kernels/softmax_test.cc             |  220 +++
 .../experimental/micro/kernels/test_utils.h   |  170 ++
 .../micro/micro_error_reporter.cc             |   78 +
 .../experimental/micro/micro_error_reporter.h |   34 +
 .../micro/micro_error_reporter_test.cc        |   25 +
 .../experimental/micro/micro_interpreter.cc   |  310 +++
 .../experimental/micro/micro_interpreter.h    |   71 +
 .../micro/micro_interpreter_test.cc           |  197 ++
 .../micro/micro_mutable_op_resolver.cc        |   80 +
 .../micro/micro_mutable_op_resolver.h         |   46 +
 .../micro/micro_mutable_op_resolver_test.cc   |   83 +
 .../micro/simple_tensor_allocator.cc          |  149 ++
 .../micro/simple_tensor_allocator.h           |   51 +
 .../micro/simple_tensor_allocator_test.cc     |  144 ++
 .../lite/experimental/micro/testing/BUILD     |   17 +
 .../micro/testing/Dockerfile.bluepill         |   21 +
 .../experimental/micro/testing/bluepill.resc  |   36 +
 .../experimental/micro/testing/micro_test.bzl |   64 +
 .../experimental/micro/testing/micro_test.h   |  138 ++
 .../micro/testing/test_bluepill_binary.sh     |   54 +
 .../micro/testing/test_linux_binary.sh        |   39 +
 .../experimental/micro/tools/make/Makefile    |  166 ++
 .../micro/tools/make/download_dependencies.sh |   73 +
 .../tools/make/targets/bluepill_makefile.inc  |   65 +
 .../lite/kernels/internal/compatibility.h     |   23 +
 .../contrib/lite/kernels/internal/types.h     |    3 +-
 41 files changed, 6197 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/contrib/lite/experimental/micro/BUILD
 create mode 100644 tensorflow/contrib/lite/experimental/micro/README.md
 create mode 100644 tensorflow/contrib/lite/experimental/micro/compatibility.h
 create mode 100644 tensorflow/contrib/lite/experimental/micro/examples/micro_speech/BUILD
 create mode 100644 tensorflow/contrib/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h
 create mode 100644 tensorflow/contrib/lite/experimental/micro/kernels/BUILD
 create mode 100644 tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h
 create mode 100644 tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/kernels/fully_connected.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/kernels/fully_connected_test.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/kernels/softmax.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/kernels/softmax_test.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h
 create mode 100644 tensorflow/contrib/lite/experimental/micro/micro_error_reporter.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h
 create mode 100644 tensorflow/contrib/lite/experimental/micro/micro_error_reporter_test.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/micro_interpreter.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/micro_interpreter.h
 create mode 100644 tensorflow/contrib/lite/experimental/micro/micro_interpreter_test.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.h
 create mode 100644 tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver_test.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h
 create mode 100644 tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator_test.cc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/testing/BUILD
 create mode 100644 tensorflow/contrib/lite/experimental/micro/testing/Dockerfile.bluepill
 create mode 100644 tensorflow/contrib/lite/experimental/micro/testing/bluepill.resc
 create mode 100644 tensorflow/contrib/lite/experimental/micro/testing/micro_test.bzl
 create mode 100644 tensorflow/contrib/lite/experimental/micro/testing/micro_test.h
 create mode 100755 tensorflow/contrib/lite/experimental/micro/testing/test_bluepill_binary.sh
 create mode 100755 tensorflow/contrib/lite/experimental/micro/testing/test_linux_binary.sh
 create mode 100644 tensorflow/contrib/lite/experimental/micro/tools/make/Makefile
 create mode 100755 tensorflow/contrib/lite/experimental/micro/tools/make/download_dependencies.sh
 create mode 100644 tensorflow/contrib/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc

diff --git a/tensorflow/contrib/lite/experimental/micro/BUILD b/tensorflow/contrib/lite/experimental/micro/BUILD
new file mode 100644
index 0000000000..df1036bc8b
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/BUILD
@@ -0,0 +1,76 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/contrib/lite/experimental/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
+cc_library(
+    name = "micro_framework",
+    srcs = [
+        "micro_error_reporter.cc",
+        "micro_interpreter.cc",
+        "micro_mutable_op_resolver.cc",
+        "simple_tensor_allocator.cc",
+    ],
+    hdrs = [
+        "compatibility.h",
+        "micro_error_reporter.h",
+        "micro_interpreter.h",
+        "micro_mutable_op_resolver.h",
+        "simple_tensor_allocator.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite/c:c_api_internal",
+        "//tensorflow/contrib/lite/core/api",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "micro_error_reporter_test",
+    srcs = [
+        "micro_error_reporter_test.cc",
+    ],
+    deps = [
+        ":micro_framework",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "micro_mutable_op_resolver_test",
+    srcs = [
+        "micro_mutable_op_resolver_test.cc",
+    ],
+    deps = [
+        ":micro_framework",
+        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "micro_interpreter_test",
+    srcs = [
+        "micro_interpreter_test.cc",
+    ],
+    deps = [
+        ":micro_framework",
+        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "simple_tensor_allocator_test",
+    srcs = [
+        "simple_tensor_allocator_test.cc",
+    ],
+    deps = [
+        ":micro_framework",
+        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/contrib/lite/experimental/micro/README.md b/tensorflow/contrib/lite/experimental/micro/README.md
new file mode 100644
index 0000000000..414cafde4d
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/README.md
@@ -0,0 +1,114 @@
+# TensorFlow Lite for Microcontrollers
+
+This an experimental port of TensorFlow Lite aimed at micro controllers and other devices with only kilobytes of memory. It doesn't require any operating system support, any standard C or C++ libraries, or dynamic memory allocation, so it's designed to be portable even to 'bare metal' systems. The core runtime fits in 16KB on a Cortex M3, and with enough operators to run a speech keyword detection model, takes up a total of 22KB.
+
+The design goals are for the framework to be:
+
+- **Readable**: We want embedded software engineers to be able to understand what's required to run ML inference without having to study research papers. We've tried to keep the code base small, modular, and have reference implementations of all operations to help with this.
+
+- **Easy to modify**: We know that there are a lot of different platforms and requirements in the embedded world, and we don't expect to cover all of them in one framework. Instead, we're hoping that it can be a good starting point for developers to build on top of to meet their own needs. For example, we tried to make it easy to replace the implementations of key computational operators that are often crucial for performance, without having to touch the data flow and other runtime code. We want it to make more sense to use our workflow to handle things like model import and less-important operations, and customize the parts that matter, rather than having to reimplement everything in your own engine.
+
+- **Well-tested**: If you're modifying code, you need to know if your changes are correct. Having an easy way to test lets you develop much faster. To help there, we've written tests for all the components, and we've made sure that the tests can be run on almost any platform, with no dependencies apart from the ability to log text to a debug console somewhere. We also provide an easy way to run all the tests on-device as part of an automated test framework, and we use qemu/Renode emulation so that tests can be run even without physical devices present.
+
+- **Easy to integrate**: We want to be as open a system as possible, and use the best code available for each platform. To do that, we're going to rely on projects like [CMSIS-NN](https://www.keil.com/pack/doc/CMSIS/NN/html/index.html), [uTensor](https://github.com/uTensor/uTensor), and other vendor libraries to handle as much performance-critical code as possible. We know that there are an increasing number of options to accelerate neural networks on microcontrollers, so we're aiming to be a good host for deploying those hardware technologies too.
+
+- **Compatible**: We're using the same file schema, interpreter API, and kernel interface as regular TensorFlow Lite, so we leverage the large existing set of tools, documentation, and examples for the project. The biggest barrier to deploying ML models is getting them from a training environment into a form that's easy to run inference on, so we see reusing this rich ecosystem as being crucial to being easily usable. We also hope to integrate this experimental work back into the main codebase in the future.
+
+To meet those goals, we've made some tradeoffs:
+
+- **Simple C++**: To help with readability, our code is written in a modern version of C++, but we generally treat it as a "better C", rather relying on more complex features such as template meta-programming. As mentioned earlier, we avoid any use of dynamic memory allocation (new/delete) or the standard C/C++ libraries, so we believe this should still be fairly portable. It does mean that some older devices with C-only toolchains won't be supported, but we're hoping that the reference operator implementations (which are simple C-like functions) can still be useful in those cases. The interfaces are also designed to be C-only, so it should be possible to integrate the resulting library with pure C projects.
+
+- **Interpreted**: Code generation is a popular pattern for embedded code, because it gives standalone code that's easy to modify and step through, but we've chosen to go with an interpreted approach. In our internal microcontroller work we've found that using an extremely stripped-down interpreter with almost no dependencies gives us a lot of the same advantages, but is easier to maintain. For example, when new updates come out for the underlying library, you can just merge your local modifications in a single step, rather than having to regenerate new code and then patch in any changes you subsequently made. The coarse granularity of the interpreted primitives means that each operation call typically takes hundreds of thousands of instruction cycles at least, so we don't see noticeable performance gains from avoiding what's essentially a single switch statement at the interpreter level to call each operation. We're still working on improving the packaging though, for example we're considering having the ability to snapshot all the source files and headers used for a particular model, being able to compile the code and data together as a library, and then access it through a minimal set of C interface calls which hide the underlying complexity.
+
+- **Flatbuffers**: We represent our models using [the standard flatbuffer schema used by the rest of TensorFlow Lite](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/schema/schema.fbs), with the difference that we always keep it in read-only program memory (typically flash) rather than relying on having a file system to read it from. This is a good fit because flatbuffer's serialized format is designed to be mapped into memory without requiring any extra memory allocations or modifications to access it. All of the functions to read model values work directly on the serialized bytes, and large sections of data like weights are directly accessible as sequential C-style arrays of their data type, with no strides or unpacking needed. We do get a lot of value from using flatbuffers, but there is a cost in complexity. The flat buffer library code is all inline [inside the main headers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/schema/schema_generated.h), but it isn't straightforward to inspect their implementations, and the model data structures aren't easy to comprehend from the debugger. The header for the schema itself also has to be periodically updated when new information is added to the file format, though we try to handle that transparently for most developers by checking in a pre-generated version.
+
+- **Code Duplication**: Some of the code in this prototype largely duplicates the logic in other parts of the TensorFlow Lite code base, for example the operator wrappers. We've tried to keep share as much as we can between the two interpreters, but there are some assumptions built into the original runtime that make this difficult. We'll be working on modularizing the main interpreter so that we can move to an entirely shared system.
+
+This initial preview release is designed to get early feedback, and is not intended to be a final product. It only includes enough operations to run a simple keyword recognition model, and the implementations are not optimized. We're hoping this will be a good way to get feedback and collaborate to improve the framework.
+
+## Getting Started
+
+Building requires a Linux or OS X machine.
+
+ - Open a terminal
+ - Download the TensorFlow source with `git clone https://github.com/tensorflow`
+ - Enter the source root directory by running `cd tensorflow`
+ - Download the dependencies by running `tensorflow/contrib/lite/experimental/micro/tools/make/download_dependencies.sh`. This may take a few minutes
+ - Build and test the library with `make -f tensorflow/contrib/lite/experimental/micro/tools/make/Makefile test`
+
+You should see a series of compilation steps, followed by "~~~ALL TESTS PASSED~~~" for the various tests of the code that it will run. If there's an error, you should get an informative message from make about what went wrong.
+
+These tests are all built as simple binaries with few dependencies, so you can run them manually. For example, here's how to run the depthwise convolution test, and its output:
+
+```
+tensorflow/contrib/lite/experimental/micro/tools/make/gen/linux_x86_64/bin/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test
+
+Testing SimpleTest
+Testing SimpleTestQuantized
+Testing SimpleTestRelu
+Testing SimpleTestReluQuantized
+4/4 tests passed
+~ALL TESTS PASSED~~~
+```
+
+Looking at the [depthwise_conv_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test.cc) code, you'll see a sequence that looks like this:
+
+```
+...
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleTest) {
+...
+}
+...
+TF_LITE_MICRO_TESTS_END
+```
+
+These macros work a lot like [the Google test framework](https://github.com/google/googletest), but they don't require any dependencies and just write results to stderr, rather than aborting the program. If all the tests pass, then "~~~ALL TESTS PASSED~~~" is output, and the test harness that runs the binary during the make process knows that everything ran correctly. If there's an error, the lack of the expected string lets the harness know that the test failed.
+
+So, why are we running tests in this complicated way? So far, we've been building binaries that run locally on the Mac OS or Linux machine you're building on, but this approach becomes important when we're targeting simple micro controller devices.
+
+## Building for the "Blue Pill" STM32F103
+
+The goal of this library is to enable machine learning on resource-constrained micro controllers and DSPs, and as part of that we've targeted the ["Blue Pill" STM32F103-compatible development board](https://github.com/google/googletest) as a cheap and popular platform. It only has 20KB of RAM and 64KB of flash, so it's a good device to ensure we can run efficiently on small chips.
+
+It's fairly easy to [buy and wire up a physical board](https://github.com/google/stm32_bare_lib#wiring-up-your-blue-pill), but even if you don't have an actual device, the [Renode project](https://renode.io/) makes it easy to run a faithful emulation on your desktop machine. You'll need [Docker](https://www.docker.com/) installed, but once you have that set up, try running the following command:
+
+`make -f tensorflow/contrib/lite/experimental/micro/tools/make/Makefile TARGET=bluepill test`
+
+You should see a similar set of outputs as you did in the previous section, with the addition of some extra Docker logging messages. These are because we're using Docker to run the Renode micro controller emulation tool, and the tests themselves are being run on a simulated STM32F103 device. The communication channels between an embedded device and the host are quite limited, so the test harness looks at the output of the debug log to see if tests have passed, just as it did in the previous section. This makes it a very flexible way to run cross-platform tests, even when a platform has no operating system facilities, as long as it can output debugging text logs.
+
+To understand what's happening here, try running the same depthwise convolution test, but through the emulated device test harness, with the following command:
+
+```
+tensorflow/contrib/lite/experimental/micro/testing/test_bluepill_binary.sh \
+tensorflow/contrib/lite/experimental/micro/tools/make/gen/bluepill_cortex-m3/bin/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test
+
+```
+
+You should see output that looks something like this:
+
+```
+Sending build context to Docker daemon   21.5kB
+Step 1/2 : FROM antmicro/renode:latest
+ ---> 1b670a243e8f
+Step 2/2 : LABEL maintainer="Pete Warden <petewarden@google.com>"
+ ---> Using cache
+ ---> 3afcd410846d
+Successfully built 3afcd410846d
+Successfully tagged renode_bluepill:latest
+LOGS:
+...
+03:27:32.4340 [INFO] machine-0: Machine started.
+03:27:32.4790 [DEBUG] cpu.uartSemihosting: [+0.22s host +0s virt 0s virt from start] Testing SimpleTest
+03:27:32.4812 [DEBUG] cpu.uartSemihosting: [+2.21ms host +0s virt 0s virt from start]   Testing SimpleTestQuantized
+03:27:32.4833 [DEBUG] cpu.uartSemihosting: [+2.14ms host +0s virt 0s virt from start]   Testing SimpleTestRelu
+03:27:32.4834 [DEBUG] cpu.uartSemihosting: [+0.18ms host +0s virt 0s virt from start]   Testing SimpleTestReluQuantized
+03:27:32.4838 [DEBUG] cpu.uartSemihosting: [+0.4ms host +0s virt 0s virt from start]   4/4 tests passed
+03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+41µs host +0s virt 0s virt from start]   ~~~ALL TESTS PASSED~~~
+03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+5µs host +0s virt 0s virt from start]   
+...
+tensorflow/contrib/lite/experimental/micro/tools/make/gen/bluepill_cortex-m3/bin/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test: PASS
+```
+
+There's a lot of output here, but you should be able to see that the same tests that were covered when we ran locally on the development machine show up in the debug logs here, along with the magic string "~~~ALL TESTS PASSED~~~". This is the exact same code as before, just compiled and run on the STM32F103 rather than your desktop. We hope that the simplicity of this testing approach will help make adding support for new platforms as easy as possible.
diff --git a/tensorflow/contrib/lite/experimental/micro/compatibility.h b/tensorflow/contrib/lite/experimental/micro/compatibility.h
new file mode 100644
index 0000000000..4f0fd9f312
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/compatibility.h
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_COMPATIBILITY_H_
+#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_COMPATIBILITY_H_
+
+// C++ will automatically create class-specific delete operators for virtual
+// objects, which by default call the global delete function. For embedded
+// applications we want to avoid this, and won't be calling new/delete on these
+// objects, so we need to override the default implementation with one that does
+// nothing to avoid linking in ::delete().
+// This macro needs to be included in all subclasses of a virtual base class in
+// the private section.
+#ifdef TF_LITE_STATIC_MEMORY
+#define TF_LITE_REMOVE_VIRTUAL_DELETE \
+  void operator delete(void* p) {}
+#else
+#define TF_LITE_REMOVE_VIRTUAL_DELETE
+#endif
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_COMPATIBILITY_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/BUILD
new file mode 100644
index 0000000000..447c584387
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/BUILD
@@ -0,0 +1,28 @@
+# Description:
+#   TensorFlow Lite microcontroller example.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/contrib/lite/experimental/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
+tflite_micro_cc_test(
+    name = "micro_speech_test",
+    srcs = [
+        "micro_speech_test.cc",
+        "tiny_conv_model_data.cc",
+        "tiny_conv_model_data.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
+        "//tensorflow/contrib/lite/experimental/micro/kernels:all_ops_resolver",
+        "//tensorflow/contrib/lite/experimental/micro/kernels:micro_ops",
+        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+    ],
+)
diff --git a/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
new file mode 100644
index 0000000000..86cd056a72
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+#include "tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/contrib/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/version.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestInvoke) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    error_reporter->Report(
+        "Model provided is schema version %d not equal "
+        "to supported version %d.\n",
+        model->version(), TFLITE_SCHEMA_VERSION);
+  }
+  tflite::ops::micro::AllOpsResolver resolver;
+
+  const int tensor_arena_size = 10 * 1024;
+  uint8_t tensor_arena[tensor_arena_size];
+  tflite::SimpleTensorAllocator tensor_allocator(tensor_arena,
+                                                 tensor_arena_size);
+
+  tflite::MicroInterpreter interpreter(model, resolver, &tensor_allocator,
+                                       error_reporter);
+  TfLiteStatus invoke_status = interpreter.Invoke();
+  if (invoke_status != kTfLiteOk) {
+    error_reporter->Report("Invoke failed\n");
+  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
+
+  error_reporter->Report("Ran successfully\n");
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc b/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
new file mode 100644
index 0000000000..f1f9e0e219
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
@@ -0,0 +1,1672 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Automatically created from a TensorFlow Lite flatbuffer using the command:
+// xxd -i tiny_conv.tflite > tiny_conv_model_data.cc
+
+#include "tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+
+const unsigned char g_tiny_conv_model_data[] = {
+    0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
+    0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x4d, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xf4, 0x47, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00,
+    0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74,
+    0x65, 0x64, 0x2e, 0x00, 0x09, 0x00, 0x00, 0x00, 0xd4, 0x47, 0x00, 0x00,
+    0x04, 0x03, 0x00, 0x00, 0xfc, 0x02, 0x00, 0x00, 0xf4, 0x02, 0x00, 0x00,
+    0x64, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb8, 0xb3, 0xff, 0xff,
+    0x16, 0xb4, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0xd7, 0x02, 0x00, 0x00, 0x2f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0xb3, 0xff, 0xff,
+    0x46, 0xb4, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0xab, 0x00, 0x00, 0x00, 0x1e, 0xff, 0xff, 0xff, 0xed, 0xff, 0xff, 0xff,
+    0x4a, 0x00, 0x00, 0x00, 0x62, 0xb4, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x80, 0x02, 0x00, 0x00, 0xce, 0xad, 0xaf, 0x3c, 0xc8, 0xe9, 0xb0, 0x83,
+    0xa1, 0xbf, 0xb2, 0xb1, 0xab, 0xd0, 0xa7, 0x53, 0xa5, 0xe9, 0xb5, 0xac,
+    0xa2, 0xd3, 0xc4, 0x9e, 0x8b, 0xb2, 0x64, 0xb3, 0x9d, 0xa2, 0xae, 0xa6,
+    0xd5, 0xbe, 0x43, 0x9f, 0x9c, 0x54, 0xb5, 0xa8, 0x49, 0x78, 0x86, 0xa2,
+    0xa3, 0x55, 0x35, 0x96, 0x3d, 0x7f, 0xe2, 0xb5, 0xb0, 0x47, 0x28, 0xa9,
+    0x9d, 0xbb, 0xd6, 0xff, 0xb7, 0x79, 0x63, 0xb5, 0xaf, 0xa7, 0xab, 0x7e,
+    0xbc, 0xc7, 0xa0, 0xc3, 0xb1, 0xb6, 0xb2, 0xa1, 0xc2, 0xbb, 0x79, 0x57,
+    0xbe, 0xc1, 0xb7, 0xb0, 0x6b, 0xb7, 0xa5, 0x75, 0x97, 0xb8, 0xe7, 0xac,
+    0xad, 0x7e, 0xb1, 0x9b, 0xc3, 0xba, 0x6b, 0xa2, 0x7f, 0x58, 0xb9, 0x7a,
+    0x4c, 0x91, 0x74, 0x9e, 0xa7, 0x3d, 0xc2, 0x94, 0x75, 0xa1, 0xa4, 0xac,
+    0xab, 0x45, 0x2e, 0xb4, 0xb6, 0xbf, 0xc1, 0xdb, 0xaf, 0x6c, 0x67, 0xb1,
+    0xa9, 0xa6, 0xa8, 0xca, 0xc2, 0xc4, 0xb9, 0xbf, 0xb4, 0xb9, 0xaa, 0x9d,
+    0x9f, 0xb9, 0xb2, 0x71, 0xb2, 0xca, 0xbe, 0xaf, 0x5f, 0xbc, 0xa0, 0x5b,
+    0xa8, 0xb4, 0xa4, 0xa8, 0xd8, 0x69, 0xb7, 0x8a, 0xbc, 0xb8, 0xaf, 0x9c,
+    0x7c, 0x5d, 0xb3, 0x6b, 0x49, 0x95, 0x64, 0xa0, 0xa2, 0x49, 0xcb, 0x87,
+    0xa5, 0xb5, 0xa1, 0xb2, 0xa3, 0x40, 0x6d, 0x9f, 0xc5, 0xb6, 0xbb, 0xd4,
+    0x9c, 0x6d, 0x69, 0xa9, 0xa8, 0x91, 0xad, 0xb8, 0xd2, 0xc6, 0xaf, 0xb8,
+    0xac, 0xa9, 0xa2, 0xa7, 0x60, 0xa6, 0xa1, 0xc9, 0xb8, 0xd6, 0xcf, 0xb1,
+    0x56, 0xb4, 0xac, 0x40, 0xae, 0xbd, 0xbf, 0xa2, 0x54, 0x72, 0x9b, 0x8c,
+    0xc2, 0xb5, 0xc2, 0x9b, 0x64, 0x6d, 0xb4, 0x62, 0x4e, 0x9b, 0x6c, 0xa6,
+    0x8f, 0x4c, 0xca, 0x95, 0xb6, 0xbf, 0x92, 0xae, 0x9c, 0x49, 0xae, 0xb2,
+    0xc0, 0xb6, 0xbc, 0xd1, 0xa4, 0x7b, 0x64, 0xa0, 0xa6, 0x81, 0xac, 0xa6,
+    0xbd, 0xc8, 0xbc, 0xae, 0xaa, 0x9e, 0x61, 0xb1, 0x57, 0xac, 0xbf, 0xbf,
+    0xbb, 0xe0, 0xa6, 0xae, 0x47, 0xc9, 0xbc, 0x57, 0xb0, 0xb5, 0xc7, 0x98,
+    0xf4, 0x93, 0xb6, 0x70, 0xc3, 0xb3, 0xca, 0xab, 0x77, 0x9a, 0xac, 0x45,
+    0x5c, 0x9e, 0x9a, 0xa9, 0x9b, 0x35, 0xc0, 0x6f, 0xc6, 0xc7, 0x91, 0xb4,
+    0xa8, 0x3c, 0xce, 0xb8, 0xad, 0xb9, 0xb5, 0xdd, 0x9c, 0x6d, 0xbf, 0x91,
+    0xb2, 0x7d, 0xa0, 0xaf, 0x9f, 0xbd, 0xb9, 0xcf, 0x9b, 0x5d, 0x3f, 0xac,
+    0x64, 0xae, 0xaf, 0xb8, 0xbc, 0xb8, 0x86, 0xb5, 0x36, 0xcf, 0xb4, 0xa9,
+    0xad, 0xcd, 0xdb, 0xa4, 0x68, 0xa6, 0xa4, 0x67, 0xc8, 0xb7, 0xe5, 0xa4,
+    0x76, 0xb8, 0xa8, 0x28, 0x6b, 0xa5, 0xba, 0xad, 0x9f, 0x3a, 0xa5, 0x42,
+    0xc5, 0xb0, 0x88, 0xad, 0xa5, 0x4d, 0xea, 0x8a, 0xb8, 0xb5, 0xb3, 0xd9,
+    0xa0, 0x77, 0xbb, 0x92, 0x9e, 0x80, 0xbd, 0xbd, 0x6d, 0xcc, 0xab, 0x99,
+    0x88, 0x58, 0x4d, 0xb0, 0x6c, 0xbc, 0x96, 0xbd, 0xae, 0xab, 0x5b, 0xac,
+    0x2f, 0xc3, 0x9a, 0xbe, 0xac, 0xb3, 0x84, 0x9b, 0xe3, 0xaf, 0x95, 0x6b,
+    0xc2, 0xb5, 0xca, 0xb7, 0x4e, 0xbc, 0x9d, 0x24, 0x75, 0xa9, 0xd2, 0xae,
+    0xa0, 0x2b, 0x90, 0x34, 0xd1, 0xb5, 0x96, 0xae, 0xaa, 0x4d, 0xc1, 0xa3,
+    0xb1, 0xb4, 0xaa, 0xd2, 0x9c, 0x7d, 0xc0, 0x91, 0x91, 0x7a, 0xb8, 0x83,
+    0x44, 0xcb, 0xaf, 0x9b, 0x6b, 0x5b, 0x75, 0xb2, 0x62, 0xb6, 0xaa, 0xcb,
+    0x99, 0xa8, 0x63, 0xae, 0x24, 0xc7, 0x8a, 0xbe, 0xa9, 0xb6, 0xa0, 0xa1,
+    0x41, 0xac, 0x84, 0xb5, 0xb9, 0xb3, 0x9b, 0xad, 0x77, 0xbf, 0xa8, 0x7e,
+    0x82, 0xb9, 0xbe, 0xaa, 0xa3, 0x47, 0x6d, 0xb5, 0xc3, 0xb1, 0xbf, 0xa7,
+    0xb1, 0x57, 0x75, 0xb5, 0xb0, 0xb6, 0xb9, 0xce, 0xa4, 0x86, 0xb0, 0xa4,
+    0x98, 0x80, 0xc5, 0x3e, 0x90, 0xca, 0x9b, 0xa2, 0x5a, 0x50, 0xc5, 0xa5,
+    0xad, 0xc1, 0x9c, 0x91, 0x83, 0x8f, 0x21, 0xab, 0xac, 0xba, 0x70, 0xb4,
+    0xae, 0x85, 0x7e, 0xa7, 0xbd, 0xba, 0x7c, 0xb2, 0xb5, 0xb2, 0x7e, 0xb3,
+    0xc3, 0xcd, 0x82, 0xac, 0x9b, 0xb3, 0xa6, 0xb0, 0xbc, 0x6f, 0x52, 0xb9,
+    0xbf, 0xb1, 0xa6, 0xa4, 0xc1, 0x7a, 0x90, 0xc0, 0xae, 0xab, 0x94, 0xd8,
+    0xab, 0xa4, 0x98, 0xbb, 0x8b, 0x86, 0x94, 0x01, 0xad, 0xe7, 0xb1, 0x9b,
+    0x57, 0x48, 0xc1, 0x88, 0xbf, 0xcc, 0xb4, 0x4b, 0x62, 0x8b, 0x48, 0xa7,
+    0xbe, 0xe1, 0x80, 0xa6, 0xb3, 0x64, 0xaa, 0xa4, 0xcf, 0xba, 0x6d, 0xa6,
+    0xb8, 0xa0, 0x8f, 0xb3, 0xce, 0xc3, 0x87, 0xb2, 0xa0, 0xc0, 0x78, 0xb0,
+    0xb9, 0xaa, 0x40, 0xb8, 0xd8, 0xa3, 0x9a, 0xaa, 0xcc, 0xa2, 0x9f, 0xb9,
+    0xbe, 0xc2, 0x89, 0xd6, 0xc6, 0x9c, 0xa3, 0xc7, 0x94, 0xb6, 0xff, 0xff,
+    0x98, 0xb6, 0xff, 0xff, 0xf6, 0xb6, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0xc0, 0x44, 0x00, 0x00, 0x4a, 0x4d, 0x59, 0x60, 0x5a, 0x45, 0x3d, 0x50,
+    0x4a, 0x43, 0x3d, 0x59, 0x3e, 0x49, 0x4a, 0x59, 0x45, 0x44, 0x41, 0x5d,
+    0x50, 0x2f, 0x4e, 0x34, 0x46, 0x48, 0x41, 0x4a, 0x4c, 0x3b, 0x4b, 0x3e,
+    0x49, 0x49, 0x43, 0x4b, 0x3e, 0x49, 0x47, 0x41, 0x3e, 0x4a, 0x46, 0x43,
+    0x41, 0x43, 0x47, 0x49, 0x4a, 0x4c, 0x46, 0x58, 0x3f, 0x4c, 0x4b, 0x4c,
+    0x4d, 0x4b, 0x45, 0x52, 0x45, 0x42, 0x52, 0x52, 0x48, 0x40, 0x46, 0x5f,
+    0x4c, 0x41, 0x47, 0x48, 0x48, 0x4c, 0x43, 0x61, 0x50, 0x4b, 0x49, 0x49,
+    0x46, 0x3f, 0x40, 0x67, 0x40, 0x4d, 0x45, 0x40, 0x40, 0x45, 0x47, 0x56,
+    0x44, 0x3a, 0x4a, 0x4c, 0x52, 0x48, 0x46, 0x50, 0x4b, 0x44, 0x51, 0x45,
+    0x40, 0x45, 0x45, 0x48, 0x4e, 0x4e, 0x43, 0x48, 0x44, 0x4b, 0x45, 0x4a,
+    0x53, 0x45, 0x4a, 0x4b, 0x3f, 0x43, 0x45, 0x53, 0x4d, 0x43, 0x46, 0x3f,
+    0x47, 0x4e, 0x51, 0x50, 0x48, 0x4f, 0x4f, 0x4a, 0x4a, 0x4e, 0x45, 0x4e,
+    0x46, 0x41, 0x4a, 0x46, 0x45, 0x47, 0x45, 0x4b, 0x50, 0x4c, 0x46, 0x45,
+    0x41, 0x47, 0x41, 0x47, 0x46, 0x4f, 0x3f, 0x4f, 0x4a, 0x51, 0x4f, 0x53,
+    0x54, 0x48, 0x51, 0x43, 0x4b, 0x48, 0x4d, 0x46, 0x48, 0x4f, 0x49, 0x44,
+    0x43, 0x53, 0x50, 0x59, 0x56, 0x3d, 0x45, 0x44, 0x48, 0x38, 0x3b, 0x5f,
+    0x39, 0x43, 0x43, 0x52, 0x46, 0x3e, 0x43, 0x58, 0x43, 0x1e, 0x50, 0x3c,
+    0x46, 0x4b, 0x46, 0x50, 0x3c, 0x37, 0x4c, 0x47, 0x47, 0x4b, 0x47, 0x54,
+    0x43, 0x3e, 0x47, 0x4f, 0x4b, 0x41, 0x53, 0x50, 0x42, 0x46, 0x4f, 0x4b,
+    0x4e, 0x3f, 0x49, 0x52, 0x4a, 0x4a, 0x49, 0x53, 0x52, 0x47, 0x52, 0x5a,
+    0x40, 0x42, 0x4d, 0x4b, 0x50, 0x43, 0x49, 0x59, 0x47, 0x4c, 0x4d, 0x50,
+    0x4e, 0x3c, 0x44, 0x61, 0x51, 0x49, 0x49, 0x46, 0x49, 0x47, 0x4b, 0x5a,
+    0x45, 0x4b, 0x43, 0x40, 0x44, 0x52, 0x4d, 0x54, 0x49, 0x47, 0x44, 0x48,
+    0x46, 0x48, 0x3e, 0x40, 0x45, 0x4f, 0x4d, 0x4b, 0x4c, 0x40, 0x3d, 0x40,
+    0x3e, 0x48, 0x50, 0x4e, 0x4c, 0x42, 0x48, 0x4b, 0x3d, 0x48, 0x4b, 0x44,
+    0x52, 0x4b, 0x49, 0x4f, 0x49, 0x3f, 0x47, 0x43, 0x4d, 0x3f, 0x53, 0x4e,
+    0x4a, 0x4f, 0x4e, 0x4e, 0x53, 0x42, 0x46, 0x4c, 0x44, 0x4c, 0x46, 0x51,
+    0x45, 0x48, 0x4a, 0x50, 0x47, 0x41, 0x45, 0x54, 0x4a, 0x44, 0x50, 0x49,
+    0x48, 0x50, 0x51, 0x4b, 0x50, 0x4c, 0x4a, 0x49, 0x43, 0x47, 0x50, 0x4a,
+    0x4d, 0x4c, 0x4e, 0x49, 0x42, 0x50, 0x52, 0x48, 0x45, 0x5a, 0x4e, 0x55,
+    0x51, 0x3d, 0x3d, 0x4d, 0x42, 0x32, 0x36, 0x64, 0x39, 0x4c, 0x41, 0x48,
+    0x44, 0x35, 0x43, 0x56, 0x47, 0x1e, 0x4b, 0x3e, 0x47, 0x3f, 0x43, 0x52,
+    0x51, 0x34, 0x41, 0x4d, 0x3e, 0x41, 0x41, 0x48, 0x3c, 0x4b, 0x45, 0x3b,
+    0x40, 0x43, 0x4c, 0x46, 0x46, 0x47, 0x3e, 0x4f, 0x4b, 0x48, 0x42, 0x47,
+    0x4e, 0x3e, 0x49, 0x47, 0x43, 0x43, 0x4e, 0x52, 0x51, 0x45, 0x3f, 0x54,
+    0x46, 0x44, 0x48, 0x5d, 0x3e, 0x4a, 0x47, 0x52, 0x53, 0x3a, 0x4f, 0x5d,
+    0x41, 0x4c, 0x48, 0x51, 0x43, 0x4b, 0x4b, 0x67, 0x48, 0x4b, 0x45, 0x4d,
+    0x4b, 0x43, 0x4a, 0x54, 0x4c, 0x46, 0x43, 0x4a, 0x4d, 0x43, 0x4c, 0x47,
+    0x4a, 0x48, 0x4d, 0x42, 0x4d, 0x48, 0x3f, 0x43, 0x4c, 0x44, 0x4e, 0x4c,
+    0x40, 0x45, 0x4b, 0x48, 0x47, 0x47, 0x3e, 0x4c, 0x52, 0x41, 0x44, 0x4e,
+    0x4d, 0x44, 0x49, 0x4d, 0x3d, 0x45, 0x48, 0x4f, 0x4c, 0x4a, 0x55, 0x51,
+    0x4d, 0x4c, 0x45, 0x4e, 0x46, 0x45, 0x44, 0x49, 0x4e, 0x44, 0x40, 0x48,
+    0x49, 0x44, 0x53, 0x51, 0x42, 0x41, 0x51, 0x49, 0x51, 0x45, 0x51, 0x3f,
+    0x4b, 0x3f, 0x52, 0x3c, 0x50, 0x4d, 0x4f, 0x4b, 0x44, 0x4f, 0x40, 0x52,
+    0x49, 0x4a, 0x50, 0x3f, 0x3d, 0x54, 0x4c, 0x53, 0x52, 0x45, 0x41, 0x43,
+    0x47, 0x2d, 0x40, 0x63, 0x3a, 0x51, 0x43, 0x4e, 0x40, 0x2b, 0x36, 0x5b,
+    0x4b, 0x12, 0x4d, 0x35, 0x4b, 0x3f, 0x44, 0x4a, 0x46, 0x31, 0x54, 0x48,
+    0x43, 0x42, 0x3d, 0x51, 0x41, 0x45, 0x49, 0x4b, 0x47, 0x49, 0x3d, 0x3e,
+    0x46, 0x3d, 0x4d, 0x48, 0x3d, 0x45, 0x48, 0x4b, 0x49, 0x52, 0x44, 0x4c,
+    0x45, 0x44, 0x45, 0x49, 0x50, 0x48, 0x45, 0x46, 0x45, 0x44, 0x52, 0x55,
+    0x46, 0x45, 0x4b, 0x3d, 0x42, 0x4a, 0x3e, 0x57, 0x48, 0x4b, 0x3c, 0x42,
+    0x4a, 0x46, 0x47, 0x6c, 0x54, 0x4b, 0x41, 0x49, 0x49, 0x50, 0x43, 0x56,
+    0x44, 0x43, 0x4d, 0x3e, 0x44, 0x41, 0x47, 0x40, 0x4a, 0x4b, 0x4d, 0x4d,
+    0x3e, 0x46, 0x45, 0x47, 0x3e, 0x42, 0x4a, 0x45, 0x49, 0x3d, 0x3f, 0x43,
+    0x40, 0x44, 0x47, 0x4a, 0x45, 0x4d, 0x4b, 0x4c, 0x43, 0x40, 0x3d, 0x3e,
+    0x4c, 0x4c, 0x42, 0x4d, 0x48, 0x4d, 0x49, 0x42, 0x51, 0x51, 0x4c, 0x4b,
+    0x53, 0x4f, 0x48, 0x4d, 0x40, 0x46, 0x45, 0x4b, 0x47, 0x47, 0x4b, 0x46,
+    0x54, 0x42, 0x42, 0x46, 0x46, 0x4a, 0x4c, 0x55, 0x3f, 0x3c, 0x52, 0x4b,
+    0x4b, 0x4d, 0x4e, 0x48, 0x53, 0x4c, 0x4b, 0x42, 0x52, 0x54, 0x50, 0x4b,
+    0x40, 0x5f, 0x58, 0x53, 0x50, 0x42, 0x35, 0x48, 0x39, 0x24, 0x3c, 0x5e,
+    0x41, 0x50, 0x3c, 0x51, 0x42, 0x26, 0x42, 0x56, 0x41, 0x0c, 0x3e, 0x3d,
+    0x48, 0x3e, 0x50, 0x4b, 0x3a, 0x2c, 0x43, 0x3d, 0x48, 0x3e, 0x43, 0x48,
+    0x4c, 0x3f, 0x4a, 0x3e, 0x51, 0x4a, 0x4f, 0x40, 0x47, 0x43, 0x50, 0x4c,
+    0x43, 0x4d, 0x3f, 0x45, 0x4d, 0x3e, 0x4c, 0x44, 0x51, 0x47, 0x4b, 0x51,
+    0x45, 0x49, 0x44, 0x3f, 0x46, 0x46, 0x46, 0x57, 0x49, 0x4c, 0x49, 0x4e,
+    0x47, 0x4c, 0x47, 0x5e, 0x43, 0x46, 0x45, 0x4b, 0x52, 0x49, 0x45, 0x5f,
+    0x47, 0x41, 0x46, 0x43, 0x4f, 0x3b, 0x43, 0x51, 0x46, 0x53, 0x4a, 0x4e,
+    0x4b, 0x43, 0x4e, 0x40, 0x48, 0x49, 0x46, 0x3f, 0x48, 0x50, 0x4b, 0x41,
+    0x4a, 0x47, 0x4b, 0x3d, 0x46, 0x49, 0x4b, 0x43, 0x43, 0x42, 0x3e, 0x47,
+    0x47, 0x4a, 0x45, 0x46, 0x51, 0x48, 0x51, 0x4e, 0x3f, 0x50, 0x44, 0x4b,
+    0x4d, 0x4e, 0x44, 0x4d, 0x3d, 0x49, 0x4a, 0x4e, 0x42, 0x51, 0x43, 0x42,
+    0x46, 0x3e, 0x48, 0x4b, 0x4f, 0x50, 0x3d, 0x48, 0x4c, 0x4f, 0x46, 0x44,
+    0x44, 0x48, 0x42, 0x4b, 0x48, 0x41, 0x43, 0x46, 0x4d, 0x49, 0x4f, 0x43,
+    0x41, 0x44, 0x3f, 0x3d, 0x45, 0x4f, 0x45, 0x41, 0x40, 0x58, 0x4f, 0x54,
+    0x5b, 0x4b, 0x3a, 0x47, 0x3d, 0x28, 0x3d, 0x57, 0x3e, 0x51, 0x3f, 0x47,
+    0x3f, 0x2e, 0x3e, 0x54, 0x4e, 0x0b, 0x41, 0x3d, 0x3b, 0x3d, 0x43, 0x47,
+    0x47, 0x28, 0x4d, 0x43, 0x43, 0x3b, 0x4e, 0x4a, 0x4d, 0x42, 0x51, 0x46,
+    0x4f, 0x3d, 0x4c, 0x3a, 0x49, 0x49, 0x4a, 0x43, 0x42, 0x4b, 0x47, 0x42,
+    0x42, 0x49, 0x3f, 0x4d, 0x46, 0x4a, 0x49, 0x4e, 0x42, 0x3c, 0x4a, 0x41,
+    0x4c, 0x40, 0x4d, 0x5a, 0x49, 0x46, 0x51, 0x46, 0x4b, 0x4c, 0x46, 0x62,
+    0x45, 0x42, 0x51, 0x4e, 0x4d, 0x3e, 0x4d, 0x5b, 0x4d, 0x43, 0x45, 0x50,
+    0x4b, 0x40, 0x50, 0x53, 0x4f, 0x4f, 0x51, 0x53, 0x46, 0x41, 0x4e, 0x3a,
+    0x4b, 0x47, 0x3f, 0x3e, 0x4d, 0x48, 0x53, 0x3f, 0x45, 0x42, 0x4c, 0x45,
+    0x55, 0x4c, 0x4b, 0x39, 0x4a, 0x45, 0x48, 0x4d, 0x47, 0x40, 0x48, 0x4f,
+    0x4d, 0x49, 0x3e, 0x41, 0x46, 0x4e, 0x40, 0x49, 0x4b, 0x47, 0x4c, 0x45,
+    0x44, 0x51, 0x4f, 0x4b, 0x48, 0x49, 0x44, 0x41, 0x43, 0x46, 0x51, 0x45,
+    0x40, 0x48, 0x4b, 0x42, 0x44, 0x4f, 0x53, 0x4d, 0x44, 0x46, 0x4e, 0x4c,
+    0x48, 0x50, 0x41, 0x45, 0x42, 0x48, 0x4d, 0x4d, 0x47, 0x45, 0x41, 0x45,
+    0x48, 0x58, 0x4e, 0x46, 0x43, 0x53, 0x57, 0x52, 0x5e, 0x42, 0x45, 0x4e,
+    0x39, 0x24, 0x32, 0x56, 0x47, 0x56, 0x49, 0x52, 0x46, 0x26, 0x3a, 0x51,
+    0x4b, 0x05, 0x3e, 0x43, 0x3f, 0x38, 0x4d, 0x4b, 0x4f, 0x27, 0x51, 0x46,
+    0x47, 0x41, 0x4a, 0x47, 0x4a, 0x3e, 0x44, 0x51, 0x3f, 0x3a, 0x43, 0x46,
+    0x4d, 0x49, 0x46, 0x52, 0x43, 0x48, 0x49, 0x3e, 0x47, 0x46, 0x4a, 0x4d,
+    0x47, 0x46, 0x52, 0x50, 0x44, 0x48, 0x4c, 0x47, 0x45, 0x41, 0x49, 0x5b,
+    0x4d, 0x4b, 0x47, 0x4c, 0x4a, 0x47, 0x45, 0x5b, 0x49, 0x46, 0x52, 0x47,
+    0x47, 0x3d, 0x55, 0x59, 0x40, 0x4b, 0x3e, 0x50, 0x42, 0x43, 0x40, 0x4f,
+    0x48, 0x3f, 0x47, 0x53, 0x4d, 0x44, 0x4e, 0x37, 0x4c, 0x43, 0x51, 0x4d,
+    0x46, 0x4e, 0x40, 0x41, 0x52, 0x44, 0x43, 0x4a, 0x50, 0x48, 0x47, 0x42,
+    0x48, 0x45, 0x50, 0x4d, 0x42, 0x52, 0x44, 0x43, 0x45, 0x43, 0x4c, 0x4d,
+    0x44, 0x51, 0x47, 0x48, 0x51, 0x4f, 0x48, 0x45, 0x49, 0x4a, 0x3e, 0x43,
+    0x4d, 0x4e, 0x4e, 0x46, 0x54, 0x4d, 0x49, 0x4d, 0x47, 0x46, 0x4b, 0x41,
+    0x4a, 0x49, 0x44, 0x45, 0x4d, 0x3e, 0x53, 0x50, 0x47, 0x4d, 0x4e, 0x43,
+    0x4f, 0x45, 0x4e, 0x4a, 0x47, 0x49, 0x4c, 0x4c, 0x4d, 0x54, 0x42, 0x4c,
+    0x43, 0x5d, 0x59, 0x50, 0x5e, 0x4b, 0x44, 0x43, 0x3c, 0x25, 0x31, 0x5b,
+    0x46, 0x5a, 0x50, 0x4d, 0x41, 0x2a, 0x41, 0x4f, 0x44, 0x00, 0x41, 0x3d,
+    0x43, 0x4b, 0x47, 0x45, 0x4e, 0x2e, 0x44, 0x46, 0x53, 0x3d, 0x43, 0x41,
+    0x44, 0x46, 0x49, 0x42, 0x45, 0x4f, 0x4d, 0x3a, 0x43, 0x3c, 0x47, 0x53,
+    0x43, 0x4e, 0x3f, 0x41, 0x4d, 0x50, 0x4b, 0x4c, 0x51, 0x47, 0x53, 0x4f,
+    0x45, 0x4a, 0x44, 0x45, 0x41, 0x46, 0x47, 0x50, 0x51, 0x3f, 0x3e, 0x41,
+    0x48, 0x45, 0x46, 0x5d, 0x45, 0x4a, 0x4c, 0x46, 0x4a, 0x49, 0x50, 0x51,
+    0x51, 0x4c, 0x4f, 0x47, 0x47, 0x42, 0x45, 0x47, 0x4e, 0x48, 0x46, 0x40,
+    0x45, 0x46, 0x4d, 0x3b, 0x4d, 0x52, 0x4c, 0x51, 0x49, 0x51, 0x47, 0x3d,
+    0x4d, 0x42, 0x4f, 0x4e, 0x43, 0x43, 0x45, 0x3a, 0x42, 0x50, 0x4c, 0x4a,
+    0x41, 0x53, 0x4c, 0x45, 0x51, 0x3f, 0x54, 0x43, 0x4b, 0x54, 0x56, 0x4d,
+    0x4f, 0x4a, 0x50, 0x4b, 0x44, 0x45, 0x4f, 0x4f, 0x47, 0x3e, 0x50, 0x4f,
+    0x4b, 0x48, 0x4d, 0x49, 0x55, 0x4d, 0x45, 0x4d, 0x4a, 0x53, 0x43, 0x46,
+    0x4c, 0x45, 0x41, 0x46, 0x49, 0x49, 0x4f, 0x4b, 0x49, 0x50, 0x52, 0x49,
+    0x41, 0x54, 0x44, 0x4c, 0x44, 0x63, 0x4a, 0x49, 0x40, 0x59, 0x52, 0x52,
+    0x59, 0x3f, 0x3e, 0x3e, 0x40, 0x25, 0x3c, 0x5c, 0x4f, 0x57, 0x44, 0x50,
+    0x41, 0x2a, 0x48, 0x4f, 0x43, 0x08, 0x47, 0x43, 0x49, 0x48, 0x4d, 0x49,
+    0x46, 0x2b, 0x48, 0x44, 0x4e, 0x47, 0x47, 0x43, 0x44, 0x3e, 0x4a, 0x52,
+    0x3f, 0x4a, 0x53, 0x42, 0x49, 0x47, 0x4c, 0x50, 0x43, 0x46, 0x46, 0x3c,
+    0x4c, 0x47, 0x4e, 0x4d, 0x42, 0x41, 0x53, 0x52, 0x4f, 0x40, 0x54, 0x50,
+    0x46, 0x43, 0x50, 0x56, 0x51, 0x48, 0x48, 0x48, 0x49, 0x39, 0x47, 0x5e,
+    0x4e, 0x4b, 0x4f, 0x4e, 0x43, 0x45, 0x42, 0x58, 0x4a, 0x3b, 0x48, 0x4d,
+    0x43, 0x3e, 0x4b, 0x43, 0x3c, 0x45, 0x46, 0x4b, 0x42, 0x42, 0x4e, 0x3d,
+    0x4b, 0x4e, 0x51, 0x52, 0x48, 0x3e, 0x4b, 0x3f, 0x4c, 0x4a, 0x4b, 0x4c,
+    0x46, 0x48, 0x3e, 0x48, 0x47, 0x4d, 0x4a, 0x46, 0x49, 0x4d, 0x4a, 0x48,
+    0x50, 0x4b, 0x40, 0x48, 0x4b, 0x52, 0x46, 0x50, 0x4f, 0x3e, 0x42, 0x44,
+    0x44, 0x42, 0x43, 0x49, 0x4f, 0x4f, 0x46, 0x42, 0x4a, 0x54, 0x42, 0x48,
+    0x50, 0x4f, 0x4f, 0x4c, 0x4c, 0x47, 0x52, 0x49, 0x4c, 0x45, 0x4a, 0x4d,
+    0x4a, 0x41, 0x47, 0x4a, 0x4d, 0x4a, 0x4c, 0x46, 0x51, 0x44, 0x4b, 0x49,
+    0x53, 0x5e, 0x45, 0x4a, 0x3b, 0x57, 0x5a, 0x4c, 0x59, 0x43, 0x3e, 0x4a,
+    0x3e, 0x20, 0x36, 0x5d, 0x47, 0x5b, 0x3f, 0x55, 0x3e, 0x24, 0x41, 0x52,
+    0x3f, 0x01, 0x49, 0x41, 0x40, 0x45, 0x42, 0x46, 0x49, 0x2a, 0x47, 0x40,
+    0x44, 0x3f, 0x42, 0x47, 0x4e, 0x42, 0x4b, 0x3d, 0x45, 0x4c, 0x47, 0x3d,
+    0x4c, 0x44, 0x48, 0x43, 0x43, 0x41, 0x4a, 0x3d, 0x48, 0x4b, 0x46, 0x4e,
+    0x4c, 0x45, 0x48, 0x4d, 0x54, 0x4d, 0x3e, 0x46, 0x3e, 0x47, 0x44, 0x4e,
+    0x48, 0x49, 0x53, 0x4b, 0x41, 0x45, 0x4c, 0x57, 0x52, 0x4e, 0x40, 0x48,
+    0x4d, 0x43, 0x44, 0x5a, 0x4a, 0x4c, 0x48, 0x4d, 0x3f, 0x52, 0x41, 0x50,
+    0x4a, 0x47, 0x3e, 0x43, 0x4c, 0x42, 0x48, 0x3e, 0x4f, 0x4b, 0x41, 0x43,
+    0x49, 0x40, 0x43, 0x36, 0x3f, 0x4b, 0x49, 0x49, 0x51, 0x43, 0x48, 0x40,
+    0x4c, 0x51, 0x4d, 0x4a, 0x49, 0x3f, 0x4b, 0x3d, 0x4f, 0x4b, 0x43, 0x4d,
+    0x46, 0x40, 0x46, 0x4d, 0x49, 0x48, 0x4d, 0x4c, 0x52, 0x4c, 0x49, 0x4f,
+    0x53, 0x40, 0x49, 0x53, 0x47, 0x43, 0x4c, 0x45, 0x42, 0x48, 0x42, 0x4e,
+    0x49, 0x43, 0x42, 0x40, 0x4f, 0x46, 0x50, 0x47, 0x51, 0x4a, 0x52, 0x45,
+    0x4c, 0x51, 0x48, 0x47, 0x40, 0x41, 0x52, 0x4f, 0x41, 0x5a, 0x53, 0x47,
+    0x42, 0x5f, 0x55, 0x4f, 0x53, 0x3e, 0x41, 0x49, 0x3d, 0x20, 0x3f, 0x54,
+    0x42, 0x5b, 0x49, 0x4d, 0x3d, 0x22, 0x3e, 0x48, 0x41, 0x01, 0x4c, 0x3d,
+    0x43, 0x4a, 0x46, 0x43, 0x4f, 0x2b, 0x49, 0x46, 0x47, 0x4a, 0x51, 0x3d,
+    0x4b, 0x44, 0x49, 0x41, 0x47, 0x47, 0x45, 0x3a, 0x44, 0x42, 0x40, 0x52,
+    0x46, 0x51, 0x4a, 0x41, 0x4a, 0x52, 0x44, 0x52, 0x4a, 0x40, 0x46, 0x45,
+    0x52, 0x4c, 0x4e, 0x42, 0x42, 0x48, 0x40, 0x4f, 0x4b, 0x4f, 0x51, 0x4c,
+    0x4e, 0x48, 0x4a, 0x5a, 0x46, 0x3d, 0x41, 0x50, 0x52, 0x4c, 0x44, 0x53,
+    0x4b, 0x4d, 0x4f, 0x49, 0x47, 0x4c, 0x48, 0x45, 0x48, 0x4a, 0x44, 0x4e,
+    0x4c, 0x40, 0x4d, 0x35, 0x40, 0x49, 0x4a, 0x51, 0x49, 0x4a, 0x46, 0x36,
+    0x46, 0x47, 0x4a, 0x4c, 0x40, 0x4e, 0x42, 0x38, 0x48, 0x45, 0x42, 0x49,
+    0x54, 0x4c, 0x3f, 0x49, 0x4c, 0x39, 0x47, 0x45, 0x4e, 0x4a, 0x42, 0x44,
+    0x4b, 0x53, 0x43, 0x40, 0x46, 0x51, 0x3d, 0x50, 0x4b, 0x43, 0x4a, 0x4c,
+    0x55, 0x54, 0x4a, 0x43, 0x48, 0x40, 0x44, 0x3f, 0x47, 0x45, 0x3e, 0x41,
+    0x49, 0x44, 0x4d, 0x49, 0x44, 0x41, 0x4a, 0x50, 0x44, 0x49, 0x4d, 0x47,
+    0x4a, 0x49, 0x46, 0x49, 0x40, 0x5b, 0x4d, 0x51, 0x47, 0x57, 0x49, 0x4f,
+    0x56, 0x46, 0x3a, 0x4a, 0x3e, 0x22, 0x36, 0x5c, 0x44, 0x56, 0x46, 0x48,
+    0x3a, 0x2d, 0x4a, 0x48, 0x44, 0x17, 0x41, 0x42, 0x40, 0x3d, 0x4e, 0x45,
+    0x40, 0x26, 0x43, 0x52, 0x41, 0x40, 0x44, 0x4a, 0x48, 0x42, 0x4f, 0x47,
+    0x46, 0x4c, 0x4a, 0x3b, 0x42, 0x3e, 0x3e, 0x49, 0x4e, 0x44, 0x4e, 0x49,
+    0x47, 0x41, 0x47, 0x44, 0x4c, 0x45, 0x4d, 0x49, 0x49, 0x48, 0x55, 0x3d,
+    0x4a, 0x45, 0x50, 0x4f, 0x46, 0x4c, 0x46, 0x45, 0x3c, 0x51, 0x4b, 0x5a,
+    0x46, 0x47, 0x54, 0x41, 0x44, 0x40, 0x4f, 0x53, 0x49, 0x46, 0x46, 0x48,
+    0x44, 0x40, 0x50, 0x49, 0x49, 0x43, 0x50, 0x41, 0x52, 0x4b, 0x46, 0x3e,
+    0x44, 0x44, 0x46, 0x4e, 0x47, 0x48, 0x3e, 0x38, 0x4c, 0x4c, 0x48, 0x43,
+    0x48, 0x3e, 0x50, 0x42, 0x51, 0x50, 0x4a, 0x48, 0x4a, 0x42, 0x44, 0x3d,
+    0x4a, 0x46, 0x46, 0x3d, 0x4e, 0x47, 0x3d, 0x48, 0x4c, 0x46, 0x50, 0x4d,
+    0x49, 0x45, 0x4a, 0x4c, 0x4c, 0x47, 0x4a, 0x42, 0x4a, 0x45, 0x50, 0x52,
+    0x4b, 0x4d, 0x4c, 0x43, 0x42, 0x53, 0x41, 0x45, 0x49, 0x41, 0x4b, 0x4c,
+    0x52, 0x54, 0x4b, 0x41, 0x48, 0x4c, 0x47, 0x4c, 0x41, 0x49, 0x4a, 0x47,
+    0x50, 0x59, 0x4e, 0x45, 0x3c, 0x5d, 0x53, 0x4c, 0x5a, 0x3e, 0x3a, 0x51,
+    0x3a, 0x22, 0x35, 0x59, 0x40, 0x5a, 0x43, 0x46, 0x41, 0x32, 0x44, 0x4b,
+    0x47, 0x04, 0x4c, 0x3a, 0x4a, 0x49, 0x48, 0x3d, 0x45, 0x2b, 0x50, 0x41,
+    0x3e, 0x44, 0x4f, 0x43, 0x4a, 0x3f, 0x48, 0x4b, 0x53, 0x49, 0x4b, 0x38,
+    0x44, 0x40, 0x48, 0x4c, 0x41, 0x3f, 0x47, 0x3e, 0x47, 0x49, 0x45, 0x42,
+    0x43, 0x3e, 0x46, 0x44, 0x53, 0x4d, 0x48, 0x44, 0x45, 0x42, 0x43, 0x53,
+    0x55, 0x49, 0x4d, 0x4b, 0x45, 0x44, 0x47, 0x5f, 0x48, 0x44, 0x4a, 0x48,
+    0x45, 0x4d, 0x4f, 0x5e, 0x4e, 0x46, 0x49, 0x49, 0x4d, 0x49, 0x44, 0x48,
+    0x4d, 0x41, 0x50, 0x48, 0x3d, 0x3f, 0x4d, 0x38, 0x46, 0x4a, 0x50, 0x4a,
+    0x45, 0x3e, 0x43, 0x36, 0x42, 0x48, 0x53, 0x54, 0x49, 0x43, 0x4b, 0x3a,
+    0x45, 0x48, 0x50, 0x45, 0x4a, 0x4c, 0x4a, 0x4d, 0x43, 0x4c, 0x55, 0x4e,
+    0x4c, 0x42, 0x45, 0x52, 0x52, 0x45, 0x46, 0x40, 0x54, 0x4c, 0x3d, 0x4e,
+    0x49, 0x4e, 0x44, 0x47, 0x45, 0x48, 0x4b, 0x50, 0x49, 0x4b, 0x44, 0x4b,
+    0x4f, 0x49, 0x47, 0x47, 0x53, 0x3f, 0x4b, 0x42, 0x45, 0x3e, 0x4d, 0x4d,
+    0x48, 0x51, 0x45, 0x40, 0x43, 0x43, 0x4e, 0x44, 0x51, 0x55, 0x4a, 0x3e,
+    0x45, 0x55, 0x58, 0x50, 0x50, 0x38, 0x44, 0x4f, 0x3b, 0x23, 0x3c, 0x55,
+    0x3c, 0x54, 0x49, 0x42, 0x44, 0x2f, 0x3e, 0x47, 0x42, 0x01, 0x42, 0x37,
+    0x3f, 0x42, 0x45, 0x45, 0x47, 0x2a, 0x52, 0x4b, 0x45, 0x3c, 0x47, 0x44,
+    0x44, 0x40, 0x50, 0x53, 0x48, 0x42, 0x4d, 0x36, 0x50, 0x3d, 0x49, 0x44,
+    0x4f, 0x4c, 0x4a, 0x42, 0x4d, 0x3e, 0x3d, 0x3f, 0x4e, 0x44, 0x4d, 0x4e,
+    0x54, 0x3d, 0x42, 0x46, 0x49, 0x47, 0x4b, 0x53, 0x45, 0x46, 0x47, 0x4a,
+    0x45, 0x3d, 0x4a, 0x5f, 0x51, 0x3e, 0x45, 0x45, 0x44, 0x3a, 0x4d, 0x57,
+    0x45, 0x47, 0x4d, 0x45, 0x4e, 0x4b, 0x51, 0x48, 0x4b, 0x4a, 0x3c, 0x4e,
+    0x51, 0x41, 0x4d, 0x36, 0x47, 0x4a, 0x46, 0x51, 0x4e, 0x4c, 0x52, 0x41,
+    0x55, 0x47, 0x41, 0x47, 0x4d, 0x47, 0x4b, 0x3d, 0x4a, 0x4a, 0x46, 0x49,
+    0x4d, 0x48, 0x46, 0x46, 0x4d, 0x52, 0x52, 0x48, 0x49, 0x3f, 0x4b, 0x4e,
+    0x4c, 0x49, 0x45, 0x47, 0x41, 0x4b, 0x44, 0x48, 0x52, 0x4b, 0x53, 0x44,
+    0x46, 0x4e, 0x44, 0x49, 0x52, 0x50, 0x46, 0x4b, 0x44, 0x43, 0x50, 0x49,
+    0x4a, 0x53, 0x45, 0x49, 0x52, 0x3f, 0x4a, 0x4e, 0x49, 0x4c, 0x4d, 0x4d,
+    0x40, 0x40, 0x3f, 0x4a, 0x47, 0x56, 0x51, 0x43, 0x40, 0x5a, 0x58, 0x52,
+    0x4f, 0x3d, 0x3d, 0x45, 0x38, 0x29, 0x33, 0x59, 0x45, 0x54, 0x3c, 0x42,
+    0x3f, 0x27, 0x3e, 0x49, 0x48, 0x06, 0x4a, 0x3f, 0x41, 0x49, 0x4c, 0x48,
+    0x46, 0x2b, 0x4a, 0x4f, 0x44, 0x46, 0x4c, 0x46, 0x4a, 0x3b, 0x4d, 0x4a,
+    0x40, 0x41, 0x45, 0x38, 0x51, 0x39, 0x46, 0x46, 0x41, 0x51, 0x4e, 0x41,
+    0x49, 0x44, 0x48, 0x4a, 0x4b, 0x46, 0x47, 0x46, 0x4a, 0x4c, 0x47, 0x48,
+    0x3d, 0x42, 0x50, 0x4f, 0x50, 0x4a, 0x4a, 0x48, 0x4a, 0x45, 0x45, 0x61,
+    0x4a, 0x4c, 0x49, 0x3d, 0x4b, 0x4a, 0x4a, 0x5a, 0x48, 0x49, 0x50, 0x4f,
+    0x42, 0x48, 0x3e, 0x44, 0x43, 0x3b, 0x4f, 0x54, 0x4b, 0x4a, 0x47, 0x31,
+    0x4a, 0x49, 0x47, 0x4e, 0x48, 0x48, 0x46, 0x42, 0x4a, 0x45, 0x4c, 0x49,
+    0x4b, 0x4e, 0x53, 0x43, 0x4c, 0x49, 0x4f, 0x4b, 0x46, 0x4c, 0x4b, 0x4e,
+    0x51, 0x4b, 0x49, 0x52, 0x44, 0x55, 0x45, 0x49, 0x4b, 0x4a, 0x50, 0x4c,
+    0x4d, 0x4a, 0x4b, 0x48, 0x41, 0x46, 0x47, 0x43, 0x4b, 0x3f, 0x54, 0x4a,
+    0x46, 0x49, 0x51, 0x48, 0x4e, 0x4a, 0x41, 0x52, 0x52, 0x4e, 0x53, 0x47,
+    0x42, 0x48, 0x43, 0x44, 0x54, 0x51, 0x40, 0x49, 0x4c, 0x48, 0x49, 0x44,
+    0x4c, 0x56, 0x52, 0x49, 0x3d, 0x59, 0x4f, 0x56, 0x56, 0x42, 0x46, 0x45,
+    0x3e, 0x28, 0x3f, 0x5b, 0x3f, 0x5a, 0x4c, 0x42, 0x44, 0x22, 0x3f, 0x46,
+    0x47, 0x0d, 0x3e, 0x41, 0x45, 0x49, 0x4a, 0x3b, 0x45, 0x2d, 0x4d, 0x4a,
+    0x44, 0x43, 0x49, 0x46, 0x4b, 0x47, 0x49, 0x45, 0x4e, 0x40, 0x4c, 0x3c,
+    0x42, 0x3e, 0x4b, 0x50, 0x48, 0x49, 0x4c, 0x42, 0x3c, 0x43, 0x50, 0x43,
+    0x49, 0x4e, 0x4e, 0x43, 0x46, 0x4c, 0x48, 0x4a, 0x43, 0x4c, 0x49, 0x4e,
+    0x47, 0x44, 0x50, 0x4c, 0x4a, 0x48, 0x47, 0x5f, 0x3f, 0x3e, 0x48, 0x4f,
+    0x4f, 0x49, 0x4a, 0x5f, 0x4e, 0x40, 0x4e, 0x48, 0x47, 0x44, 0x40, 0x4d,
+    0x3f, 0x4a, 0x53, 0x45, 0x3e, 0x50, 0x3f, 0x39, 0x50, 0x45, 0x45, 0x4b,
+    0x43, 0x41, 0x46, 0x41, 0x49, 0x47, 0x4b, 0x41, 0x3c, 0x4b, 0x46, 0x3f,
+    0x41, 0x4a, 0x4e, 0x4c, 0x49, 0x4c, 0x3f, 0x44, 0x53, 0x4c, 0x45, 0x49,
+    0x48, 0x4d, 0x48, 0x4a, 0x48, 0x4f, 0x45, 0x4d, 0x48, 0x4c, 0x41, 0x49,
+    0x42, 0x48, 0x53, 0x46, 0x4a, 0x46, 0x4b, 0x4f, 0x4c, 0x52, 0x4c, 0x51,
+    0x41, 0x4d, 0x49, 0x41, 0x49, 0x4f, 0x49, 0x42, 0x4a, 0x48, 0x51, 0x4a,
+    0x44, 0x4d, 0x55, 0x48, 0x47, 0x4d, 0x4d, 0x45, 0x42, 0x60, 0x4a, 0x51,
+    0x42, 0x54, 0x56, 0x56, 0x50, 0x4a, 0x3f, 0x4a, 0x40, 0x25, 0x3a, 0x59,
+    0x46, 0x58, 0x52, 0x46, 0x41, 0x28, 0x3d, 0x3e, 0x45, 0x13, 0x47, 0x41,
+    0x3d, 0x44, 0x48, 0x45, 0x49, 0x26, 0x46, 0x4c, 0x3b, 0x4a, 0x42, 0x47,
+    0x46, 0x41, 0x44, 0x52, 0x50, 0x4a, 0x4f, 0x40, 0x4b, 0x39, 0x42, 0x45,
+    0x4a, 0x4d, 0x4f, 0x3f, 0x42, 0x4f, 0x49, 0x45, 0x42, 0x4a, 0x46, 0x47,
+    0x48, 0x40, 0x4a, 0x46, 0x41, 0x3b, 0x48, 0x55, 0x4b, 0x4e, 0x4e, 0x48,
+    0x4b, 0x44, 0x46, 0x53, 0x48, 0x45, 0x4b, 0x53, 0x49, 0x43, 0x4a, 0x5c,
+    0x46, 0x45, 0x45, 0x49, 0x49, 0x49, 0x4c, 0x43, 0x4e, 0x4a, 0x41, 0x4a,
+    0x42, 0x43, 0x4a, 0x38, 0x44, 0x4a, 0x4b, 0x3f, 0x45, 0x49, 0x45, 0x38,
+    0x43, 0x40, 0x45, 0x4c, 0x47, 0x42, 0x3f, 0x42, 0x3e, 0x4a, 0x43, 0x50,
+    0x4a, 0x4e, 0x4f, 0x47, 0x4d, 0x49, 0x49, 0x47, 0x4a, 0x4d, 0x46, 0x4c,
+    0x4f, 0x3d, 0x52, 0x4a, 0x41, 0x44, 0x4b, 0x50, 0x4c, 0x52, 0x49, 0x50,
+    0x4b, 0x45, 0x49, 0x4d, 0x48, 0x55, 0x50, 0x47, 0x4e, 0x50, 0x4f, 0x48,
+    0x46, 0x4d, 0x4d, 0x41, 0x48, 0x51, 0x4b, 0x4c, 0x47, 0x51, 0x42, 0x42,
+    0x4d, 0x47, 0x43, 0x4c, 0x4c, 0x5a, 0x4e, 0x47, 0x3b, 0x59, 0x51, 0x57,
+    0x4c, 0x40, 0x46, 0x4c, 0x37, 0x2a, 0x35, 0x58, 0x44, 0x5b, 0x4c, 0x44,
+    0x3e, 0x2e, 0x3f, 0x43, 0x46, 0x23, 0x49, 0x3e, 0x41, 0x3f, 0x4b, 0x3e,
+    0x4e, 0x2f, 0x4d, 0x4a, 0x4e, 0x40, 0x4e, 0x41, 0x40, 0x3f, 0x4a, 0x42,
+    0x4d, 0x4c, 0x44, 0x47, 0x4e, 0x44, 0x40, 0x43, 0x4d, 0x49, 0x4f, 0x3d,
+    0x49, 0x3f, 0x51, 0x48, 0x42, 0x4a, 0x49, 0x47, 0x49, 0x46, 0x4a, 0x45,
+    0x45, 0x49, 0x53, 0x4d, 0x4c, 0x4e, 0x44, 0x50, 0x4b, 0x43, 0x4e, 0x5f,
+    0x3c, 0x40, 0x44, 0x46, 0x48, 0x4b, 0x42, 0x62, 0x4e, 0x50, 0x4c, 0x49,
+    0x4a, 0x4f, 0x44, 0x53, 0x42, 0x43, 0x49, 0x48, 0x4b, 0x3c, 0x4a, 0x37,
+    0x4c, 0x41, 0x49, 0x46, 0x46, 0x47, 0x43, 0x40, 0x4d, 0x4d, 0x4a, 0x48,
+    0x50, 0x4b, 0x50, 0x41, 0x44, 0x3e, 0x51, 0x47, 0x44, 0x4a, 0x44, 0x45,
+    0x48, 0x4d, 0x52, 0x4e, 0x44, 0x48, 0x4d, 0x43, 0x42, 0x45, 0x48, 0x52,
+    0x44, 0x42, 0x50, 0x42, 0x4d, 0x45, 0x48, 0x4d, 0x4f, 0x4e, 0x45, 0x49,
+    0x51, 0x48, 0x4f, 0x53, 0x4d, 0x4c, 0x48, 0x50, 0x4e, 0x4d, 0x50, 0x48,
+    0x49, 0x42, 0x4c, 0x42, 0x4b, 0x4b, 0x49, 0x48, 0x48, 0x49, 0x4a, 0x54,
+    0x44, 0x57, 0x4d, 0x4b, 0x3f, 0x56, 0x53, 0x5c, 0x50, 0x4e, 0x46, 0x49,
+    0x40, 0x24, 0x44, 0x58, 0x49, 0x54, 0x48, 0x49, 0x41, 0x22, 0x44, 0x3f,
+    0x48, 0x1c, 0x4d, 0x39, 0x3e, 0x4c, 0x3d, 0x4a, 0x48, 0x2d, 0x48, 0x3e,
+    0x3f, 0x3a, 0x46, 0x4e, 0x44, 0x43, 0x49, 0x51, 0x4d, 0x3c, 0x44, 0x41,
+    0x4e, 0x44, 0x42, 0x4c, 0x45, 0x48, 0x45, 0x46, 0x42, 0x46, 0x47, 0x42,
+    0x4f, 0x45, 0x47, 0x44, 0x48, 0x47, 0x4a, 0x42, 0x4d, 0x48, 0x3e, 0x53,
+    0x47, 0x4b, 0x44, 0x4b, 0x45, 0x4a, 0x50, 0x55, 0x4c, 0x45, 0x48, 0x43,
+    0x53, 0x3d, 0x4e, 0x5f, 0x42, 0x44, 0x4a, 0x4f, 0x3f, 0x48, 0x4e, 0x4b,
+    0x43, 0x48, 0x43, 0x41, 0x4a, 0x4b, 0x51, 0x39, 0x52, 0x46, 0x44, 0x49,
+    0x48, 0x45, 0x4c, 0x40, 0x45, 0x49, 0x51, 0x48, 0x45, 0x42, 0x45, 0x48,
+    0x40, 0x43, 0x3d, 0x47, 0x53, 0x54, 0x4d, 0x4a, 0x4a, 0x47, 0x48, 0x43,
+    0x4c, 0x46, 0x43, 0x4f, 0x49, 0x4c, 0x3f, 0x3d, 0x4b, 0x41, 0x40, 0x48,
+    0x4e, 0x4c, 0x4b, 0x40, 0x4c, 0x43, 0x49, 0x4d, 0x47, 0x4f, 0x47, 0x42,
+    0x47, 0x4a, 0x4d, 0x4f, 0x46, 0x4d, 0x51, 0x49, 0x48, 0x4d, 0x4e, 0x46,
+    0x47, 0x41, 0x44, 0x4d, 0x4b, 0x55, 0x4b, 0x4c, 0x41, 0x5e, 0x50, 0x45,
+    0x40, 0x55, 0x4b, 0x60, 0x55, 0x47, 0x3d, 0x4a, 0x42, 0x22, 0x46, 0x5a,
+    0x47, 0x53, 0x49, 0x44, 0x44, 0x27, 0x41, 0x4f, 0x3e, 0x22, 0x4a, 0x44,
+    0x49, 0x3e, 0x4e, 0x4d, 0x3f, 0x3a, 0x4c, 0x44, 0x4a, 0x44, 0x46, 0x51,
+    0x4f, 0x42, 0x4c, 0x4e, 0x39, 0x4b, 0x42, 0x39, 0x4b, 0x3e, 0x4f, 0x47,
+    0x4a, 0x4f, 0x3f, 0x4d, 0x43, 0x4c, 0x4a, 0x4b, 0x4b, 0x3d, 0x51, 0x46,
+    0x49, 0x4c, 0x47, 0x44, 0x43, 0x3d, 0x3c, 0x54, 0x4a, 0x47, 0x4d, 0x50,
+    0x4a, 0x46, 0x51, 0x62, 0x46, 0x4d, 0x4b, 0x46, 0x49, 0x3c, 0x50, 0x57,
+    0x47, 0x40, 0x3e, 0x4c, 0x4b, 0x3f, 0x55, 0x46, 0x3d, 0x45, 0x42, 0x4e,
+    0x50, 0x49, 0x46, 0x3a, 0x4c, 0x47, 0x4a, 0x49, 0x42, 0x42, 0x4a, 0x44,
+    0x42, 0x40, 0x49, 0x54, 0x46, 0x4b, 0x47, 0x45, 0x51, 0x47, 0x41, 0x42,
+    0x49, 0x50, 0x4e, 0x48, 0x4b, 0x4b, 0x47, 0x4a, 0x47, 0x49, 0x4b, 0x45,
+    0x4b, 0x54, 0x48, 0x54, 0x4b, 0x49, 0x51, 0x4a, 0x4a, 0x40, 0x46, 0x42,
+    0x44, 0x44, 0x4d, 0x4b, 0x47, 0x43, 0x45, 0x41, 0x3e, 0x49, 0x43, 0x51,
+    0x3e, 0x4b, 0x52, 0x46, 0x48, 0x3f, 0x4e, 0x51, 0x51, 0x49, 0x3f, 0x48,
+    0x4c, 0x4c, 0x52, 0x47, 0x43, 0x57, 0x44, 0x42, 0x40, 0x52, 0x50, 0x5d,
+    0x4f, 0x40, 0x42, 0x45, 0x46, 0x26, 0x3c, 0x51, 0x4b, 0x4e, 0x4b, 0x49,
+    0x46, 0x35, 0x49, 0x53, 0x49, 0x2b, 0x4d, 0x3e, 0x50, 0x44, 0x4f, 0x54,
+    0x46, 0x34, 0x49, 0x4d, 0x42, 0x45, 0x44, 0x4b, 0x52, 0x44, 0x52, 0x41,
+    0x4d, 0x4c, 0x52, 0x41, 0x49, 0x3a, 0x4e, 0x49, 0x40, 0x4b, 0x45, 0x4d,
+    0x4b, 0x4a, 0x47, 0x49, 0x45, 0x49, 0x4d, 0x50, 0x3e, 0x47, 0x44, 0x51,
+    0x4c, 0x41, 0x45, 0x50, 0x47, 0x41, 0x4a, 0x52, 0x4b, 0x3d, 0x4b, 0x5b,
+    0x4c, 0x4c, 0x4d, 0x3f, 0x47, 0x44, 0x49, 0x5d, 0x4a, 0x53, 0x44, 0x45,
+    0x45, 0x46, 0x3d, 0x4f, 0x50, 0x3b, 0x44, 0x4e, 0x40, 0x41, 0x4c, 0x3a,
+    0x4a, 0x45, 0x49, 0x48, 0x45, 0x4a, 0x45, 0x36, 0x45, 0x4d, 0x4c, 0x49,
+    0x3f, 0x47, 0x4d, 0x40, 0x53, 0x48, 0x49, 0x4c, 0x47, 0x4f, 0x42, 0x44,
+    0x45, 0x40, 0x4a, 0x4c, 0x49, 0x4f, 0x4b, 0x4d, 0x42, 0x45, 0x3e, 0x4a,
+    0x48, 0x4a, 0x49, 0x50, 0x4c, 0x53, 0x50, 0x45, 0x4b, 0x4c, 0x46, 0x4f,
+    0x44, 0x43, 0x54, 0x50, 0x3f, 0x48, 0x42, 0x4b, 0x43, 0x3f, 0x4d, 0x4c,
+    0x43, 0x49, 0x4a, 0x47, 0x54, 0x4b, 0x4f, 0x4d, 0x44, 0x47, 0x49, 0x4e,
+    0x4e, 0x55, 0x40, 0x46, 0x44, 0x56, 0x4e, 0x65, 0x4f, 0x3f, 0x43, 0x48,
+    0x39, 0x27, 0x43, 0x55, 0x4b, 0x4c, 0x44, 0x46, 0x42, 0x34, 0x44, 0x52,
+    0x43, 0x22, 0x4e, 0x41, 0x49, 0x48, 0x49, 0x51, 0x3b, 0x37, 0x4b, 0x40,
+    0x4f, 0x45, 0x53, 0x4c, 0x47, 0x46, 0x47, 0x4c, 0x3e, 0x44, 0x45, 0x49,
+    0x48, 0x50, 0x45, 0x40, 0x46, 0x4c, 0x47, 0x4d, 0x44, 0x48, 0x49, 0x50,
+    0x4f, 0x4a, 0x46, 0x55, 0x4e, 0x42, 0x4c, 0x4c, 0x50, 0x48, 0x3d, 0x55,
+    0x46, 0x3e, 0x4a, 0x4b, 0x4f, 0x46, 0x46, 0x60, 0x50, 0x3f, 0x55, 0x40,
+    0x42, 0x44, 0x48, 0x63, 0x50, 0x3d, 0x45, 0x4f, 0x4e, 0x41, 0x47, 0x48,
+    0x4a, 0x3c, 0x3d, 0x46, 0x3f, 0x42, 0x43, 0x37, 0x4f, 0x4f, 0x50, 0x47,
+    0x47, 0x4b, 0x52, 0x40, 0x3f, 0x44, 0x4a, 0x40, 0x4d, 0x44, 0x4e, 0x37,
+    0x43, 0x48, 0x47, 0x3f, 0x51, 0x4d, 0x45, 0x42, 0x41, 0x46, 0x3d, 0x53,
+    0x4f, 0x4b, 0x54, 0x45, 0x51, 0x40, 0x4a, 0x4a, 0x48, 0x4f, 0x43, 0x4a,
+    0x4f, 0x4c, 0x4c, 0x4f, 0x48, 0x4c, 0x44, 0x4e, 0x43, 0x46, 0x4f, 0x4a,
+    0x43, 0x41, 0x49, 0x49, 0x47, 0x53, 0x45, 0x49, 0x4e, 0x46, 0x4c, 0x4e,
+    0x3c, 0x49, 0x44, 0x45, 0x4c, 0x42, 0x49, 0x41, 0x48, 0x58, 0x54, 0x4d,
+    0x35, 0x52, 0x4e, 0x5b, 0x4f, 0x40, 0x3e, 0x46, 0x46, 0x36, 0x3d, 0x60,
+    0x4d, 0x49, 0x4a, 0x43, 0x44, 0x36, 0x49, 0x67, 0x4a, 0x2d, 0x4b, 0x40,
+    0x3f, 0x49, 0x43, 0x5f, 0x45, 0x3c, 0x49, 0x4c, 0x4a, 0x43, 0x48, 0x55,
+    0x49, 0x46, 0x49, 0x46, 0x44, 0x4e, 0x42, 0x4e, 0x40, 0x45, 0x42, 0x52,
+    0x4a, 0x40, 0x4a, 0x44, 0x40, 0x45, 0x54, 0x3d, 0x4c, 0x3e, 0x4c, 0x55,
+    0x4d, 0x45, 0x4d, 0x51, 0x4a, 0x4b, 0x44, 0x5b, 0x48, 0x3d, 0x3e, 0x46,
+    0x4f, 0x4d, 0x3f, 0x62, 0x4d, 0x45, 0x3f, 0x47, 0x47, 0x47, 0x44, 0x5b,
+    0x4b, 0x4f, 0x51, 0x4c, 0x4a, 0x47, 0x48, 0x5b, 0x47, 0x40, 0x4a, 0x47,
+    0x42, 0x44, 0x46, 0x46, 0x45, 0x48, 0x4a, 0x3f, 0x40, 0x4f, 0x48, 0x3a,
+    0x49, 0x52, 0x4a, 0x53, 0x43, 0x4c, 0x4b, 0x4a, 0x4a, 0x4a, 0x4e, 0x42,
+    0x4b, 0x46, 0x3d, 0x50, 0x51, 0x4b, 0x4b, 0x4f, 0x50, 0x4c, 0x4f, 0x4c,
+    0x4d, 0x41, 0x41, 0x3c, 0x40, 0x43, 0x54, 0x51, 0x48, 0x3d, 0x48, 0x51,
+    0x42, 0x42, 0x4c, 0x4e, 0x4d, 0x4b, 0x49, 0x43, 0x48, 0x47, 0x4b, 0x49,
+    0x49, 0x4e, 0x4d, 0x46, 0x4c, 0x52, 0x49, 0x49, 0x51, 0x4e, 0x45, 0x47,
+    0x44, 0x47, 0x42, 0x4a, 0x46, 0x59, 0x48, 0x48, 0x4b, 0x4f, 0x4c, 0x5e,
+    0x5c, 0x45, 0x3f, 0x48, 0x3d, 0x3f, 0x37, 0x5a, 0x4b, 0x4b, 0x45, 0x49,
+    0x3e, 0x42, 0x41, 0x6b, 0x49, 0x2d, 0x45, 0x43, 0x47, 0x45, 0x49, 0x61,
+    0x3d, 0x3b, 0x49, 0x43, 0x49, 0x4b, 0x4b, 0x55, 0x4b, 0x47, 0x46, 0x46,
+    0x48, 0x4d, 0x49, 0x4f, 0x4a, 0x4c, 0x42, 0x51, 0x41, 0x44, 0x45, 0x4f,
+    0x4e, 0x44, 0x3f, 0x55, 0x3e, 0x4a, 0x45, 0x50, 0x46, 0x42, 0x41, 0x49,
+    0x49, 0x47, 0x49, 0x61, 0x47, 0x40, 0x41, 0x4e, 0x4d, 0x4b, 0x4a, 0x5e,
+    0x52, 0x49, 0x4b, 0x52, 0x51, 0x55, 0x42, 0x61, 0x53, 0x4c, 0x48, 0x4a,
+    0x4e, 0x48, 0x48, 0x57, 0x4c, 0x40, 0x40, 0x48, 0x45, 0x43, 0x3e, 0x46,
+    0x43, 0x4a, 0x45, 0x45, 0x44, 0x4f, 0x44, 0x40, 0x49, 0x48, 0x4e, 0x49,
+    0x4a, 0x4e, 0x49, 0x51, 0x46, 0x4f, 0x47, 0x44, 0x42, 0x4d, 0x43, 0x4e,
+    0x4f, 0x4d, 0x44, 0x51, 0x47, 0x49, 0x40, 0x57, 0x4b, 0x49, 0x47, 0x4c,
+    0x4d, 0x4d, 0x3e, 0x47, 0x45, 0x41, 0x50, 0x4b, 0x4b, 0x45, 0x42, 0x4e,
+    0x48, 0x47, 0x4e, 0x4b, 0x56, 0x4c, 0x4f, 0x52, 0x51, 0x49, 0x4d, 0x4a,
+    0x4b, 0x52, 0x4d, 0x55, 0x4b, 0x4e, 0x4e, 0x4b, 0x51, 0x57, 0x47, 0x42,
+    0x49, 0x48, 0x56, 0x44, 0x52, 0x56, 0x53, 0x5a, 0x63, 0x53, 0x4c, 0x4c,
+    0x43, 0x56, 0x3c, 0x57, 0x47, 0x47, 0x4d, 0x52, 0x43, 0x48, 0x45, 0x5f,
+    0x45, 0x29, 0x47, 0x45, 0x48, 0x40, 0x41, 0x4b, 0x3f, 0x39, 0x49, 0x4e,
+    0x47, 0x55, 0x42, 0x56, 0x4d, 0x43, 0x48, 0x44, 0x45, 0x53, 0x43, 0x46,
+    0x49, 0x43, 0x49, 0x4a, 0x40, 0x4e, 0x4a, 0x4a, 0x47, 0x43, 0x45, 0x4d,
+    0x4a, 0x47, 0x3f, 0x53, 0x45, 0x43, 0x4b, 0x4c, 0x42, 0x47, 0x47, 0x5f,
+    0x48, 0x48, 0x46, 0x44, 0x50, 0x47, 0x41, 0x64, 0x4e, 0x46, 0x49, 0x4a,
+    0x4d, 0x55, 0x42, 0x55, 0x46, 0x3d, 0x49, 0x43, 0x52, 0x52, 0x47, 0x52,
+    0x4e, 0x46, 0x47, 0x41, 0x49, 0x4d, 0x50, 0x47, 0x42, 0x49, 0x41, 0x42,
+    0x4b, 0x48, 0x49, 0x42, 0x4d, 0x48, 0x51, 0x54, 0x43, 0x56, 0x4c, 0x52,
+    0x53, 0x4d, 0x54, 0x4a, 0x51, 0x50, 0x48, 0x4c, 0x4e, 0x48, 0x4c, 0x4c,
+    0x52, 0x49, 0x4a, 0x4e, 0x4e, 0x41, 0x4f, 0x53, 0x49, 0x52, 0x42, 0x4b,
+    0x50, 0x46, 0x50, 0x4a, 0x53, 0x56, 0x46, 0x4f, 0x4b, 0x49, 0x3d, 0x41,
+    0x4c, 0x52, 0x42, 0x50, 0x4d, 0x45, 0x4e, 0x51, 0x4b, 0x4c, 0x46, 0x42,
+    0x41, 0x4b, 0x40, 0x4a, 0x42, 0x57, 0x4f, 0x43, 0x40, 0x50, 0x4c, 0x51,
+    0x4f, 0x48, 0x3a, 0x4e, 0x51, 0x40, 0x49, 0x66, 0x4b, 0x42, 0x48, 0x3c,
+    0x5b, 0x47, 0x53, 0x40, 0x4a, 0x48, 0x35, 0x44, 0x5f, 0x50, 0x4a, 0x3c,
+    0x41, 0x45, 0x48, 0x3b, 0x42, 0x59, 0x43, 0x4b, 0x48, 0x49, 0x4a, 0x40,
+    0x4f, 0x5c, 0x50, 0x54, 0x53, 0x55, 0x4c, 0x4a, 0x43, 0x46, 0x49, 0x47,
+    0x49, 0x48, 0x4b, 0x43, 0x42, 0x44, 0x42, 0x46, 0x44, 0x3f, 0x4b, 0x42,
+    0x4d, 0x49, 0x41, 0x46, 0x47, 0x51, 0x51, 0x44, 0x4c, 0x54, 0x4e, 0x4b,
+    0x42, 0x52, 0x4e, 0x4c, 0x4b, 0x4a, 0x50, 0x4e, 0x44, 0x4b, 0x4e, 0x4e,
+    0x4f, 0x42, 0x4b, 0x48, 0x46, 0x43, 0x48, 0x54, 0x4b, 0x4e, 0x48, 0x4f,
+    0x4a, 0x4d, 0x43, 0x4e, 0x47, 0x50, 0x4a, 0x44, 0x47, 0x52, 0x46, 0x53,
+    0x4a, 0x40, 0x46, 0x54, 0x50, 0x4a, 0x47, 0x51, 0x49, 0x45, 0x4b, 0x4e,
+    0x4b, 0x46, 0x4c, 0x4c, 0x52, 0x47, 0x45, 0x45, 0x4a, 0x47, 0x4c, 0x52,
+    0x44, 0x51, 0x47, 0x42, 0x47, 0x43, 0x43, 0x49, 0x52, 0x5a, 0x55, 0x3e,
+    0x45, 0x4b, 0x4c, 0x46, 0x4f, 0x4b, 0x45, 0x49, 0x4a, 0x4e, 0x4a, 0x50,
+    0x3e, 0x4e, 0x42, 0x4e, 0x44, 0x55, 0x3d, 0x4a, 0x4d, 0x49, 0x4d, 0x42,
+    0x49, 0x4e, 0x50, 0x44, 0x4b, 0x3c, 0x41, 0x49, 0x51, 0x49, 0x3c, 0x4e,
+    0x4c, 0x39, 0x4c, 0x72, 0x44, 0x4b, 0x49, 0x42, 0x5f, 0x48, 0x4a, 0x48,
+    0x41, 0x4c, 0x43, 0x40, 0x62, 0x5e, 0x47, 0x3c, 0x4a, 0x4c, 0x55, 0x49,
+    0x4b, 0x52, 0x4e, 0x4b, 0x4d, 0x48, 0x4c, 0x3c, 0x3f, 0x4f, 0x4e, 0x48,
+    0x45, 0x55, 0x4a, 0x46, 0x48, 0x3d, 0x45, 0x44, 0x4b, 0x4a, 0x46, 0x3a,
+    0x4e, 0x44, 0x4d, 0x49, 0x49, 0x49, 0x40, 0x3e, 0x40, 0x47, 0x48, 0x43,
+    0x3f, 0x51, 0x46, 0x4c, 0x45, 0x4c, 0x49, 0x44, 0x3e, 0x57, 0x49, 0x4e,
+    0x48, 0x3f, 0x48, 0x47, 0x53, 0x4d, 0x50, 0x51, 0x49, 0x42, 0x45, 0x44,
+    0x49, 0x49, 0x46, 0x4b, 0x45, 0x49, 0x4f, 0x49, 0x46, 0x48, 0x4c, 0x55,
+    0x46, 0x51, 0x48, 0x4a, 0x48, 0x54, 0x4b, 0x5a, 0x4c, 0x47, 0x40, 0x47,
+    0x40, 0x55, 0x50, 0x52, 0x4a, 0x4b, 0x4f, 0x49, 0x4b, 0x50, 0x4b, 0x5b,
+    0x51, 0x53, 0x4f, 0x4e, 0x49, 0x48, 0x44, 0x52, 0x46, 0x4e, 0x47, 0x48,
+    0x44, 0x43, 0x49, 0x55, 0x48, 0x58, 0x4f, 0x46, 0x45, 0x53, 0x45, 0x4a,
+    0x4c, 0x4c, 0x49, 0x46, 0x47, 0x4d, 0x41, 0x4d, 0x4f, 0x59, 0x4a, 0x49,
+    0x46, 0x4e, 0x44, 0x49, 0x4d, 0x48, 0x54, 0x47, 0x48, 0x4e, 0x48, 0x43,
+    0x46, 0x41, 0x46, 0x44, 0x52, 0x46, 0x42, 0x4c, 0x4c, 0x31, 0x4d, 0x6f,
+    0x51, 0x4f, 0x4d, 0x43, 0x5c, 0x48, 0x49, 0x49, 0x46, 0x4c, 0x43, 0x3b,
+    0x5d, 0x63, 0x58, 0x46, 0x49, 0x45, 0x4e, 0x48, 0x49, 0x5d, 0x45, 0x50,
+    0x56, 0x4d, 0x57, 0x37, 0x40, 0x55, 0x43, 0x4b, 0x4e, 0x46, 0x4c, 0x3b,
+    0x3d, 0x4b, 0x49, 0x4b, 0x52, 0x47, 0x4d, 0x34, 0x4c, 0x4c, 0x47, 0x4e,
+    0x4d, 0x4c, 0x3d, 0x3f, 0x4a, 0x49, 0x44, 0x45, 0x4a, 0x54, 0x43, 0x44,
+    0x50, 0x4b, 0x4d, 0x4c, 0x4e, 0x48, 0x46, 0x51, 0x43, 0x48, 0x48, 0x48,
+    0x42, 0x44, 0x4e, 0x48, 0x47, 0x45, 0x48, 0x51, 0x53, 0x4a, 0x4f, 0x58,
+    0x42, 0x4d, 0x48, 0x4f, 0x4c, 0x45, 0x4a, 0x57, 0x4b, 0x43, 0x4d, 0x4b,
+    0x4a, 0x4e, 0x4c, 0x5f, 0x3f, 0x4f, 0x4a, 0x42, 0x4b, 0x48, 0x4d, 0x62,
+    0x4f, 0x4b, 0x50, 0x4c, 0x45, 0x49, 0x44, 0x53, 0x4a, 0x4f, 0x45, 0x56,
+    0x4b, 0x44, 0x41, 0x53, 0x49, 0x48, 0x4d, 0x49, 0x47, 0x4b, 0x46, 0x4c,
+    0x49, 0x4b, 0x4c, 0x54, 0x4f, 0x4b, 0x47, 0x49, 0x44, 0x4a, 0x4e, 0x53,
+    0x4f, 0x49, 0x54, 0x4e, 0x4a, 0x48, 0x42, 0x54, 0x51, 0x46, 0x4b, 0x52,
+    0x45, 0x48, 0x51, 0x4a, 0x40, 0x4a, 0x50, 0x45, 0x4a, 0x46, 0x49, 0x46,
+    0x54, 0x46, 0x42, 0x48, 0x50, 0x36, 0x4a, 0x6b, 0x46, 0x59, 0x51, 0x47,
+    0x5f, 0x4d, 0x43, 0x4d, 0x44, 0x4d, 0x42, 0x3b, 0x65, 0x6a, 0x56, 0x48,
+    0x4d, 0x4c, 0x52, 0x4a, 0x4d, 0x61, 0x52, 0x4b, 0x47, 0x4f, 0x48, 0x49,
+    0x3f, 0x5b, 0x45, 0x51, 0x48, 0x48, 0x4b, 0x3c, 0x3b, 0x4c, 0x54, 0x52,
+    0x4f, 0x51, 0x53, 0x31, 0x47, 0x4c, 0x45, 0x4a, 0x42, 0x4b, 0x47, 0x40,
+    0x41, 0x49, 0x4c, 0x46, 0x4b, 0x53, 0x46, 0x49, 0x44, 0x4b, 0x4e, 0x4b,
+    0x48, 0x51, 0x49, 0x4d, 0x4b, 0x3f, 0x42, 0x44, 0x45, 0x43, 0x46, 0x56,
+    0x42, 0x4b, 0x49, 0x4e, 0x4e, 0x53, 0x42, 0x5c, 0x4b, 0x46, 0x49, 0x46,
+    0x4e, 0x41, 0x42, 0x67, 0x41, 0x49, 0x4d, 0x48, 0x49, 0x4e, 0x3f, 0x61,
+    0x48, 0x4a, 0x40, 0x42, 0x4c, 0x51, 0x50, 0x63, 0x49, 0x44, 0x49, 0x47,
+    0x45, 0x4d, 0x49, 0x61, 0x3f, 0x48, 0x40, 0x41, 0x49, 0x49, 0x45, 0x57,
+    0x45, 0x46, 0x4d, 0x46, 0x4c, 0x4a, 0x4d, 0x4b, 0x43, 0x54, 0x4b, 0x49,
+    0x4c, 0x49, 0x41, 0x49, 0x4b, 0x47, 0x45, 0x4b, 0x44, 0x43, 0x46, 0x3f,
+    0x47, 0x47, 0x43, 0x4c, 0x49, 0x4c, 0x3d, 0x4d, 0x4b, 0x54, 0x4a, 0x4f,
+    0x44, 0x4c, 0x4b, 0x47, 0x4c, 0x45, 0x3d, 0x52, 0x58, 0x4b, 0x45, 0x4e,
+    0x48, 0x39, 0x53, 0x70, 0x4a, 0x5d, 0x4c, 0x4e, 0x5a, 0x4f, 0x46, 0x4b,
+    0x3e, 0x4f, 0x44, 0x3d, 0x66, 0x6b, 0x50, 0x4d, 0x4d, 0x57, 0x52, 0x4a,
+    0x4c, 0x5b, 0x4e, 0x53, 0x4d, 0x54, 0x50, 0x42, 0x3c, 0x5d, 0x4a, 0x4c,
+    0x56, 0x52, 0x50, 0x40, 0x48, 0x4c, 0x4d, 0x49, 0x49, 0x4f, 0x51, 0x38,
+    0x42, 0x49, 0x4d, 0x4f, 0x45, 0x40, 0x4d, 0x41, 0x4b, 0x4a, 0x47, 0x51,
+    0x4b, 0x53, 0x4c, 0x4a, 0x51, 0x4c, 0x42, 0x56, 0x48, 0x4a, 0x47, 0x58,
+    0x49, 0x46, 0x52, 0x4a, 0x45, 0x47, 0x51, 0x54, 0x4f, 0x50, 0x50, 0x53,
+    0x49, 0x4a, 0x4d, 0x56, 0x56, 0x4b, 0x4d, 0x45, 0x40, 0x4d, 0x48, 0x60,
+    0x4e, 0x56, 0x48, 0x4b, 0x47, 0x45, 0x47, 0x62, 0x4e, 0x4f, 0x41, 0x49,
+    0x48, 0x57, 0x44, 0x64, 0x4f, 0x4f, 0x49, 0x44, 0x49, 0x4c, 0x3f, 0x53,
+    0x40, 0x41, 0x4e, 0x4b, 0x4d, 0x54, 0x42, 0x53, 0x4e, 0x41, 0x49, 0x44,
+    0x41, 0x45, 0x4d, 0x4f, 0x47, 0x51, 0x45, 0x4a, 0x42, 0x45, 0x4e, 0x40,
+    0x4b, 0x52, 0x48, 0x47, 0x4e, 0x4f, 0x47, 0x41, 0x48, 0x53, 0x47, 0x47,
+    0x46, 0x42, 0x48, 0x4b, 0x42, 0x4c, 0x49, 0x4c, 0x45, 0x4c, 0x54, 0x45,
+    0x4c, 0x43, 0x4e, 0x49, 0x56, 0x47, 0x45, 0x4f, 0x4d, 0x3a, 0x58, 0x74,
+    0x49, 0x5b, 0x4c, 0x4f, 0x64, 0x4e, 0x45, 0x43, 0x44, 0x5b, 0x43, 0x41,
+    0x63, 0x70, 0x55, 0x45, 0x4a, 0x4a, 0x4d, 0x51, 0x4b, 0x5a, 0x51, 0x57,
+    0x54, 0x5b, 0x55, 0x44, 0x38, 0x57, 0x4e, 0x50, 0x4e, 0x56, 0x57, 0x3a,
+    0x3a, 0x4b, 0x57, 0x4c, 0x51, 0x53, 0x4d, 0x3b, 0x44, 0x43, 0x47, 0x4c,
+    0x48, 0x59, 0x51, 0x41, 0x43, 0x44, 0x51, 0x51, 0x4a, 0x54, 0x51, 0x4b,
+    0x4e, 0x45, 0x51, 0x4a, 0x49, 0x4a, 0x4f, 0x52, 0x4c, 0x3e, 0x4e, 0x55,
+    0x42, 0x46, 0x46, 0x4a, 0x42, 0x52, 0x49, 0x47, 0x4a, 0x56, 0x4f, 0x50,
+    0x46, 0x4f, 0x43, 0x51, 0x53, 0x46, 0x40, 0x60, 0x44, 0x4d, 0x46, 0x54,
+    0x3d, 0x49, 0x43, 0x64, 0x45, 0x4d, 0x50, 0x49, 0x4f, 0x4d, 0x53, 0x60,
+    0x4a, 0x52, 0x49, 0x47, 0x48, 0x5a, 0x48, 0x58, 0x4e, 0x4f, 0x43, 0x4f,
+    0x50, 0x51, 0x41, 0x52, 0x4c, 0x4d, 0x45, 0x42, 0x41, 0x4c, 0x44, 0x54,
+    0x4e, 0x4d, 0x4a, 0x47, 0x40, 0x4a, 0x3e, 0x47, 0x4c, 0x58, 0x46, 0x46,
+    0x55, 0x4c, 0x4d, 0x45, 0x49, 0x51, 0x53, 0x46, 0x46, 0x43, 0x43, 0x48,
+    0x52, 0x3d, 0x4b, 0x4e, 0x49, 0x47, 0x3f, 0x3d, 0x4f, 0x45, 0x44, 0x3f,
+    0x5a, 0x43, 0x4b, 0x4d, 0x51, 0x35, 0x54, 0x76, 0x4f, 0x5e, 0x4c, 0x50,
+    0x5a, 0x51, 0x46, 0x49, 0x44, 0x61, 0x4f, 0x41, 0x67, 0x72, 0x56, 0x4f,
+    0x42, 0x48, 0x4b, 0x52, 0x46, 0x60, 0x50, 0x4e, 0x4a, 0x5b, 0x5f, 0x46,
+    0x31, 0x5b, 0x4a, 0x48, 0x4b, 0x58, 0x51, 0x41, 0x37, 0x4e, 0x4f, 0x55,
+    0x51, 0x5c, 0x4f, 0x42, 0x4b, 0x4e, 0x4f, 0x54, 0x4f, 0x52, 0x43, 0x43,
+    0x48, 0x53, 0x53, 0x41, 0x4b, 0x49, 0x4e, 0x50, 0x46, 0x4c, 0x4f, 0x49,
+    0x42, 0x49, 0x4c, 0x4c, 0x4c, 0x41, 0x4e, 0x48, 0x47, 0x4c, 0x49, 0x53,
+    0x44, 0x46, 0x51, 0x53, 0x45, 0x52, 0x4e, 0x53, 0x50, 0x58, 0x42, 0x45,
+    0x44, 0x42, 0x48, 0x58, 0x4e, 0x4d, 0x54, 0x56, 0x4c, 0x46, 0x4a, 0x58,
+    0x48, 0x4f, 0x47, 0x51, 0x47, 0x4f, 0x4f, 0x5b, 0x41, 0x4e, 0x45, 0x45,
+    0x4a, 0x50, 0x3e, 0x57, 0x48, 0x4e, 0x41, 0x4c, 0x45, 0x51, 0x46, 0x4c,
+    0x46, 0x4f, 0x42, 0x45, 0x4b, 0x4c, 0x49, 0x4c, 0x44, 0x4f, 0x4e, 0x4d,
+    0x48, 0x56, 0x43, 0x48, 0x42, 0x54, 0x48, 0x43, 0x3e, 0x51, 0x43, 0x47,
+    0x47, 0x47, 0x49, 0x4d, 0x46, 0x4e, 0x52, 0x42, 0x48, 0x4e, 0x4c, 0x4a,
+    0x4d, 0x3e, 0x43, 0x40, 0x48, 0x41, 0x47, 0x4f, 0x5e, 0x49, 0x40, 0x4c,
+    0x50, 0x42, 0x56, 0x75, 0x51, 0x5e, 0x51, 0x4e, 0x62, 0x58, 0x49, 0x47,
+    0x51, 0x59, 0x46, 0x46, 0x6c, 0x72, 0x55, 0x44, 0x4c, 0x4a, 0x4d, 0x59,
+    0x53, 0x64, 0x4d, 0x51, 0x55, 0x5e, 0x59, 0x50, 0x30, 0x58, 0x50, 0x4c,
+    0x4c, 0x60, 0x59, 0x42, 0x32, 0x53, 0x50, 0x55, 0x4d, 0x53, 0x59, 0x43,
+    0x3e, 0x49, 0x4f, 0x52, 0x4d, 0x51, 0x47, 0x45, 0x4d, 0x4e, 0x53, 0x4e,
+    0x54, 0x4f, 0x4d, 0x4d, 0x4e, 0x40, 0x47, 0x53, 0x53, 0x49, 0x56, 0x4d,
+    0x4d, 0x3a, 0x4c, 0x4e, 0x45, 0x4a, 0x47, 0x45, 0x53, 0x4a, 0x4e, 0x52,
+    0x4d, 0x4e, 0x48, 0x56, 0x4e, 0x4a, 0x4d, 0x52, 0x49, 0x4e, 0x4e, 0x58,
+    0x47, 0x50, 0x4c, 0x54, 0x49, 0x42, 0x46, 0x54, 0x50, 0x54, 0x54, 0x46,
+    0x40, 0x49, 0x4b, 0x57, 0x4b, 0x59, 0x44, 0x46, 0x52, 0x55, 0x51, 0x55,
+    0x4f, 0x50, 0x4d, 0x4d, 0x48, 0x50, 0x4e, 0x49, 0x4e, 0x42, 0x45, 0x3f,
+    0x4d, 0x4f, 0x51, 0x47, 0x4a, 0x4c, 0x4b, 0x4b, 0x46, 0x4d, 0x44, 0x52,
+    0x4d, 0x44, 0x40, 0x4d, 0x54, 0x46, 0x54, 0x44, 0x4b, 0x46, 0x47, 0x45,
+    0x50, 0x45, 0x45, 0x4b, 0x4c, 0x48, 0x3f, 0x55, 0x4a, 0x45, 0x49, 0x4e,
+    0x40, 0x49, 0x4a, 0x41, 0x56, 0x4b, 0x49, 0x4e, 0x4a, 0x41, 0x50, 0x70,
+    0x56, 0x59, 0x4b, 0x55, 0x58, 0x59, 0x49, 0x47, 0x4a, 0x5a, 0x4c, 0x46,
+    0x62, 0x7b, 0x58, 0x51, 0x44, 0x47, 0x44, 0x57, 0x4f, 0x65, 0x4e, 0x50,
+    0x4d, 0x67, 0x5c, 0x4a, 0x2b, 0x61, 0x48, 0x4b, 0x4b, 0x5d, 0x5c, 0x48,
+    0x39, 0x50, 0x45, 0x4d, 0x53, 0x60, 0x53, 0x46, 0x42, 0x46, 0x50, 0x45,
+    0x4f, 0x4e, 0x46, 0x4a, 0x4d, 0x51, 0x54, 0x47, 0x59, 0x4b, 0x58, 0x4a,
+    0x50, 0x3d, 0x59, 0x48, 0x45, 0x4e, 0x4e, 0x47, 0x4f, 0x47, 0x4d, 0x4b,
+    0x52, 0x42, 0x4c, 0x48, 0x4a, 0x4f, 0x47, 0x43, 0x4e, 0x4c, 0x4d, 0x51,
+    0x49, 0x4f, 0x4c, 0x47, 0x47, 0x48, 0x47, 0x59, 0x4f, 0x4f, 0x53, 0x49,
+    0x4e, 0x4b, 0x4f, 0x5a, 0x50, 0x42, 0x47, 0x50, 0x4a, 0x54, 0x47, 0x5a,
+    0x43, 0x49, 0x47, 0x4e, 0x49, 0x4d, 0x43, 0x54, 0x4c, 0x53, 0x4e, 0x4e,
+    0x42, 0x43, 0x48, 0x46, 0x4f, 0x43, 0x43, 0x45, 0x51, 0x47, 0x4b, 0x4f,
+    0x56, 0x48, 0x48, 0x49, 0x46, 0x45, 0x4d, 0x52, 0x47, 0x4b, 0x46, 0x50,
+    0x3e, 0x4e, 0x4c, 0x43, 0x45, 0x4d, 0x53, 0x43, 0x46, 0x45, 0x44, 0x52,
+    0x45, 0x49, 0x49, 0x51, 0x3d, 0x4a, 0x4d, 0x46, 0x42, 0x41, 0x4e, 0x48,
+    0x5a, 0x49, 0x49, 0x49, 0x4f, 0x3d, 0x56, 0x68, 0x56, 0x67, 0x4b, 0x57,
+    0x5f, 0x5c, 0x40, 0x4a, 0x4a, 0x54, 0x4c, 0x47, 0x64, 0x7a, 0x54, 0x48,
+    0x46, 0x45, 0x46, 0x57, 0x4e, 0x61, 0x4f, 0x50, 0x4d, 0x64, 0x5b, 0x43,
+    0x2d, 0x60, 0x55, 0x51, 0x4c, 0x54, 0x4f, 0x4e, 0x2f, 0x50, 0x4f, 0x52,
+    0x50, 0x61, 0x54, 0x4b, 0x3d, 0x4c, 0x47, 0x51, 0x4a, 0x54, 0x4b, 0x42,
+    0x3b, 0x55, 0x47, 0x50, 0x4f, 0x49, 0x4a, 0x46, 0x43, 0x44, 0x45, 0x47,
+    0x46, 0x4b, 0x4f, 0x46, 0x43, 0x47, 0x4a, 0x4e, 0x51, 0x43, 0x55, 0x47,
+    0x4d, 0x46, 0x4c, 0x4c, 0x49, 0x4d, 0x43, 0x51, 0x47, 0x51, 0x52, 0x4a,
+    0x46, 0x4f, 0x49, 0x52, 0x50, 0x4a, 0x43, 0x53, 0x46, 0x4e, 0x50, 0x54,
+    0x45, 0x3a, 0x4a, 0x4a, 0x4c, 0x50, 0x4b, 0x54, 0x43, 0x4f, 0x4e, 0x45,
+    0x49, 0x4f, 0x46, 0x53, 0x4d, 0x51, 0x52, 0x53, 0x3d, 0x4a, 0x47, 0x4e,
+    0x43, 0x4a, 0x53, 0x48, 0x4a, 0x4c, 0x4a, 0x4a, 0x42, 0x53, 0x3e, 0x43,
+    0x4f, 0x4c, 0x47, 0x48, 0x54, 0x4d, 0x48, 0x48, 0x4e, 0x4c, 0x43, 0x51,
+    0x42, 0x49, 0x44, 0x3e, 0x49, 0x51, 0x4a, 0x4d, 0x4f, 0x49, 0x45, 0x44,
+    0x4e, 0x41, 0x48, 0x4b, 0x4c, 0x49, 0x46, 0x47, 0x5d, 0x4c, 0x4d, 0x50,
+    0x45, 0x40, 0x4e, 0x6a, 0x4f, 0x62, 0x53, 0x50, 0x5c, 0x5e, 0x4a, 0x4c,
+    0x50, 0x56, 0x52, 0x42, 0x60, 0x7e, 0x5b, 0x4b, 0x43, 0x41, 0x4c, 0x56,
+    0x46, 0x5f, 0x4d, 0x49, 0x43, 0x65, 0x5c, 0x4d, 0x2c, 0x61, 0x48, 0x4c,
+    0x44, 0x55, 0x5c, 0x49, 0x37, 0x54, 0x4e, 0x57, 0x52, 0x5c, 0x50, 0x49,
+    0x3e, 0x4d, 0x4f, 0x4f, 0x51, 0x4c, 0x48, 0x43, 0x4a, 0x5a, 0x4d, 0x4b,
+    0x4e, 0x58, 0x54, 0x49, 0x51, 0x42, 0x49, 0x4f, 0x46, 0x45, 0x52, 0x3d,
+    0x4b, 0x4b, 0x43, 0x54, 0x47, 0x47, 0x4c, 0x42, 0x4b, 0x49, 0x45, 0x46,
+    0x46, 0x4a, 0x51, 0x47, 0x47, 0x4f, 0x48, 0x4a, 0x3f, 0x4c, 0x4b, 0x57,
+    0x4a, 0x3f, 0x52, 0x4a, 0x56, 0x52, 0x4b, 0x54, 0x4c, 0x3e, 0x3f, 0x4f,
+    0x4b, 0x50, 0x4c, 0x53, 0x4a, 0x49, 0x46, 0x4e, 0x50, 0x48, 0x4f, 0x4b,
+    0x4a, 0x4e, 0x3e, 0x49, 0x45, 0x42, 0x42, 0x41, 0x47, 0x4b, 0x4f, 0x42,
+    0x49, 0x4c, 0x55, 0x4c, 0x4e, 0x42, 0x47, 0x42, 0x4b, 0x48, 0x46, 0x41,
+    0x46, 0x4e, 0x4d, 0x3f, 0x4f, 0x46, 0x4f, 0x4b, 0x4b, 0x4d, 0x50, 0x3e,
+    0x42, 0x43, 0x44, 0x4a, 0x49, 0x40, 0x4e, 0x43, 0x3e, 0x52, 0x3e, 0x44,
+    0x49, 0x43, 0x4d, 0x44, 0x62, 0x51, 0x42, 0x53, 0x51, 0x40, 0x4c, 0x64,
+    0x4f, 0x63, 0x4e, 0x5c, 0x5b, 0x5c, 0x48, 0x4d, 0x4a, 0x57, 0x4f, 0x42,
+    0x65, 0xfe, 0x5c, 0x4e, 0x47, 0x43, 0x4a, 0x58, 0x4e, 0x5e, 0x48, 0x4c,
+    0x51, 0x5e, 0x60, 0x56, 0x2f, 0x62, 0x54, 0x58, 0x51, 0x52, 0x55, 0x51,
+    0x36, 0x4b, 0x46, 0x51, 0x53, 0x5f, 0x46, 0x4c, 0x37, 0x4d, 0x4a, 0x45,
+    0x4b, 0x3f, 0x41, 0x42, 0x3f, 0x53, 0x4a, 0x48, 0x49, 0x4a, 0x4a, 0x45,
+    0x52, 0x3f, 0x52, 0x52, 0x45, 0x4d, 0x4f, 0x45, 0x46, 0x4a, 0x51, 0x48,
+    0x56, 0x47, 0x50, 0x3e, 0x46, 0x49, 0x4c, 0x51, 0x49, 0x54, 0x45, 0x4f,
+    0x4b, 0x4b, 0x49, 0x46, 0x4b, 0x4d, 0x49, 0x5c, 0x4d, 0x43, 0x47, 0x49,
+    0x48, 0x52, 0x46, 0x50, 0x51, 0x37, 0x50, 0x52, 0x4c, 0x4d, 0x4f, 0x51,
+    0x4f, 0x42, 0x50, 0x47, 0x48, 0x4e, 0x4d, 0x4c, 0x48, 0x48, 0x4a, 0x51,
+    0x49, 0x42, 0x50, 0x4f, 0x43, 0x4e, 0x47, 0x4b, 0x47, 0x4a, 0x44, 0x44,
+    0x4c, 0x51, 0x49, 0x44, 0x45, 0x45, 0x45, 0x48, 0x3f, 0x4a, 0x43, 0x49,
+    0x46, 0x49, 0x4c, 0x4d, 0x45, 0x50, 0x44, 0x45, 0x44, 0x55, 0x4a, 0x45,
+    0x48, 0x47, 0x4c, 0x43, 0x3f, 0x48, 0x42, 0x43, 0x43, 0x43, 0x48, 0x46,
+    0x5c, 0x51, 0x47, 0x51, 0x48, 0x40, 0x54, 0x66, 0x4e, 0x67, 0x4d, 0x5a,
+    0x60, 0x57, 0x47, 0x4d, 0x4d, 0x58, 0x53, 0x46, 0x66, 0x7e, 0x56, 0x48,
+    0x44, 0x4f, 0x49, 0x5c, 0x4a, 0x63, 0x50, 0x4c, 0x49, 0x56, 0x61, 0x50,
+    0x2c, 0x68, 0x4d, 0x51, 0x46, 0x4e, 0x5b, 0x51, 0x2e, 0x53, 0x54, 0x50,
+    0x46, 0x58, 0x44, 0x4f, 0x37, 0x48, 0x55, 0x50, 0x49, 0x49, 0x4e, 0x46,
+    0x43, 0x56, 0x52, 0x4e, 0x50, 0x4b, 0x50, 0x4c, 0x49, 0x40, 0x4d, 0x4f,
+    0x50, 0x41, 0x44, 0x39, 0x4b, 0x4d, 0x4b, 0x41, 0x51, 0x4d, 0x4c, 0x41,
+    0x3f, 0x52, 0x4e, 0x4b, 0x49, 0x53, 0x45, 0x43, 0x4d, 0x4f, 0x44, 0x4d,
+    0x4b, 0x53, 0x50, 0x4e, 0x45, 0x3f, 0x4e, 0x51, 0x50, 0x55, 0x4f, 0x51,
+    0x4d, 0x3d, 0x58, 0x3f, 0x46, 0x50, 0x50, 0x50, 0x56, 0x42, 0x49, 0x49,
+    0x50, 0x4f, 0x42, 0x4b, 0x4c, 0x45, 0x52, 0x41, 0x46, 0x43, 0x4c, 0x4a,
+    0x4c, 0x51, 0x4d, 0x4d, 0x4a, 0x49, 0x54, 0x49, 0x58, 0x53, 0x49, 0x45,
+    0x47, 0x4c, 0x4c, 0x44, 0x4e, 0x51, 0x4c, 0x4c, 0x47, 0x48, 0x4c, 0x4e,
+    0x49, 0x54, 0x4c, 0x51, 0x49, 0x48, 0x47, 0x45, 0x42, 0x49, 0x42, 0x51,
+    0x4e, 0x3f, 0x49, 0x41, 0x50, 0x3e, 0x4d, 0x50, 0x5c, 0x51, 0x4d, 0x56,
+    0x47, 0x48, 0x58, 0x65, 0x51, 0x6b, 0x56, 0x5b, 0x56, 0x55, 0x46, 0x49,
+    0x4b, 0x58, 0x59, 0x4a, 0x68, 0x79, 0x53, 0x46, 0x45, 0x4b, 0x53, 0x5d,
+    0x4b, 0x6f, 0x4e, 0x4f, 0x4c, 0x53, 0x5b, 0x52, 0x30, 0x63, 0x46, 0x57,
+    0x46, 0x50, 0x4b, 0x48, 0x2e, 0x4c, 0x46, 0x48, 0x44, 0x51, 0x46, 0x4a,
+    0x35, 0x55, 0x43, 0x4c, 0x43, 0x4d, 0x4e, 0x3e, 0x47, 0x56, 0x50, 0x4d,
+    0x44, 0x59, 0x4c, 0x51, 0x46, 0x42, 0x4e, 0x43, 0x4c, 0x44, 0x42, 0x3a,
+    0x40, 0x48, 0x46, 0x44, 0x45, 0x4a, 0x46, 0x3a, 0x53, 0x4c, 0x4d, 0x4c,
+    0x4a, 0x4f, 0x53, 0x40, 0x4b, 0x48, 0x54, 0x4b, 0x44, 0x59, 0x41, 0x50,
+    0x4e, 0x50, 0x55, 0x4d, 0x55, 0x41, 0x4a, 0x4f, 0x47, 0x43, 0x4e, 0x50,
+    0x52, 0x4c, 0x50, 0x4d, 0x47, 0x42, 0x4f, 0x4b, 0x47, 0x43, 0x41, 0x4a,
+    0x55, 0x3e, 0x50, 0x4b, 0x41, 0x49, 0x47, 0x49, 0x53, 0x4d, 0x48, 0x4b,
+    0x43, 0x43, 0x51, 0x44, 0x4d, 0x4c, 0x44, 0x50, 0x4d, 0x42, 0x49, 0x4e,
+    0x50, 0x50, 0x4c, 0x49, 0x49, 0x51, 0x46, 0x43, 0x4a, 0x4e, 0x53, 0x47,
+    0x43, 0x46, 0x40, 0x49, 0x47, 0x44, 0x44, 0x4d, 0x4b, 0x4b, 0x51, 0x4b,
+    0x45, 0x49, 0x47, 0x43, 0x56, 0x49, 0x4c, 0x54, 0x50, 0x3c, 0x4c, 0x5e,
+    0x51, 0x67, 0x4f, 0x57, 0x57, 0x53, 0x3e, 0x4e, 0x4e, 0x5e, 0x4b, 0x48,
+    0x5a, 0x78, 0x55, 0x4a, 0x3f, 0x4b, 0x4c, 0x5b, 0x53, 0x64, 0x4d, 0x53,
+    0x49, 0x57, 0x57, 0x58, 0x37, 0x62, 0x4f, 0x56, 0x44, 0x4e, 0x58, 0x4a,
+    0x30, 0x4f, 0x40, 0x4e, 0x47, 0x58, 0x52, 0x50, 0x35, 0x4d, 0x49, 0x52,
+    0x4e, 0x42, 0x46, 0x47, 0x44, 0x57, 0x54, 0x43, 0x4e, 0x56, 0x43, 0x49,
+    0x44, 0x40, 0x44, 0x41, 0x50, 0x49, 0x4b, 0x44, 0x4d, 0x52, 0x49, 0x43,
+    0x52, 0x54, 0x49, 0x3f, 0x49, 0x42, 0x49, 0x4a, 0x43, 0x3e, 0x50, 0x40,
+    0x46, 0x4b, 0x50, 0x4b, 0x53, 0x4b, 0x47, 0x52, 0x51, 0x4b, 0x47, 0x3f,
+    0x46, 0x4b, 0x4c, 0x57, 0x49, 0x47, 0x54, 0x49, 0x50, 0x50, 0x4d, 0x4a,
+    0x42, 0x4e, 0x51, 0x4c, 0x47, 0x47, 0x42, 0x43, 0x54, 0x43, 0x46, 0x47,
+    0x4d, 0x43, 0x54, 0x47, 0x43, 0x58, 0x48, 0x45, 0x4b, 0x46, 0x48, 0x3d,
+    0x47, 0x3f, 0x44, 0x4f, 0x4e, 0x46, 0x41, 0x40, 0x4d, 0x4d, 0x4d, 0x52,
+    0x54, 0x47, 0x4f, 0x51, 0x4f, 0x45, 0x45, 0x48, 0x4b, 0x4d, 0x44, 0x52,
+    0x51, 0x4b, 0x48, 0x4f, 0x49, 0x49, 0x46, 0x50, 0x54, 0x42, 0x44, 0x51,
+    0x58, 0x4e, 0x43, 0x58, 0x55, 0x40, 0x53, 0x5a, 0x51, 0x61, 0x51, 0x60,
+    0x53, 0x57, 0x45, 0x4f, 0x45, 0x5e, 0x51, 0x42, 0x61, 0x7a, 0x55, 0x47,
+    0x41, 0x4b, 0x4a, 0x5b, 0x4c, 0x65, 0x4f, 0x55, 0x46, 0x54, 0x65, 0x59,
+    0x36, 0x61, 0x54, 0x55, 0x48, 0x57, 0x52, 0x4e, 0x24, 0x4b, 0x49, 0x4d,
+    0x43, 0x57, 0x44, 0x51, 0x3b, 0x4f, 0x45, 0x40, 0x47, 0x4a, 0x43, 0x47,
+    0x46, 0x58, 0x50, 0x54, 0x4d, 0x50, 0x44, 0x42, 0x4a, 0x46, 0x4b, 0x4d,
+    0x4f, 0x4f, 0x4d, 0x40, 0x48, 0x4a, 0x53, 0x48, 0x49, 0x48, 0x4d, 0x39,
+    0x47, 0x4e, 0x44, 0x4c, 0x4b, 0x49, 0x44, 0x42, 0x4a, 0x45, 0x46, 0x46,
+    0x53, 0x4d, 0x49, 0x4f, 0x4e, 0x48, 0x50, 0x4a, 0x4c, 0x46, 0x56, 0x4b,
+    0x4b, 0x57, 0x4c, 0x49, 0x4a, 0x4a, 0x43, 0x4e, 0x56, 0x45, 0x50, 0x4c,
+    0x47, 0x55, 0x48, 0x46, 0x4e, 0x46, 0x45, 0x3f, 0x4a, 0x4c, 0x4c, 0x47,
+    0x4a, 0x51, 0x4e, 0x50, 0x40, 0x52, 0x45, 0x45, 0x4b, 0x46, 0x4f, 0x44,
+    0x51, 0x4a, 0x4e, 0x4d, 0x4c, 0x46, 0x42, 0x47, 0x4a, 0x4e, 0x46, 0x42,
+    0x4b, 0x4f, 0x4b, 0x4e, 0x4e, 0x46, 0x42, 0x50, 0x53, 0x51, 0x4f, 0x54,
+    0x45, 0x4f, 0x45, 0x42, 0x4c, 0x45, 0x40, 0x48, 0x59, 0x49, 0x49, 0x53,
+    0x4c, 0x43, 0x4b, 0x57, 0x54, 0x64, 0x4e, 0x5f, 0x5c, 0x59, 0x4b, 0x56,
+    0x49, 0x5d, 0x4f, 0x4b, 0x62, 0x73, 0x54, 0x45, 0x49, 0x50, 0x48, 0x5a,
+    0x50, 0x6d, 0x4a, 0x4e, 0x48, 0x55, 0x5d, 0x57, 0x38, 0x68, 0x52, 0x5a,
+    0x46, 0x56, 0x4c, 0x5a, 0x2e, 0x55, 0x49, 0x4f, 0x4a, 0x57, 0x4f, 0x54,
+    0x41, 0x53, 0x46, 0x43, 0x45, 0x47, 0x53, 0x4a, 0x42, 0x4f, 0x4d, 0x48,
+    0x4c, 0x49, 0x47, 0x48, 0x45, 0x49, 0x48, 0x53, 0x48, 0x52, 0x4a, 0x44,
+    0x4c, 0x49, 0x52, 0x4b, 0x47, 0x51, 0x42, 0x47, 0x49, 0x51, 0x3f, 0x45,
+    0x47, 0x4e, 0x53, 0x33, 0x55, 0x51, 0x55, 0x48, 0x4b, 0x51, 0x56, 0x47,
+    0x43, 0x55, 0x47, 0x42, 0x47, 0x4f, 0x47, 0x51, 0x46, 0x55, 0x4a, 0x4b,
+    0x50, 0x52, 0x4f, 0x43, 0x4b, 0x53, 0x4d, 0x3f, 0x4e, 0x56, 0x50, 0x49,
+    0x4d, 0x47, 0x51, 0x49, 0x4a, 0x52, 0x44, 0x43, 0x4d, 0x4e, 0x41, 0x51,
+    0x4c, 0x4d, 0x47, 0x48, 0x4f, 0x40, 0x50, 0x46, 0x43, 0x4d, 0x4e, 0x50,
+    0x43, 0x47, 0x4e, 0x46, 0x4f, 0x4b, 0x51, 0x4b, 0x4a, 0x57, 0x42, 0x51,
+    0x4c, 0x54, 0x52, 0x42, 0x4c, 0x42, 0x47, 0x54, 0x4a, 0x4a, 0x47, 0x4a,
+    0x3f, 0x46, 0x4e, 0x4c, 0x53, 0x50, 0x47, 0x53, 0x49, 0x44, 0x52, 0x5a,
+    0x4b, 0x65, 0x50, 0x5b, 0x57, 0x59, 0x4a, 0x48, 0x48, 0x5f, 0x55, 0x48,
+    0x5c, 0x78, 0x55, 0x48, 0x4a, 0x4b, 0x49, 0x4c, 0x46, 0x6b, 0x54, 0x57,
+    0x55, 0x4b, 0x59, 0x52, 0x38, 0x5b, 0x57, 0x56, 0x4b, 0x4f, 0x48, 0x4e,
+    0x34, 0x5a, 0x4e, 0x4f, 0x43, 0x4e, 0x4b, 0x4e, 0x36, 0x4d, 0x52, 0x48,
+    0x4d, 0x4c, 0x4c, 0x49, 0x51, 0x54, 0x45, 0x54, 0x4a, 0x4e, 0x52, 0x41,
+    0x4c, 0x45, 0x4a, 0x53, 0x55, 0x4b, 0x50, 0x47, 0x4e, 0x4d, 0x43, 0x51,
+    0x4e, 0x4a, 0x51, 0x46, 0x4e, 0x4d, 0x48, 0x3f, 0x43, 0x52, 0x56, 0x38,
+    0x52, 0x46, 0x43, 0x49, 0x40, 0x49, 0x53, 0x41, 0x47, 0x41, 0x41, 0x42,
+    0x4f, 0x4b, 0x46, 0x4b, 0x4a, 0x57, 0x4a, 0x45, 0x4b, 0x46, 0x47, 0x3c,
+    0x43, 0x46, 0x4f, 0x50, 0x4c, 0x53, 0x4f, 0x41, 0x4a, 0x4a, 0x40, 0x4a,
+    0x3e, 0x4e, 0x4d, 0x41, 0x4a, 0x42, 0x49, 0x4c, 0x51, 0x46, 0x4f, 0x43,
+    0x4b, 0x41, 0x50, 0x48, 0x4a, 0x40, 0x52, 0x45, 0x40, 0x40, 0x46, 0x48,
+    0x48, 0x52, 0x52, 0x41, 0x43, 0x49, 0x49, 0x4c, 0x44, 0x48, 0x50, 0x4a,
+    0x47, 0x48, 0x4c, 0x42, 0x49, 0x48, 0x52, 0x56, 0x4b, 0x41, 0x4e, 0x47,
+    0x52, 0x56, 0x4e, 0x56, 0x4b, 0x38, 0x50, 0x55, 0x5a, 0x63, 0x51, 0x5a,
+    0x54, 0x52, 0x44, 0x45, 0x47, 0x5e, 0x4c, 0x4a, 0x5e, 0x71, 0x56, 0x44,
+    0x4c, 0x4b, 0x4c, 0x4e, 0x49, 0x69, 0x50, 0x53, 0x4d, 0x5c, 0x59, 0x50,
+    0x36, 0x5d, 0x46, 0x5b, 0x51, 0x55, 0x55, 0x51, 0x36, 0x5a, 0x53, 0x56,
+    0x54, 0x4a, 0x55, 0x53, 0x3c, 0x52, 0x4a, 0x45, 0x4c, 0x56, 0x49, 0x46,
+    0x4f, 0x5b, 0x43, 0x4b, 0x49, 0x4c, 0x4b, 0x41, 0x44, 0x4b, 0x47, 0x4b,
+    0x4b, 0x54, 0x4a, 0x4c, 0x49, 0x44, 0x46, 0x46, 0x48, 0x49, 0x47, 0x4a,
+    0x40, 0x4e, 0x47, 0x53, 0x4a, 0x47, 0x4a, 0x3b, 0x48, 0x4b, 0x50, 0x51,
+    0x50, 0x44, 0x4d, 0x49, 0x42, 0x4b, 0x43, 0x48, 0x4a, 0x43, 0x4d, 0x4d,
+    0x49, 0x4d, 0x43, 0x4f, 0x50, 0x49, 0x47, 0x48, 0x48, 0x4f, 0x49, 0x41,
+    0x4c, 0x46, 0x47, 0x3e, 0x51, 0x4d, 0x4e, 0x42, 0x3d, 0x53, 0x4d, 0x3b,
+    0x53, 0x52, 0x4c, 0x4c, 0x43, 0x46, 0x43, 0x3d, 0x53, 0x48, 0x43, 0x4e,
+    0x45, 0x52, 0x4d, 0x4a, 0x44, 0x49, 0x47, 0x4c, 0x4e, 0x4c, 0x4a, 0x4e,
+    0x41, 0x48, 0x4b, 0x44, 0x4d, 0x4a, 0x4d, 0x44, 0x4a, 0x45, 0x4f, 0x52,
+    0x45, 0x3f, 0x4b, 0x48, 0x43, 0x41, 0x3d, 0x53, 0x53, 0x50, 0x4a, 0x56,
+    0x4d, 0x3e, 0x55, 0x4e, 0x56, 0x5e, 0x52, 0x52, 0x54, 0x50, 0x42, 0x4a,
+    0x4d, 0x5f, 0x4f, 0x49, 0x5d, 0x6f, 0x55, 0x4a, 0x47, 0x49, 0x4e, 0x4a,
+    0x43, 0x6e, 0x4e, 0x4f, 0x52, 0x59, 0x62, 0x4b, 0x3e, 0x5c, 0x4c, 0x4e,
+    0x45, 0x52, 0x43, 0x4d, 0x3c, 0x58, 0x52, 0x49, 0x48, 0x55, 0x53, 0x4e,
+    0x3d, 0x4e, 0x4c, 0x4b, 0x4b, 0x50, 0x4a, 0x47, 0x45, 0x62, 0x50, 0x49,
+    0x48, 0x4b, 0x55, 0x45, 0x46, 0x51, 0x41, 0x55, 0x54, 0x55, 0x50, 0x47,
+    0x46, 0x4d, 0x46, 0x4b, 0x41, 0x49, 0x4c, 0x40, 0x45, 0x4f, 0x52, 0x54,
+    0x45, 0x4d, 0x53, 0x3a, 0x4c, 0x55, 0x4e, 0x48, 0x44, 0x45, 0x56, 0x3c,
+    0x48, 0x46, 0x4b, 0x51, 0x53, 0x43, 0x41, 0x49, 0x4c, 0x52, 0x48, 0x42,
+    0x48, 0x3f, 0x4c, 0x38, 0x46, 0x50, 0x4a, 0x44, 0x50, 0x54, 0x4e, 0x38,
+    0x48, 0x42, 0x43, 0x4a, 0x4c, 0x44, 0x47, 0x42, 0x42, 0x46, 0x4a, 0x50,
+    0x47, 0x4b, 0x43, 0x40, 0x44, 0x46, 0x46, 0x4d, 0x50, 0x4a, 0x4e, 0x51,
+    0x44, 0x40, 0x50, 0x43, 0x52, 0x4d, 0x42, 0x4c, 0x50, 0x41, 0x4a, 0x4e,
+    0x45, 0x49, 0x4d, 0x40, 0x46, 0x51, 0x43, 0x4b, 0x48, 0x47, 0x42, 0x55,
+    0x4a, 0x41, 0x4f, 0x49, 0x4f, 0x4e, 0x47, 0x4c, 0x4a, 0x48, 0x50, 0x4e,
+    0x50, 0x57, 0x4e, 0x56, 0x56, 0x4e, 0x44, 0x48, 0x4a, 0x5b, 0x55, 0x49,
+    0x59, 0x67, 0x54, 0x46, 0x4f, 0x41, 0x4d, 0x4e, 0x4a, 0x63, 0x4d, 0x44,
+    0x53, 0x5b, 0x59, 0x4f, 0x43, 0x55, 0x56, 0x4e, 0x55, 0x4c, 0x4b, 0x54,
+    0x3c, 0x56, 0x4d, 0x50, 0x4f, 0x4a, 0x5a, 0x47, 0x48, 0x56, 0x4f, 0x4f,
+    0x50, 0x51, 0x48, 0x4e, 0x4d, 0x50, 0x4e, 0x45, 0x4b, 0x48, 0x4e, 0x44,
+    0x46, 0x4d, 0x43, 0x46, 0x41, 0x59, 0x53, 0x4b, 0x4a, 0x3e, 0x51, 0x47,
+    0x43, 0x48, 0x52, 0x3f, 0x43, 0x50, 0x4b, 0x4f, 0x41, 0x48, 0x43, 0x2e,
+    0x4d, 0x4e, 0x4c, 0x45, 0x45, 0x46, 0x4b, 0x43, 0x46, 0x49, 0x46, 0x4d,
+    0x47, 0x4e, 0x4d, 0x3c, 0x47, 0x4a, 0x52, 0x4e, 0x41, 0x50, 0x43, 0x3a,
+    0x50, 0x47, 0x4a, 0x45, 0x52, 0x4a, 0x4c, 0x3f, 0x42, 0x3d, 0x49, 0x48,
+    0x48, 0x4c, 0x42, 0x3a, 0x40, 0x47, 0x46, 0x4e, 0x44, 0x52, 0x46, 0x44,
+    0x4a, 0x44, 0x43, 0x49, 0x42, 0x45, 0x3f, 0x50, 0x4c, 0x44, 0x48, 0x43,
+    0x47, 0x4a, 0x48, 0x48, 0x3e, 0x45, 0x43, 0x48, 0x4a, 0x48, 0x53, 0x4b,
+    0x50, 0x49, 0x43, 0x4d, 0x53, 0x4f, 0x4b, 0x4b, 0x40, 0x42, 0x50, 0x4d,
+    0x53, 0x4e, 0x44, 0x4d, 0x45, 0x3d, 0x51, 0x51, 0x4f, 0x59, 0x4b, 0x51,
+    0x4a, 0x4e, 0x42, 0x40, 0x49, 0x5b, 0x4b, 0x43, 0x53, 0x60, 0x47, 0x49,
+    0x4a, 0x44, 0x44, 0x48, 0x4b, 0x60, 0x51, 0x3f, 0x4b, 0x5b, 0x4f, 0x4a,
+    0x4a, 0x50, 0x49, 0x46, 0x55, 0x50, 0x4b, 0x4c, 0x40, 0x4e, 0x51, 0x4f,
+    0x4b, 0x51, 0x54, 0x50, 0x48, 0x4e, 0x4a, 0x4f, 0x4d, 0x4e, 0x54, 0x4d,
+    0x41, 0x50, 0x4e, 0x47, 0x47, 0x47, 0x54, 0x3b, 0x51, 0x54, 0x50, 0x49,
+    0x48, 0x4c, 0x4e, 0x47, 0x3f, 0x3c, 0x4c, 0x43, 0x45, 0x42, 0x45, 0x37,
+    0x41, 0x52, 0x49, 0x47, 0x4e, 0x4a, 0x4b, 0x37, 0x48, 0x4d, 0x4e, 0x4a,
+    0x42, 0x56, 0x3d, 0x35, 0x48, 0x42, 0x4b, 0x4a, 0x44, 0x52, 0x40, 0x48,
+    0x4f, 0x49, 0x4f, 0x4c, 0x4d, 0x43, 0x49, 0x38, 0x4b, 0x42, 0x48, 0x42,
+    0x45, 0x45, 0x54, 0x3a, 0x47, 0x47, 0x52, 0x45, 0x4a, 0x48, 0x47, 0x39,
+    0x4d, 0x45, 0x54, 0x4b, 0x4e, 0x4f, 0x4e, 0x38, 0x4a, 0x4b, 0x48, 0x45,
+    0x4e, 0x43, 0x4e, 0x4e, 0x46, 0x4e, 0x4e, 0x50, 0x46, 0x4c, 0x42, 0x45,
+    0x4b, 0x46, 0x47, 0x4d, 0x49, 0x3f, 0x4f, 0x50, 0x46, 0x4a, 0x47, 0x4e,
+    0x4a, 0x3e, 0x50, 0x46, 0x47, 0x40, 0x4f, 0x47, 0x51, 0x4b, 0x43, 0x46,
+    0x4a, 0x42, 0x55, 0x4d, 0x46, 0x63, 0x49, 0x4e, 0x4f, 0x4f, 0x42, 0x45,
+    0x50, 0x57, 0x49, 0x3e, 0x57, 0x63, 0x45, 0x4a, 0x49, 0x50, 0x41, 0x4a,
+    0x48, 0x64, 0x4f, 0x42, 0x47, 0x58, 0x4b, 0x45, 0x43, 0x57, 0x49, 0x58,
+    0x51, 0x51, 0x47, 0x43, 0x51, 0x4b, 0x4a, 0x45, 0x50, 0x54, 0x4d, 0x4d,
+    0x3e, 0x4a, 0x50, 0x40, 0x51, 0x4f, 0x52, 0x48, 0x53, 0x49, 0x44, 0x4b,
+    0x51, 0x4b, 0x50, 0x42, 0x4d, 0x49, 0x4a, 0x46, 0x44, 0x50, 0x47, 0x3f,
+    0x48, 0x47, 0x41, 0x4a, 0x42, 0x52, 0x4a, 0x33, 0x50, 0x50, 0x54, 0x3f,
+    0x44, 0x4e, 0x51, 0x3c, 0x4e, 0x51, 0x48, 0x4b, 0x47, 0x49, 0x3f, 0x3d,
+    0x4e, 0x46, 0x4a, 0x41, 0x40, 0x50, 0x49, 0x40, 0x4a, 0x4b, 0x45, 0x50,
+    0x4e, 0x4d, 0x4b, 0x39, 0x4e, 0x4b, 0x48, 0x3c, 0x47, 0x44, 0x4c, 0x42,
+    0x45, 0x50, 0x3e, 0x54, 0x4d, 0x49, 0x48, 0x3c, 0x45, 0x42, 0x55, 0x4a,
+    0x41, 0x4f, 0x40, 0x3f, 0x47, 0x46, 0x46, 0x44, 0x4f, 0x47, 0x46, 0x44,
+    0x41, 0x40, 0x44, 0x48, 0x3e, 0x3c, 0x46, 0x3e, 0x4a, 0x45, 0x4c, 0x52,
+    0x47, 0x42, 0x47, 0x3f, 0x47, 0x4e, 0x4b, 0x53, 0x4a, 0x3d, 0x4d, 0x47,
+    0x4f, 0x3d, 0x4e, 0x43, 0x4f, 0x46, 0x43, 0x43, 0x46, 0x41, 0x4f, 0x42,
+    0x46, 0x57, 0x4d, 0x51, 0x49, 0x51, 0x4c, 0x44, 0x51, 0x4f, 0x46, 0x44,
+    0x54, 0x5d, 0x4f, 0x40, 0x59, 0x46, 0x53, 0x46, 0x48, 0x54, 0x43, 0x45,
+    0x4d, 0x51, 0x4f, 0x44, 0x44, 0x53, 0x49, 0x4e, 0x48, 0x46, 0x44, 0x4a,
+    0x4a, 0x42, 0x4c, 0x46, 0x54, 0x4f, 0x52, 0x47, 0x46, 0x44, 0x4c, 0x4d,
+    0x4c, 0x47, 0x4d, 0x40, 0x55, 0x58, 0x46, 0x46, 0x3f, 0x3e, 0x47, 0x36,
+    0x3f, 0x4d, 0x4b, 0x4d, 0x4f, 0x4f, 0x48, 0x34, 0x4d, 0x46, 0x46, 0x50,
+    0x50, 0x4b, 0x47, 0x45, 0x4e, 0x49, 0x50, 0x4f, 0x4a, 0x48, 0x4f, 0x39,
+    0x53, 0x4c, 0x4b, 0x56, 0x45, 0x4f, 0x55, 0x3a, 0x40, 0x53, 0x43, 0x4b,
+    0x47, 0x3d, 0x4c, 0x34, 0x4b, 0x4e, 0x4a, 0x4b, 0x4d, 0x49, 0x4e, 0x40,
+    0x4d, 0x48, 0x40, 0x4a, 0x4a, 0x4b, 0x4a, 0x42, 0x4c, 0x52, 0x43, 0x42,
+    0x44, 0x3f, 0x4e, 0x42, 0x44, 0x45, 0x40, 0x3d, 0x4b, 0x45, 0x4a, 0x43,
+    0x4b, 0x4b, 0x4e, 0x46, 0x55, 0x43, 0x44, 0x3f, 0x44, 0x43, 0x4b, 0x4b,
+    0x45, 0x51, 0x48, 0x49, 0x3d, 0x44, 0x4a, 0x4a, 0x50, 0x50, 0x47, 0x44,
+    0x4f, 0x3e, 0x3f, 0x43, 0x4c, 0x46, 0x4a, 0x4e, 0x4c, 0x52, 0x48, 0x4e,
+    0x48, 0x46, 0x45, 0x48, 0x41, 0x4f, 0x51, 0x48, 0x40, 0x4d, 0x4a, 0x4b,
+    0x4c, 0x51, 0x49, 0x50, 0x4e, 0x4b, 0x4a, 0x42, 0x49, 0x54, 0x4e, 0x43,
+    0x52, 0x47, 0x4a, 0x41, 0x42, 0x51, 0x48, 0x4a, 0x46, 0x45, 0x4a, 0x43,
+    0x4e, 0x4f, 0x41, 0x49, 0x4b, 0x42, 0x40, 0x4a, 0x50, 0x41, 0x42, 0x3f,
+    0x49, 0x4a, 0x40, 0x3e, 0x3f, 0x42, 0x4d, 0x51, 0x4e, 0x4e, 0x47, 0x41,
+    0x4e, 0x4e, 0x49, 0x4b, 0x41, 0x45, 0x51, 0x40, 0x45, 0x4c, 0x3f, 0x42,
+    0x4c, 0x45, 0x4d, 0x39, 0x46, 0x52, 0x4a, 0x4e, 0x4c, 0x49, 0x4e, 0x43,
+    0x43, 0x4c, 0x48, 0x46, 0x48, 0x49, 0x50, 0x3a, 0x3f, 0x49, 0x42, 0x4f,
+    0x42, 0x4d, 0x4e, 0x3f, 0x51, 0x4b, 0x4e, 0x4b, 0x51, 0x44, 0x43, 0x4a,
+    0x4a, 0x4c, 0x50, 0x48, 0x45, 0x47, 0x4d, 0x41, 0x47, 0x45, 0x51, 0x41,
+    0x42, 0x48, 0x4c, 0x39, 0x51, 0x45, 0x46, 0x53, 0x4b, 0x50, 0x46, 0x45,
+    0x4b, 0x4d, 0x42, 0x4b, 0x3f, 0x45, 0x4b, 0x4e, 0x50, 0x50, 0x47, 0x4a,
+    0x45, 0x40, 0x4b, 0x43, 0x3f, 0x4a, 0x41, 0x42, 0x51, 0x41, 0x4d, 0x42,
+    0x53, 0x48, 0x48, 0x49, 0x4b, 0x40, 0x42, 0x3d, 0x4f, 0x53, 0x49, 0x46,
+    0x46, 0x43, 0x42, 0x44, 0x46, 0x48, 0x3f, 0x46, 0x31, 0x43, 0x4d, 0x4b,
+    0x48, 0x4d, 0x4c, 0x43, 0x45, 0x53, 0x50, 0x40, 0x4a, 0x48, 0x45, 0x3b,
+    0x4f, 0x4d, 0x53, 0x4c, 0x44, 0x54, 0x50, 0x66, 0x3f, 0x45, 0x4c, 0x4c,
+    0x4a, 0x49, 0x49, 0x4a, 0x40, 0x52, 0x3e, 0x4c, 0x49, 0x40, 0x44, 0x49,
+    0x48, 0x3f, 0x45, 0x5b, 0x49, 0x4b, 0x4c, 0x44, 0x50, 0x4e, 0x4a, 0x4a,
+    0x49, 0x4e, 0x4f, 0x47, 0x46, 0x4b, 0x44, 0x3b, 0x4e, 0x4b, 0x48, 0x46,
+    0x45, 0x45, 0x3d, 0x35, 0x4c, 0x49, 0x54, 0x42, 0x51, 0x46, 0x49, 0x2d,
+    0x43, 0x4a, 0x53, 0x49, 0x49, 0x42, 0x4f, 0x40, 0x4e, 0x50, 0x54, 0x51,
+    0x4b, 0x45, 0x48, 0x35, 0x4d, 0x41, 0x51, 0x40, 0x41, 0x49, 0x4a, 0x3b,
+    0x45, 0x50, 0x48, 0x51, 0x51, 0x4d, 0x4c, 0x36, 0x47, 0x4a, 0x44, 0x45,
+    0x4d, 0x47, 0x43, 0x3a, 0x48, 0x40, 0x42, 0x4f, 0x4f, 0x4f, 0x4f, 0x43,
+    0x4a, 0x41, 0x4b, 0x53, 0x43, 0x46, 0x4f, 0x39, 0x46, 0x4a, 0x4d, 0x53,
+    0x41, 0x44, 0x4e, 0x44, 0x3f, 0x47, 0x4c, 0x4d, 0x4d, 0x43, 0x45, 0x3d,
+    0x43, 0x4b, 0x3e, 0x48, 0x42, 0x4c, 0x47, 0x42, 0x42, 0x50, 0x49, 0x4b,
+    0x43, 0x4e, 0x44, 0x44, 0x4c, 0x3d, 0x4c, 0x47, 0x4e, 0x42, 0x4b, 0x44,
+    0x4b, 0x44, 0x3f, 0x49, 0x33, 0x46, 0x4a, 0x4a, 0x42, 0x57, 0x5e, 0x4a,
+    0x46, 0x4f, 0x55, 0x3c, 0x4a, 0x4b, 0x4c, 0x43, 0x51, 0x59, 0x64, 0x51,
+    0x45, 0x60, 0x4b, 0x65, 0x46, 0x4a, 0x4e, 0x49, 0x41, 0x4b, 0x50, 0x5c,
+    0x48, 0x4b, 0x3e, 0x52, 0x4f, 0x2f, 0x4e, 0x4a, 0x45, 0x53, 0x48, 0x59,
+    0x4c, 0x4e, 0x4a, 0x4d, 0x49, 0x40, 0x52, 0x44, 0x49, 0x46, 0x4e, 0x46,
+    0x42, 0x4b, 0x4a, 0x4b, 0x4b, 0x4b, 0x4f, 0x52, 0x46, 0x50, 0x4d, 0x3d,
+    0x46, 0x4b, 0x4b, 0x40, 0x4d, 0x3f, 0x43, 0x33, 0x4e, 0x53, 0x4b, 0x4a,
+    0x45, 0x48, 0x4c, 0x2e, 0x48, 0x4f, 0x49, 0x42, 0x54, 0x4f, 0x4b, 0x2b,
+    0x55, 0x4e, 0x43, 0x4d, 0x4d, 0x47, 0x42, 0x3e, 0x48, 0x48, 0x4d, 0x54,
+    0x52, 0x4f, 0x43, 0x37, 0x4b, 0x42, 0x4b, 0x4e, 0x49, 0x49, 0x4b, 0x2e,
+    0x45, 0x4e, 0x48, 0x4e, 0x44, 0x49, 0x48, 0x30, 0x4c, 0x4b, 0x3f, 0x42,
+    0x4f, 0x4f, 0x4e, 0x38, 0x4f, 0x42, 0x54, 0x49, 0x41, 0x42, 0x45, 0x3a,
+    0x47, 0x43, 0x43, 0x4b, 0x49, 0x40, 0x4d, 0x38, 0x52, 0x4c, 0x3d, 0x4d,
+    0x43, 0x54, 0x4e, 0x41, 0x4a, 0x47, 0x44, 0x51, 0x47, 0x48, 0x41, 0x47,
+    0x4d, 0x41, 0x46, 0x4c, 0x4d, 0x46, 0x51, 0x4a, 0x49, 0x46, 0x4a, 0x42,
+    0x3a, 0x43, 0x4a, 0x4b, 0x43, 0x4c, 0x68, 0x44, 0x4b, 0x52, 0x50, 0x37,
+    0x4d, 0x4c, 0x57, 0x4c, 0x68, 0x62, 0x64, 0x4a, 0x3e, 0x64, 0x4b, 0x66,
+    0x48, 0x4d, 0x54, 0x57, 0x4b, 0x52, 0x49, 0x5c, 0x4d, 0x55, 0x51, 0x57,
+    0x4c, 0x3a, 0x48, 0x43, 0x3b, 0x43, 0x52, 0x5d, 0x45, 0x4e, 0x51, 0x4d,
+    0x4a, 0x55, 0x4e, 0x4c, 0x44, 0x51, 0x4c, 0x4f, 0x41, 0x4f, 0x4a, 0x43,
+    0x53, 0x48, 0x47, 0x49, 0x46, 0x52, 0x48, 0x3e, 0x4b, 0x4e, 0x4a, 0x50,
+    0x4f, 0x47, 0x3e, 0x2e, 0x4b, 0x51, 0x4a, 0x44, 0x4c, 0x49, 0x4f, 0x26,
+    0x48, 0x4f, 0x44, 0x51, 0x48, 0x3f, 0x4c, 0x30, 0x4e, 0x48, 0x4d, 0x48,
+    0x48, 0x44, 0x4b, 0x2f, 0x50, 0x41, 0x4d, 0x50, 0x52, 0x42, 0x45, 0x33,
+    0x4c, 0x48, 0x48, 0x3d, 0x46, 0x41, 0x43, 0x38, 0x45, 0x4f, 0x48, 0x4b,
+    0x41, 0x49, 0x4c, 0x2f, 0x53, 0x4c, 0x48, 0x4a, 0x47, 0x40, 0x4a, 0x31,
+    0x52, 0x40, 0x49, 0x4c, 0x3f, 0x48, 0x48, 0x39, 0x48, 0x3f, 0x45, 0x43,
+    0x40, 0x48, 0x3c, 0x40, 0x4c, 0x48, 0x48, 0x4d, 0x3e, 0x42, 0x4a, 0x3d,
+    0x4c, 0x45, 0x44, 0x46, 0x44, 0x45, 0x4a, 0x47, 0x52, 0x48, 0x4a, 0x4d,
+    0x3f, 0x49, 0x4c, 0x4c, 0x48, 0x44, 0x4c, 0x44, 0x3d, 0x41, 0x47, 0x45,
+    0x43, 0x4a, 0x5a, 0x3f, 0x48, 0x5d, 0x50, 0x35, 0x47, 0x4f, 0x5b, 0x46,
+    0x6e, 0x50, 0x6d, 0x44, 0x49, 0x6a, 0x53, 0x6b, 0x4b, 0x4b, 0x4f, 0x62,
+    0x45, 0x57, 0x48, 0x5b, 0x40, 0x4b, 0x4f, 0x63, 0x48, 0x3a, 0x4b, 0x42,
+    0x43, 0x53, 0x41, 0x5f, 0x54, 0x3e, 0x4d, 0x43, 0x3d, 0x4c, 0x46, 0x46,
+    0x49, 0x56, 0x4b, 0x45, 0x47, 0x45, 0x4e, 0x4f, 0x4c, 0x4d, 0x4f, 0x47,
+    0x49, 0x4b, 0x51, 0x33, 0x4b, 0x45, 0x4d, 0x41, 0x51, 0x4a, 0x43, 0x2a,
+    0x50, 0x4b, 0x4a, 0x4b, 0x4c, 0x52, 0x4c, 0x3b, 0x45, 0x4c, 0x51, 0x44,
+    0x4c, 0x48, 0x43, 0x35, 0x51, 0x50, 0x48, 0x49, 0x3f, 0x48, 0x3d, 0x3b,
+    0x52, 0x3f, 0x42, 0x4b, 0x49, 0x49, 0x47, 0x38, 0x4a, 0x4a, 0x41, 0x52,
+    0x41, 0x3e, 0x4b, 0x2f, 0x46, 0x4d, 0x49, 0x44, 0x46, 0x3b, 0x47, 0x36,
+    0x46, 0x3f, 0x49, 0x48, 0x47, 0x42, 0x42, 0x35, 0x44, 0x4b, 0x4d, 0x56,
+    0x50, 0x49, 0x43, 0x42, 0x4b, 0x3e, 0x53, 0x44, 0x4a, 0x43, 0x47, 0x38,
+    0x4a, 0x45, 0x4d, 0x3f, 0x46, 0x4a, 0x47, 0x3a, 0x4c, 0x3e, 0x47, 0x45,
+    0x46, 0x4b, 0x45, 0x49, 0x4a, 0x4b, 0x54, 0x49, 0x4a, 0x53, 0x4a, 0x4c,
+    0x45, 0x48, 0x53, 0x42, 0x4b, 0x47, 0x4e, 0x50, 0x3d, 0x51, 0x60, 0x3e,
+    0x53, 0x5d, 0x51, 0x30, 0x45, 0x50, 0x59, 0x4e, 0x62, 0x52, 0x68, 0x51,
+    0x45, 0x6c, 0x4c, 0x64, 0x4d, 0x47, 0x55, 0x61, 0x44, 0x57, 0x44, 0x58,
+    0x44, 0x4a, 0x53, 0x58, 0x47, 0x31, 0x3f, 0x4c, 0x43, 0x45, 0x48, 0x5e,
+    0x41, 0x43, 0x3f, 0x43, 0x51, 0x46, 0x48, 0x4b, 0x4d, 0x5b, 0x45, 0x4b,
+    0x48, 0x46, 0x3f, 0x45, 0x47, 0x45, 0x40, 0x4a, 0x51, 0x51, 0x3d, 0x3f,
+    0x43, 0x45, 0x4d, 0x4a, 0x47, 0x50, 0x49, 0x32, 0x4c, 0x5a, 0x55, 0x4f,
+    0x4c, 0x51, 0x43, 0x37, 0x40, 0x59, 0x49, 0x49, 0x4e, 0x4f, 0x47, 0x34,
+    0x40, 0x4c, 0x4a, 0x41, 0x4a, 0x47, 0x4a, 0x42, 0x4e, 0x4a, 0x48, 0x4e,
+    0x4e, 0x4e, 0x45, 0x39, 0x4e, 0x45, 0x45, 0x4e, 0x4c, 0x48, 0x4a, 0x35,
+    0x45, 0x4c, 0x49, 0x4f, 0x51, 0x43, 0x3c, 0x3a, 0x4a, 0x4a, 0x46, 0x48,
+    0x49, 0x42, 0x4e, 0x2f, 0x42, 0x4e, 0x45, 0x50, 0x51, 0x40, 0x45, 0x32,
+    0x4a, 0x4d, 0x44, 0x4e, 0x48, 0x48, 0x47, 0x2f, 0x48, 0x4b, 0x49, 0x44,
+    0x48, 0x4d, 0x46, 0x3b, 0x46, 0x4a, 0x41, 0x4e, 0x4e, 0x47, 0x54, 0x4b,
+    0x45, 0x49, 0x45, 0x44, 0x45, 0x48, 0x4a, 0x46, 0x55, 0x49, 0x47, 0x49,
+    0x4b, 0x42, 0x48, 0x4f, 0x3f, 0x52, 0x60, 0x39, 0x4b, 0x5e, 0x55, 0x2e,
+    0x48, 0x50, 0x59, 0x4f, 0x68, 0x5f, 0x64, 0x4f, 0x3b, 0x71, 0x50, 0x63,
+    0x4f, 0x50, 0x50, 0x6c, 0x4b, 0x55, 0x47, 0x5b, 0x4c, 0x40, 0x48, 0x59,
+    0x4f, 0x2e, 0x4b, 0x4c, 0x4e, 0x4e, 0x46, 0x61, 0x50, 0x41, 0x4c, 0x4a,
+    0x44, 0x3e, 0x3f, 0x47, 0x4b, 0x4f, 0x47, 0x4b, 0x47, 0x3d, 0x41, 0x49,
+    0x49, 0x3f, 0x4d, 0x44, 0x4a, 0x4d, 0x45, 0x41, 0x4d, 0x43, 0x49, 0x3c,
+    0x49, 0x57, 0x49, 0x3b, 0x49, 0x59, 0x3f, 0x4f, 0x4e, 0x49, 0x4e, 0x46,
+    0x52, 0x4e, 0x4c, 0x54, 0x4a, 0x48, 0x48, 0x3a, 0x44, 0x4a, 0x4f, 0x4a,
+    0x44, 0x4b, 0x43, 0x4d, 0x51, 0x42, 0x53, 0x4d, 0x52, 0x41, 0x4d, 0x43,
+    0x4e, 0x54, 0x4b, 0x42, 0x4b, 0x3f, 0x53, 0x45, 0x3f, 0x4a, 0x45, 0x50,
+    0x3f, 0x4c, 0x4f, 0x43, 0x46, 0x42, 0x4b, 0x4d, 0x4c, 0x3b, 0x48, 0x40,
+    0x4e, 0x4e, 0x49, 0x46, 0x4d, 0x4d, 0x52, 0x40, 0x4e, 0x4f, 0x46, 0x4a,
+    0x40, 0x4b, 0x4c, 0x40, 0x4f, 0x4a, 0x44, 0x41, 0x46, 0x3c, 0x40, 0x3d,
+    0x44, 0x48, 0x4a, 0x50, 0x46, 0x53, 0x46, 0x40, 0x44, 0x3e, 0x47, 0x43,
+    0x48, 0x3d, 0x4e, 0x3e, 0x48, 0x49, 0x4b, 0x49, 0x4c, 0x3e, 0x4c, 0x4a,
+    0x46, 0x4e, 0x62, 0x3c, 0x59, 0x60, 0x51, 0x29, 0x47, 0x52, 0x59, 0x4c,
+    0x67, 0x68, 0x68, 0x4e, 0x3b, 0x72, 0x4d, 0x68, 0x44, 0x4f, 0x53, 0x63,
+    0x47, 0x5a, 0x45, 0x4f, 0x4b, 0x37, 0x43, 0x5b, 0x4b, 0x3d, 0x44, 0x41,
+    0x4a, 0x4b, 0x3c, 0x64, 0x48, 0x38, 0x42, 0x3f, 0x48, 0x46, 0x4b, 0x46,
+    0x46, 0x4f, 0x46, 0x46, 0x44, 0x3c, 0x4b, 0x4f, 0x4d, 0x4a, 0x4b, 0x46,
+    0x4d, 0x4f, 0x4f, 0x3f, 0x3a, 0x4b, 0x55, 0x3c, 0x51, 0x56, 0x4d, 0x42,
+    0x52, 0x5a, 0x3e, 0x4b, 0x54, 0x57, 0x4e, 0x4d, 0x4e, 0x5b, 0x4e, 0x49,
+    0x4e, 0x3c, 0x40, 0x41, 0x40, 0x4d, 0x48, 0x42, 0x49, 0x4e, 0x4f, 0x47,
+    0x47, 0x48, 0x50, 0x49, 0x51, 0x46, 0x44, 0x45, 0x49, 0x46, 0x43, 0x48,
+    0x48, 0x49, 0x4d, 0x4c, 0x45, 0x4f, 0x4c, 0x45, 0x44, 0x40, 0x49, 0x45,
+    0x49, 0x51, 0x4b, 0x4b, 0x50, 0x4b, 0x48, 0x3d, 0x4e, 0x52, 0x4a, 0x47,
+    0x49, 0x41, 0x55, 0x3d, 0x48, 0x4d, 0x49, 0x48, 0x4e, 0x4c, 0x48, 0x3d,
+    0x3f, 0x4c, 0x4e, 0x53, 0x3e, 0x48, 0x4a, 0x3f, 0x54, 0x4d, 0x54, 0x4b,
+    0x47, 0x4e, 0x44, 0x48, 0x49, 0x4b, 0x4c, 0x49, 0x4d, 0x42, 0x52, 0x4b,
+    0x40, 0x3e, 0x54, 0x49, 0x55, 0x45, 0x47, 0x4d, 0x45, 0x5c, 0x60, 0x40,
+    0x57, 0x60, 0x5b, 0x27, 0x4a, 0x5a, 0x64, 0x53, 0x6a, 0x5a, 0x5f, 0x52,
+    0x3a, 0x72, 0x4b, 0x5f, 0x45, 0x56, 0x5f, 0x5f, 0x54, 0x5f, 0x39, 0x52,
+    0x51, 0x3e, 0x3b, 0x5a, 0x44, 0x32, 0x46, 0x50, 0x3a, 0x4f, 0x44, 0x5d,
+    0x4c, 0x41, 0x39, 0x3f, 0x45, 0x46, 0x3b, 0x43, 0x46, 0x51, 0x3c, 0x4c,
+    0x4b, 0x43, 0x4b, 0x51, 0x43, 0x48, 0x4d, 0x43, 0x38, 0x46, 0x46, 0x43,
+    0x44, 0x4a, 0x46, 0x49, 0x48, 0x50, 0x4e, 0x4a, 0x4e, 0x58, 0x4a, 0x49,
+    0x48, 0x4f, 0x4a, 0x49, 0x41, 0x57, 0x51, 0x50, 0x4b, 0x48, 0x47, 0x4b,
+    0x53, 0x3d, 0x4b, 0x4c, 0x4b, 0x4b, 0x55, 0x56, 0x45, 0x49, 0x46, 0x4c,
+    0x45, 0x51, 0x47, 0x50, 0x40, 0x4b, 0x4f, 0x4b, 0x4d, 0x4a, 0x4f, 0x50,
+    0x49, 0x53, 0x50, 0x46, 0x40, 0x48, 0x4a, 0x4a, 0x49, 0x4a, 0x42, 0x45,
+    0x4b, 0x45, 0x42, 0x45, 0x4e, 0x4e, 0x44, 0x41, 0x4b, 0x4a, 0x49, 0x3f,
+    0x41, 0x51, 0x48, 0x4c, 0x40, 0x41, 0x51, 0x42, 0x49, 0x49, 0x48, 0x42,
+    0x48, 0x4c, 0x4b, 0x3c, 0x49, 0x45, 0x42, 0x49, 0x4c, 0x46, 0x45, 0x43,
+    0x43, 0x48, 0x48, 0x41, 0x43, 0x42, 0x4c, 0x4b, 0x40, 0x45, 0x44, 0x46,
+    0x4c, 0x4b, 0x4e, 0x4d, 0x3f, 0x59, 0x55, 0x41, 0x56, 0x5a, 0x51, 0x30,
+    0x49, 0x5a, 0x63, 0x4d, 0x61, 0x5b, 0x64, 0x55, 0x34, 0x7a, 0x4c, 0x62,
+    0x3e, 0x5d, 0x56, 0x60, 0x48, 0x61, 0x3f, 0x54, 0x46, 0x40, 0x42, 0x56,
+    0x52, 0x35, 0x4c, 0x59, 0x45, 0x4c, 0x42, 0x60, 0x49, 0x3f, 0x4c, 0x3c,
+    0x52, 0x36, 0x46, 0x3d, 0x58, 0x4b, 0x41, 0x48, 0x3e, 0x45, 0x4e, 0x54,
+    0x4c, 0x56, 0x47, 0x44, 0x39, 0x4a, 0x4a, 0x4a, 0x46, 0x48, 0x4a, 0x48,
+    0x51, 0x4f, 0x4b, 0x49, 0x45, 0x4b, 0x44, 0x4c, 0x3e, 0x4c, 0x42, 0x59,
+    0x47, 0x55, 0x47, 0x47, 0x41, 0x44, 0x44, 0x4a, 0x44, 0x4b, 0x44, 0x46,
+    0x49, 0x5a, 0x48, 0x5d, 0x4f, 0x4a, 0x47, 0x50, 0x48, 0x4e, 0x44, 0x57,
+    0x49, 0x46, 0x42, 0x4d, 0x3d, 0x4a, 0x4a, 0x58, 0x41, 0x4d, 0x3c, 0x47,
+    0x42, 0x4e, 0x4d, 0x49, 0x44, 0x4b, 0x4c, 0x4b, 0x53, 0x42, 0x4a, 0x46,
+    0x4e, 0x56, 0x4b, 0x47, 0x50, 0x43, 0x4f, 0x48, 0x49, 0x50, 0x48, 0x50,
+    0x42, 0x4c, 0x4e, 0x3c, 0x41, 0x4f, 0x4a, 0x41, 0x44, 0x47, 0x4c, 0x42,
+    0x51, 0x4f, 0x53, 0x46, 0x4c, 0x4b, 0x48, 0x51, 0x47, 0x4b, 0x4c, 0x4d,
+    0x4d, 0x49, 0x3d, 0x44, 0x4b, 0x42, 0x43, 0x49, 0x51, 0x47, 0x4c, 0x4b,
+    0x4a, 0x50, 0x5b, 0x43, 0x5b, 0x68, 0x54, 0x31, 0x4c, 0x5d, 0x5c, 0x54,
+    0x63, 0x5a, 0x61, 0x54, 0x3d, 0x7a, 0x51, 0x5b, 0x40, 0x59, 0x5a, 0x62,
+    0x4c, 0x5e, 0x42, 0x58, 0x49, 0x3c, 0x38, 0x50, 0x54, 0x37, 0x42, 0x51,
+    0x4d, 0x4f, 0x42, 0x68, 0x4a, 0x40, 0x4e, 0x40, 0x3f, 0x3e, 0x3f, 0x40,
+    0x54, 0x52, 0x3e, 0x43, 0x46, 0x4a, 0x48, 0x51, 0x4e, 0x4d, 0x42, 0x47,
+    0x3f, 0x51, 0x47, 0x44, 0x3f, 0x4c, 0x46, 0x47, 0x4f, 0x55, 0x4b, 0x4e,
+    0x4c, 0x51, 0x40, 0x51, 0x47, 0x4a, 0x44, 0x5c, 0x48, 0x54, 0x4b, 0x46,
+    0x49, 0x4b, 0x53, 0x59, 0x43, 0x3e, 0x45, 0x4e, 0x4f, 0x58, 0x4b, 0x64,
+    0x41, 0x4b, 0x45, 0x4a, 0x4c, 0x51, 0x47, 0x57, 0x45, 0x46, 0x43, 0x4f,
+    0x4d, 0x4d, 0x49, 0x58, 0x4b, 0x52, 0x43, 0x4b, 0x45, 0x4c, 0x50, 0x4c,
+    0x4e, 0x4b, 0x40, 0x4c, 0x44, 0x4e, 0x4c, 0x47, 0x41, 0x55, 0x45, 0x4a,
+    0x4c, 0x48, 0x46, 0x41, 0x47, 0x52, 0x44, 0x4f, 0x48, 0x49, 0x4b, 0x47,
+    0x50, 0x4f, 0x42, 0x4a, 0x44, 0x4b, 0x52, 0x43, 0x45, 0x4e, 0x46, 0x49,
+    0x45, 0x52, 0x51, 0x45, 0x44, 0x41, 0x4c, 0x46, 0x4c, 0x4b, 0x44, 0x4d,
+    0x4f, 0x48, 0x44, 0x4d, 0x56, 0x48, 0x50, 0x4f, 0x3b, 0x4e, 0x55, 0x43,
+    0x52, 0x62, 0x57, 0x2c, 0x4d, 0x5e, 0x5e, 0x50, 0x64, 0x5b, 0x6a, 0x55,
+    0x39, 0x7d, 0x4b, 0x5e, 0x43, 0x54, 0x5d, 0x5c, 0x4d, 0x5c, 0x42, 0x51,
+    0x4c, 0x3d, 0x46, 0x51, 0x4c, 0x2a, 0x3e, 0x54, 0x47, 0x48, 0x46, 0x64,
+    0x42, 0x3d, 0x47, 0x3f, 0x42, 0x45, 0x49, 0x3b, 0x59, 0x50, 0x4c, 0x46,
+    0x4d, 0x44, 0x47, 0x4d, 0x4a, 0x50, 0x41, 0x48, 0x43, 0x50, 0x3e, 0x44,
+    0x4b, 0x53, 0x48, 0x49, 0x51, 0x51, 0x4d, 0x57, 0x49, 0x4f, 0x53, 0x50,
+    0x46, 0x4f, 0x41, 0x5d, 0x47, 0x46, 0x49, 0x51, 0x45, 0x41, 0x4a, 0x56,
+    0x4f, 0x4e, 0x4d, 0x4a, 0x3e, 0x55, 0x47, 0x65, 0x48, 0x51, 0x4d, 0x4e,
+    0x46, 0x43, 0x48, 0x5b, 0x48, 0x4f, 0x4f, 0x48, 0x4b, 0x4d, 0x4e, 0x5c,
+    0x4f, 0x4c, 0x54, 0x48, 0x4a, 0x4d, 0x4e, 0x4e, 0x44, 0x48, 0x43, 0x52,
+    0x41, 0x52, 0x48, 0x4f, 0x46, 0x4f, 0x51, 0x41, 0x44, 0x45, 0x41, 0x4b,
+    0x43, 0x4e, 0x4e, 0x42, 0x48, 0x41, 0x45, 0x43, 0x44, 0x43, 0x4c, 0x4c,
+    0x51, 0x54, 0x4c, 0x32, 0x46, 0x52, 0x4e, 0x49, 0x40, 0x4d, 0x43, 0x4f,
+    0x4a, 0x4d, 0x4d, 0x49, 0x46, 0x4c, 0x41, 0x4d, 0x41, 0x3a, 0x50, 0x4c,
+    0x5a, 0x4e, 0x49, 0x53, 0x4d, 0x53, 0x53, 0x3d, 0x52, 0x64, 0x55, 0x2a,
+    0x47, 0x5d, 0x61, 0x51, 0x5b, 0x5d, 0x66, 0x52, 0x3f, 0xfd, 0x55, 0x5a,
+    0x4b, 0x54, 0x5b, 0x60, 0x49, 0x5d, 0x43, 0x57, 0x47, 0x41, 0x45, 0x5e,
+    0x4c, 0x28, 0x3e, 0x40, 0x49, 0x4e, 0x40, 0x69, 0x4a, 0x44, 0x45, 0x43,
+    0x45, 0x3d, 0x39, 0x40, 0x4c, 0x53, 0x4b, 0x3d, 0x4e, 0x43, 0x48, 0x55,
+    0x4d, 0x50, 0x4d, 0x49, 0x4f, 0x48, 0x3e, 0x46, 0x47, 0x56, 0x40, 0x48,
+    0x46, 0x53, 0x50, 0x5d, 0x43, 0x54, 0x49, 0x47, 0x49, 0x4c, 0x48, 0x5d,
+    0x49, 0x51, 0x50, 0x3d, 0x41, 0x47, 0x48, 0x64, 0x4b, 0x44, 0x49, 0x41,
+    0x54, 0x48, 0x3d, 0x6b, 0x4c, 0x5a, 0x48, 0x4e, 0x40, 0x4c, 0x52, 0x5f,
+    0x54, 0x4a, 0x3f, 0x48, 0x43, 0x43, 0x44, 0x66, 0x49, 0x47, 0x43, 0x46,
+    0x47, 0x54, 0x42, 0x54, 0x4b, 0x4e, 0x49, 0x49, 0x49, 0x4b, 0x52, 0x4f,
+    0x43, 0x46, 0x4b, 0x49, 0x54, 0x4b, 0x40, 0x48, 0x47, 0x4a, 0x46, 0x47,
+    0x44, 0x47, 0x4c, 0x37, 0x3f, 0x49, 0x45, 0x44, 0x50, 0x49, 0x44, 0x36,
+    0x4d, 0x40, 0x45, 0x49, 0x53, 0x55, 0x44, 0x42, 0x47, 0x48, 0x46, 0x40,
+    0x4f, 0x4c, 0x41, 0x42, 0x52, 0x3a, 0x43, 0x46, 0x55, 0x51, 0x4e, 0x4f,
+    0x48, 0x51, 0x55, 0x48, 0x52, 0x66, 0x4e, 0x33, 0x49, 0x5b, 0x5f, 0x4b,
+    0x5f, 0x5b, 0x66, 0x52, 0x41, 0x7c, 0x4a, 0x59, 0x47, 0x59, 0x58, 0x67,
+    0x49, 0x5e, 0x44, 0x57, 0x49, 0x4c, 0x43, 0x56, 0x41, 0x27, 0x4c, 0x44,
+    0x51, 0x44, 0x42, 0x65, 0x49, 0x44, 0x40, 0x3d, 0x4d, 0x3e, 0x4c, 0x3c,
+    0x4f, 0x4b, 0x45, 0x44, 0x4d, 0x48, 0x47, 0x54, 0x4d, 0x4e, 0x44, 0x42,
+    0x47, 0x44, 0x3d, 0x49, 0x4e, 0x50, 0x49, 0x45, 0x58, 0x4a, 0x54, 0x5c,
+    0x41, 0x49, 0x4f, 0x42, 0x44, 0x4f, 0x4a, 0x62, 0x48, 0x50, 0x48, 0x43,
+    0x51, 0x53, 0x47, 0x6c, 0x40, 0x46, 0x3d, 0x46, 0x4a, 0x50, 0x43, 0x69,
+    0x49, 0x4f, 0x4a, 0x4c, 0x49, 0x46, 0x43, 0x6a, 0x48, 0x50, 0x49, 0x48,
+    0x48, 0x51, 0x4b, 0x65, 0x42, 0x4b, 0x4d, 0x48, 0x44, 0x4e, 0x49, 0x60,
+    0x44, 0x52, 0x42, 0x42, 0x47, 0x48, 0x4b, 0x51, 0x50, 0x4b, 0x3c, 0x4d,
+    0x4c, 0x44, 0x48, 0x55, 0x51, 0x4c, 0x55, 0x4e, 0x52, 0x4c, 0x4b, 0x39,
+    0x48, 0x42, 0x49, 0x49, 0x49, 0x50, 0x49, 0x32, 0x4e, 0x4b, 0x45, 0x4f,
+    0x42, 0x4b, 0x47, 0x50, 0x48, 0x45, 0x54, 0x49, 0x4c, 0x46, 0x40, 0x46,
+    0x43, 0x3d, 0x51, 0x44, 0x53, 0x4f, 0x54, 0x55, 0x43, 0x4f, 0x5b, 0x47,
+    0x53, 0x6c, 0x57, 0x2e, 0x50, 0x55, 0x5a, 0x4d, 0x57, 0x5d, 0x70, 0x50,
+    0x3f, 0x79, 0x4a, 0x5a, 0x4c, 0x58, 0x59, 0x63, 0x45, 0x69, 0x48, 0x58,
+    0x42, 0x4b, 0x43, 0x5c, 0x46, 0x28, 0x48, 0x49, 0x4c, 0x3f, 0x45, 0x58,
+    0x45, 0x44, 0x47, 0x40, 0x4c, 0x42, 0x3e, 0x37, 0x45, 0x54, 0x48, 0x3b,
+    0x4e, 0x48, 0x43, 0x4a, 0x50, 0x4a, 0x49, 0x46, 0x4c, 0x54, 0x3f, 0x4b,
+    0x4e, 0x56, 0x48, 0x49, 0x49, 0x4c, 0x51, 0x5f, 0x4d, 0x4b, 0x43, 0x4d,
+    0x47, 0x51, 0x43, 0x59, 0x45, 0x4e, 0x4f, 0x45, 0x44, 0x54, 0x44, 0x6d,
+    0x47, 0x51, 0x43, 0x4e, 0x4c, 0x4f, 0x43, 0x6d, 0x48, 0x53, 0x4b, 0x47,
+    0x49, 0x48, 0x46, 0x6a, 0x51, 0x4c, 0x4d, 0x45, 0x4e, 0x47, 0x46, 0x62,
+    0x4a, 0x54, 0x51, 0x4c, 0x47, 0x4d, 0x4a, 0x61, 0x3d, 0x50, 0x4c, 0x4c,
+    0x45, 0x3f, 0x3e, 0x54, 0x3d, 0x53, 0x48, 0x47, 0x52, 0x4b, 0x47, 0x51,
+    0x4f, 0x45, 0x4b, 0x4a, 0x4c, 0x46, 0x44, 0x37, 0x42, 0x50, 0x49, 0x4f,
+    0x51, 0x41, 0x44, 0x38, 0x54, 0x40, 0x51, 0x52, 0x3e, 0x43, 0x44, 0x47,
+    0x49, 0x4b, 0x4b, 0x46, 0x53, 0x54, 0x55, 0x4b, 0x4a, 0x37, 0x43, 0x4a,
+    0x51, 0x47, 0x51, 0x54, 0x43, 0x46, 0x56, 0x3d, 0x54, 0x66, 0x4f, 0x30,
+    0x45, 0x52, 0x5a, 0x43, 0x5c, 0x65, 0x5d, 0x52, 0x32, 0x77, 0x53, 0x5f,
+    0x4a, 0x5a, 0x4f, 0x5e, 0x4e, 0x61, 0x4b, 0x5b, 0x4a, 0x53, 0x3e, 0x61,
+    0x47, 0x24, 0x3e, 0x48, 0x4d, 0x43, 0x40, 0x53, 0x4e, 0x41, 0x43, 0x3d,
+    0x50, 0x49, 0x41, 0x3a, 0x4e, 0x4b, 0x48, 0x49, 0x48, 0x49, 0x46, 0x50,
+    0x4f, 0x4b, 0x47, 0x4b, 0x48, 0x52, 0x3e, 0x4d, 0x4d, 0x59, 0x4c, 0x3e,
+    0x52, 0x49, 0x4f, 0x5e, 0x54, 0x59, 0x47, 0x4d, 0x40, 0x4c, 0x4b, 0x64,
+    0x42, 0x4c, 0x53, 0x46, 0x4e, 0x50, 0x46, 0x6a, 0x41, 0x59, 0x44, 0x4b,
+    0x4f, 0x44, 0x52, 0x6c, 0x54, 0x4e, 0x46, 0x48, 0x42, 0x3d, 0x44, 0x67,
+    0x44, 0x4f, 0x47, 0x54, 0x4c, 0x4f, 0x43, 0x61, 0x4c, 0x54, 0x4f, 0x43,
+    0x49, 0x40, 0x4a, 0x5f, 0x4a, 0x52, 0x47, 0x43, 0x4c, 0x43, 0x49, 0x53,
+    0x4c, 0x4b, 0x43, 0x3d, 0x4e, 0x45, 0x49, 0x50, 0x44, 0x53, 0x4f, 0x48,
+    0x4b, 0x46, 0x44, 0x3c, 0x50, 0x42, 0x43, 0x40, 0x47, 0x43, 0x42, 0x34,
+    0x47, 0x42, 0x3f, 0x4a, 0x48, 0x42, 0x48, 0x4c, 0x42, 0x4c, 0x4e, 0x47,
+    0x48, 0x47, 0x51, 0x51, 0x4d, 0x3d, 0x3e, 0x4b, 0x54, 0x4c, 0x4c, 0x59,
+    0x4f, 0x50, 0x57, 0x3c, 0x54, 0x62, 0x54, 0x35, 0x3d, 0x5a, 0x5b, 0x47,
+    0x59, 0x63, 0x66, 0x4d, 0x3c, 0x79, 0x50, 0x5f, 0x45, 0x58, 0x4e, 0x5d,
+    0x48, 0x61, 0x43, 0x54, 0x47, 0x54, 0x4d, 0x54, 0x4b, 0x25, 0x41, 0x44,
+    0x4c, 0x4a, 0x3b, 0x52, 0x47, 0x3c, 0x45, 0x3c, 0x53, 0x44, 0x44, 0x40,
+    0x50, 0x4c, 0x45, 0x3a, 0x4c, 0x51, 0x44, 0x49, 0x4d, 0x52, 0x4d, 0x4b,
+    0x45, 0x52, 0x3d, 0x50, 0x4a, 0x58, 0x4a, 0x47, 0x4d, 0x47, 0x4e, 0x52,
+    0x4f, 0x4d, 0x4f, 0x49, 0x52, 0x52, 0x4c, 0x5e, 0x47, 0x4d, 0x46, 0x4d,
+    0x4c, 0x48, 0x50, 0x70, 0x41, 0x4a, 0x48, 0x3d, 0x45, 0x48, 0x45, 0x74,
+    0x47, 0x4c, 0x43, 0x4f, 0x4a, 0x4a, 0x40, 0x68, 0x52, 0x49, 0x3e, 0x3e,
+    0x4e, 0x4b, 0x4b, 0x69, 0x42, 0x4f, 0x45, 0x47, 0x3f, 0x45, 0x46, 0x56,
+    0x45, 0x4a, 0x47, 0x44, 0x52, 0x4b, 0x53, 0x4e, 0x4e, 0x46, 0x45, 0x40,
+    0x47, 0x4b, 0x53, 0x52, 0x53, 0x51, 0x4f, 0x46, 0x42, 0x43, 0x50, 0x3e,
+    0x48, 0x4e, 0x41, 0x53, 0x4d, 0x48, 0x48, 0x33, 0x40, 0x43, 0x4b, 0x42,
+    0x52, 0x4c, 0x42, 0x4e, 0x41, 0x4e, 0x4f, 0x50, 0x43, 0x49, 0x4d, 0x47,
+    0x4a, 0x3a, 0x3f, 0x51, 0x51, 0x44, 0x4e, 0x54, 0x40, 0x55, 0x59, 0x3c,
+    0x57, 0x67, 0x4e, 0x2e, 0x4c, 0x5b, 0x5b, 0x51, 0x58, 0x63, 0x62, 0x52,
+    0x3c, 0x72, 0x51, 0x5a, 0x4e, 0x53, 0x4a, 0x5c, 0x51, 0x69, 0x42, 0x51,
+    0x48, 0x54, 0x48, 0x57, 0x3e, 0x37, 0x3f, 0x4d, 0x4d, 0x4a, 0x35, 0x57,
+    0x4e, 0x40, 0x45, 0x4a, 0x45, 0x4e, 0x49, 0x40, 0x49, 0x53, 0x51, 0x44,
+    0x4a, 0x50, 0x4b, 0x4b, 0x50, 0x4f, 0x3e, 0x44, 0x45, 0x44, 0x4c, 0x51,
+    0x47, 0x51, 0x46, 0x42, 0x48, 0x50, 0x49, 0x4d, 0x43, 0x54, 0x52, 0x4d,
+    0x4e, 0x4f, 0x3f, 0x63, 0x54, 0x57, 0x41, 0x44, 0x4e, 0x50, 0x4e, 0x66,
+    0x41, 0x53, 0x4b, 0x4d, 0x4e, 0x4f, 0x43, 0x6d, 0x4e, 0x51, 0x49, 0x4f,
+    0x49, 0x4a, 0x4a, 0x6c, 0x4b, 0x4f, 0x3d, 0x47, 0x4d, 0x51, 0x3c, 0x66,
+    0x4b, 0x56, 0x3e, 0x4c, 0x41, 0x46, 0x45, 0x68, 0x47, 0x4b, 0x4a, 0x54,
+    0x53, 0x48, 0x51, 0x59, 0x45, 0x43, 0x50, 0x45, 0x4f, 0x45, 0x42, 0x55,
+    0x48, 0x52, 0x4c, 0x46, 0x52, 0x49, 0x47, 0x3d, 0x55, 0x48, 0x52, 0x52,
+    0x40, 0x4e, 0x47, 0x31, 0x45, 0x4f, 0x42, 0x4a, 0x4e, 0x50, 0x42, 0x4a,
+    0x49, 0x57, 0x46, 0x4b, 0x45, 0x4e, 0x4d, 0x46, 0x47, 0x43, 0x50, 0x4e,
+    0x4f, 0x4c, 0x53, 0x55, 0x45, 0x51, 0x5b, 0x3a, 0x52, 0x64, 0x54, 0x2d,
+    0x42, 0x59, 0x59, 0x45, 0x59, 0x67, 0x69, 0x53, 0x3f, 0x78, 0x50, 0x60,
+    0x4c, 0x4c, 0x5b, 0x53, 0x45, 0x63, 0x49, 0x63, 0x51, 0x4c, 0x41, 0x4e,
+    0x4b, 0x37, 0x45, 0x4e, 0x48, 0x4c, 0x39, 0x55, 0x44, 0x37, 0x3c, 0x49,
+    0x44, 0x56, 0x3e, 0x40, 0x4d, 0x45, 0x4c, 0x43, 0x42, 0x41, 0x40, 0x42,
+    0x57, 0x4f, 0x43, 0x3f, 0x52, 0x53, 0x51, 0x4b, 0x4b, 0x55, 0x46, 0x40,
+    0x49, 0x45, 0x40, 0x4f, 0x47, 0x58, 0x4b, 0x53, 0x4e, 0x52, 0x54, 0x5e,
+    0x4b, 0x51, 0x50, 0x44, 0x50, 0x4b, 0x4f, 0x70, 0x49, 0x4f, 0x4c, 0x50,
+    0x45, 0x56, 0x4b, 0x6b, 0x49, 0x52, 0x4a, 0x3f, 0x44, 0x4b, 0x48, 0x72,
+    0x4c, 0x47, 0x4e, 0x43, 0x46, 0x4c, 0x4f, 0x61, 0x4a, 0x52, 0x52, 0x46,
+    0x4a, 0x4d, 0x46, 0x65, 0x48, 0x4e, 0x4d, 0x4e, 0x46, 0x4e, 0x53, 0x59,
+    0x43, 0x49, 0x43, 0x47, 0x45, 0x47, 0x53, 0x50, 0x3e, 0x4d, 0x41, 0x46,
+    0x4c, 0x4a, 0x4c, 0x35, 0x3f, 0x4f, 0x50, 0x48, 0x47, 0x4d, 0x4c, 0x32,
+    0x45, 0x53, 0x43, 0x4d, 0x4e, 0x4a, 0x3e, 0x4b, 0x55, 0x4f, 0x53, 0x4c,
+    0x4a, 0x4d, 0x48, 0x53, 0x4f, 0x3a, 0x47, 0x4b, 0x4e, 0x4e, 0x51, 0x59,
+    0x41, 0x50, 0x57, 0x38, 0x5d, 0x63, 0x59, 0x2b, 0x45, 0x53, 0x5a, 0x4e,
+    0x5c, 0x60, 0x5e, 0x4c, 0x41, 0x6f, 0x53, 0x5c, 0x48, 0x53, 0x56, 0x54,
+    0x4b, 0x62, 0x46, 0x63, 0x47, 0x4e, 0x40, 0x51, 0x43, 0x36, 0x44, 0x42,
+    0x46, 0x51, 0x41, 0x54, 0x4e, 0x36, 0x40, 0x4b, 0x55, 0x49, 0x40, 0x3f,
+    0x4b, 0x42, 0x4a, 0x4a, 0x48, 0x47, 0x40, 0x43, 0x4d, 0x4f, 0x55, 0x3f,
+    0x53, 0x42, 0x4d, 0x56, 0x49, 0x51, 0x4f, 0x41, 0x3b, 0x48, 0x43, 0x4e,
+    0x4b, 0x5c, 0x4f, 0x45, 0x4a, 0x4c, 0x46, 0x66, 0x43, 0x45, 0x46, 0x48,
+    0x4f, 0x4e, 0x40, 0x71, 0x4b, 0x4e, 0x3e, 0x42, 0x4d, 0x52, 0x42, 0x71,
+    0x4c, 0x54, 0x4f, 0x3f, 0x4c, 0x43, 0x4a, 0x73, 0x48, 0x48, 0x4c, 0x4b,
+    0x4c, 0x4d, 0x40, 0x72, 0x3e, 0x51, 0x49, 0x48, 0x52, 0x53, 0x45, 0x65,
+    0x52, 0x4e, 0x4f, 0x44, 0x4c, 0x43, 0x4a, 0x5e, 0x3e, 0x56, 0x46, 0x55,
+    0x55, 0x43, 0x49, 0x51, 0x4f, 0x52, 0x49, 0x4d, 0x46, 0x47, 0x49, 0x3e,
+    0x51, 0x49, 0x41, 0x53, 0x42, 0x47, 0x46, 0x3b, 0x4d, 0x4e, 0x48, 0x44,
+    0x42, 0x48, 0x4c, 0x47, 0x42, 0x4e, 0x4a, 0x3e, 0x44, 0x54, 0x4a, 0x4d,
+    0x49, 0x41, 0x41, 0x53, 0x52, 0x4c, 0x4c, 0x56, 0x49, 0x4a, 0x5a, 0x3f,
+    0x5b, 0x5c, 0x59, 0x2f, 0x49, 0x52, 0x5a, 0x4e, 0x5a, 0x61, 0x67, 0x4c,
+    0x41, 0x6f, 0x5a, 0x5a, 0x40, 0x5a, 0x54, 0x4e, 0x49, 0x66, 0x45, 0x5a,
+    0x4a, 0x45, 0x44, 0x4b, 0x44, 0x36, 0x41, 0x4c, 0x45, 0x44, 0x3d, 0x51,
+    0x3f, 0x35, 0x3c, 0x46, 0x53, 0x5c, 0x3f, 0x3e, 0x50, 0x43, 0x46, 0x4b,
+    0x40, 0x54, 0x41, 0x47, 0x4b, 0x51, 0x41, 0x46, 0x4a, 0x4d, 0x51, 0x52,
+    0x43, 0x58, 0x45, 0x46, 0x4e, 0x46, 0x4a, 0x4b, 0x44, 0x54, 0x4c, 0x4c,
+    0x43, 0x59, 0x48, 0x61, 0x4e, 0x4f, 0x4d, 0x4d, 0x4a, 0x52, 0x4c, 0x6e,
+    0x49, 0x57, 0x48, 0x4d, 0x46, 0x46, 0x4d, 0x72, 0x4a, 0x4e, 0x47, 0x44,
+    0x49, 0x4f, 0x48, 0x73, 0x42, 0x40, 0x4d, 0x44, 0x4d, 0x57, 0x3e, 0x69,
+    0x50, 0x52, 0x4c, 0x55, 0x46, 0x4c, 0x44, 0x5f, 0x4b, 0x4d, 0x55, 0x4c,
+    0x48, 0x49, 0x4a, 0x5e, 0x47, 0x4b, 0x45, 0x53, 0x55, 0x53, 0x4d, 0x53,
+    0x47, 0x5c, 0x45, 0x4e, 0x4e, 0x52, 0x4c, 0x39, 0x4b, 0x4c, 0x49, 0x46,
+    0x4a, 0x4e, 0x4b, 0x33, 0x46, 0x47, 0x52, 0x41, 0x49, 0x4b, 0x4c, 0x48,
+    0x51, 0x53, 0x44, 0x4c, 0x4a, 0x45, 0x46, 0x49, 0x49, 0x4b, 0x50, 0x47,
+    0x4d, 0x4b, 0x4c, 0x4f, 0x44, 0x45, 0x58, 0x3c, 0x56, 0x5a, 0x56, 0x23,
+    0x4f, 0x4d, 0x5c, 0x4e, 0x59, 0x5a, 0x65, 0x43, 0x45, 0x66, 0x54, 0x5f,
+    0x45, 0x5e, 0x54, 0x4f, 0x48, 0x5f, 0x44, 0x59, 0x48, 0x46, 0x47, 0x49,
+    0x4d, 0x3c, 0x49, 0x54, 0x3e, 0x48, 0x43, 0x5b, 0x4a, 0x35, 0x41, 0x43,
+    0x4b, 0x55, 0x43, 0x38, 0x46, 0x42, 0x4a, 0x4e, 0x54, 0x4b, 0x4d, 0x46,
+    0x43, 0x4e, 0x44, 0x47, 0x56, 0x4c, 0x51, 0x57, 0x41, 0x4d, 0x43, 0x41,
+    0x51, 0x47, 0x41, 0x51, 0x51, 0x4f, 0x46, 0x50, 0x52, 0x4e, 0x4d, 0x60,
+    0x41, 0x49, 0x46, 0x50, 0x48, 0x56, 0x42, 0x6d, 0x40, 0x45, 0x44, 0x55,
+    0x40, 0x4e, 0x40, 0x7c, 0x47, 0x5a, 0x44, 0x44, 0x45, 0x56, 0x55, 0x71,
+    0x47, 0x4b, 0x4b, 0x45, 0x4f, 0x54, 0x4c, 0x73, 0x48, 0x55, 0x44, 0x4d,
+    0x4a, 0x47, 0x49, 0x5e, 0x4d, 0x52, 0x4e, 0x4c, 0x48, 0x52, 0x48, 0x58,
+    0x4c, 0x5a, 0x49, 0x4b, 0x53, 0x46, 0x4d, 0x4b, 0x48, 0x53, 0x41, 0x49,
+    0x4a, 0x56, 0x51, 0x3a, 0x4c, 0x4e, 0x4f, 0x51, 0x4c, 0x59, 0x47, 0x45,
+    0x4f, 0x50, 0x4a, 0x4f, 0x4d, 0x3f, 0x44, 0x4e, 0x42, 0x4a, 0x4a, 0x43,
+    0x46, 0x4e, 0x4c, 0x4f, 0x47, 0x47, 0x4c, 0x4b, 0x52, 0x50, 0x50, 0x4b,
+    0x42, 0x45, 0x54, 0x44, 0x54, 0x59, 0x4c, 0x2b, 0x4d, 0x4c, 0x55, 0x4e,
+    0x5c, 0x5b, 0x5a, 0x42, 0x47, 0x5e, 0x56, 0x59, 0x47, 0x65, 0x55, 0x4c,
+    0x4c, 0x59, 0x42, 0x5a, 0x4e, 0x46, 0x4e, 0x4b, 0x53, 0x46, 0x49, 0x56,
+    0x48, 0x58, 0x4b, 0x4f, 0x45, 0x38, 0x40, 0x44, 0x49, 0x51, 0x4a, 0x3b,
+    0x53, 0x40, 0x40, 0x48, 0x51, 0x49, 0x44, 0x46, 0x52, 0x4b, 0x4e, 0x45,
+    0x48, 0x5a, 0x4e, 0x57, 0x44, 0x53, 0x49, 0x40, 0x4c, 0x47, 0x41, 0x4f,
+    0x49, 0x55, 0x46, 0x50, 0x57, 0x5b, 0x48, 0x66, 0x50, 0x49, 0x51, 0x55,
+    0x55, 0x4f, 0x47, 0x72, 0x49, 0x4f, 0x41, 0x4c, 0x49, 0x42, 0x48, 0x75,
+    0x4a, 0x55, 0x45, 0x4a, 0x41, 0x51, 0x41, 0x70, 0x47, 0x49, 0x42, 0x52,
+    0x4f, 0x47, 0x46, 0x63, 0x4f, 0x53, 0x46, 0x4f, 0x49, 0x53, 0x52, 0x63,
+    0x4c, 0x59, 0x46, 0x41, 0x49, 0x51, 0x3e, 0x53, 0x45, 0x52, 0x51, 0x40,
+    0x4f, 0x4c, 0x41, 0x4c, 0x47, 0x4a, 0x46, 0x47, 0x53, 0x47, 0x48, 0x39,
+    0x53, 0x4b, 0x46, 0x4b, 0x50, 0x4c, 0x41, 0x40, 0x48, 0x4e, 0x49, 0x4e,
+    0x44, 0x53, 0x44, 0x4e, 0x53, 0x49, 0x49, 0x4e, 0x46, 0x3f, 0x45, 0x42,
+    0x4c, 0x47, 0x42, 0x4e, 0x49, 0x4a, 0x49, 0x44, 0x51, 0x48, 0x57, 0x4c,
+    0x4d, 0x60, 0x4e, 0x2d, 0x46, 0x4d, 0x58, 0x53, 0x5c, 0x56, 0x5e, 0x41,
+    0x3e, 0x66, 0x53, 0x5b, 0x49, 0x59, 0x5a, 0x55, 0x4e, 0x59, 0x46, 0x4a,
+    0x44, 0x42, 0x45, 0x3d, 0x4d, 0x45, 0x44, 0x4f, 0x4d, 0x53, 0x42, 0x5a,
+    0x43, 0x3c, 0x48, 0x4f, 0x44, 0x59, 0x3f, 0x33, 0x45, 0x48, 0x43, 0x45,
+    0x4d, 0x56, 0x48, 0x44, 0x3e, 0x48, 0x46, 0x4d, 0x44, 0x53, 0x46, 0x4e,
+    0x45, 0x52, 0x40, 0x46, 0x4c, 0x50, 0x4e, 0x4b, 0x4d, 0x46, 0x48, 0x46,
+    0x50, 0x52, 0x4e, 0x57, 0x3f, 0x4a, 0x49, 0x50, 0x53, 0x4e, 0x41, 0x66,
+    0x49, 0x4f, 0x40, 0x4b, 0x50, 0x4c, 0x4a, 0x70, 0x42, 0x51, 0x41, 0x4c,
+    0x50, 0x4f, 0x46, 0x60, 0x45, 0x47, 0x54, 0x4c, 0x49, 0x59, 0x52, 0x61,
+    0x4a, 0x53, 0x52, 0x4f, 0x4b, 0x4c, 0x46, 0x56, 0x4b, 0x54, 0x4f, 0x47,
+    0x53, 0x49, 0x4f, 0x50, 0x4a, 0x54, 0x45, 0x4e, 0x47, 0x48, 0x47, 0x42,
+    0x49, 0x44, 0x46, 0x46, 0x55, 0x4c, 0x4f, 0x36, 0x4c, 0x49, 0x3f, 0x4e,
+    0x45, 0x4b, 0x4b, 0x36, 0x48, 0x4f, 0x4b, 0x50, 0x45, 0x47, 0x49, 0x3f,
+    0x50, 0x4b, 0x52, 0x48, 0x4c, 0x41, 0x49, 0x43, 0x4e, 0x3c, 0x43, 0x45,
+    0x3e, 0x45, 0x48, 0x44, 0x4d, 0x48, 0x56, 0x47, 0x4b, 0x54, 0x52, 0x2b,
+    0x4d, 0x4e, 0x57, 0x4f, 0x57, 0x4f, 0x56, 0x43, 0x48, 0x5f, 0x4c, 0x51,
+    0x4d, 0x58, 0x4f, 0x4e, 0x50, 0x50, 0x48, 0x4a, 0x4d, 0x3f, 0x47, 0x40,
+    0x4b, 0x4a, 0x4e, 0x4b, 0x4a, 0x58, 0x42, 0x49, 0x3f, 0x42, 0x3d, 0x4d,
+    0x46, 0x53, 0x45, 0x3e, 0x4e, 0x49, 0x4f, 0x4a, 0x47, 0x46, 0x40, 0x3e,
+    0x4c, 0x4d, 0x4d, 0x45, 0x4a, 0x56, 0x40, 0x4a, 0x47, 0x57, 0x4f, 0x48,
+    0x4f, 0x48, 0x47, 0x49, 0x4e, 0x52, 0x50, 0x48, 0x42, 0x52, 0x43, 0x5a,
+    0x49, 0x42, 0x4f, 0x4f, 0x51, 0x51, 0x50, 0x5c, 0x4b, 0x43, 0x4b, 0x48,
+    0x50, 0x51, 0x4b, 0x6d, 0x53, 0x4e, 0x44, 0x4c, 0x4c, 0x51, 0x46, 0x5b,
+    0x44, 0x48, 0x4d, 0x4c, 0x46, 0x4f, 0x54, 0x54, 0x4e, 0x54, 0x42, 0x4e,
+    0x4c, 0x49, 0x49, 0x58, 0x49, 0x53, 0x53, 0x4a, 0x4e, 0x4b, 0x47, 0x53,
+    0x43, 0x55, 0x46, 0x51, 0x3d, 0x3d, 0x4c, 0x47, 0x4e, 0x51, 0x47, 0x48,
+    0x4b, 0x4c, 0x42, 0x3b, 0x43, 0x4f, 0x44, 0x4d, 0x54, 0x4b, 0x4a, 0x47,
+    0x4c, 0x42, 0x4b, 0x43, 0x41, 0x4e, 0x4d, 0x50, 0x45, 0x46, 0x41, 0x4a,
+    0x49, 0x49, 0x54, 0x47, 0x4c, 0x4b, 0x50, 0x4e, 0x3f, 0x43, 0x40, 0x41,
+    0x44, 0x54, 0x51, 0x47, 0x4c, 0x4b, 0x4f, 0x34, 0x4d, 0x4c, 0x4f, 0x49,
+    0x56, 0x4e, 0x4b, 0x3e, 0x48, 0x53, 0x4e, 0x56, 0x49, 0x4e, 0x4c, 0x40,
+    0x55, 0x4a, 0x46, 0x4f, 0x48, 0x4a, 0x55, 0x41, 0x55, 0x3d, 0x47, 0x51,
+    0x50, 0x51, 0x45, 0x51, 0x4b, 0x4e, 0x4a, 0x4f, 0x4b, 0x45, 0x42, 0x3c,
+    0x4e, 0x46, 0x47, 0x49, 0x4a, 0x4c, 0x48, 0x41, 0x4f, 0x4a, 0x44, 0x45,
+    0x4e, 0x4e, 0x43, 0x41, 0x4c, 0x47, 0x48, 0x49, 0x4c, 0x48, 0x4f, 0x4a,
+    0x4f, 0x4a, 0x4b, 0x45, 0x42, 0x40, 0x52, 0x55, 0x4f, 0x49, 0x44, 0x54,
+    0x49, 0x48, 0x51, 0x4d, 0x44, 0x4a, 0x4d, 0x49, 0x4e, 0x4e, 0x51, 0x5d,
+    0x42, 0x4d, 0x49, 0x3f, 0x48, 0x58, 0x40, 0x5e, 0x48, 0x4f, 0x49, 0x53,
+    0x45, 0x47, 0x4f, 0x53, 0x4d, 0x4f, 0x4d, 0x4d, 0x46, 0x55, 0x43, 0x51,
+    0x4f, 0x51, 0x4a, 0x4e, 0x49, 0x42, 0x49, 0x50, 0x47, 0x4d, 0x42, 0x47,
+    0x46, 0x50, 0x55, 0x47, 0x4d, 0x47, 0x3e, 0x51, 0x4d, 0x43, 0x44, 0x39,
+    0x4e, 0x4b, 0x41, 0x48, 0x52, 0x53, 0x4d, 0x39, 0x4d, 0x51, 0x4c, 0x46,
+    0x4e, 0x47, 0x49, 0x41, 0x45, 0x4a, 0x4a, 0x45, 0x50, 0x4a, 0x40, 0x48,
+    0x43, 0x47, 0x44, 0x50, 0x4d, 0x47, 0x4a, 0x47, 0x45, 0x57, 0x41, 0x34,
+    0x51, 0x40, 0x45, 0x44, 0x3c, 0x47, 0x46, 0x47, 0x44, 0x48, 0x42, 0x40,
+    0x37, 0x53, 0x4a, 0x43, 0x49, 0x4b, 0x43, 0x44, 0x4f, 0x4f, 0x48, 0x48,
+    0x53, 0x49, 0x4b, 0x48, 0x4e, 0x4c, 0x42, 0x45, 0x4c, 0x4a, 0x4a, 0x46,
+    0x47, 0x57, 0x3e, 0x46, 0x46, 0x45, 0x4a, 0x43, 0x46, 0x49, 0x43, 0x52,
+    0x3e, 0x48, 0x4a, 0x4b, 0x47, 0x47, 0x48, 0x4a, 0x4b, 0x4b, 0x4e, 0x44,
+    0x42, 0x44, 0x50, 0x41, 0x49, 0x49, 0x4d, 0x4b, 0x44, 0x46, 0x4a, 0x52,
+    0x4d, 0x47, 0x49, 0x4b, 0x4d, 0x49, 0x41, 0x48, 0x4b, 0x3f, 0x45, 0x4f,
+    0x51, 0x41, 0x55, 0x42, 0x49, 0x4b, 0x4b, 0x51, 0x4f, 0x4f, 0x42, 0x4e,
+    0x4e, 0x4a, 0x52, 0x41, 0x4f, 0x42, 0x48, 0x3d, 0x4a, 0x44, 0x50, 0x4b,
+    0x49, 0x45, 0x51, 0x46, 0x51, 0x44, 0x4d, 0x47, 0x4a, 0x4a, 0x4d, 0x49,
+    0x4d, 0x48, 0x4d, 0x4f, 0x4d, 0x44, 0x48, 0x4e, 0x4a, 0x4b, 0x40, 0x4f,
+    0x47, 0x3a, 0x41, 0x47, 0x4a, 0x4a, 0x4a, 0x48, 0x42, 0x41, 0x4d, 0x56,
+    0x3f, 0x52, 0x4d, 0x4c, 0x44, 0x48, 0x47, 0x4e, 0x51, 0x4c, 0x49, 0x47,
+    0x44, 0x4c, 0x4b, 0x47, 0x48, 0x46, 0x47, 0x4f, 0x43, 0x41, 0x3e, 0x47,
+    0x53, 0x4a, 0x46, 0x42, 0x46, 0x61, 0x43, 0x30, 0x4e, 0x52, 0x43, 0x45,
+    0x32, 0x4a, 0x45, 0x48, 0x51, 0x3e, 0x44, 0x3b, 0x3a, 0x63, 0x4c, 0x46,
+    0x4c, 0x49, 0x3d, 0x41, 0x52, 0x53, 0x43, 0x43, 0x45, 0x3d, 0x48, 0x40,
+    0x4b, 0x4a, 0x49, 0x48, 0x4d, 0x49, 0x4b, 0x4c, 0x3f, 0x4e, 0x4b, 0x47,
+    0x45, 0x4d, 0x3f, 0x4d, 0x43, 0x50, 0x48, 0x4b, 0x54, 0x3e, 0x44, 0x4e,
+    0x3e, 0x4c, 0x43, 0x4b, 0x4c, 0x4b, 0x3e, 0x49, 0x50, 0x52, 0x4a, 0x4a,
+    0x50, 0x50, 0x43, 0x4e, 0x49, 0x48, 0x51, 0x50, 0x47, 0x3d, 0x45, 0x4b,
+    0x47, 0x46, 0x4d, 0x4c, 0x45, 0x4d, 0x4a, 0x4d, 0x42, 0x4d, 0x47, 0x4f,
+    0x40, 0x43, 0x46, 0x51, 0x47, 0x4b, 0x43, 0x49, 0x49, 0x50, 0x4b, 0x4b,
+    0x46, 0x4a, 0x4c, 0x48, 0x49, 0x47, 0x4b, 0x56, 0x55, 0x4f, 0x49, 0x4f,
+    0x4f, 0x4e, 0x4b, 0x49, 0x4a, 0x4a, 0x49, 0x47, 0x44, 0x4b, 0x47, 0x50,
+    0x46, 0x4c, 0x46, 0x4c, 0x4b, 0x4e, 0x49, 0x57, 0x4d, 0x3e, 0x46, 0x47,
+    0x50, 0x45, 0x4f, 0x52, 0x3e, 0x4d, 0x49, 0x4a, 0x40, 0x49, 0x4f, 0x5c,
+    0x3e, 0x4a, 0x47, 0x45, 0x47, 0x41, 0x44, 0x3f, 0x4b, 0x4a, 0x52, 0x43,
+    0x41, 0x43, 0x43, 0x47, 0x55, 0x49, 0x42, 0x4c, 0x58, 0x4b, 0x42, 0x48,
+    0x4b, 0x5a, 0x36, 0x33, 0x53, 0x57, 0x4d, 0x4a, 0x37, 0x4c, 0x3e, 0x48,
+    0x43, 0x46, 0x39, 0x3c, 0x34, 0x65, 0x47, 0x3d, 0x47, 0x42, 0x3c, 0x3e,
+    0x45, 0x5b, 0x44, 0x3e, 0x45, 0x43, 0x46, 0x43, 0x59, 0x4e, 0x48, 0x46,
+    0x43, 0x3f, 0x46, 0x47, 0x4e, 0x53, 0x50, 0x4b, 0x4a, 0x3f, 0x4a, 0x54,
+    0x4c, 0x4a, 0x43, 0x50, 0x4c, 0x42, 0x4d, 0x55, 0x4d, 0x51, 0x51, 0x46,
+    0x49, 0x41, 0x50, 0x44, 0x4a, 0x4b, 0x4b, 0x43, 0x4b, 0x4e, 0x47, 0x4b,
+    0x3e, 0x4e, 0x44, 0x4d, 0x49, 0x41, 0x49, 0x44, 0x50, 0x4d, 0x45, 0x4e,
+    0x4b, 0x50, 0x45, 0x4c, 0x46, 0x4a, 0x46, 0x42, 0x50, 0x45, 0x48, 0x53,
+    0x4d, 0x44, 0x42, 0x50, 0x4c, 0x49, 0x45, 0x55, 0x4d, 0x42, 0x43, 0x41,
+    0x4c, 0x41, 0x4e, 0x4d, 0x42, 0x4e, 0x3f, 0x44, 0x4d, 0x4c, 0x4b, 0x4a,
+    0x47, 0x47, 0x4e, 0x54, 0x43, 0x40, 0x41, 0x55, 0x49, 0x49, 0x4e, 0x49,
+    0x52, 0x4e, 0x46, 0x58, 0x4b, 0x3d, 0x4a, 0x44, 0x4e, 0x47, 0x53, 0x58,
+    0x47, 0x42, 0x52, 0x46, 0x49, 0x4b, 0x47, 0x5a, 0x4c, 0x46, 0x46, 0x49,
+    0x4b, 0x4d, 0x3d, 0x48, 0x40, 0x54, 0x48, 0x4c, 0x4c, 0x44, 0x4c, 0x46,
+    0x47, 0x4b, 0x4d, 0x44, 0x5a, 0x4a, 0x3e, 0x46, 0x48, 0x53, 0x39, 0x30,
+    0x51, 0x60, 0x4d, 0x47, 0x35, 0x4f, 0x45, 0x45, 0x4a, 0x4b, 0x42, 0x3f,
+    0x38, 0x6c, 0x3d, 0x40, 0x44, 0x48, 0x3a, 0x3b, 0x46, 0x5e, 0x45, 0x3b,
+    0x47, 0x47, 0x45, 0x42, 0x53, 0x55, 0x44, 0x45, 0x46, 0x43, 0x48, 0x48,
+    0x52, 0x5d, 0x3e, 0x41, 0x53, 0x42, 0x48, 0x55, 0x49, 0x4d, 0x4a, 0x46,
+    0x52, 0x46, 0x51, 0x48, 0x44, 0x46, 0x48, 0x41, 0x49, 0x49, 0x49, 0x49,
+    0x41, 0x4d, 0x40, 0x4f, 0x45, 0x46, 0x45, 0x3f, 0x53, 0x40, 0x46, 0x43,
+    0x47, 0x4d, 0x50, 0x4c, 0x55, 0x48, 0x45, 0x47, 0x4f, 0x46, 0x42, 0x4d,
+    0x41, 0x48, 0x46, 0x4e, 0x42, 0x48, 0x48, 0x45, 0x41, 0x45, 0x48, 0x4a,
+    0x40, 0x49, 0x43, 0x4b, 0x48, 0x4a, 0x4c, 0x45, 0x4b, 0x48, 0x48, 0x4f,
+    0x40, 0x4b, 0x4a, 0x44, 0x50, 0x4a, 0x43, 0x50, 0x4c, 0x44, 0x46, 0x4c,
+    0x42, 0x44, 0x4e, 0x55, 0x47, 0x49, 0x48, 0x47, 0x52, 0x4e, 0x44, 0x59,
+    0x4e, 0x44, 0x4a, 0x48, 0x49, 0x4a, 0x42, 0x4e, 0x3e, 0x39, 0x51, 0x45,
+    0x4d, 0x49, 0x4f, 0x54, 0x51, 0x4b, 0x50, 0x44, 0x53, 0x4f, 0x4d, 0x48,
+    0x42, 0x45, 0x4e, 0x40, 0x4a, 0x48, 0x43, 0x48, 0x52, 0x54, 0x4d, 0x49,
+    0x5f, 0x53, 0x46, 0x4e, 0x3f, 0x5a, 0x36, 0x31, 0x52, 0x60, 0x4b, 0x4a,
+    0x32, 0x51, 0x40, 0x44, 0x46, 0x52, 0x44, 0x41, 0x3a, 0x6e, 0x41, 0x3e,
+    0x47, 0x3e, 0x3a, 0x2a, 0x44, 0x5a, 0x40, 0x3c, 0x4d, 0x48, 0x46, 0x3b,
+    0x5e, 0x58, 0x4d, 0x47, 0x51, 0x3a, 0x4b, 0x48, 0x5b, 0x5a, 0x54, 0x43,
+    0x50, 0x4c, 0x54, 0x54, 0x49, 0x47, 0x4f, 0x48, 0x50, 0x40, 0x4f, 0x4a,
+    0x42, 0x42, 0x3c, 0x41, 0x43, 0x4e, 0x53, 0x49, 0x4b, 0x4d, 0x49, 0x41,
+    0x4c, 0x3e, 0x40, 0x49, 0x40, 0x44, 0x49, 0x4f, 0x50, 0x4a, 0x42, 0x3a,
+    0x49, 0x4b, 0x47, 0x50, 0x49, 0x41, 0x52, 0x46, 0x3d, 0x44, 0x46, 0x43,
+    0x4b, 0x4b, 0x4d, 0x4b, 0x4e, 0x40, 0x45, 0x43, 0x48, 0x44, 0x55, 0x51,
+    0x4a, 0x46, 0x4e, 0x40, 0x53, 0x4a, 0x45, 0x41, 0x48, 0x48, 0x45, 0x4e,
+    0x4a, 0x48, 0x40, 0x4c, 0x54, 0x44, 0x42, 0x4d, 0x49, 0x43, 0x45, 0x4c,
+    0x43, 0x4f, 0x46, 0x3f, 0x46, 0x4f, 0x4b, 0x59, 0x46, 0x49, 0x54, 0x47,
+    0x49, 0x46, 0x45, 0x53, 0x4a, 0x49, 0x54, 0x45, 0x41, 0x45, 0x4c, 0x5e,
+    0x50, 0x3d, 0x4d, 0x49, 0x55, 0x4b, 0x49, 0x47, 0x4c, 0x4f, 0x43, 0x3d,
+    0x41, 0x4b, 0x43, 0x46, 0x4f, 0x4a, 0x4c, 0x54, 0x5e, 0x4e, 0x40, 0x4d,
+    0x3d, 0x59, 0x40, 0x28, 0x54, 0x5f, 0x4d, 0x4b, 0x36, 0x51, 0x3a, 0x47,
+    0x4a, 0x55, 0x42, 0x43, 0x3b, 0x72, 0x3b, 0x3d, 0x51, 0x42, 0x3f, 0x2d,
+    0x4b, 0x5a, 0x48, 0x44, 0x49, 0x49, 0x3d, 0x39, 0x56, 0x55, 0x46, 0x46,
+    0x4b, 0x43, 0x40, 0x4a, 0x52, 0x56, 0x4d, 0x45, 0x4b, 0x48, 0x40, 0x5a,
+    0x4e, 0x3a, 0x53, 0x48, 0x4c, 0x44, 0x49, 0x4e, 0x42, 0x47, 0x46, 0x40,
+    0x51, 0x42, 0x50, 0x4b, 0x43, 0x53, 0x44, 0x44, 0x46, 0x4c, 0x4c, 0x3c,
+    0x42, 0x45, 0x42, 0x45, 0x44, 0x4b, 0x52, 0x3d, 0x47, 0x4b, 0x4c, 0x4e,
+    0x52, 0x4a, 0x4e, 0x41, 0x3f, 0x46, 0x43, 0x54, 0x44, 0x53, 0x4e, 0x48,
+    0x40, 0x41, 0x4f, 0x45, 0x43, 0x3c, 0x52, 0x49, 0x40, 0x44, 0x4a, 0x3f,
+    0x4d, 0x4c, 0x4f, 0x47, 0x44, 0x47, 0x55, 0x47, 0x50, 0x4d, 0x4a, 0x4c,
+    0x50, 0x48, 0x47, 0x55, 0x4b, 0x4a, 0x52, 0x49, 0x3d, 0x3f, 0x4f, 0x51,
+    0x48, 0x4e, 0x42, 0x4e, 0x42, 0x48, 0x4e, 0x49, 0x4a, 0x50, 0x45, 0x54,
+    0x41, 0x43, 0x45, 0x4d, 0x48, 0x48, 0x48, 0x51, 0x53, 0x3e, 0x55, 0x44,
+    0x52, 0x56, 0x44, 0x4d, 0x4e, 0x48, 0x4b, 0x43, 0x48, 0x53, 0x48, 0x44,
+    0x49, 0x45, 0x4e, 0x50, 0x5d, 0x4a, 0x45, 0x4c, 0x45, 0x55, 0x43, 0x2e,
+    0x59, 0x60, 0x4e, 0x4d, 0x32, 0x53, 0x3e, 0x3f, 0x40, 0x63, 0x41, 0x48,
+    0x38, 0x73, 0x38, 0x46, 0x50, 0x3e, 0x3c, 0x23, 0x48, 0x61, 0x45, 0x3c,
+    0x41, 0x41, 0x36, 0x3b, 0x58, 0x56, 0x4a, 0x40, 0x4f, 0x44, 0x45, 0x4c,
+    0x5a, 0x56, 0x47, 0x3f, 0x4d, 0x4b, 0x46, 0x5d, 0x52, 0x47, 0x45, 0x4c,
+    0x4a, 0x52, 0x4f, 0x4f, 0x4f, 0x43, 0x4f, 0x47, 0x43, 0x46, 0x3c, 0x4c,
+    0x46, 0x55, 0x40, 0x53, 0x43, 0x3e, 0x42, 0x35, 0x51, 0x41, 0x42, 0x3f,
+    0x45, 0x3d, 0x41, 0x31, 0x4e, 0x47, 0x48, 0x42, 0x41, 0x45, 0x43, 0x38,
+    0x42, 0x40, 0x4a, 0x47, 0x4e, 0x43, 0x40, 0x43, 0x48, 0x49, 0x45, 0x4f,
+    0x44, 0x42, 0x4d, 0x42, 0x42, 0x3f, 0x46, 0x52, 0x3c, 0x3c, 0x47, 0x43,
+    0x46, 0x47, 0x45, 0x40, 0x4c, 0x44, 0x43, 0x4a, 0x4b, 0x4d, 0x4e, 0x46,
+    0x51, 0x45, 0x47, 0x4b, 0x45, 0x50, 0x40, 0x42, 0x4c, 0x4c, 0x4c, 0x4f,
+    0x44, 0x3c, 0x49, 0x3c, 0x3f, 0x45, 0x3f, 0x5c, 0x42, 0x3e, 0x4b, 0x4e,
+    0x50, 0x45, 0x42, 0x5c, 0x4c, 0x48, 0x50, 0x52, 0x50, 0x47, 0x4b, 0x44,
+    0x3d, 0x50, 0x55, 0x4c, 0x48, 0x3f, 0x4b, 0x44, 0x4a, 0x51, 0x42, 0x4c,
+    0x60, 0x51, 0x41, 0x4b, 0x46, 0x5c, 0x42, 0x2c, 0x55, 0x61, 0x50, 0x52,
+    0x37, 0x5a, 0x3f, 0x43, 0x43, 0x58, 0x3a, 0x4d, 0x3e, 0x72, 0x35, 0x3f,
+    0x58, 0x41, 0x40, 0x1f, 0x55, 0x63, 0x3f, 0x49, 0x41, 0x3e, 0x35, 0x41,
+    0x65, 0x54, 0x42, 0x45, 0x45, 0x3c, 0x44, 0x45, 0x59, 0x5a, 0x4d, 0x41,
+    0x51, 0x46, 0x49, 0x59, 0x4c, 0x41, 0x42, 0x44, 0x4a, 0x45, 0x3f, 0x4a,
+    0x4a, 0x44, 0x48, 0x48, 0x52, 0x40, 0x4a, 0x4a, 0x4d, 0x54, 0x44, 0x48,
+    0x54, 0x46, 0x49, 0x3b, 0x42, 0x4a, 0x4e, 0x46, 0x4a, 0x45, 0x4f, 0x30,
+    0x46, 0x41, 0x47, 0x46, 0x4b, 0x47, 0x46, 0x38, 0x4c, 0x3a, 0x4b, 0x46,
+    0x52, 0x48, 0x4f, 0x3e, 0x48, 0x4a, 0x48, 0x4b, 0x44, 0x45, 0x4a, 0x46,
+    0x3f, 0x4f, 0x40, 0x44, 0x43, 0x43, 0x4b, 0x39, 0x46, 0x43, 0x49, 0x49,
+    0x49, 0x4a, 0x44, 0x48, 0x4c, 0x41, 0x4d, 0x52, 0x4c, 0x4a, 0x46, 0x3d,
+    0x41, 0x4b, 0x41, 0x48, 0x45, 0x3b, 0x51, 0x54, 0x4a, 0x39, 0x4d, 0x41,
+    0x54, 0x46, 0x4c, 0x53, 0x48, 0x3e, 0x4a, 0x3d, 0x41, 0x52, 0x54, 0x63,
+    0x44, 0x4d, 0x4a, 0x43, 0x52, 0x4b, 0x52, 0x52, 0x4e, 0x41, 0x48, 0x42,
+    0x48, 0x4d, 0x49, 0x45, 0x51, 0x48, 0x3e, 0x47, 0x5a, 0x52, 0x4a, 0x4e,
+    0x3e, 0x59, 0x3c, 0x2e, 0x5c, 0x5b, 0x4c, 0x56, 0x30, 0x59, 0x3a, 0x48,
+    0x3d, 0x5c, 0x44, 0x49, 0x40, 0x7c, 0x3a, 0x48, 0x54, 0x40, 0x41, 0x28,
+    0x4d, 0x64, 0x46, 0x47, 0x49, 0x40, 0x30, 0x3a, 0x5f, 0x5b, 0x42, 0x37,
+    0x49, 0x45, 0x40, 0x43, 0x5b, 0x54, 0x48, 0x4d, 0x4a, 0x47, 0x51, 0x58,
+    0x4b, 0x3c, 0x4d, 0x46, 0x4b, 0x52, 0x4c, 0x58, 0x53, 0x46, 0x42, 0x45,
+    0x4c, 0x4a, 0x4d, 0x4e, 0x52, 0x4d, 0x46, 0x44, 0x46, 0x3f, 0x46, 0x34,
+    0x4f, 0x42, 0x44, 0x46, 0x44, 0x50, 0x47, 0x30, 0x44, 0x3c, 0x42, 0x46,
+    0x4f, 0x4a, 0x52, 0x30, 0x55, 0x4f, 0x45, 0x4a, 0x48, 0x4c, 0x4e, 0x35,
+    0x4e, 0x3c, 0x45, 0x4a, 0x45, 0x4a, 0x44, 0x3c, 0x4e, 0x4a, 0x51, 0x44,
+    0x49, 0x40, 0x4a, 0x40, 0x41, 0x44, 0x4f, 0x4c, 0x43, 0x45, 0x4b, 0x43,
+    0x3e, 0x3e, 0x4c, 0x44, 0x48, 0x48, 0x42, 0x42, 0x4d, 0x43, 0x50, 0x4d,
+    0x49, 0x3c, 0x45, 0x4f, 0x4c, 0x46, 0x4b, 0x48, 0x4d, 0x4d, 0x49, 0x55,
+    0x49, 0x3b, 0x40, 0x44, 0x4a, 0x4b, 0x4e, 0x5e, 0x43, 0x47, 0x45, 0x43,
+    0x4d, 0x4d, 0x49, 0x46, 0x4a, 0x44, 0x4e, 0x3e, 0x52, 0x41, 0x47, 0x47,
+    0x4a, 0x50, 0x48, 0x43, 0x5d, 0x4f, 0x49, 0x48, 0x43, 0x4f, 0x45, 0x3e,
+    0x5a, 0x69, 0x4d, 0x5a, 0x3a, 0x5d, 0x3a, 0x48, 0x42, 0x55, 0x3e, 0x48,
+    0x48, 0x7b, 0x37, 0x40, 0x57, 0x45, 0x48, 0x24, 0x50, 0x61, 0x4c, 0x4a,
+    0x44, 0x41, 0x34, 0x38, 0x65, 0x5b, 0x4f, 0x3c, 0x4d, 0x3a, 0x4a, 0x4c,
+    0x66, 0x55, 0x50, 0x47, 0x4d, 0x46, 0x47, 0x58, 0x4c, 0x48, 0x48, 0x48,
+    0x4e, 0x59, 0x4f, 0x4b, 0x45, 0x45, 0x4b, 0x54, 0x46, 0x51, 0x4f, 0x44,
+    0x42, 0x55, 0x48, 0x44, 0x48, 0x41, 0x53, 0x2e, 0x4d, 0x45, 0x44, 0x54,
+    0x4a, 0x44, 0x53, 0x34, 0x4c, 0x46, 0x47, 0x3f, 0x4c, 0x4b, 0x47, 0x36,
+    0x47, 0x41, 0x43, 0x40, 0x51, 0x46, 0x45, 0x33, 0x46, 0x3e, 0x47, 0x50,
+    0x3f, 0x48, 0x48, 0x37, 0x41, 0x41, 0x42, 0x3e, 0x45, 0x3d, 0x49, 0x3e,
+    0x4f, 0x42, 0x49, 0x4a, 0x46, 0x46, 0x48, 0x44, 0x49, 0x45, 0x46, 0x4a,
+    0x4a, 0x47, 0x48, 0x43, 0x44, 0x45, 0x3f, 0x4c, 0x4c, 0x49, 0x4d, 0x51,
+    0x4a, 0x4a, 0x49, 0x4c, 0x42, 0x4d, 0x4b, 0x4b, 0x4a, 0x42, 0x47, 0x4d,
+    0x3e, 0x4b, 0x47, 0x5c, 0x49, 0x3d, 0x4e, 0x41, 0x44, 0x49, 0x3e, 0x3e,
+    0x4b, 0x47, 0x4e, 0x45, 0x44, 0x4a, 0x4d, 0x4a, 0x4f, 0x46, 0x45, 0x52,
+    0x60, 0x53, 0x49, 0x50, 0x3d, 0x4f, 0x43, 0x3d, 0x52, 0x64, 0x52, 0x58,
+    0x39, 0x5f, 0x36, 0x4c, 0x45, 0x57, 0x42, 0x4b, 0x3f, 0x80, 0x34, 0x47,
+    0x58, 0x41, 0x45, 0x1b, 0x4b, 0x5e, 0x4c, 0x40, 0x44, 0x42, 0x39, 0x3a,
+    0x5e, 0x5b, 0x4b, 0x3a, 0x4b, 0x3f, 0x45, 0x3e, 0x69, 0x57, 0x4b, 0x45,
+    0x4b, 0x3f, 0x45, 0x55, 0x49, 0x49, 0x48, 0x47, 0x41, 0x4f, 0x42, 0x53,
+    0x49, 0x40, 0x42, 0x3e, 0x49, 0x47, 0x53, 0x47, 0x45, 0x51, 0x4a, 0x44,
+    0x44, 0x45, 0x4e, 0x2a, 0x45, 0x42, 0x4a, 0x4b, 0x46, 0x4d, 0x41, 0x30,
+    0x3d, 0x43, 0x3f, 0x48, 0x49, 0x44, 0x4d, 0x2e, 0x48, 0x4a, 0x4c, 0x51,
+    0x50, 0x46, 0x3e, 0x2c, 0x4d, 0x3f, 0x47, 0x46, 0x3c, 0x40, 0x4c, 0x38,
+    0x4f, 0x46, 0x47, 0x53, 0x3b, 0x3c, 0x4e, 0x3e, 0x49, 0x40, 0x43, 0x4c,
+    0x4d, 0x48, 0x45, 0x3c, 0x4d, 0x4c, 0x4d, 0x45, 0x3f, 0x49, 0x4a, 0x43,
+    0x4d, 0x41, 0x4b, 0x50, 0x4e, 0x46, 0x50, 0x44, 0x49, 0x44, 0x4e, 0x42,
+    0x4a, 0x43, 0x4c, 0x4c, 0x49, 0x49, 0x44, 0x4e, 0x4b, 0x3f, 0x4b, 0x5d,
+    0x41, 0x49, 0x4b, 0x46, 0x4e, 0x48, 0x45, 0x51, 0x4d, 0x45, 0x46, 0x45,
+    0x4b, 0x4e, 0x3c, 0x4d, 0x3d, 0x41, 0x47, 0x47, 0x64, 0x54, 0x41, 0x55,
+    0x47, 0x56, 0x44, 0x3b, 0x53, 0x66, 0x4f, 0x5e, 0x40, 0x5d, 0x38, 0x4a,
+    0x41, 0x59, 0x42, 0x48, 0x47, 0xff, 0x36, 0x49, 0x59, 0x41, 0x43, 0x1d,
+    0x4d, 0x5e, 0x44, 0x44, 0x50, 0x3f, 0x39, 0x40, 0x68, 0x5e, 0x4a, 0x41,
+    0x52, 0x41, 0x43, 0x41, 0x68, 0x51, 0x45, 0x48, 0x4c, 0x46, 0x4a, 0x5e,
+    0x4e, 0x40, 0x4d, 0x41, 0x41, 0x5c, 0x3f, 0x4e, 0x4c, 0x37, 0x48, 0x40,
+    0x46, 0x47, 0x4f, 0x43, 0x53, 0x52, 0x3d, 0x44, 0x47, 0x44, 0x3d, 0x34,
+    0x44, 0x42, 0x4a, 0x43, 0x4d, 0x3f, 0x53, 0x2e, 0x42, 0x47, 0x43, 0x4d,
+    0x45, 0x45, 0x47, 0x31, 0x4d, 0x39, 0x41, 0x4a, 0x4a, 0x4d, 0x4b, 0x35,
+    0x47, 0x4e, 0x4c, 0x40, 0x4a, 0x44, 0x44, 0x36, 0x3e, 0x49, 0x3f, 0x45,
+    0x46, 0x43, 0x4e, 0x3c, 0x4d, 0x47, 0x4c, 0x48, 0x4a, 0x4b, 0x48, 0x39,
+    0x46, 0x50, 0x4a, 0x4f, 0x46, 0x41, 0x44, 0x4a, 0x41, 0x4f, 0x4c, 0x4e,
+    0x55, 0x46, 0x43, 0x46, 0x4a, 0x48, 0x4e, 0x46, 0x42, 0x40, 0x4f, 0x56,
+    0x4c, 0x45, 0x4b, 0x46, 0x4a, 0x47, 0x42, 0x5e, 0x49, 0x4e, 0x46, 0x43,
+    0x4e, 0x42, 0x45, 0x48, 0x47, 0x48, 0x4f, 0x45, 0x47, 0x51, 0x4b, 0x4c,
+    0x51, 0x39, 0x4d, 0x48, 0x60, 0x57, 0x49, 0x52, 0x3d, 0x57, 0x46, 0x3d,
+    0x53, 0x68, 0x4b, 0x60, 0x40, 0x5a, 0x41, 0x4b, 0x46, 0x56, 0x46, 0x4c,
+    0x49, 0x7e, 0x2f, 0x48, 0x51, 0x42, 0x40, 0x20, 0x4b, 0x62, 0x4d, 0x41,
+    0x4f, 0x43, 0x3d, 0x35, 0x63, 0x63, 0x46, 0x3e, 0x4e, 0x47, 0x40, 0x40,
+    0x60, 0x52, 0x4c, 0x46, 0x49, 0x48, 0x4f, 0x56, 0x51, 0x47, 0x52, 0x4e,
+    0x4b, 0x59, 0x55, 0x4f, 0x48, 0x3d, 0x48, 0x4a, 0x4d, 0x50, 0x47, 0x47,
+    0x51, 0x52, 0x4d, 0x51, 0x45, 0x45, 0x47, 0x2d, 0x4d, 0x41, 0x43, 0x49,
+    0x4d, 0x40, 0x4a, 0x2f, 0x4f, 0x43, 0x46, 0x4a, 0x3e, 0x4a, 0x4a, 0x2b,
+    0x49, 0x4c, 0x4c, 0x3e, 0x41, 0x4c, 0x4a, 0x2b, 0x40, 0x44, 0x46, 0x4a,
+    0x40, 0x44, 0x42, 0x38, 0x52, 0x42, 0x46, 0x51, 0x53, 0x4e, 0x45, 0x31,
+    0x45, 0x47, 0x4f, 0x46, 0x49, 0x43, 0x45, 0x3b, 0x4b, 0x4b, 0x4b, 0x4c,
+    0x43, 0x4a, 0x4c, 0x43, 0x4e, 0x40, 0x52, 0x44, 0x48, 0x49, 0x47, 0x4b,
+    0x4e, 0x3d, 0x4e, 0x44, 0x48, 0x4d, 0x4f, 0x4f, 0x50, 0x36, 0x47, 0x41,
+    0x4a, 0x44, 0x45, 0x56, 0x4f, 0x4c, 0x50, 0x4b, 0x45, 0x3e, 0x45, 0x4e,
+    0x45, 0x45, 0x43, 0x40, 0x47, 0x4e, 0x45, 0x3e, 0x4a, 0x3f, 0x49, 0x50,
+    0x62, 0x55, 0x48, 0x56, 0x3e, 0x57, 0x4f, 0x3b, 0x55, 0x6c, 0x50, 0x5c,
+    0x3d, 0x54, 0x3d, 0x46, 0x43, 0x59, 0x3e, 0x51, 0x4d, 0x7b, 0x33, 0x47,
+    0x52, 0x43, 0x3f, 0x25, 0x4a, 0x6f, 0x49, 0x3e, 0x50, 0x40, 0x41, 0x30,
+    0x5e, 0x5c, 0x4a, 0x43, 0x4d, 0x42, 0x46, 0x3b, 0x63, 0x53, 0x4f, 0x43,
+    0x58, 0x48, 0x4b, 0x59, 0x50, 0x4e, 0x4b, 0x51, 0x4a, 0x55, 0x44, 0x46,
+    0x4c, 0x3d, 0x4c, 0x52, 0x44, 0x52, 0x4c, 0x41, 0x4f, 0x44, 0x4a, 0x47,
+    0x4e, 0x48, 0x49, 0x2e, 0x3e, 0x45, 0x4c, 0x48, 0x41, 0x47, 0x4d, 0x2e,
+    0x40, 0x4b, 0x4c, 0x42, 0x4d, 0x40, 0x4e, 0x2e, 0x43, 0x45, 0x4b, 0x43,
+    0x3e, 0x49, 0x55, 0x35, 0x43, 0x42, 0x42, 0x40, 0x4e, 0x46, 0x44, 0x37,
+    0x49, 0x41, 0x3f, 0x52, 0x47, 0x4b, 0x43, 0x33, 0x4b, 0x47, 0x4b, 0x4c,
+    0x4d, 0x4b, 0x3f, 0x42, 0x44, 0x40, 0x49, 0x41, 0x42, 0x49, 0x4b, 0x46,
+    0x4e, 0x4e, 0x47, 0x4e, 0x48, 0x48, 0x4b, 0x46, 0x51, 0x4b, 0x46, 0x4d,
+    0x47, 0x4f, 0x3e, 0x51, 0x46, 0x4e, 0x46, 0x4b, 0x47, 0x48, 0x4e, 0x55,
+    0x4c, 0x3d, 0x47, 0x51, 0x42, 0x45, 0x4f, 0x42, 0x52, 0x50, 0x44, 0x4c,
+    0x44, 0x44, 0x43, 0x4d, 0x40, 0x42, 0x4d, 0x4b, 0x5d, 0x4e, 0x47, 0x54,
+    0x47, 0x51, 0x43, 0x39, 0x58, 0x66, 0x4e, 0x5a, 0x41, 0x52, 0x36, 0x47,
+    0x45, 0x5f, 0x34, 0x50, 0x46, 0x79, 0x30, 0x48, 0x50, 0x45, 0x32, 0x22,
+    0x54, 0x64, 0x49, 0x46, 0x45, 0x3c, 0x42, 0x36, 0x65, 0x5c, 0x48, 0x3a,
+    0x4d, 0x4b, 0x47, 0x3e, 0x63, 0x56, 0x4a, 0x48, 0x51, 0x42, 0x4f, 0x5e,
+    0x4c, 0x44, 0x4b, 0x4c, 0x3d, 0x5a, 0x43, 0x4d, 0x42, 0x40, 0x4f, 0x4d,
+    0x3f, 0x3e, 0x46, 0x40, 0x49, 0x42, 0x49, 0x40, 0x49, 0x4c, 0x4a, 0x2e,
+    0x4b, 0x3f, 0x53, 0x4b, 0x48, 0x49, 0x3e, 0x34, 0x47, 0x4a, 0x4b, 0x46,
+    0x3b, 0x49, 0x46, 0x34, 0x4b, 0x48, 0x4c, 0x49, 0x49, 0x43, 0x4f, 0x2e,
+    0x44, 0x46, 0x48, 0x50, 0x46, 0x4e, 0x4a, 0x37, 0x4b, 0x4c, 0x4a, 0x50,
+    0x45, 0x4a, 0x48, 0x3b, 0x48, 0x44, 0x48, 0x4a, 0x41, 0x44, 0x52, 0x3f,
+    0x4c, 0x46, 0x4a, 0x45, 0x46, 0x49, 0x49, 0x36, 0x53, 0x3e, 0x48, 0x47,
+    0x3f, 0x42, 0x41, 0x4c, 0x42, 0x4a, 0x52, 0x46, 0x49, 0x3f, 0x48, 0x5a,
+    0x43, 0x42, 0x3d, 0x43, 0x4f, 0x44, 0x43, 0x65, 0x41, 0x41, 0x44, 0x4b,
+    0x50, 0x44, 0x53, 0x49, 0x41, 0x45, 0x4a, 0x4d, 0x40, 0x45, 0x4a, 0x4e,
+    0x50, 0x40, 0x51, 0x40, 0x5e, 0x50, 0x43, 0x5c, 0x47, 0x5a, 0x44, 0x4c,
+    0x54, 0x64, 0x4f, 0x63, 0x39, 0x58, 0x3c, 0x4a, 0x42, 0x5e, 0x3c, 0x4a,
+    0x48, 0x7b, 0x34, 0x4c, 0x4f, 0x44, 0x30, 0x24, 0x50, 0x65, 0x47, 0x39,
+    0x46, 0x3e, 0x3f, 0x33, 0x65, 0x5a, 0x44, 0x38, 0x50, 0x47, 0x4b, 0x3e,
+    0x5b, 0x53, 0x4a, 0x4d, 0x51, 0x40, 0x47, 0x59, 0x51, 0x42, 0x4f, 0x50,
+    0x45, 0x57, 0x46, 0x50, 0x3f, 0x3c, 0x4c, 0x4f, 0x46, 0x41, 0x4a, 0x3e,
+    0x4d, 0x45, 0x51, 0x48, 0x4e, 0x44, 0x4e, 0x35, 0x44, 0x3f, 0x44, 0x48,
+    0x3c, 0x4c, 0x49, 0x2c, 0x4a, 0x46, 0x48, 0x44, 0x4b, 0x42, 0x4b, 0x2f,
+    0x4e, 0x50, 0x4c, 0x4d, 0x44, 0x46, 0x3f, 0x39, 0x4d, 0x47, 0x45, 0x41,
+    0x42, 0x47, 0x4a, 0x3a, 0x40, 0x3e, 0x4a, 0x51, 0x3f, 0x47, 0x44, 0x37,
+    0x47, 0x4e, 0x47, 0x52, 0x45, 0x42, 0x4a, 0x3d, 0x43, 0x4d, 0x4d, 0x47,
+    0x48, 0x43, 0x44, 0x44, 0x47, 0x4e, 0x52, 0x4b, 0x4e, 0x50, 0x42, 0x47,
+    0x4b, 0x4b, 0x4e, 0x4c, 0x4e, 0x47, 0x50, 0x56, 0x46, 0x47, 0x4d, 0x49,
+    0x4d, 0x46, 0x49, 0x5f, 0x49, 0x42, 0x4d, 0x44, 0x40, 0x4b, 0x52, 0x45,
+    0x46, 0x4a, 0x4b, 0x49, 0x47, 0x4b, 0x42, 0x45, 0x42, 0x44, 0x46, 0x4c,
+    0x62, 0x4a, 0x44, 0x53, 0x43, 0x5a, 0x48, 0x49, 0x59, 0x68, 0x46, 0x61,
+    0x40, 0x5a, 0x3a, 0x4d, 0x45, 0x5e, 0x33, 0x4f, 0x4e, 0x74, 0x3e, 0x3e,
+    0x5a, 0x4b, 0x34, 0x31, 0x52, 0x6c, 0x44, 0x39, 0x4c, 0x3b, 0x39, 0x3a,
+    0x63, 0x65, 0x4b, 0x40, 0x50, 0x4d, 0x53, 0x4a, 0x69, 0x56, 0x54, 0x45,
+    0x4c, 0x4c, 0x50, 0x5b, 0x4d, 0x4f, 0x3d, 0x4b, 0x44, 0x47, 0x43, 0x47,
+    0x49, 0x3c, 0x49, 0x41, 0x41, 0x3f, 0x47, 0x43, 0x48, 0x47, 0x4c, 0x43,
+    0x4a, 0x40, 0x4d, 0x32, 0x4b, 0x4d, 0x44, 0x48, 0x46, 0x44, 0x50, 0x2f,
+    0x4e, 0x49, 0x53, 0x4b, 0x52, 0x47, 0x4b, 0x2b, 0x48, 0x4b, 0x4a, 0x4c,
+    0x4d, 0x4c, 0x43, 0x37, 0x48, 0x3c, 0x4b, 0x42, 0x51, 0x3f, 0x45, 0x3c,
+    0x49, 0x40, 0x42, 0x43, 0x4d, 0x4c, 0x3f, 0x3f, 0x4d, 0x43, 0x45, 0x42,
+    0x48, 0x42, 0x48, 0x39, 0x51, 0x4e, 0x46, 0x4f, 0x3e, 0x4c, 0x45, 0x3e,
+    0x3f, 0x3f, 0x43, 0x41, 0x4b, 0x4b, 0x43, 0x4d, 0x44, 0x3b, 0x48, 0x45,
+    0x3c, 0x4a, 0x48, 0x5b, 0x3c, 0x4b, 0x4c, 0x44, 0x46, 0x3e, 0x45, 0x57,
+    0x43, 0x42, 0x51, 0x4a, 0x46, 0x47, 0x43, 0x49, 0x42, 0x43, 0x50, 0x4e,
+    0x4e, 0x44, 0x41, 0x4e, 0x4e, 0x41, 0x48, 0x47, 0x5c, 0x53, 0x44, 0x54,
+    0x44, 0x5b, 0x45, 0x46, 0x55, 0x67, 0x4d, 0x5d, 0x40, 0x5a, 0x43, 0x4b,
+    0x43, 0x60, 0x3c, 0x4b, 0x41, 0x79, 0x41, 0x41, 0x58, 0x48, 0x40, 0x3b,
+    0x4f, 0x6c, 0x46, 0x3f, 0x53, 0x3a, 0x3d, 0x36, 0x5a, 0x57, 0x44, 0x41,
+    0x4c, 0x47, 0x4e, 0x48, 0x62, 0x60, 0x4a, 0x46, 0x51, 0x3e, 0x52, 0x5f,
+    0x4b, 0x46, 0x48, 0x4c, 0x4c, 0x55, 0x43, 0x46, 0x49, 0x3e, 0x41, 0x40,
+    0x4d, 0x47, 0x46, 0x3b, 0x51, 0x3a, 0x4a, 0x45, 0x50, 0x47, 0x51, 0x38,
+    0x44, 0x41, 0x40, 0x4b, 0x4d, 0x44, 0x4d, 0x28, 0x47, 0x3e, 0x44, 0x40,
+    0x49, 0x49, 0x40, 0x3c, 0x44, 0x4c, 0x48, 0x51, 0x46, 0x3e, 0x47, 0x2a,
+    0x41, 0x44, 0x49, 0x4c, 0x4e, 0x4e, 0x42, 0x3c, 0x49, 0x42, 0x43, 0x45,
+    0x4e, 0x4d, 0x50, 0x39, 0x42, 0x43, 0x48, 0x41, 0x3f, 0x40, 0x4e, 0x3a,
+    0x44, 0x3d, 0x49, 0x4d, 0x47, 0x45, 0x4b, 0x42, 0x4c, 0x4d, 0x3f, 0x3f,
+    0x4e, 0x4d, 0x4d, 0x4d, 0x4d, 0x45, 0x47, 0x43, 0x4c, 0x46, 0x47, 0x57,
+    0x4b, 0x42, 0x4d, 0x46, 0x4b, 0x4b, 0x43, 0x58, 0x48, 0x49, 0x4d, 0x47,
+    0x43, 0x49, 0x4b, 0x48, 0x46, 0x4f, 0x4f, 0x42, 0x4a, 0x43, 0x49, 0x4e,
+    0x4a, 0x47, 0x4c, 0x48, 0x5a, 0x57, 0x4a, 0x58, 0x49, 0x4f, 0x45, 0x47,
+    0x63, 0x66, 0x4d, 0x5e, 0x4b, 0x51, 0x45, 0x4a, 0x43, 0x5d, 0x33, 0x4b,
+    0x4e, 0x70, 0x42, 0x39, 0x57, 0x4a, 0x40, 0x3a, 0x51, 0x68, 0x45, 0x45,
+    0x4c, 0x44, 0x3a, 0x3a, 0x4f, 0x62, 0x49, 0x45, 0x53, 0x4c, 0x4e, 0x41,
+    0x63, 0x5e, 0x44, 0x44, 0x47, 0x43, 0x47, 0x59, 0x4c, 0x4b, 0x4c, 0x49,
+    0x3e, 0x43, 0x4c, 0x46, 0x4c, 0x38, 0x47, 0x46, 0x46, 0x47, 0x40, 0x44,
+    0x51, 0x3e, 0x40, 0x47, 0x3f, 0x45, 0x48, 0x2a, 0x42, 0x3e, 0x43, 0x46,
+    0x50, 0x4c, 0x4a, 0x2c, 0x49, 0x4b, 0x48, 0x48, 0x40, 0x4a, 0x4a, 0x37,
+    0x4e, 0x42, 0x4f, 0x4c, 0x41, 0x43, 0x45, 0x38, 0x4e, 0x3d, 0x41, 0x47,
+    0x42, 0x42, 0x43, 0x3b, 0x4a, 0x40, 0x48, 0x4a, 0x53, 0x44, 0x4d, 0x35,
+    0x51, 0x3c, 0x4e, 0x4e, 0x3e, 0x3f, 0x4b, 0x3c, 0x3e, 0x47, 0x41, 0x48,
+    0x40, 0x46, 0x4e, 0x44, 0x49, 0x42, 0x49, 0x44, 0x4b, 0x46, 0x46, 0x43,
+    0x4c, 0x4b, 0x49, 0x4d, 0x3d, 0x47, 0x43, 0x5c, 0x4a, 0x42, 0x47, 0x4e,
+    0x47, 0x40, 0x4c, 0x55, 0x3f, 0x45, 0x46, 0x49, 0x46, 0x48, 0x49, 0x4d,
+    0x4c, 0x41, 0x49, 0x40, 0x4a, 0x44, 0x42, 0x49, 0x52, 0x41, 0x49, 0x4a,
+    0x5c, 0x53, 0x47, 0x58, 0x49, 0x55, 0x4a, 0x4a, 0x62, 0x61, 0x4b, 0x57,
+    0x3c, 0x50, 0x42, 0x4c, 0x49, 0x5f, 0x3f, 0x4a, 0x42, 0x70, 0x40, 0x40,
+    0x4f, 0x46, 0x43, 0x43, 0x4d, 0x6c, 0x41, 0x3e, 0x4e, 0x49, 0x43, 0x38,
+    0x50, 0x57, 0x43, 0x39, 0x4a, 0x4f, 0x51, 0x3e, 0x5c, 0x57, 0x46, 0x49,
+    0x41, 0x40, 0x42, 0x4f, 0x4c, 0x45, 0x46, 0x4a, 0x4c, 0x4b, 0x43, 0x42,
+    0x4c, 0x3c, 0x47, 0x47, 0x4f, 0x44, 0x45, 0x3a, 0x4d, 0x3d, 0x4d, 0x3f,
+    0x46, 0x4f, 0x41, 0x37, 0x46, 0x45, 0x54, 0x47, 0x4e, 0x46, 0x47, 0x23,
+    0x48, 0x4e, 0x4a, 0x47, 0x45, 0x45, 0x4e, 0x33, 0x49, 0x4a, 0x4d, 0x4e,
+    0x49, 0x46, 0x49, 0x36, 0x48, 0x44, 0x53, 0x44, 0x4a, 0x45, 0x4a, 0x37,
+    0x45, 0x36, 0x4b, 0x4e, 0x50, 0x3f, 0x49, 0x38, 0x40, 0x43, 0x46, 0x4c,
+    0x43, 0x46, 0x4a, 0x3f, 0x45, 0x3d, 0x44, 0x47, 0x44, 0x42, 0x4a, 0x45,
+    0x47, 0x43, 0x4d, 0x4d, 0x44, 0x44, 0x4f, 0x4a, 0x4a, 0x41, 0x50, 0x50,
+    0x4b, 0x44, 0x54, 0x5c, 0x4b, 0x3a, 0x46, 0x4a, 0x4a, 0x43, 0x48, 0x5c,
+    0x4b, 0x43, 0x47, 0x3d, 0x3e, 0x54, 0x42, 0x47, 0x42, 0x4f, 0x4b, 0x4b,
+    0x46, 0x46, 0x46, 0x42, 0x42, 0x4b, 0x48, 0x45, 0x51, 0x4e, 0x49, 0x4d,
+    0x43, 0x56, 0x45, 0x40, 0x5a, 0x58, 0x4c, 0x55, 0x40, 0x4b, 0x4c, 0x51,
+    0x42, 0x59, 0x43, 0x46, 0x46, 0x69, 0x43, 0x3c, 0x54, 0x47, 0x3d, 0x41,
+    0x52, 0x64, 0x44, 0x38, 0x4f, 0x49, 0x3a, 0x3a, 0x55, 0x54, 0x45, 0x3e,
+    0x49, 0x44, 0x4e, 0x3f, 0x57, 0x50, 0x47, 0x43, 0x45, 0x48, 0x53, 0x5b,
+    0x53, 0x4d, 0x48, 0x4e, 0x48, 0x3a, 0x3e, 0x46, 0x42, 0x36, 0x50, 0x4d,
+    0x49, 0x4b, 0x4b, 0x45, 0x4c, 0x44, 0x50, 0x47, 0x3e, 0x49, 0x50, 0x37,
+    0x4c, 0x4b, 0x4a, 0x54, 0x4e, 0x43, 0x40, 0x25, 0x46, 0x42, 0x52, 0x3d,
+    0x44, 0x45, 0x51, 0x2e, 0x4a, 0x3d, 0x46, 0x46, 0x4c, 0x42, 0x48, 0x34,
+    0x44, 0x44, 0x44, 0x4c, 0x4f, 0x4b, 0x42, 0x3d, 0x45, 0x40, 0x47, 0x49,
+    0x43, 0x41, 0x3e, 0x39, 0x47, 0x4b, 0x50, 0x4a, 0x46, 0x47, 0x4e, 0x3b,
+    0x4e, 0x3e, 0x49, 0x4a, 0x50, 0x40, 0x43, 0x49, 0x48, 0x3c, 0x4f, 0x45,
+    0x4a, 0x41, 0x42, 0x48, 0x4b, 0x46, 0x4a, 0x50, 0x40, 0x49, 0x44, 0x54,
+    0x45, 0x45, 0x4a, 0x4b, 0x51, 0x51, 0x48, 0x53, 0x50, 0x3f, 0x50, 0x46,
+    0x44, 0x45, 0x51, 0x43, 0x4f, 0x3e, 0x41, 0x41, 0x46, 0x45, 0x45, 0x4c,
+    0x54, 0x3c, 0x4a, 0x4c, 0x5a, 0x4f, 0x46, 0x4b, 0x47, 0x4a, 0x43, 0x4c,
+    0x56, 0x5a, 0x4a, 0x53, 0x4c, 0x49, 0x46, 0x4c, 0x45, 0x59, 0x40, 0x4b,
+    0x48, 0x60, 0x3d, 0x42, 0x52, 0x3f, 0x42, 0x3d, 0x52, 0x5f, 0x46, 0x42,
+    0x4b, 0x4e, 0x4a, 0x3d, 0x52, 0x55, 0x53, 0x37, 0x47, 0x3e, 0x4a, 0x42,
+    0x51, 0x54, 0x48, 0x48, 0x4b, 0x48, 0x3e, 0x52, 0x41, 0x4e, 0x4c, 0x4f,
+    0x43, 0x3b, 0x4b, 0x4b, 0x4c, 0x40, 0x48, 0x49, 0x4d, 0x3a, 0x45, 0x3c,
+    0x53, 0x44, 0x48, 0x4d, 0x4b, 0x49, 0x46, 0x3c, 0x4d, 0x40, 0x51, 0x3f,
+    0x4c, 0x45, 0x44, 0x2f, 0x49, 0x51, 0x3f, 0x4d, 0x3e, 0x4e, 0x3c, 0x30,
+    0x3d, 0x48, 0x4f, 0x3f, 0x45, 0x45, 0x46, 0x3b, 0x4c, 0x46, 0x4d, 0x50,
+    0x4c, 0x3d, 0x41, 0x37, 0x3e, 0x3e, 0x4f, 0x4b, 0x4d, 0x4f, 0x45, 0x45,
+    0x4a, 0x47, 0x4a, 0x44, 0x43, 0x46, 0x51, 0x41, 0x4e, 0x39, 0x44, 0x4a,
+    0x4e, 0x49, 0x4a, 0x42, 0x49, 0x4b, 0x4e, 0x48, 0x49, 0x4a, 0x45, 0x4a,
+    0x45, 0x41, 0x4a, 0x4b, 0x42, 0x41, 0x48, 0x4a, 0x44, 0x3a, 0x46, 0x49,
+    0x54, 0x45, 0x44, 0x60, 0x4a, 0x4e, 0x45, 0x4a, 0x4a, 0x45, 0x4b, 0x49,
+    0x42, 0x44, 0x46, 0x50, 0x4b, 0x4b, 0x4e, 0x45, 0x48, 0x3e, 0x55, 0x42,
+    0x51, 0x49, 0x49, 0x44, 0x4e, 0x54, 0x53, 0x49, 0x4c, 0x63, 0x48, 0x5a,
+    0x50, 0x4b, 0x45, 0x49, 0x43, 0x57, 0x4c, 0x3f, 0x4d, 0x67, 0x3f, 0x47,
+    0x53, 0x49, 0x43, 0x44, 0x49, 0x61, 0x50, 0x47, 0x49, 0x49, 0x4a, 0x42,
+    0x4a, 0x51, 0x46, 0x43, 0x3f, 0x34, 0x40, 0x3a, 0x45, 0x54, 0x4c, 0x55,
+    0x40, 0x3c, 0x4a, 0x4d, 0x3e, 0x4d, 0x48, 0x51, 0x4c, 0x3e, 0x4c, 0x4f,
+    0x50, 0x47, 0x4d, 0x49, 0x4d, 0x4e, 0x45, 0x43, 0x41, 0x41, 0x40, 0x47,
+    0x43, 0x4a, 0x4a, 0x3c, 0x4c, 0x3d, 0x4e, 0x43, 0x41, 0x42, 0x4a, 0x30,
+    0x45, 0x4c, 0x45, 0x55, 0x46, 0x39, 0x43, 0x39, 0x45, 0x47, 0x48, 0x53,
+    0x4a, 0x48, 0x43, 0x38, 0x4f, 0x51, 0x4d, 0x4c, 0x41, 0x46, 0x40, 0x3d,
+    0x43, 0x4b, 0x40, 0x46, 0x47, 0x50, 0x4a, 0x43, 0x50, 0x4e, 0x45, 0x4f,
+    0x4d, 0x44, 0x4d, 0x3f, 0x4e, 0x48, 0x4a, 0x49, 0x44, 0x3d, 0x4a, 0x44,
+    0x40, 0x45, 0x49, 0x40, 0x4a, 0x44, 0x4f, 0x4a, 0x43, 0x4a, 0x4e, 0x52,
+    0x4d, 0x50, 0x48, 0x4c, 0x43, 0x45, 0x4d, 0x54, 0x4a, 0x49, 0x4c, 0x58,
+    0x4c, 0x48, 0x4c, 0x44, 0x4b, 0x4e, 0x52, 0x44, 0x49, 0x44, 0x47, 0x4e,
+    0x4b, 0x45, 0x49, 0x3e, 0x4c, 0x3b, 0x53, 0x3f, 0x51, 0x41, 0x3f, 0x44,
+    0x43, 0x4a, 0x4b, 0x43, 0x53, 0x57, 0x50, 0x53, 0x4f, 0x4b, 0x48, 0x51,
+    0x47, 0x49, 0x46, 0x4d, 0x4d, 0x5e, 0x44, 0x46, 0x56, 0x3d, 0x3c, 0x3e,
+    0x47, 0x55, 0x54, 0x46, 0x42, 0x49, 0x4f, 0x43, 0x48, 0x54, 0x51, 0x40,
+    0x44, 0x44, 0x47, 0x45, 0x4b, 0x59, 0x4d, 0x47, 0x40, 0x39, 0x48, 0x54,
+    0x43, 0x45, 0x44, 0x42, 0x4c, 0x3c, 0x4d, 0x42, 0x4b, 0x45, 0x42, 0x48,
+    0x51, 0x44, 0x45, 0x3f, 0x3d, 0x49, 0x4b, 0x4a, 0x41, 0x43, 0x4f, 0x3f,
+    0x51, 0x4b, 0x44, 0x46, 0x46, 0x44, 0x53, 0x3d, 0x47, 0x47, 0x43, 0x4b,
+    0x41, 0x43, 0x3c, 0x3b, 0x49, 0x47, 0x47, 0x49, 0x4b, 0x3d, 0x43, 0x43,
+    0x4b, 0x47, 0x45, 0x4e, 0x42, 0x4a, 0x4c, 0x3e, 0x51, 0x3e, 0x46, 0x44,
+    0x46, 0x43, 0x42, 0x42, 0x47, 0x4d, 0x51, 0x4b, 0x49, 0x44, 0x4d, 0x40,
+    0x50, 0x43, 0x41, 0x4c, 0x42, 0x49, 0x49, 0x4c, 0x42, 0x50, 0x48, 0x3f,
+    0x46, 0x42, 0x48, 0x57, 0x49, 0x4d, 0x47, 0x4e, 0x48, 0x4b, 0x46, 0x50,
+    0x47, 0x45, 0x52, 0x45, 0x4b, 0x48, 0x40, 0x5b, 0x4e, 0x43, 0x51, 0x48,
+    0x48, 0x4a, 0x4a, 0x4a, 0x52, 0x51, 0x4c, 0x4b, 0x42, 0x55, 0x4d, 0x46,
+    0x50, 0x40, 0x4a, 0x50, 0x51, 0x3e, 0x42, 0x4c, 0x43, 0x46, 0x4d, 0x46,
+    0x46, 0x4d, 0x4d, 0x52, 0x4e, 0x44, 0x45, 0x47, 0x49, 0x4c, 0x41, 0x44,
+    0x4d, 0x54, 0x4c, 0x4a, 0x54, 0x3e, 0x44, 0x43, 0x53, 0x55, 0x4b, 0x4a,
+    0x47, 0x47, 0x4f, 0x46, 0x4f, 0x4b, 0x51, 0x3f, 0x41, 0x4c, 0x43, 0x46,
+    0x55, 0x51, 0x40, 0x4b, 0x4f, 0x40, 0x47, 0x50, 0x4e, 0x4a, 0x46, 0x4e,
+    0x42, 0x4d, 0x48, 0x49, 0x48, 0x4a, 0x4a, 0x43, 0x49, 0x48, 0x44, 0x3b,
+    0x51, 0x46, 0x3d, 0x43, 0x47, 0x4a, 0x4f, 0x42, 0x4a, 0x50, 0x4f, 0x41,
+    0x45, 0x45, 0x43, 0x3c, 0x4c, 0x4c, 0x46, 0x4b, 0x3e, 0x44, 0x4b, 0x3a,
+    0x45, 0x50, 0x42, 0x48, 0x46, 0x47, 0x44, 0x3a, 0x53, 0x46, 0x4e, 0x4f,
+    0x43, 0x40, 0x46, 0x48, 0x4e, 0x45, 0x3f, 0x47, 0x48, 0x3f, 0x44, 0x4f,
+    0x44, 0x47, 0x4e, 0x47, 0x47, 0x49, 0x42, 0x43, 0x3f, 0x49, 0x4a, 0x53,
+    0x53, 0x4a, 0x4e, 0x4a, 0x49, 0x4d, 0x49, 0x41, 0x48, 0x4d, 0x4d, 0x4e,
+    0x4b, 0x45, 0x4d, 0x4a, 0x46, 0x4a, 0x46, 0x51, 0x4b, 0x47, 0x49, 0x45,
+    0x49, 0x49, 0x4b, 0x5c, 0x48, 0x42, 0x51, 0x4c, 0x41, 0x3f, 0x4c, 0x42,
+    0x4f, 0x45, 0x4b, 0x4a, 0x52, 0x48, 0x53, 0x4f, 0x40, 0x47, 0x41, 0x47,
+    0x68, 0xfb, 0xff, 0xff, 0x4c, 0xfc, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xe8, 0x03, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
+    0x58, 0x01, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0xd8, 0x00, 0x00, 0x00,
+    0x38, 0x02, 0x00, 0x00, 0x9c, 0x02, 0x00, 0x00, 0xa0, 0x01, 0x00, 0x00,
+    0x14, 0x03, 0x00, 0x00, 0xfe, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x10, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x19, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x00,
+    0xcc, 0xfc, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x17, 0xbf, 0xd2, 0x3f, 0x01, 0x00, 0x00, 0x00, 0x58, 0xec, 0xd1, 0x43,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6e, 0xfd, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x02, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x43, 0x6f, 0x6e, 0x76,
+    0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x34, 0xff, 0xff, 0xff,
+    0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xf5, 0xf7, 0x84, 0x3a, 0xc2, 0xfd, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68,
+    0x61, 0x70, 0x65, 0x5f, 0x31, 0x00, 0x00, 0x00, 0x94, 0xfd, 0xff, 0xff,
+    0x30, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x80, 0x3f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x43,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3a, 0xfe, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x02, 0x10, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x4d, 0x61, 0x74, 0x4d,
+    0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x0c, 0x00, 0x0c, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xc5, 0x01, 0x2a, 0x3b, 0x96, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x44, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x0a, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x25, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f,
+    0x71, 0x75, 0x61, 0x6e, 0x74, 0x2f, 0x46, 0x61, 0x6b, 0x65, 0x51, 0x75,
+    0x61, 0x6e, 0x74, 0x57, 0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61,
+    0x78, 0x56, 0x61, 0x72, 0x73, 0x00, 0x00, 0x00, 0x84, 0xfe, 0xff, 0xff,
+    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xf5, 0xf7, 0x84, 0x3a,
+    0x01, 0x00, 0x00, 0x00, 0x6e, 0x88, 0xae, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0xd4, 0x97, 0x30, 0xbe, 0x26, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x10, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,
+    0x31, 0x00, 0x00, 0x00, 0xec, 0xfe, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x2f, 0xad, 0x18, 0x40, 0x01, 0x00, 0x00, 0x00,
+    0x02, 0x38, 0xa2, 0x43, 0x01, 0x00, 0x00, 0x00, 0x02, 0xf1, 0x8d, 0xc3,
+    0x8e, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,
+    0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00, 0x5c, 0xff, 0xff, 0xff,
+    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3b,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x3f, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00,
+    0x07, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x30, 0x11, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
+    0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e,
+    0x74, 0x5f, 0x31, 0x2f, 0x46, 0x61, 0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e,
+    0x74, 0x57, 0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56,
+    0x61, 0x72, 0x73, 0x2f, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73,
+    0x65, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x04, 0x00, 0x08, 0x00,
+    0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x31, 0x83, 0xce, 0x3a, 0x01, 0x00, 0x00, 0x00,
+    0x4d, 0x97, 0x92, 0x3e, 0x01, 0x00, 0x00, 0x00, 0x84, 0x75, 0xec, 0xbd,
+    0x03, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x14, 0x00, 0x1c, 0x00,
+    0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
+    0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00,
+    0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x14, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x10, 0x00,
+    0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff, 0x00, 0x19, 0x06, 0x00,
+    0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x09, 0x06, 0x00,
+    0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04};
+const int g_tiny_conv_model_data_len = 19800;
diff --git a/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h b/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h
new file mode 100644
index 0000000000..2953cc852d
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is a standard TensorFlow Lite model file that has been converted into a
+// C data array, so it can be easily compiled into a binary for devices that
+// don't have a file system. It was created using the command:
+// xxd -i tiny_conv.tflite > tiny_conv_model_data.cc
+
+#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
+#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
+
+extern const unsigned char g_tiny_conv_model_data[];
+extern const int g_tiny_conv_model_data_len;
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/BUILD b/tensorflow/contrib/lite/experimental/micro/kernels/BUILD
new file mode 100644
index 0000000000..a012f950e6
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/kernels/BUILD
@@ -0,0 +1,107 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
+load(
+    "//tensorflow/contrib/lite/experimental/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
+cc_library(
+    name = "micro_ops",
+    srcs = [
+        "depthwise_conv.cc",
+        "fully_connected.cc",
+        "softmax.cc",
+    ],
+    hdrs = [
+    ],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/contrib/lite/c:c_api_internal",
+        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
+        "//tensorflow/contrib/lite/kernels:kernel_util",
+        "//tensorflow/contrib/lite/kernels:op_macros",
+        "//tensorflow/contrib/lite/kernels:padding",
+        "//tensorflow/contrib/lite/kernels/internal:quantization_util",
+        "//tensorflow/contrib/lite/kernels/internal:reference_base",
+        "//tensorflow/contrib/lite/kernels/internal:tensor",
+    ],
+)
+
+cc_library(
+    name = "all_ops_resolver",
+    srcs = [
+        "all_ops_resolver.cc",
+    ],
+    hdrs = [
+        "all_ops_resolver.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":micro_ops",
+        "//tensorflow/contrib/lite/c:c_api_internal",
+        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "test_utils",
+    srcs = [
+    ],
+    hdrs = [
+        "test_utils.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/contrib/lite/c:c_api_internal",
+        "//tensorflow/contrib/lite/core/api",
+        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
+        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "depthwise_conv_test",
+    srcs = [
+        "depthwise_conv_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        ":test_utils",
+        "//tensorflow/contrib/lite/c:c_api_internal",
+        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
+        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "fully_connected_test",
+    srcs = [
+        "fully_connected_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        ":test_utils",
+        "//tensorflow/contrib/lite/c:c_api_internal",
+        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
+        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "softmax_test",
+    srcs = [
+        "softmax_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        ":test_utils",
+        "//tensorflow/contrib/lite/c:c_api_internal",
+        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
+        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.cc b/tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.cc
new file mode 100644
index 0000000000..bd0a37badb
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
+TfLiteRegistration* Micro_Register_DEPTHWISE_CONV_2D() {
+  return Register_DEPTHWISE_CONV_2D();
+}
+
+TfLiteRegistration* Register_FULLY_CONNECTED();
+TfLiteRegistration* Micro_Register_FULLY_CONNECTED() {
+  return Register_FULLY_CONNECTED();
+}
+
+TfLiteRegistration* Register_SOFTMAX();
+TfLiteRegistration* Micro_Register_SOFTMAX() { return Register_SOFTMAX(); }
+
+AllOpsResolver::AllOpsResolver() {
+  AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D,
+             Micro_Register_DEPTHWISE_CONV_2D());
+  AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Micro_Register_FULLY_CONNECTED(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_SOFTMAX, Micro_Register_SOFTMAX());
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h b/tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h
new file mode 100644
index 0000000000..f836064a3f
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_KERNELS_ALL_OPS_RESOLVER_H_
+#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_KERNELS_ALL_OPS_RESOLVER_H_
+
+#include "tensorflow/contrib/lite/experimental/micro/compatibility.h"
+#include "tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+class AllOpsResolver : public MicroMutableOpResolver {
+ public:
+  AllOpsResolver();
+
+ private:
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_KERNELS_ALL_OPS_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv.cc
new file mode 100644
index 0000000000..4f17263181
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv.cc
@@ -0,0 +1,208 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/padding.h"
+
+#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace depthwise_conv {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  TfLitePaddingValues padding;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             int out_width, int out_height,
+                             const TfLiteType data_type, OpData* data) {
+  data->padding.height = ComputePadding(params->stride_height, 1, height,
+                                        filter_height, out_height);
+  data->padding.width =
+      ComputePadding(params->stride_width, 1, width, filter_width, out_width);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (data_type != kTfLiteFloat32) {
+    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    const TfLiteTensor* bias =
+        GetOptionalInputTensor(context, node, kBiasTensor);
+    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+    data->output_shift = -exponent;
+    CalculateActivationRangeUint8(params->activation, output,
+                                  &data->output_activation_min,
+                                  &data->output_activation_max);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteDepthwiseConvParams* params, OpData* data,
+               const TfLiteTensor* input, const TfLiteTensor* filter,
+               const TfLiteTensor* bias, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  tflite::reference_ops::DepthwiseConv(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(filter), GetTensorData<float>(filter),
+      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+      GetTensorData<float>(output));
+}
+
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteDepthwiseConvParams* params, OpData* data,
+                   const TfLiteTensor* input, const TfLiteTensor* filter,
+                   const TfLiteTensor* bias, TfLiteTensor* output) {
+  const int32_t input_offset = -input->params.zero_point;
+  const int32_t filter_offset = -filter->params.zero_point;
+  const int32_t output_offset = output->params.zero_point;
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = -data->output_shift;
+
+  tflite::reference_ops::DepthwiseConv(
+      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+      GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),
+      GetTensorShape(output), GetTensorData<uint8_t>(output));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias =
+      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+  int out_width = ComputeOutSize(params->padding, width, filter_width,
+                                 params->stride_width);
+  int out_height = ComputeOutSize(params->padding, height, filter_height,
+                                  params->stride_height);
+  OpData local_data_object;
+  OpData* data = &local_data_object;
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
+                                        filter_width, filter_height, out_width,
+                                        out_height, data_type, data));
+
+  // TODO(aselle): Consider whether float conv and quantized conv should be
+  // separate ops to avoid dispatch overhead here.
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      EvalFloat(context, node, params, data, input, filter, bias, output);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantized(context, node, params, data, input, filter, bias, output);
+      break;
+    default:
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace depthwise_conv
+
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
+  static TfLiteRegistration r = {depthwise_conv::Init, depthwise_conv::Free,
+                                 depthwise_conv::Prepare, depthwise_conv::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test.cc b/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test.cc
new file mode 100644
index 0000000000..169899c471
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test.cc
@@ -0,0 +1,406 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h"
+#include "tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestDepthwiseConvFloat(std::initializer_list<int> input_dims_data,
+                            std::initializer_list<float> input_data,
+                            std::initializer_list<int> filter_dims_data,
+                            std::initializer_list<float> filter_data,
+                            std::initializer_list<int> bias_dims_data,
+                            std::initializer_list<float> bias_data,
+                            std::initializer_list<float> expected_output_data,
+                            std::initializer_list<int> output_dims_data,
+                            TfLiteFusedActivation activation,
+                            float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInitializer(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInitializer(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(filter_data, filter_dims, "filter_tensor"),
+      CreateFloatTensor(bias_data, bias_dims, "bias_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  int input_depth = input_dims->data[3];
+  int output_depth = filter_dims->data[3];
+  int depth_mul = output_depth / input_depth;
+  TfLiteDepthwiseConvParams builtin_data = {
+      kTfLitePaddingValid, 1, 1, depth_mul, activation,
+  };
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestDepthwiseConvQuantized(
+    std::initializer_list<int> input_dims_data,
+    std::initializer_list<uint8_t> input_data, float input_min, float input_max,
+    std::initializer_list<int> filter_dims_data,
+    std::initializer_list<uint8_t> filter_data, float filter_min,
+    float filter_max, std::initializer_list<int> bias_dims_data,
+    std::initializer_list<int32_t> bias_data, float bias_min, float bias_max,
+    std::initializer_list<uint8_t> expected_output_data,
+    std::initializer_list<int> output_dims_data, float output_min,
+    float output_max, TfLiteFusedActivation activation, uint8_t* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInitializer(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInitializer(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(filter_data, filter_dims, "filter_tensor",
+                            filter_min, filter_max),
+      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_min,
+                              bias_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  int input_depth = input_dims->data[3];
+  int output_depth = filter_dims->data[3];
+  int depth_mul = output_depth / input_depth;
+  TfLiteDepthwiseConvParams builtin_data = {
+      kTfLitePaddingValid, 1, 1, depth_mul, activation,
+  };
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleTest) {
+  const int output_dims_count = 8;
+  float output_data[output_dims_count];
+  tflite::testing::TestDepthwiseConvFloat(  //
+      {4, 1, 3, 2, 2},                      // Input shape.
+      {
+          1, 2, 7, 8,    // Input values.
+          3, 4, 9, 10,   //
+          5, 6, 11, 12,  //
+      },
+      {4, 1, 2, 2, 4},  // Filters shape.
+      {
+          1, 2, 3, 4,        // Filters values.
+          -9, 10, -11, 12,   //
+          5, 6, 7, 8,        //
+          13, -14, 15, -16,  //
+      },
+      {1, 4},  // Bias shape.
+      {
+          1, 2, 3, 4,  // Bias values.
+      },
+      {
+          71, -34, 99, -20,  // Expected results.
+          91, -26, 127, -4,  //
+      },
+      {4, 1, 2, 1, 4},  // Output shape.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantized) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float filter_min = -63.5f;
+  const float filter_max = 64.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 64.0f * (1 << 24);
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+  const int output_dims_count = 8;
+  uint8_t output_data[output_dims_count];
+
+  tflite::testing::TestDepthwiseConvQuantized(  //
+      {4, 1, 3, 2, 2},                          // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(7, input_min, input_max),
+          F2Q(8, input_min, input_max),
+          F2Q(3, input_min, input_max),
+          F2Q(4, input_min, input_max),
+          F2Q(9, input_min, input_max),
+          F2Q(10, input_min, input_max),
+          F2Q(5, input_min, input_max),
+          F2Q(6, input_min, input_max),
+          F2Q(11, input_min, input_max),
+          F2Q(12, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {4, 1, 2, 2, 4},       // Filter shape.
+      {
+          // Filter values.
+          F2Q(1, filter_min, filter_max),
+          F2Q(2, filter_min, filter_max),
+          F2Q(3, filter_min, filter_max),
+          F2Q(4, filter_min, filter_max),
+          F2Q(-9, filter_min, filter_max),
+          F2Q(10, filter_min, filter_max),
+          F2Q(-11, filter_min, filter_max),
+          F2Q(12, filter_min, filter_max),
+          F2Q(5, filter_min, filter_max),
+          F2Q(6, filter_min, filter_max),
+          F2Q(7, filter_min, filter_max),
+          F2Q(8, filter_min, filter_max),
+          F2Q(13, filter_min, filter_max),
+          F2Q(-14, filter_min, filter_max),
+          F2Q(15, filter_min, filter_max),
+          F2Q(-16, filter_min, filter_max),
+      },
+      filter_min, filter_max,  // Filter quantization range.
+      {1, 4},                  // Bias shape.
+      {
+          // Bias values.
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(2, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+          F2Q32(4, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          F2Q(71, output_min, output_max),
+          F2Q(-34, output_min, output_max),
+          F2Q(99, output_min, output_max),
+          F2Q(-20, output_min, output_max),
+          F2Q(91, output_min, output_max),
+          F2Q(-26, output_min, output_max),
+          F2Q(127, output_min, output_max),
+          F2Q(-4, output_min, output_max),
+      },
+      {4, 1, 2, 1, 4},         // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestRelu) {
+  const int output_dims_count = 8;
+  float output_data[output_dims_count];
+  tflite::testing::TestDepthwiseConvFloat(  //
+      {4, 1, 3, 2, 2},                      // Input shape.
+      {
+          1, 2, 7, 8,    // Input values.
+          3, 4, 9, 10,   //
+          5, 6, 11, 12,  //
+      },
+      {4, 1, 2, 2, 4},  // Filters shape.
+      {
+          1, 2, 3, 4,        // Filters values.
+          -9, 10, -11, 12,   //
+          5, 6, 7, 8,        //
+          13, -14, 15, -16,  //
+      },
+      {1, 4},  // Bias shape.
+      {
+          1, 2, 3, 4,  // Bias values.
+      },
+      {
+          71, 0, 99, 0,   // Expected results.
+          91, 0, 127, 0,  //
+      },
+      {4, 1, 2, 1, 4},  // Output shape.
+      kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestReluQuantized) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float filter_min = -63.5f;
+  const float filter_max = 64.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 64.0f * (1 << 24);
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+  const int output_dims_count = 8;
+  uint8_t output_data[output_dims_count];
+
+  tflite::testing::TestDepthwiseConvQuantized(  //
+      {4, 1, 3, 2, 2},                          // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(7, input_min, input_max),
+          F2Q(8, input_min, input_max),
+          F2Q(3, input_min, input_max),
+          F2Q(4, input_min, input_max),
+          F2Q(9, input_min, input_max),
+          F2Q(10, input_min, input_max),
+          F2Q(5, input_min, input_max),
+          F2Q(6, input_min, input_max),
+          F2Q(11, input_min, input_max),
+          F2Q(12, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {4, 1, 2, 2, 4},       // Filter shape.
+      {
+          // Filter values.
+          F2Q(1, filter_min, filter_max),
+          F2Q(2, filter_min, filter_max),
+          F2Q(3, filter_min, filter_max),
+          F2Q(4, filter_min, filter_max),
+          F2Q(-9, filter_min, filter_max),
+          F2Q(10, filter_min, filter_max),
+          F2Q(-11, filter_min, filter_max),
+          F2Q(12, filter_min, filter_max),
+          F2Q(5, filter_min, filter_max),
+          F2Q(6, filter_min, filter_max),
+          F2Q(7, filter_min, filter_max),
+          F2Q(8, filter_min, filter_max),
+          F2Q(13, filter_min, filter_max),
+          F2Q(-14, filter_min, filter_max),
+          F2Q(15, filter_min, filter_max),
+          F2Q(-16, filter_min, filter_max),
+      },
+      filter_min, filter_max,  // Filter quantization range.
+      {1, 4},                  // Bias shape.
+      {
+          // Bias values.
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(2, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+          F2Q32(4, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          F2Q(71, output_min, output_max),
+          F2Q(0, output_min, output_max),
+          F2Q(99, output_min, output_max),
+          F2Q(0, output_min, output_max),
+          F2Q(91, output_min, output_max),
+          F2Q(0, output_min, output_max),
+          F2Q(127, output_min, output_max),
+          F2Q(0, output_min, output_max),
+      },
+      {4, 1, 2, 1, 4},         // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/fully_connected.cc b/tensorflow/contrib/lite/experimental/micro/kernels/fully_connected.cc
new file mode 100644
index 0000000000..1e9e54cafb
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/kernels/fully_connected.cc
@@ -0,0 +1,184 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace fully_connected {
+namespace {
+
+struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  // The index of the temporary tensor where the quantized inputs are cached.
+  int input_quantized_index;
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus CalculateOpData(TfLiteContext* context,
+                             TfLiteFullyConnectedParams* params,
+                             TfLiteType data_type, const TfLiteTensor* input,
+                             const TfLiteTensor* filter,
+                             const TfLiteTensor* bias, TfLiteTensor* output,
+                             OpData* data) {
+  TfLiteStatus status = kTfLiteOk;
+  if (data_type != kTfLiteFloat32) {
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+    data->output_shift = -exponent;
+    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
+        context, params->activation, output, &data->output_activation_min,
+        &data->output_activation_max));
+  }
+  return status;
+}
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteFullyConnectedParams* params, OpData* data,
+                           const TfLiteTensor* input,
+                           const TfLiteTensor* filter, const TfLiteTensor* bias,
+                           TfLiteTensor* output) {
+  const int32_t input_offset = -input->params.zero_point;
+  const int32_t filter_offset = -filter->params.zero_point;
+  const int32_t output_offset = output->params.zero_point;
+
+  tflite::FullyConnectedParams op_params;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = -data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+#define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
+  reference_ops::FullyConnected(                                       \
+      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
+      GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
+      GetTensorShape(output), GetTensorData<output_data_type>(output), \
+      nullptr)
+  switch (output->type) {
+    case kTfLiteUInt8:
+      TF_LITE_FULLY_CONNECTED(uint8_t);
+      break;
+    case kTfLiteInt16:
+      TF_LITE_FULLY_CONNECTED(int16_t);
+      break;
+    default:
+      context->ReportError(
+          context,
+          "Quantized FullyConnected expects output data type uint8 or int16");
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteFullyConnectedParams* params, OpData* data,
+                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteTensor* bias, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+  tflite::FullyConnectedParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  tflite::reference_ops::FullyConnected(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(filter), GetTensorData<float>(filter),
+      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+      GetTensorData<float>(output));
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TfLiteType data_type = input->type;
+  OpData local_data_object;
+  OpData* data = &local_data_object;
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
+                                        filter, bias, output, data));
+
+  switch (filter->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      return EvalFloat(context, node, params, data, input, filter, bias,
+                       output);
+    case kTfLiteUInt8:
+      return EvalQuantized(context, node, params, data, input, filter, bias,
+                           output);
+
+    default:
+      context->ReportError(context, "Type %d not currently supported.",
+                           filter->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace fully_connected
+
+TfLiteRegistration* Register_FULLY_CONNECTED() {
+  static TfLiteRegistration r = {fully_connected::Init, fully_connected::Free,
+                                 fully_connected::Prepare,
+                                 fully_connected::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/experimental/micro/kernels/fully_connected_test.cc
new file mode 100644
index 0000000000..b42bf4c3bc
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/kernels/fully_connected_test.cc
@@ -0,0 +1,643 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h"
+#include "tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestFullyConnectedFloat(std::initializer_list<int> input_dims_data,
+                             std::initializer_list<float> input_data,
+                             std::initializer_list<int> weights_dims_data,
+                             std::initializer_list<float> weights_data,
+                             std::initializer_list<int> bias_dims_data,
+                             std::initializer_list<float> bias_data,
+                             std::initializer_list<float> expected_output_data,
+                             std::initializer_list<int> output_dims_data,
+                             TfLiteFusedActivation activation,
+                             float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* weights_dims = IntArrayFromInitializer(weights_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInitializer(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(weights_data, weights_dims, "weights_tensor"),
+      CreateFloatTensor(bias_data, bias_dims, "bias_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteFullyConnectedParams builtin_data = {
+      activation,
+      kTfLiteFullyConnectedWeightsFormatDefault,
+  };
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestFullyConnectedQuantized(
+    std::initializer_list<int> input_dims_data,
+    std::initializer_list<uint8_t> input_data, float input_min, float input_max,
+    std::initializer_list<int> weights_dims_data,
+    std::initializer_list<uint8_t> weights_data, float weights_min,
+    float weights_max, std::initializer_list<int> bias_dims_data,
+    std::initializer_list<int32_t> bias_data, float bias_min, float bias_max,
+    std::initializer_list<uint8_t> expected_output_data,
+    std::initializer_list<int> output_dims_data, float output_min,
+    float output_max, TfLiteFusedActivation activation, uint8_t* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* weights_dims = IntArrayFromInitializer(weights_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInitializer(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(weights_data, weights_dims, "weights_tensor",
+                            weights_min, weights_max),
+      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_min,
+                              bias_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteFullyConnectedParams builtin_data = {
+      activation,
+      kTfLiteFullyConnectedWeightsFormatDefault,
+  };
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleTest) {
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedFloat(  //
+      {2, 2, 10},                            // Input shape.
+      {
+          1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+          1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+      },
+      {2, 3, 10},  // Weights shape.
+      {
+          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+      },
+      {1, 3},  // Bias shape.
+      {
+          1, 2, 3,  // Bias values.
+      },
+      {
+          24, 25, 26, 58, 59, 60,  // Expected results.
+      },
+      {2, 2, 3},  // Output shape.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTest2) {
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedFloat(  //
+      {2, 2, 2},                             // Input shape.
+      {
+          1, 2,  // b = 0
+          2, 1,  // b = 1
+      },
+      {2, 1, 2},  // Weights shape.
+      {
+          2, 4,  // u = 0
+      },
+      {1, 1},  // Bias shape.
+      {
+          1,  // Bias values.
+      },
+      {
+          11, 9,  // Expected results.
+      },
+      {2, 2, 1},  // Output shape.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestRelu) {
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedFloat(  //
+      {2, 2, 10},                            // Input shape.
+      {
+          1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+          1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+      },
+      {2, 3, 10},  // Weights shape.
+      {
+          1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 0
+          -1, -2, -3, -4, -5, -6, -7, -8, -9, -10,  // u = 1
+          1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 2
+      },
+      {1, 3},  // Bias shape.
+      {
+          1, -2, 3,  // Bias values.
+      },
+      {
+          24, 0, 26, 58, 0, 60,  // Expected results.
+      },
+      {2, 2, 3},  // Output shape.
+      kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantized) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -63.5f;
+  const float weights_max = 64.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 64.0f * (1 << 24);
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized(  //
+      {2, 2, 10},                                // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {2, 3, 10},            // Weights shape.
+      {
+          // Weight values.
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      },
+      weights_min, weights_max,  // Weights quantization range.
+      {1, 3},                    // Bias shape.
+      {
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(2, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          F2Q(24, output_min, output_max),
+          F2Q(25, output_min, output_max),
+          F2Q(26, output_min, output_max),
+          F2Q(58, output_min, output_max),
+          F2Q(59, output_min, output_max),
+          F2Q(60, output_min, output_max),
+      },
+      {2, 2, 3},               // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedRelu) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -63.5f;
+  const float weights_max = 64.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 64.0f * (1 << 24);
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized(  //
+      {2, 2, 10},                                // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {2, 3, 10},            // Weights shape.
+      {
+          // Weight values.
+          F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
+          F2Q(-1, weights_min, weights_max), F2Q(-2, weights_min, weights_max),
+          F2Q(-3, weights_min, weights_max), F2Q(-4, weights_min, weights_max),
+          F2Q(-5, weights_min, weights_max), F2Q(-6, weights_min, weights_max),
+          F2Q(-7, weights_min, weights_max), F2Q(-8, weights_min, weights_max),
+          F2Q(-9, weights_min, weights_max), F2Q(-10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
+      },
+      weights_min, weights_max,  // Weights quantization range.
+      {1, 3},                    // Bias shape.
+      {
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(0, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          F2Q(24, output_min, output_max),
+          F2Q(0, output_min, output_max),
+          F2Q(26, output_min, output_max),
+          F2Q(58, output_min, output_max),
+          F2Q(0, output_min, output_max),
+          F2Q(60, output_min, output_max),
+      },
+      {2, 2, 3},               // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedOutputMultiplierGreaterThan1) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -127.0f;
+  const float input_max = 128.0f;
+  const float weights_min = -127.0f;
+  const float weights_max = 128.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 256.0f * (1 << 24);
+  const float output_min = -63.5f;
+  const float output_max = 64.0f;
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized(  //
+      {2, 2, 10},                                // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {2, 3, 10},            // Weights shape.
+      {
+          // Weight values.
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      },
+      weights_min, weights_max,  // Weights quantization range.
+      {1, 3},                    // Bias shape.
+      {
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(2, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          F2Q(24, output_min, output_max),
+          F2Q(25, output_min, output_max),
+          F2Q(26, output_min, output_max),
+          F2Q(58, output_min, output_max),
+          F2Q(59, output_min, output_max),
+          F2Q(60, output_min, output_max),
+      },
+      {2, 2, 3},               // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTest4DInput) {
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedFloat(  //
+      {4, 1, 1, 5, 1},                       // Input shape.
+      {
+          1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+          1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+      },
+      {2, 3, 10},  // Weights shape.
+      {
+          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+      },
+      {1, 3},  // Bias shape.
+      {
+          1, 2, 3,  // Bias values.
+      },
+      {
+          24, 25, 26, 58, 59, 60,  // Expected results.
+      },
+      {2, 2, 3},  // Output shape.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTest4DInputQuantized) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -63.5f;
+  const float weights_max = 64.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 64.0f * (1 << 24);
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized(  //
+      {4, 1, 1, 5, 1},                           // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {2, 3, 10},            // Weights shape.
+      {
+          // Weight values.
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      },
+      weights_min, weights_max,  // Weights quantization range.
+      {1, 3},                    // Bias shape.
+      {
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(2, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          F2Q(24, output_min, output_max),
+          F2Q(25, output_min, output_max),
+          F2Q(26, output_min, output_max),
+          F2Q(58, output_min, output_max),
+          F2Q(59, output_min, output_max),
+          F2Q(60, output_min, output_max),
+      },
+      {2, 2, 3},               // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedOutputMultiplierGreaterThan1) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -127.0f;
+  const float input_max = 128.0f;
+  const float weights_min = -127.0f;
+  const float weights_max = 128.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 256.0f * (1 << 24);
+  const float output_min = -63.5f;
+  const float output_max = 64.0f;
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized(  //
+      {4, 1, 1, 5, 1},                           // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {2, 3, 10},            // Weights shape.
+      {
+          // Weight values.
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      },
+      weights_min, weights_max,  // Weights quantization range.
+      {1, 3},                    // Bias shape.
+      {
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(2, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          F2Q(24, output_min, output_max),
+          F2Q(25, output_min, output_max),
+          F2Q(26, output_min, output_max),
+          F2Q(58, output_min, output_max),
+          F2Q(59, output_min, output_max),
+          F2Q(60, output_min, output_max),
+      },
+      {2, 2, 3},               // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/softmax.cc b/tensorflow/contrib/lite/experimental/micro/kernels/softmax.cc
new file mode 100644
index 0000000000..a4019a067c
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/kernels/softmax.cc
@@ -0,0 +1,213 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/kernels/internal/reference/softmax.h"
+#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace activations {
+namespace {
+
+struct OpData {
+  int32_t input_multiplier = 0;
+  int input_left_shift = 0;
+  int32_t input_range_radius = 0;
+  int diff_min = 0;
+};
+
+TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
+                                    const TfLiteTensor* input,
+                                    TfLiteTensor* output,
+                                    const TfLiteSoftmaxParams* params,
+                                    OpData* data) {
+  if (input->type == kTfLiteUInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
+
+    static const int kScaledDiffIntegerBits = 5;
+
+    tflite::PreprocessSoftmaxScaling(
+        params->beta, input->params.scale, kScaledDiffIntegerBits,
+        &data->input_multiplier, &data->input_left_shift);
+    data->diff_min = -1.0 * tflite::CalculateInputRadius(
+                                kScaledDiffIntegerBits, data->input_left_shift);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+// Takes a 1D tensor and performs softmax along it.
+void Softmax1DFloat(const TfLiteTensor* input, TfLiteTensor* output,
+                    TfLiteSoftmaxParams* params) {
+  const int input_size = input->dims->data[0];
+  tflite::reference_ops::Softmax(input->data.f, input_size, 1, params->beta,
+                                 output->data.f);
+}
+
+// Takes a 2D tensor and perform softmax along the last dimension.
+void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output,
+                    TfLiteSoftmaxParams* params) {
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  tflite::reference_ops::Softmax(input->data.f, input_size, batch_size,
+                                 params->beta, output->data.f);
+}
+
+void Softmax1DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
+                        TfLiteSoftmaxParams* params, OpData* data) {
+  // TODO(ahentz): this is arguably a dirty trick. Since the implementation
+  // always traverses the last dimension of a 4D tensor, we will pretend our 1D
+  // tensor is 4D in a special way. We will convert a (Y) shape into a (1,
+  // 1, 1, Y) shape.
+  const int input_size = input->dims->data[0];
+  const int32_t shape_data[4] = {1, 1, 1, input_size};
+  RuntimeShape shape(4, shape_data);
+  SoftmaxParams op_params;
+  op_params.input_multiplier = data->input_multiplier;
+  op_params.input_left_shift = data->input_left_shift;
+  op_params.diff_min = data->diff_min;
+  tflite::reference_ops::Softmax(op_params, shape,
+                                 GetTensorData<uint8_t>(input), shape,
+                                 GetTensorData<uint8_t>(output));
+}
+
+void Softmax2DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
+                        TfLiteSoftmaxParams* params, OpData* data) {
+  // TODO(ahentz): this is arguably a dirty trick. Since the implementation
+  // always traverses the last dimension of a 4D tensor, we will pretend our 2D
+  // tensor is 4D in a special way. We will convert a (X, Y) shape into a (X,
+  // 1, 1, Y) shape.
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  const int32_t shape_data[4] = {batch_size, 1, 1, input_size};
+  RuntimeShape shape(4, shape_data);
+  SoftmaxParams op_params;
+  op_params.input_multiplier = data->input_multiplier;
+  op_params.input_left_shift = data->input_left_shift;
+  op_params.diff_min = data->diff_min;
+  tflite::reference_ops::Softmax(op_params, shape,
+                                 GetTensorData<uint8_t>(input), shape,
+                                 GetTensorData<uint8_t>(output));
+}
+
+// Takes a 4D tensor and perform softmax along the forth dimension.
+void Softmax4DFloat(const TfLiteTensor* input, TfLiteTensor* output,
+                    TfLiteSoftmaxParams* params) {
+  SoftmaxParams op_params;
+  op_params.beta = params->beta;
+  tflite::reference_ops::Softmax(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(output), GetTensorData<float>(output));
+}
+
+void Softmax4DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
+                        TfLiteSoftmaxParams* params, OpData* data) {
+  SoftmaxParams op_params;
+  op_params.input_multiplier = data->input_multiplier;
+  op_params.input_left_shift = data->input_left_shift;
+  op_params.diff_min = data->diff_min;
+  tflite::reference_ops::Softmax(
+      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+      GetTensorShape(output), GetTensorData<uint8_t>(output));
+}
+
+TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+
+  OpData local_data_object;
+  OpData* data = &local_data_object;
+  TF_LITE_ENSURE_STATUS(
+      CalculateSoftmaxOpData(context, input, output, params, data));
+
+  // TODO(ahentz): consider an implementation that works for many (all?)
+  // dimensions.
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      if (NumDimensions(input) == 1) {
+        Softmax1DFloat(input, output, params);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 2) {
+        Softmax2DFloat(input, output, params);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 4) {
+        Softmax4DFloat(input, output, params);
+        return kTfLiteOk;
+      }
+      context->ReportError(
+          context, "Only 1D, 2D and 4D tensors supported currently, got %dD.",
+          NumDimensions(input));
+      return kTfLiteError;
+    }
+    case kTfLiteUInt8: {
+      if (NumDimensions(input) == 1) {
+        Softmax1DQuantized(input, output, params, data);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 2) {
+        Softmax2DQuantized(input, output, params, data);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 4) {
+        Softmax4DQuantized(input, output, params, data);
+        return kTfLiteOk;
+      }
+      context->ReportError(
+          context, "Only 2D and 4D tensors supported currently, got %dD.",
+          NumDimensions(input));
+      return kTfLiteError;
+    }
+    default:
+      context->ReportError(
+          context, "Only float32 and uint8_t supported currently, got %d.",
+          input->type);
+      return kTfLiteError;
+  }
+}
+}  // namespace activations
+
+TfLiteRegistration* Register_SOFTMAX() {
+  static TfLiteRegistration r = {activations::Init, activations::Free,
+                                 activations::SoftmaxPrepare,
+                                 activations::SoftmaxEval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/softmax_test.cc b/tensorflow/contrib/lite/experimental/micro/kernels/softmax_test.cc
new file mode 100644
index 0000000000..df7d87d623
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/kernels/softmax_test.cc
@@ -0,0 +1,220 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h"
+#include "tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestSoftmaxFloat(std::initializer_list<int> input_dims_data,
+                      std::initializer_list<float> input_data,
+                      std::initializer_list<float> expected_output_data,
+                      std::initializer_list<int> output_dims_data,
+                      float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteSoftmaxParams builtin_data = {1.0f};
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestSoftmaxQuantized(std::initializer_list<int> input_dims_data,
+                          std::initializer_list<uint8_t> input_data,
+                          float input_min, float input_max,
+                          std::initializer_list<uint8_t> expected_output_data,
+                          std::initializer_list<int> output_dims_data,
+                          float output_min, float output_max,
+                          uint8_t* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteSoftmaxParams builtin_data = {1.0f};
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleTest) {
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestSoftmaxFloat(  //
+      {2, 2, 5},                      // Input shape.
+      {
+          1.0, 2.0, 3.0, 4.0, 5.0,       // b = 0
+          -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 0
+      },
+      {
+          // Expected results.
+          0.011656231,
+          0.031684921,
+          0.086128544,
+          0.234121657,
+          0.636408647,
+          0.636408647,
+          0.234121657,
+          0.086128544,
+          0.031684921,
+          0.011656231,
+      },
+      {2, 2, 3},  // Output shape.
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantized) {
+  using tflite::testing::F2Q;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float output_min = 0.0f;
+  const float output_max = (255.0f / 256.0f);
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestSoftmaxQuantized(  //
+      {2, 1, 5},                          // Input shape.
+      {
+          F2Q(1.0, input_min, input_max),
+          F2Q(2.0, input_min, input_max),
+          F2Q(3.0, input_min, input_max),
+          F2Q(4.0, input_min, input_max),
+          F2Q(5.0, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantized range.
+      {
+          // Expected results.
+          F2Q(0.011656231, output_min, output_max),
+          F2Q(0.031684921, output_min, output_max),
+          F2Q(0.086128544, output_min, output_max),
+          F2Q(0.234121657, output_min, output_max),
+          F2Q(0.636408647, output_min, output_max),
+      },
+      {2, 1, 3},               // Output shape.
+      output_min, output_max,  // Output quantized range.
+      output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h b/tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h
new file mode 100644
index 0000000000..789a48ece8
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h
@@ -0,0 +1,170 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
+#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
+
+#include <cstdarg>
+#include <initializer_list>
+#include <limits>
+
+#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/contrib/lite/core/api/error_reporter.h"
+#include "tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h"
+#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace testing {
+
+// How many elements are in the array with this shape.
+inline int ElementCount(const TfLiteIntArray& dims) {
+  int result = 1;
+  for (int i = 0; i < dims.size; ++i) {
+    result *= dims.data[i];
+  }
+  return result;
+}
+
+// Wrapper to forward kernel errors to the interpreter's error reporter.
+inline void ReportOpError(struct TfLiteContext* context, const char* format,
+                          ...) {
+  ErrorReporter* error_reporter = static_cast<ErrorReporter*>(context->impl_);
+  va_list args;
+  va_start(args, format);
+  error_reporter->Report(format, args);
+  va_end(args);
+}
+
+// Derives the quantization scaling factor from a min and max range.
+template <typename T>
+inline float ScaleFromMinMax(const float min, const float max) {
+  return (max - min) / ((std::numeric_limits<T>::max() * 1.0) -
+                        std::numeric_limits<T>::min());
+}
+
+// Derives the quantization zero point from a min and max range.
+template <typename T>
+inline int ZeroPointFromMinMax(const float min, const float max) {
+  return static_cast<int>((-min / ScaleFromMinMax<T>(min, max)) + 0.5f);
+}
+
+// Converts a float value into an unsigned eight-bit quantized value.
+inline uint8_t F2Q(const float value, const float min, const float max) {
+  int32_t result = ZeroPointFromMinMax<uint8_t>(min, max) +
+                   (value / ScaleFromMinMax<uint8_t>(min, max)) + 0.5f;
+  if (result < 0) {
+    result = 0;
+  }
+  if (result > 256) {
+    result = 256;
+  }
+  return result;
+}
+
+// Converts a float value into a signed thirty-two-bit quantized value.
+inline uint8_t F2Q32(const float value, const float min, const float max) {
+  return static_cast<int32_t>((value - ZeroPointFromMinMax<int32_t>(min, max)) /
+                              ScaleFromMinMax<int32_t>(min, max));
+}
+
+inline void PopulateContext(TfLiteTensor* tensors, int tensors_size,
+                            TfLiteContext* context) {
+  context->tensors_size = tensors_size;
+  context->tensors = tensors;
+  context->impl_ = static_cast<void*>(micro_test::reporter);
+  context->GetExecutionPlan = nullptr;
+  context->ResizeTensor = nullptr;
+  context->ReportError = ReportOpError;
+  context->AddTensors = nullptr;
+  context->GetNodeAndRegistration = nullptr;
+  context->ReplaceSubgraphsWithDelegateKernels = nullptr;
+  context->recommended_num_threads = 1;
+  context->GetExternalContext = nullptr;
+  context->SetExternalContext = nullptr;
+}
+
+inline TfLiteIntArray* IntArrayFromInts(const int* int_array) {
+  return const_cast<TfLiteIntArray*>(
+      reinterpret_cast<const TfLiteIntArray*>(int_array));
+}
+
+inline TfLiteIntArray* IntArrayFromInitializer(
+    std::initializer_list<int> int_initializer) {
+  return IntArrayFromInts(int_initializer.begin());
+}
+
+inline TfLiteTensor CreateFloatTensor(const float* data, TfLiteIntArray* dims,
+                                      const char* name) {
+  const size_t bytes = ElementCount(*dims) * sizeof(float);
+  return {
+      kTfLiteFloat32, {const_cast<int*>(reinterpret_cast<const int*>(data))},
+      dims,           {},
+      kTfLiteMemNone, bytes,
+      nullptr,        name};
+}
+
+inline TfLiteTensor CreateFloatTensor(std::initializer_list<float> data,
+                                      TfLiteIntArray* dims, const char* name) {
+  return CreateFloatTensor(data.begin(), dims, name);
+}
+
+inline TfLiteTensor CreateQuantizedTensor(const uint8_t* data,
+                                          TfLiteIntArray* dims,
+                                          const char* name, float min,
+                                          float max) {
+  const size_t bytes = ElementCount(*dims) * sizeof(uint8_t);
+  const TfLiteQuantizationParams q_params = {
+      ScaleFromMinMax<uint8_t>(min, max),
+      ZeroPointFromMinMax<uint8_t>(min, max)};
+  return {
+      kTfLiteUInt8,   {const_cast<int*>(reinterpret_cast<const int*>(data))},
+      dims,           q_params,
+      kTfLiteMemNone, bytes,
+      nullptr,        name};
+}
+
+inline TfLiteTensor CreateQuantizedTensor(std::initializer_list<uint8_t> data,
+                                          TfLiteIntArray* dims,
+                                          const char* name, float min,
+                                          float max) {
+  return CreateQuantizedTensor(data.begin(), dims, name, min, max);
+}
+
+inline TfLiteTensor CreateQuantized32Tensor(const int32_t* data,
+                                            TfLiteIntArray* dims,
+                                            const char* name, float min,
+                                            float max) {
+  const size_t bytes = ElementCount(*dims) * sizeof(int32_t);
+  const TfLiteQuantizationParams q_params = {
+      ScaleFromMinMax<int32_t>(min, max),
+      ZeroPointFromMinMax<int32_t>(min, max)};
+  return {
+      kTfLiteUInt8,   {const_cast<int*>(reinterpret_cast<const int*>(data))},
+      dims,           q_params,
+      kTfLiteMemNone, bytes,
+      nullptr,        name};
+}
+
+inline TfLiteTensor CreateQuantized32Tensor(std::initializer_list<int32_t> data,
+                                            TfLiteIntArray* dims,
+                                            const char* name, float min,
+                                            float max) {
+  return CreateQuantized32Tensor(data.begin(), dims, name, min, max);
+}
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_error_reporter.cc b/tensorflow/contrib/lite/experimental/micro/micro_error_reporter.cc
new file mode 100644
index 0000000000..99dd883661
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/micro_error_reporter.cc
@@ -0,0 +1,78 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h"
+
+#ifdef TF_LITE_MCU_DEBUG_LOG
+#include <debug_log.h>
+#else  // TF_LITE_MCU_DEBUG_LOG
+#include <cstdint>
+#include <cstdio>
+void DebugLog(const char* s) { fprintf(stderr, "%s", s); }
+void DebugLogInt32(int32_t i) { fprintf(stderr, "%d", i); }
+void DebugLogUInt32(uint32_t i) { fprintf(stderr, "%d", i); }
+void DebugLogHex(uint32_t i) { fprintf(stderr, "0x%8x", i); }
+void DebugLogFloat(float i) { fprintf(stderr, "%f", i); }
+#endif  // TF_LITE_MCU_DEBUG_LOG
+
+namespace tflite {
+namespace {
+void DebugLogPrintf(const char* format, va_list args) {
+  const int output_cache_size = 64;
+  char output_cache[output_cache_size + 1];
+  int output_cache_index = 0;
+  const char* current = format;
+  while (*current != 0) {
+    if (*current == '%') {
+      const char next = *(current + 1);
+      if ((next == 'd') || (next == 's')) {
+        current += 1;
+        if (output_cache_index > 0) {
+          output_cache[output_cache_index] = 0;
+          DebugLog(output_cache);
+          output_cache_index = 0;
+        }
+        if (next == 'd') {
+          DebugLogInt32(va_arg(args, int));
+        } else if (next == 's') {
+          DebugLog(va_arg(args, char*));
+        }
+      }
+    } else {
+      output_cache[output_cache_index] = *current;
+      output_cache_index += 1;
+    }
+    if (output_cache_index >= output_cache_size) {
+      output_cache[output_cache_index] = 0;
+      DebugLog(output_cache);
+      output_cache_index = 0;
+    }
+    current += 1;
+  }
+  if (output_cache_index > 0) {
+    output_cache[output_cache_index] = 0;
+    DebugLog(output_cache);
+    output_cache_index = 0;
+  }
+  DebugLog("\n");
+}
+}  // namespace
+
+int MicroErrorReporter::Report(const char* format, va_list args) {
+  DebugLogPrintf(format, args);
+  return 0;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h b/tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h
new file mode 100644
index 0000000000..33e54f7990
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_ERROR_REPORTER_H_
+#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_ERROR_REPORTER_H_
+
+#include "tensorflow/contrib/lite/core/api/error_reporter.h"
+#include "tensorflow/contrib/lite/experimental/micro/compatibility.h"
+
+namespace tflite {
+
+class MicroErrorReporter : public ErrorReporter {
+ public:
+  ~MicroErrorReporter() {}
+  int Report(const char* format, va_list args) override;
+
+ private:
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_ERROR_REPORTER_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_error_reporter_test.cc b/tensorflow/contrib/lite/experimental/micro/micro_error_reporter_test.cc
new file mode 100644
index 0000000000..ef3c32050c
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/micro_error_reporter_test.cc
@@ -0,0 +1,25 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h"
+
+int main(int argc, char** argv) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+  error_reporter->Report("Number: %d", 42);
+  error_reporter->Report("Badly-formed format string %");
+  error_reporter->Report("Another % badly-formed %% format string");
+  error_reporter->Report("~~~%s~~~", "ALL TESTS PASSED");
+}
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_interpreter.cc b/tensorflow/contrib/lite/experimental/micro/micro_interpreter.cc
new file mode 100644
index 0000000000..0f38991bb0
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/micro_interpreter.cc
@@ -0,0 +1,310 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/experimental/micro/micro_interpreter.h"
+
+#include "tensorflow/contrib/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/contrib/lite/experimental/micro/compatibility.h"
+
+namespace tflite {
+namespace {
+const int kStackDataAllocatorSize = 128;
+class StackDataAllocator : public BuiltinDataAllocator {
+ public:
+  void* Allocate(size_t size) override {
+    if (size > kStackDataAllocatorSize) {
+      return nullptr;
+    } else {
+      return data_;
+    }
+  }
+  void Deallocate(void* data) override {
+    // Do nothing.
+  }
+
+ private:
+  uint8_t data_[kStackDataAllocatorSize];
+
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+const char* OpNameFromRegistration(const TfLiteRegistration* registration) {
+  if (registration->builtin_code == BuiltinOperator_CUSTOM) {
+    return registration->custom_name;
+  } else {
+    return EnumNameBuiltinOperator(BuiltinOperator(registration->builtin_code));
+  }
+}
+
+void ReportOpError(struct TfLiteContext* context, const char* format, ...) {
+  MicroInterpreter* interpreter =
+      static_cast<MicroInterpreter*>(context->impl_);
+  va_list args;
+  va_start(args, format);
+  interpreter->error_reporter()->Report(format, args);
+  va_end(args);
+}
+
+}  // namespace
+
+MicroInterpreter::MicroInterpreter(const Model* model,
+                                   const OpResolver& op_resolver,
+                                   SimpleTensorAllocator* tensor_allocator,
+                                   ErrorReporter* error_reporter)
+    : model_(model),
+      op_resolver_(op_resolver),
+      tensor_allocator_(tensor_allocator),
+      error_reporter_(error_reporter),
+      initialization_status_(kTfLiteOk) {
+  const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
+      model->buffers();
+  auto* subgraphs = model->subgraphs();
+  if (subgraphs->size() != 1) {
+    error_reporter->Report("Only 1 subgraph is currently supported.\n");
+    initialization_status_ = kTfLiteError;
+    return;
+  }
+  subgraph_ = (*subgraphs)[0];
+  tensors_ = subgraph_->tensors();
+  operators_ = subgraph_->operators();
+
+  context_.tensors_size = tensors_->Length();
+  context_.tensors =
+      reinterpret_cast<TfLiteTensor*>(tensor_allocator_->AllocateMemory(
+          sizeof(TfLiteTensor) * context_.tensors_size));
+  for (int i = 0; i < subgraph_->inputs()->Length(); ++i) {
+    const int tensor_index = subgraph_->inputs()->Get(i);
+    const auto* tensor = tensors_->Get(tensor_index);
+    initialization_status_ = tensor_allocator_->AllocateTensor(
+        *tensor, 0, operators_->Length(), buffers, error_reporter,
+        &context_.tensors[tensor_index]);
+    if (initialization_status_ != kTfLiteOk) {
+      return;
+    }
+  }
+
+  int* first_created = reinterpret_cast<int*>(
+      tensor_allocator_->AllocateMemory(sizeof(int) * tensors_->Length()));
+  int* last_used = reinterpret_cast<int*>(
+      tensor_allocator_->AllocateMemory(sizeof(int) * tensors_->Length()));
+  for (int i = 0; i < tensors_->Length(); ++i) {
+    first_created[i] = -1;
+    last_used[i] = -1;
+  }
+
+  for (int i = (operators_->Length() - 1); i >= 0; --i) {
+    const auto* op = operators_->Get(i);
+    for (int n = 0; n < op->inputs()->Length(); ++n) {
+      const int tensor_index = op->inputs()->Get(n);
+      if ((last_used[tensor_index] == -1) || (last_used[tensor_index] < i)) {
+        last_used[tensor_index] = i;
+      }
+    }
+    for (int n = 0; n < op->outputs()->Length(); ++n) {
+      const int tensor_index = op->outputs()->Get(n);
+      const int create_before = i;
+      int destroy_after = last_used[tensor_index];
+      if (destroy_after == -1) {
+        destroy_after = operators_->Length();
+      }
+      const auto* tensor = tensors_->Get(tensor_index);
+      if (!tensor->is_variable()) {
+        initialization_status_ = tensor_allocator_->AllocateTensor(
+            *tensor, create_before, destroy_after, buffers, error_reporter,
+            &context_.tensors[tensor_index]);
+        if (initialization_status_ != kTfLiteOk) {
+          return;
+        }
+        first_created[tensor_index] = i;
+      }
+    }
+  }
+
+  for (int i = 0; i < tensors_->Length(); ++i) {
+    const auto* tensor = tensors_->Get(i);
+    const bool is_read_only = (first_created[i] == -1) && (last_used[i] != -1);
+    if (tensor->is_variable() || is_read_only) {
+      initialization_status_ = tensor_allocator_->AllocateTensor(
+          *tensor, 0, operators_->Length(), buffers, error_reporter,
+          &context_.tensors[i]);
+      if (initialization_status_ != kTfLiteOk) {
+        return;
+      }
+    }
+  }
+  context_.impl_ = static_cast<void*>(this);
+  context_.GetExecutionPlan = nullptr;
+  context_.ResizeTensor = nullptr;
+  context_.ReportError = ReportOpError;
+  context_.AddTensors = nullptr;
+  context_.GetNodeAndRegistration = nullptr;
+  context_.ReplaceSubgraphsWithDelegateKernels = nullptr;
+  context_.recommended_num_threads = 1;
+  context_.GetExternalContext = nullptr;
+  context_.SetExternalContext = nullptr;
+}
+
+TfLiteStatus MicroInterpreter::Invoke() {
+  if (initialization_status_ != kTfLiteOk) {
+    error_reporter_->Report("Invoke() called after initialization failed\n");
+    return kTfLiteError;
+  }
+  TfLiteStatus status = kTfLiteOk;
+  auto opcodes = model_->operator_codes();
+  for (int i = 0; i < operators_->Length(); ++i) {
+    const auto* op = operators_->Get(i);
+    int index = op->opcode_index();
+    if (index < 0 || index >= opcodes->size()) {
+      error_reporter_->Report("Missing registration for opcode_index %d\n",
+                              index);
+      return kTfLiteError;
+    }
+    auto opcode = (*opcodes)[index];
+    const TfLiteRegistration* registration = nullptr;
+    status = GetRegistrationFromOpCode(opcode, op_resolver_, error_reporter_,
+                                       &registration);
+    if (status != kTfLiteOk) {
+      return status;
+    }
+    if (registration == nullptr) {
+      error_reporter_->Report("Skipping op for opcode_index %d\n", index);
+      return kTfLiteError;
+    }
+    BuiltinOperator op_type =
+        static_cast<BuiltinOperator>(registration->builtin_code);
+
+    if (op_type != BuiltinOperator_CUSTOM && op->custom_options()) {
+      error_reporter_->Report(
+          "Found builtin operator %s with custom options.\n",
+          EnumNameBuiltinOperator(op_type));
+    }
+    StackDataAllocator stack_data_allocator;
+    const char* custom_data = nullptr;
+    size_t custom_data_size = 0;
+    unsigned char* builtin_data = nullptr;
+    if (op->custom_options()) {
+      custom_data = reinterpret_cast<const char*>(op->custom_options()->data());
+      custom_data_size = op->custom_options()->size();
+    } else {
+      TF_LITE_ENSURE_STATUS(ParseOpData(op, op_type, error_reporter_,
+                                        &stack_data_allocator,
+                                        (void**)(&builtin_data)));
+    }
+
+    const char* init_data;
+    size_t init_data_size;
+    if (registration->builtin_code == BuiltinOperator_CUSTOM) {
+      init_data = custom_data;
+      init_data_size = custom_data_size;
+    } else {
+      init_data = reinterpret_cast<const char*>(builtin_data);
+      init_data_size = 0;
+    }
+    void* user_data = nullptr;
+    if (registration->init) {
+      user_data = registration->init(&context_, init_data, init_data_size);
+    }
+
+    const int kMaxInputs = 16;
+    int inputs_data[kMaxInputs + 1];
+    TfLiteIntArray* inputs_array =
+        reinterpret_cast<TfLiteIntArray*>(inputs_data);
+    if (op->inputs()->Length() >= kMaxInputs) {
+      error_reporter_->Report("Too many inputs (%d)\n", op->inputs()->Length());
+      return kTfLiteError;
+    }
+    inputs_array->size = op->inputs()->Length();
+    for (int n = 0; n < op->inputs()->Length(); ++n) {
+      inputs_array->data[n] = op->inputs()->Get(n);
+    }
+
+    const int kMaxOutputs = 16;
+    int outputs_data[kMaxOutputs + 1];
+    TfLiteIntArray* outputs_array =
+        reinterpret_cast<TfLiteIntArray*>(outputs_data);
+    if (op->outputs()->Length() >= kMaxOutputs) {
+      error_reporter_->Report("Too many outputs (%d)\n",
+                              op->outputs()->Length());
+      return kTfLiteError;
+    }
+    outputs_array->size = op->outputs()->Length();
+    for (int n = 0; n < op->outputs()->Length(); ++n) {
+      outputs_array->data[n] = op->outputs()->Get(n);
+    }
+
+    const int kMaxTemporaries = 16;
+    int temporaries_data[kMaxTemporaries + 1];
+    TfLiteIntArray* temporaries_array =
+        reinterpret_cast<TfLiteIntArray*>(temporaries_data);
+    temporaries_array->size = 0;
+
+    TfLiteNode node;
+    node.inputs = inputs_array;
+    node.outputs = outputs_array;
+    node.temporaries = temporaries_array;
+    node.user_data = user_data;
+    node.builtin_data = reinterpret_cast<void*>(builtin_data);
+    node.custom_initial_data = custom_data;
+    node.custom_initial_data_size = custom_data_size;
+    node.delegate = nullptr;
+    if (registration->prepare) {
+      TfLiteStatus prepare_status = registration->prepare(&context_, &node);
+      if (prepare_status != kTfLiteOk) {
+        error_reporter_->Report(
+            "Node %s (number %d) failed to prepare with status %d",
+            OpNameFromRegistration(registration), i, prepare_status);
+        return kTfLiteError;
+      }
+    }
+
+    if (registration->invoke) {
+      TfLiteStatus invoke_status = registration->invoke(&context_, &node);
+      if (invoke_status != kTfLiteOk) {
+        error_reporter_->Report(
+            "Node %s (number %d) failed to invoke with status %d",
+            OpNameFromRegistration(registration), i, invoke_status);
+        return kTfLiteError;
+      }
+    }
+
+    if (registration->free) {
+      registration->free(&context_, user_data);
+    }
+  }
+  return status;
+}
+
+TfLiteTensor* MicroInterpreter::input(int index) {
+  const flatbuffers::Vector<int32_t>* inputs = subgraph_->inputs();
+  const size_t length = inputs->Length();
+  if ((index < 0) || (index >= length)) {
+    error_reporter_->Report("Input index %d out of range (length is %d)", index,
+                            length);
+    return nullptr;
+  }
+  return &(context_.tensors[inputs->Get(index)]);
+}
+
+TfLiteTensor* MicroInterpreter::output(int index) {
+  const flatbuffers::Vector<int32_t>* outputs = subgraph_->outputs();
+  const size_t length = outputs->Length();
+  if ((index < 0) || (index >= outputs->Length())) {
+    error_reporter_->Report("Output index %d out of range (length is %d)",
+                            index, length);
+    return nullptr;
+  }
+  return &(context_.tensors[outputs->Get(index)]);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_interpreter.h b/tensorflow/contrib/lite/experimental/micro/micro_interpreter.h
new file mode 100644
index 0000000000..a88514cde8
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/micro_interpreter.h
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_INTERPRETER_H_
+#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_INTERPRETER_H_
+
+#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/contrib/lite/core/api/error_reporter.h"
+#include "tensorflow/contrib/lite/core/api/op_resolver.h"
+#include "tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+class MicroInterpreter {
+ public:
+  // The lifetime of the model, op resolver, allocator, and error reporter must
+  // be at least as long as that of the interpreter object, since the
+  // interpreter may need to access them at any time. This means that you should
+  // usually create them with the same scope as each other, for example having
+  // them all allocated on the stack as local variables through a top-level
+  // function.
+  // The interpreter doesn't do any deallocation of any of the pointed-to
+  // objects, ownership remains with the caller.
+  MicroInterpreter(const Model* model, const OpResolver& op_resolver,
+                   SimpleTensorAllocator* tensor_allocator,
+                   ErrorReporter* error_reporter);
+
+  TfLiteStatus Invoke();
+
+  size_t tensors_size() const { return context_.tensors_size; }
+  TfLiteTensor* tensor(int tensor_index);
+
+  TfLiteTensor* input(int index);
+  size_t inputs_size() const { return subgraph_->inputs()->Length(); }
+
+  TfLiteTensor* output(int index);
+  size_t outputs_size() const { return subgraph_->outputs()->Length(); }
+
+  TfLiteStatus initialization_status() const { return initialization_status_; }
+
+  ErrorReporter* error_reporter() { return error_reporter_; }
+
+ private:
+  const Model* model_;
+  const OpResolver& op_resolver_;
+  SimpleTensorAllocator* tensor_allocator_;
+  ErrorReporter* error_reporter_;
+
+  TfLiteStatus initialization_status_;
+  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors_;
+  const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators_;
+  TfLiteContext context_;
+
+  const SubGraph* subgraph_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_INTERPRETER_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_interpreter_test.cc b/tensorflow/contrib/lite/experimental/micro/micro_interpreter_test.cc
new file mode 100644
index 0000000000..251e5f7203
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/micro_interpreter_test.cc
@@ -0,0 +1,197 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/experimental/micro/micro_interpreter.h"
+
+#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace {
+void* MockInit(TfLiteContext* context, const char* buffer, size_t length) {
+  // Do nothing.
+  return nullptr;
+}
+
+void MockFree(TfLiteContext* context, void* buffer) {
+  // Do nothing.
+}
+
+TfLiteStatus MockPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus MockInvoke(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
+  const int32_t* input_data = input->data.i32;
+  const TfLiteTensor* weight = &context->tensors[node->inputs->data[1]];
+  const uint8_t* weight_data = weight->data.uint8;
+  TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+  int32_t* output_data = output->data.i32;
+  output_data[0] = input_data[0] + weight_data[0];
+  return kTfLiteOk;
+}
+
+class MockOpResolver : public OpResolver {
+ public:
+  const TfLiteRegistration* FindOp(BuiltinOperator op,
+                                   int version) const override {
+    return nullptr;
+  }
+  const TfLiteRegistration* FindOp(const char* op, int version) const override {
+    if (strcmp(op, "mock_custom") == 0) {
+      static TfLiteRegistration r = {MockInit, MockFree, MockPrepare,
+                                     MockInvoke};
+      return &r;
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+class StackAllocator : public flatbuffers::Allocator {
+ public:
+  StackAllocator() : data_(data_backing_), data_size_(0) {}
+
+  uint8_t* allocate(size_t size) override {
+    if ((data_size_ + size) > kStackAllocatorSize) {
+      // TODO(petewarden): Add error reporting beyond returning null!
+      return nullptr;
+    }
+    uint8_t* result = data_;
+    data_ += size;
+    data_size_ += size;
+    return result;
+  }
+
+  void deallocate(uint8_t* p, size_t) override {}
+
+  static StackAllocator& instance() {
+    // Avoid using true dynamic memory allocation to be portable to bare metal.
+    static char inst_memory[sizeof(StackAllocator)];
+    static StackAllocator* inst = new (inst_memory) StackAllocator;
+    return *inst;
+  }
+
+  static constexpr int kStackAllocatorSize = 4096;
+
+ private:
+  uint8_t data_backing_[kStackAllocatorSize];
+  uint8_t* data_;
+  int data_size_;
+};
+
+const Model* BuildMockModel() {
+  using flatbuffers::Offset;
+  flatbuffers::FlatBufferBuilder builder(StackAllocator::kStackAllocatorSize,
+                                         &StackAllocator::instance());
+  constexpr size_t buffer_data_size = 1;
+  const uint8_t buffer_data[buffer_data_size] = {21};
+  constexpr size_t buffers_size = 2;
+  const Offset<Buffer> buffers[buffers_size] = {
+      CreateBuffer(builder),
+      CreateBuffer(builder,
+                   builder.CreateVector(buffer_data, buffer_data_size))};
+  constexpr size_t tensor_shape_size = 1;
+  const int32_t tensor_shape[tensor_shape_size] = {1};
+  constexpr size_t tensors_size = 3;
+  const Offset<Tensor> tensors[tensors_size] = {
+      CreateTensor(builder,
+                   builder.CreateVector(tensor_shape, tensor_shape_size),
+                   TensorType_INT32, 0,
+                   builder.CreateString("test_input_tensor"), 0, false),
+      CreateTensor(builder,
+                   builder.CreateVector(tensor_shape, tensor_shape_size),
+                   TensorType_UINT8, 1,
+                   builder.CreateString("test_weight_tensor"), 0, false),
+      CreateTensor(builder,
+                   builder.CreateVector(tensor_shape, tensor_shape_size),
+                   TensorType_INT32, 0,
+                   builder.CreateString("test_output_tensor"), 0, false),
+  };
+  constexpr size_t inputs_size = 1;
+  const int32_t inputs[inputs_size] = {0};
+  constexpr size_t outputs_size = 1;
+  const int32_t outputs[outputs_size] = {2};
+  constexpr size_t operator_inputs_size = 2;
+  const int32_t operator_inputs[operator_inputs_size] = {0, 1};
+  constexpr size_t operator_outputs_size = 1;
+  const int32_t operator_outputs[operator_outputs_size] = {2};
+  constexpr size_t operators_size = 1;
+  const Offset<Operator> operators[operators_size] = {CreateOperator(
+      builder, 0, builder.CreateVector(operator_inputs, operator_inputs_size),
+      builder.CreateVector(operator_outputs, operator_outputs_size),
+      BuiltinOptions_NONE)};
+  constexpr size_t subgraphs_size = 1;
+  const Offset<SubGraph> subgraphs[subgraphs_size] = {
+      CreateSubGraph(builder, builder.CreateVector(tensors, tensors_size),
+                     builder.CreateVector(inputs, inputs_size),
+                     builder.CreateVector(outputs, outputs_size),
+                     builder.CreateVector(operators, operators_size),
+                     builder.CreateString("test_subgraph"))};
+  constexpr size_t operator_codes_size = 1;
+  const Offset<OperatorCode> operator_codes[operator_codes_size] = {
+      CreateOperatorCodeDirect(builder, BuiltinOperator_CUSTOM, "mock_custom",
+                               0)};
+  const Offset<Model> model_offset = CreateModel(
+      builder, 0, builder.CreateVector(operator_codes, operator_codes_size),
+      builder.CreateVector(subgraphs, subgraphs_size),
+      builder.CreateString("test_model"),
+      builder.CreateVector(buffers, buffers_size));
+  FinishModelBuffer(builder, model_offset);
+  void* model_pointer = builder.GetBufferPointer();
+  const Model* model = flatbuffers::GetRoot<Model>(model_pointer);
+  return model;
+}
+
+}  // namespace
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestInterpreter) {
+  const tflite::Model* model = tflite::BuildMockModel();
+  TF_LITE_MICRO_EXPECT_NE(nullptr, model);
+  tflite::MockOpResolver mock_resolver;
+  constexpr size_t allocator_buffer_size = 1024;
+  uint8_t allocator_buffer[allocator_buffer_size];
+  tflite::SimpleTensorAllocator simple_tensor_allocator(allocator_buffer,
+                                                        allocator_buffer_size);
+  tflite::MicroInterpreter interpreter(
+      model, mock_resolver, &simple_tensor_allocator, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(1, interpreter.inputs_size());
+  TF_LITE_MICRO_EXPECT_EQ(1, interpreter.outputs_size());
+
+  TfLiteTensor* input = interpreter.input(0);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, input);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, input->type);
+  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(4, input->bytes);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, input->data.i32);
+  input->data.i32[0] = 21;
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter.Invoke());
+
+  TfLiteTensor* output = interpreter.output(0);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, output);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, output->type);
+  TF_LITE_MICRO_EXPECT_EQ(1, output->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(4, output->bytes);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, output->data.i32);
+  TF_LITE_MICRO_EXPECT_EQ(42, output->data.i32[0]);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.cc b/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.cc
new file mode 100644
index 0000000000..40c21c6448
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.cc
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.h"
+
+namespace tflite {
+
+const TfLiteRegistration* MicroMutableOpResolver::FindOp(
+    tflite::BuiltinOperator op, int version) const {
+  for (int i = 0; i < registrations_len_; ++i) {
+    const TfLiteRegistration& registration = registrations_[i];
+    if ((registration.builtin_code == op) &&
+        (registration.version == version)) {
+      return &registration;
+    }
+  }
+  return nullptr;
+}
+
+const TfLiteRegistration* MicroMutableOpResolver::FindOp(const char* op,
+                                                         int version) const {
+  for (int i = 0; i < registrations_len_; ++i) {
+    const TfLiteRegistration& registration = registrations_[i];
+    if ((registration.builtin_code == -1) &&
+        (strcmp(registration.custom_name, op) == 0) &&
+        (registration.version == version)) {
+      return &registration;
+    }
+  }
+  return nullptr;
+}
+
+void MicroMutableOpResolver::AddBuiltin(tflite::BuiltinOperator op,
+                                        TfLiteRegistration* registration,
+                                        int min_version, int max_version) {
+  for (int version = min_version; version <= max_version; ++version) {
+    if (registrations_len_ >= TFLITE_REGISTRATIONS_MAX) {
+      // TODO(petewarden) - Add error reporting hooks so we can report this!
+      return;
+    }
+    TfLiteRegistration* new_registration = &registrations_[registrations_len_];
+    registrations_len_ += 1;
+
+    *new_registration = *registration;
+    new_registration->builtin_code = op;
+    new_registration->version = version;
+  }
+}
+
+void MicroMutableOpResolver::AddCustom(const char* name,
+                                       TfLiteRegistration* registration,
+                                       int min_version, int max_version) {
+  for (int version = min_version; version <= max_version; ++version) {
+    if (registrations_len_ >= TFLITE_REGISTRATIONS_MAX) {
+      // TODO(petewarden) - Add error reporting hooks so we can report this!
+      return;
+    }
+    TfLiteRegistration* new_registration = &registrations_[registrations_len_];
+    registrations_len_ += 1;
+
+    *new_registration = *registration;
+    new_registration->builtin_code = -1;
+    new_registration->custom_name = name;
+    new_registration->version = version;
+  }
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.h b/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.h
new file mode 100644
index 0000000000..f3750a2484
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
+#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
+
+#include "tensorflow/contrib/lite/core/api/op_resolver.h"
+#include "tensorflow/contrib/lite/experimental/micro/compatibility.h"
+
+#ifndef TFLITE_REGISTRATIONS_MAX
+#define TFLITE_REGISTRATIONS_MAX (128)
+#endif
+
+namespace tflite {
+
+class MicroMutableOpResolver : public OpResolver {
+ public:
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override;
+  const TfLiteRegistration* FindOp(const char* op, int version) const override;
+  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
+                  int min_version = 1, int max_version = 1);
+  void AddCustom(const char* name, TfLiteRegistration* registration,
+                 int min_version = 1, int max_version = 1);
+
+ private:
+  TfLiteRegistration registrations_[TFLITE_REGISTRATIONS_MAX];
+  int registrations_len_ = 0;
+
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver_test.cc b/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver_test.cc
new file mode 100644
index 0000000000..5420a33e87
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.h"
+
+#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace {
+void* MockInit(TfLiteContext* context, const char* buffer, size_t length) {
+  // Do nothing.
+  return nullptr;
+}
+
+void MockFree(TfLiteContext* context, void* buffer) {
+  // Do nothing.
+}
+
+TfLiteStatus MockPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus MockInvoke(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+}  // namespace
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestOperations) {
+  using tflite::BuiltinOperator_CONV_2D;
+  using tflite::BuiltinOperator_RELU;
+  using tflite::MicroMutableOpResolver;
+  using tflite::OpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  MicroMutableOpResolver micro_mutable_op_resolver;
+  micro_mutable_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r, 0, 2);
+  micro_mutable_op_resolver.AddCustom("mock_custom", &r, 0, 3);
+  OpResolver* resolver = &micro_mutable_op_resolver;
+
+  const TfLiteRegistration* registration =
+      resolver->FindOp(BuiltinOperator_CONV_2D, 0);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
+
+  registration = resolver->FindOp(BuiltinOperator_CONV_2D, 10);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
+
+  registration = resolver->FindOp(BuiltinOperator_RELU, 0);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
+
+  registration = resolver->FindOp("mock_custom", 0);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
+
+  registration = resolver->FindOp("mock_custom", 10);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
+
+  registration = resolver->FindOp("nonexistent_custom", 0);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.cc b/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.cc
new file mode 100644
index 0000000000..8c090a20a5
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.cc
@@ -0,0 +1,149 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h"
+
+#include "tensorflow/contrib/lite/core/api/flatbuffer_conversions.h"
+
+namespace tflite {
+namespace {
+
+TfLiteStatus TfLiteTypeSizeOf(TfLiteType type, size_t* size,
+                              ErrorReporter* reporter) {
+  switch (type) {
+    case kTfLiteFloat32:
+      *size = sizeof(float);
+      break;
+    case kTfLiteInt16:
+      *size = sizeof(int16_t);
+      break;
+    case kTfLiteInt32:
+      *size = sizeof(int32_t);
+      break;
+    case kTfLiteUInt8:
+      *size = sizeof(uint8_t);
+      break;
+    case kTfLiteInt64:
+      *size = sizeof(int64_t);
+      break;
+    case kTfLiteBool:
+      *size = sizeof(bool);
+      break;
+    case kTfLiteComplex64:
+      *size = sizeof(float) * 2;
+      break;
+    default:
+      reporter->Report(
+          "Only float32, int16, int32, int64, uint8, bool, complex64 "
+          "supported currently.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus BytesRequired(const tflite::Tensor& flatbuffer_tensor,
+                           size_t dims_size, size_t* bytes,
+                           ErrorReporter* error_reporter) {
+  TfLiteType tf_lite_type;
+  TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(),
+                                          &tf_lite_type, error_reporter));
+  size_t type_size;
+  TF_LITE_ENSURE_STATUS(
+      TfLiteTypeSizeOf(tf_lite_type, &type_size, error_reporter));
+  *bytes = dims_size * type_size;
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteStatus SimpleTensorAllocator::AllocateTensor(
+    const tflite::Tensor& flatbuffer_tensor, int create_before,
+    int destroy_after,
+    const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
+    ErrorReporter* error_reporter, TfLiteTensor* result) {
+  TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(),
+                                          &result->type, error_reporter));
+  result->is_variable = flatbuffer_tensor.is_variable();
+
+  result->data.raw = nullptr;
+  result->bytes = 0;
+  if (auto* buffer = (*buffers)[flatbuffer_tensor.buffer()]) {
+    if (auto* array = buffer->data()) {
+      if (size_t array_size = array->size()) {
+        result->data.raw =
+            const_cast<char*>(reinterpret_cast<const char*>(array->data()));
+        TF_LITE_ENSURE_STATUS(BytesRequired(flatbuffer_tensor, array_size,
+                                            &result->bytes, error_reporter));
+      }
+    }
+  }
+  if (result->data.raw) {
+    result->allocation_type = kTfLiteMmapRo;
+  } else {
+    int data_size = 1;
+    for (int n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
+      data_size *= flatbuffer_tensor.shape()->Get(n);
+    }
+    TF_LITE_ENSURE_STATUS(BytesRequired(flatbuffer_tensor, data_size,
+                                        &result->bytes, error_reporter));
+    result->data.raw = reinterpret_cast<char*>(AllocateMemory(result->bytes));
+    if (result->data.raw == nullptr) {
+      const char* tensor_name = flatbuffer_tensor.name()->c_str();
+      if (tensor_name == nullptr) {
+        tensor_name = "<None>";
+      }
+      error_reporter->Report(
+          "Couldn't allocate memory for tensor '%s', wanted %d bytes but only "
+          "%d were available",
+          tensor_name, result->bytes, (data_size_max_ - data_size_));
+      return kTfLiteError;
+    }
+    result->allocation_type = kTfLiteArenaRw;
+  }
+  result->dims = reinterpret_cast<TfLiteIntArray*>(
+      AllocateMemory(sizeof(int) * (flatbuffer_tensor.shape()->Length() + 1)));
+  result->dims->size = flatbuffer_tensor.shape()->Length();
+  for (int n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
+    result->dims->data[n] = flatbuffer_tensor.shape()->Get(n);
+  }
+  if (flatbuffer_tensor.quantization()) {
+    result->params.scale = flatbuffer_tensor.quantization()->scale()->Get(0);
+    result->params.zero_point =
+        flatbuffer_tensor.quantization()->zero_point()->Get(0);
+  }
+  result->allocation = nullptr;
+  if (flatbuffer_tensor.name()) {
+    result->name = flatbuffer_tensor.name()->c_str();
+  } else {
+    result->name = "<No name>";
+  }
+  result->delegate = nullptr;
+  result->buffer_handle = 0;
+  result->data_is_stale = false;
+  return kTfLiteOk;
+}
+
+uint8_t* SimpleTensorAllocator::AllocateMemory(size_t size) {
+  if ((data_size_ + size) > data_size_max_) {
+    // TODO(petewarden): Add error reporting beyond returning null!
+    return nullptr;
+  }
+  uint8_t* result = data_;
+  data_ += size;
+  data_size_ += size;
+  return result;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h b/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h
new file mode 100644
index 0000000000..4f16a9d0e5
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_SIMPLE_TENSOR_ALLOCATOR_H_
+#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_SIMPLE_TENSOR_ALLOCATOR_H_
+
+#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/contrib/lite/core/api/error_reporter.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// TODO(petewarden): This allocator never frees up or reuses  any memory, even
+// though we have enough information about lifetimes of the tensors to do so.
+// This makes it pretty wasteful, so we should use a more intelligent method.
+class SimpleTensorAllocator {
+ public:
+  SimpleTensorAllocator(uint8_t* buffer, int buffer_size)
+      : data_size_(0), data_size_max_(buffer_size), data_(buffer) {}
+
+  TfLiteStatus AllocateTensor(
+      const tflite::Tensor& flatbuffer_tensor, int create_before,
+      int destroy_after,
+      const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
+      ErrorReporter* error_reporter, TfLiteTensor* result);
+
+  uint8_t* AllocateMemory(size_t size);
+
+  int GetDataSize() const { return data_size_; }
+
+ private:
+  int data_size_;
+  int data_size_max_;
+  uint8_t* data_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_SIMPLE_TENSOR_ALLOCATOR_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator_test.cc b/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator_test.cc
new file mode 100644
index 0000000000..c835427243
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator_test.cc
@@ -0,0 +1,144 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/experimental/micro/micro_interpreter.h"
+
+#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace {
+class StackAllocator : public flatbuffers::Allocator {
+ public:
+  StackAllocator() : data_(data_backing_), data_size_(0) {}
+
+  uint8_t* allocate(size_t size) override {
+    if ((data_size_ + size) > kStackAllocatorSize) {
+      // TODO(petewarden): Add error reporting beyond returning null!
+      return nullptr;
+    }
+    uint8_t* result = data_;
+    data_ += size;
+    data_size_ += size;
+    return result;
+  }
+
+  void deallocate(uint8_t* p, size_t) override {}
+
+  static StackAllocator& instance() {
+    // Avoid using true dynamic memory allocation to be portable to bare metal.
+    static char inst_memory[sizeof(StackAllocator)];
+    static StackAllocator* inst = new (inst_memory) StackAllocator;
+    return *inst;
+  }
+
+  static constexpr int kStackAllocatorSize = 4096;
+
+ private:
+  uint8_t data_backing_[kStackAllocatorSize];
+  uint8_t* data_;
+  int data_size_;
+};
+
+flatbuffers::FlatBufferBuilder* BuilderInstance() {
+  static char inst_memory[sizeof(flatbuffers::FlatBufferBuilder)];
+  static flatbuffers::FlatBufferBuilder* inst =
+      new (inst_memory) flatbuffers::FlatBufferBuilder(
+          StackAllocator::kStackAllocatorSize, &StackAllocator::instance());
+  return inst;
+}
+
+const Tensor* Create1dTensor(int size) {
+  using flatbuffers::Offset;
+  flatbuffers::FlatBufferBuilder* builder = BuilderInstance();
+  constexpr size_t tensor_shape_size = 1;
+  const int32_t tensor_shape[tensor_shape_size] = {size};
+  const Offset<Tensor> tensor_offset = CreateTensor(
+      *builder, builder->CreateVector(tensor_shape, tensor_shape_size),
+      TensorType_INT32, 0, builder->CreateString("test_tensor"), 0, false);
+  builder->Finish(tensor_offset);
+  void* tensor_pointer = builder->GetBufferPointer();
+  const Tensor* tensor = flatbuffers::GetRoot<Tensor>(tensor_pointer);
+  return tensor;
+}
+
+const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* CreateBuffers() {
+  using flatbuffers::Offset;
+  flatbuffers::FlatBufferBuilder* builder = BuilderInstance();
+  constexpr size_t buffers_size = 1;
+  const Offset<Buffer> buffers[buffers_size] = {
+      CreateBuffer(*builder),
+  };
+  const flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>>
+      buffers_offset = builder->CreateVector(buffers, buffers_size);
+  builder->Finish(buffers_offset);
+  void* buffers_pointer = builder->GetBufferPointer();
+  const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* result =
+      flatbuffers::GetRoot<flatbuffers::Vector<flatbuffers::Offset<Buffer>>>(
+          buffers_pointer);
+  return result;
+}
+
+}  // namespace
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestAllocateTensor) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleTensorAllocator allocator(arena, arena_size);
+
+  const tflite::Tensor* tensor = tflite::Create1dTensor(100);
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>* buffers =
+      tflite::CreateBuffers();
+
+  TfLiteTensor allocated_tensor;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      allocator.AllocateTensor(*tensor, 0, 1, buffers, micro_test::reporter,
+                               &allocated_tensor));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
+  TF_LITE_MICRO_EXPECT_EQ(1, allocated_tensor.dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(100, allocated_tensor.dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(400, allocated_tensor.bytes);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, allocated_tensor.data.i32);
+}
+
+TF_LITE_MICRO_TEST(TestTooLarge) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleTensorAllocator allocator(arena, arena_size);
+
+  const tflite::Tensor* tensor = tflite::Create1dTensor(10000);
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>* buffers =
+      tflite::CreateBuffers();
+
+  TfLiteTensor allocated_tensor;
+  TF_LITE_MICRO_EXPECT_NE(
+      kTfLiteOk,
+      allocator.AllocateTensor(*tensor, 0, 1, buffers, micro_test::reporter,
+                               &allocated_tensor));
+}
+
+TF_LITE_MICRO_TEST(TestJustFits) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleTensorAllocator allocator(arena, arena_size);
+
+  uint8_t* result = allocator.AllocateMemory(arena_size);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/BUILD b/tensorflow/contrib/lite/experimental/micro/testing/BUILD
new file mode 100644
index 0000000000..0d23be5712
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/testing/BUILD
@@ -0,0 +1,17 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["test_linux_binary.sh"])
+
+cc_library(
+    name = "micro_test",
+    hdrs = [
+        "micro_test.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
+    ],
+)
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/Dockerfile.bluepill b/tensorflow/contrib/lite/experimental/micro/testing/Dockerfile.bluepill
new file mode 100644
index 0000000000..7d6d81af0f
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/testing/Dockerfile.bluepill
@@ -0,0 +1,21 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# This docker configuration file lets you emulate a Blue Pill board
+# on an x86 desktop or laptop, which can be useful for debugging and
+# automated testing.
+FROM antmicro/renode:latest
+
+LABEL maintainer="Pete Warden <petewarden@google.com>"
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/bluepill.resc b/tensorflow/contrib/lite/experimental/micro/testing/bluepill.resc
new file mode 100644
index 0000000000..9333dc42bf
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/testing/bluepill.resc
@@ -0,0 +1,36 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+using sysbus
+
+mach create
+machine LoadPlatformDescription @platforms/cpus/stm32f103.repl
+
+# These lines are needed to show the results of DebugLog calls in the output.
+machine LoadPlatformDescriptionFromString "uartSemihosting: UART.SemihostingUart @ cpu"
+showAnalyzer cpu.uartSemihosting Antmicro.Renode.Analyzers.LoggingUartAnalyzer
+
+logFile @/tmp/renode_bluepill_log.txt
+
+macro reset
+"""
+    sysbus LoadELF $bin
+"""
+
+runMacro $reset
+
+emulation RunFor @1
+
+quit
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/micro_test.bzl b/tensorflow/contrib/lite/experimental/micro/testing/micro_test.bzl
new file mode 100644
index 0000000000..91e349cb24
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/testing/micro_test.bzl
@@ -0,0 +1,64 @@
+"""Rules for simple testing without dependencies by parsing output logs."""
+
+def tflite_micro_cc_test(
+        name,
+        expected_in_logs = "~~~ALL TESTS PASSED~~~",
+        srcs = [],
+        includes = [],
+        defines = [],
+        copts = [],
+        nocopts = "",
+        linkopts = [],
+        deps = [],
+        visibility = None):
+    """Tests a C/C++ binary without testing framework  dependencies`.
+
+    Runs a C++ binary, and tests that the output logs contain the
+    expected value. This is a deliberately spartan way of testing, to match
+    what's available when testing microcontroller binaries.
+
+    Args:
+      name: a unique name for this rule.
+      expected_in_logs: A regular expression that is required to be
+                        present in the binary's logs for the test to pass.
+      srcs: sources to compile (C, C++, ld scripts).
+      includes: include paths to add to this rule and its dependents.
+      defines: list of `VAR` or `VAR=VAL` to pass to CPP for this rule and
+               its dependents.
+      copts: gcc compilation flags for this rule only.
+      nocopts: list of gcc compilation flags to remove for this rule
+               only. No regexp like for `cc_library`.
+      linkopts: `gcc` flags to add to the linking phase. For "pure" ld flags,
+                prefix them with the `-Wl,` prefix here.
+      deps: dependencies. only `tflite_bare_metal_cc_library()` dependencies
+            allowed.
+      visibility: visibility.
+    """
+    native.cc_binary(
+        name = name + "_binary",
+        srcs = srcs,
+        includes = includes,
+        defines = defines,
+        copts = copts,
+        nocopts = nocopts,
+        linkopts = linkopts,
+        deps = deps,
+        visibility = visibility,
+    )
+    native.sh_test(
+        name = name,
+        size = "medium",
+        srcs = [
+            "//tensorflow/contrib/lite/experimental/micro/testing:test_linux_binary.sh",
+        ],
+        args = [
+            native.package_name() + "/" + name + "_binary",
+            "'" + expected_in_logs + "'",
+        ],
+        data = [
+            name + "_binary",
+            # Internal test dependency placeholder
+        ],
+        deps = [
+        ],
+    )
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/micro_test.h b/tensorflow/contrib/lite/experimental/micro/testing/micro_test.h
new file mode 100644
index 0000000000..104509c9dc
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/testing/micro_test.h
@@ -0,0 +1,138 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// An ultra-lightweight testing framework designed for use with microcontroller
+// applications. Its only dependency is on TensorFlow Lite's ErrorReporter
+// interface, where log messages are output. This is designed to be usable even
+// when no standard C or C++ libraries are available, and without any dynamic
+// memory allocation or reliance on global constructors.
+//
+// To build a test, you use syntax similar to gunit, but with some extra
+// decoration to create a hidden 'main' function containing each of the tests to
+// be run. Your code should look something like:
+// ----------------------------------------------------------------------------
+// #include "path/to/this/header"
+//
+// TF_LITE_MICRO_TESTS_BEGIN
+//
+// TF_LITE_MICRO_TEST(SomeTest) {
+//   TF_LITE_LOG_EXPECT_EQ(true, true);
+// }
+//
+// TF_LITE_MICRO_TESTS_END
+// ----------------------------------------------------------------------------
+// If you compile this for your platform, you'll get a normal binary that you
+// should be able to run. Executing it will output logging information like this
+// to stderr (or whatever equivalent is available and written to by
+// ErrorReporter):
+// ----------------------------------------------------------------------------
+// Testing SomeTest
+// 1/1 tests passed
+// ~~~ALL TESTS PASSED~~~
+// ----------------------------------------------------------------------------
+// This is designed to be human-readable, so you can just run tests manually,
+// but the string "~~~ALL TESTS PASSED~~~" should only appear if all of the
+// tests do pass. This makes it possible to integrate with automated test
+// systems by scanning the output logs and looking for that magic value.
+//
+// This framework is intended to be a rudimentary alternative to no testing at
+// all on systems that struggle to run more conventional approaches, so use with
+// caution!
+
+#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_TESTING_MICRO_TEST_H_
+#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_TESTING_MICRO_TEST_H_
+
+#include "tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h"
+
+namespace micro_test {
+extern int tests_passed;
+extern int tests_failed;
+extern bool is_test_complete;
+extern bool did_test_fail;
+extern tflite::ErrorReporter* reporter;
+}  // namespace micro_test
+
+#define TF_LITE_MICRO_TESTS_BEGIN              \
+  namespace micro_test {                       \
+  int tests_passed;                            \
+  int tests_failed;                            \
+  bool is_test_complete;                       \
+  bool did_test_fail;                          \
+  tflite::ErrorReporter* reporter;             \
+  }                                            \
+                                               \
+  int main(int argc, char** argv) {            \
+    micro_test::tests_passed = 0;              \
+    micro_test::tests_failed = 0;              \
+    tflite::MicroErrorReporter error_reporter; \
+    micro_test::reporter = &error_reporter;
+
+#define TF_LITE_MICRO_TESTS_END                                \
+  micro_test::reporter->Report(                                \
+      "%d/%d tests passed", micro_test::tests_passed,          \
+      (micro_test::tests_failed + micro_test::tests_passed));  \
+  if (micro_test::tests_failed == 0) {                         \
+    micro_test::reporter->Report("~~~ALL TESTS PASSED~~~\n");  \
+  } else {                                                     \
+    micro_test::reporter->Report("~~~SOME TESTS FAILED~~~\n"); \
+  }                                                            \
+  }
+
+// TODO(petewarden): I'm going to hell for what I'm doing to this poor for loop.
+#define TF_LITE_MICRO_TEST(name)                                           \
+  micro_test::reporter->Report("Testing %s", #name);                       \
+  for (micro_test::is_test_complete = false,                               \
+      micro_test::did_test_fail = false;                                   \
+       !micro_test::is_test_complete; micro_test::is_test_complete = true, \
+      micro_test::tests_passed += (micro_test::did_test_fail) ? 0 : 1,     \
+      micro_test::tests_failed += (micro_test::did_test_fail) ? 1 : 0)
+
+#define TF_LITE_MICRO_EXPECT(x)                                                \
+  do {                                                                         \
+    if (!(x)) {                                                                \
+      micro_test::reporter->Report(#x " failed at %s:%d", __FILE__, __LINE__); \
+      micro_test::did_test_fail = true;                                        \
+    }                                                                          \
+  } while (false)
+
+#define TF_LITE_MICRO_EXPECT_EQ(x, y)                                         \
+  do {                                                                        \
+    if ((x) != (y)) {                                                         \
+      micro_test::reporter->Report(#x " == " #y " failed at %s:%d", __FILE__, \
+                                   __LINE__);                                 \
+      micro_test::did_test_fail = true;                                       \
+    }                                                                         \
+  } while (false)
+
+#define TF_LITE_MICRO_EXPECT_NE(x, y)                                         \
+  do {                                                                        \
+    if ((x) == (y)) {                                                         \
+      micro_test::reporter->Report(#x " != " #y " failed at %s:%d", __FILE__, \
+                                   __LINE__);                                 \
+      micro_test::did_test_fail = true;                                       \
+    }                                                                         \
+  } while (false)
+
+#define TF_LITE_MICRO_EXPECT_NEAR(x, y, epsilon)                      \
+  do {                                                                \
+    auto delta = ((x) > (y)) ? ((x) - (y)) : ((y) - (x));             \
+    if (delta > epsilon) {                                            \
+      micro_test::reporter->Report(#x " near " #y " failed at %s:%d", \
+                                   __FILE__, __LINE__);               \
+      micro_test::did_test_fail = true;                               \
+    }                                                                 \
+  } while (false)
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_TESTING_MICRO_TEST_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/test_bluepill_binary.sh b/tensorflow/contrib/lite/experimental/micro/testing/test_bluepill_binary.sh
new file mode 100755
index 0000000000..07742a8262
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/testing/test_bluepill_binary.sh
@@ -0,0 +1,54 @@
+#!/bin/bash -e
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Tests a 'bluepill' STM32F103 ELF by parsing the log output of Renode emulation.
+#
+# First argument is the ELF location.
+# Second argument is a regular expression that's required to be in the output logs
+# for the test to pass.
+
+declare -r ROOT_DIR=`pwd`
+declare -r TEST_TMPDIR=/tmp/test_bluepill_binary/
+declare -r MICRO_LOG_PATH=${TEST_TMPDIR}
+declare -r MICRO_LOG_FILENAME=${MICRO_LOG_PATH}/logs.txt
+mkdir -p ${MICRO_LOG_PATH}
+
+docker build -t renode_bluepill \
+  -f ${ROOT_DIR}/tensorflow/contrib/lite/experimental/micro/testing/Dockerfile.bluepill \
+  ${ROOT_DIR}/tensorflow/contrib/lite/experimental/micro/testing/
+
+docker run \
+  --log-driver=none -a stdout -a stderr \
+  -v ${ROOT_DIR}:/workspace \
+  -v /tmp:/tmp \
+  -it renode_bluepill \
+  /bin/bash -c "renode -P 5000 --disable-xwt -e '
+\$bin?=@/workspace/$1
+s @/workspace/tensorflow/contrib/lite/experimental/micro/testing/bluepill.resc
+' 2>&1 >${MICRO_LOG_FILENAME}"
+
+echo "LOGS:"
+cat ${MICRO_LOG_FILENAME}
+
+if grep -q "$2" ${MICRO_LOG_FILENAME}
+then
+  echo "$1: PASS"
+  exit 0
+else
+  echo "$1: FAIL - '$2' not found in logs."
+  exit 1
+fi
+
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/test_linux_binary.sh b/tensorflow/contrib/lite/experimental/micro/testing/test_linux_binary.sh
new file mode 100755
index 0000000000..24131a6d2d
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/testing/test_linux_binary.sh
@@ -0,0 +1,39 @@
+#!/bin/bash -e
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Tests a Linux binary by parsing the log output.
+#
+# First argument is the binary location.
+# Second argument is a regular expression that's required to be in the output logs
+# for the test to pass.
+
+declare -r ROOT_DIR=`pwd`
+declare -r TEST_TMPDIR=/tmp/test_bluepill_binary/
+declare -r MICRO_LOG_PATH=${TEST_TMPDIR}/$1
+declare -r MICRO_LOG_FILENAME=${MICRO_LOG_PATH}/logs.txt
+mkdir -p ${MICRO_LOG_PATH}
+
+$1 2>&1 | tee ${MICRO_LOG_FILENAME}
+
+if grep -q "$2" ${MICRO_LOG_FILENAME}
+then
+  echo "$1: PASS"
+  exit 0
+else
+  echo "$1: FAIL - '$2' not found in logs."
+  exit 1
+fi
+
diff --git a/tensorflow/contrib/lite/experimental/micro/tools/make/Makefile b/tensorflow/contrib/lite/experimental/micro/tools/make/Makefile
new file mode 100644
index 0000000000..880bb4763c
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/tools/make/Makefile
@@ -0,0 +1,166 @@
+MAKEFILE_DIR := tensorflow/contrib/lite/experimental/micro/tools/make
+
+# Try to figure out the host system
+HOST_OS :=
+ifeq ($(OS),Windows_NT)
+	HOST_OS = windows
+else
+	UNAME_S := $(shell uname -s)
+	ifeq ($(UNAME_S),Linux)
+		HOST_OS := linux
+	endif
+	ifeq ($(UNAME_S),Darwin)
+		HOST_OS := osx
+	endif
+endif
+
+HOST_ARCH := $(shell if [[ $(shell uname -m) =~ i[345678]86 ]]; then echo x86_32; else echo $(shell uname -m); fi)
+
+# Override these on the make command line to target a specific architecture. For example:
+# make -f tensorflow/contrib/lite/Makefile TARGET=rpi TARGET_ARCH=armv7l
+TARGET := $(HOST_OS)
+TARGET_ARCH := $(HOST_ARCH)
+
+INCLUDES := \
+-I. \
+-I$(MAKEFILE_DIR)/../../../../../ \
+-I$(MAKEFILE_DIR)/../../../../../../ \
+-I$(MAKEFILE_DIR)/downloads/ \
+-I$(MAKEFILE_DIR)/downloads/gemmlowp \
+-I$(MAKEFILE_DIR)/downloads/flatbuffers/include \
+-I$(OBJDIR)
+# This is at the end so any globally-installed frameworks like protobuf don't
+# override local versions in the source tree.
+INCLUDES += -I/usr/local/include
+
+TEST_SCRIPT := tensorflow/contrib/lite/experimental/micro/testing/test_linux_binary.sh
+
+MICROLITE_LIBS := -lm
+
+# There are no rules for compiling objects for the host system (since we don't
+# generate things like the protobuf compiler that require that), so all of
+# these settings are for the target compiler.
+CXXFLAGS := -O3 -DNDEBUG
+CXXFLAGS += --std=c++11 -g -DTF_LITE_STATIC_MEMORY
+CCFLAGS := -DNDEBUG -g -DTF_LITE_STATIC_MEMORY
+LDOPTS := -L/usr/local/lib
+ARFLAGS := -r
+TARGET_TOOLCHAIN_PREFIX :=
+CC_PREFIX :=
+
+# This library is the main target for this makefile. It will contain a minimal
+# runtime that can be linked in to other programs.
+MICROLITE_LIB_NAME := libtensorflow-microlite.a
+
+# Test binary for the microcontroller speech model.
+MICRO_SPEECH_TEST_SRCS := \
+tensorflow/contrib/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc \
+tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
+
+MICROLITE_TEST_SRCS := \
+$(wildcard tensorflow/contrib/lite/experimental/micro/*test.cc) \
+$(wildcard tensorflow/contrib/lite/experimental/micro/kernels/*test.cc)
+
+MICROLITE_CC_BASE_SRCS := \
+$(wildcard tensorflow/contrib/lite/experimental/micro/*.cc) \
+$(wildcard tensorflow/contrib/lite/experimental/micro/kernels/*.cc) \
+tensorflow/contrib/lite/c/c_api_internal.c \
+tensorflow/contrib/lite/core/api/error_reporter.cc \
+tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc \
+tensorflow/contrib/lite/core/api/op_resolver.cc \
+tensorflow/contrib/lite/kernels/kernel_util.cc \
+tensorflow/contrib/lite/kernels/internal/quantization_util.cc
+MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SRCS))
+
+# These target-specific makefiles should modify or replace options like
+# CXXFLAGS or LIBS to work for a specific targetted architecture. All logic
+# based on platforms or architectures should happen within these files, to
+# keep this main makefile focused on the sources and dependencies.
+include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
+
+ALL_SRCS := \
+	$(MICRO_SPEECH_TEST_SRCS) \
+	$(MICROLITE_CC_SRCS) \
+	$(MICROLITE_TEST_SRCS)
+
+# Where compiled objects are stored.
+GENDIR := $(MAKEFILE_DIR)/gen/$(TARGET)_$(TARGET_ARCH)/
+OBJDIR := $(GENDIR)obj/
+BINDIR := $(GENDIR)bin/
+LIBDIR := $(GENDIR)lib/
+
+MICROLITE_LIB_PATH := $(LIBDIR)$(MICROLITE_LIB_NAME)
+
+MICRO_SPEECH_TEST_BINARY := $(BINDIR)micro_speech_test
+
+CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
+CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
+AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
+
+MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICRO_SPEECH_TEST_SRCS))))
+
+MICROLITE_LIB_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICROLITE_CC_SRCS))))
+
+MICROLITE_TEST_TARGETS := $(addprefix $(BINDIR), \
+$(patsubst %_test.cc,%.test_target,$(MICROLITE_TEST_SRCS)))
+
+# For normal manually-created TensorFlow C++ source files.
+$(OBJDIR)%.o: %.cc
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+# For normal manually-created TensorFlow C source files.
+$(OBJDIR)%.o: %.c
+	@mkdir -p $(dir $@)
+	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
+
+# The target that's compiled if there's no command-line arguments.
+all: $(MICROLITE_LIB_PATH) $(MICRO_SPEECH_TEST_BINARY)
+
+microlite: $(MICROLITE_LIB_PATH)
+
+# Hack for generating schema file bypassing flatbuffer parsing
+tensorflow/contrib/lite/schema/schema_generated.h:
+	@cp -u tensorflow/contrib/lite/schema/schema_generated.h.OPENSOURCE tensorflow/contrib/lite/schema/schema_generated.h
+
+# Gathers together all the objects we've compiled into a single '.a' archive.
+$(MICROLITE_LIB_PATH): tensorflow/contrib/lite/schema/schema_generated.h $(MICROLITE_LIB_OBJS)
+	@mkdir -p $(dir $@)
+	$(AR) $(ARFLAGS) $(MICROLITE_LIB_PATH) $(MICROLITE_LIB_OBJS)
+
+$(MICRO_SPEECH_TEST_BINARY): $(MICRO_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(MICRO_SPEECH_TEST_BINARY) $(MICRO_SPEECH_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+
+micro_speech_test: $(MICRO_SPEECH_TEST_BINARY)
+micro_speech_test_bin: $(MICRO_SPEECH_TEST_BINARY).bin
+
+test_micro_speech: $(MICRO_SPEECH_TEST_BINARY)
+	$(TEST_SCRIPT) $(MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+$(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $@ $< \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+
+$(BINDIR)%.test_target: $(BINDIR)%_test
+	$(TEST_SCRIPT) $< '~~~ALL TESTS PASSED~~~'
+
+$(info $(MICROLITE_TEST_TARGETS))
+
+test: test_micro_speech $(MICROLITE_TEST_TARGETS)
+
+# Gets rid of all generated files.
+clean:
+	rm -rf $(MAKEFILE_DIR)/gen
+
+$(DEPDIR)/%.d: ;
+.PRECIOUS: $(DEPDIR)/%.d
+.PRECIOUS: $(BINDIR)%_test
+
+-include $(patsubst %,$(DEPDIR)/%.d,$(basename $(ALL_SRCS)))
diff --git a/tensorflow/contrib/lite/experimental/micro/tools/make/download_dependencies.sh b/tensorflow/contrib/lite/experimental/micro/tools/make/download_dependencies.sh
new file mode 100755
index 0000000000..4c2ff8545d
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/tools/make/download_dependencies.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR/../../../../../../.."
+
+DOWNLOADS_DIR=tensorflow/contrib/lite/experimental/micro/tools/make/downloads
+BZL_FILE_PATH=tensorflow/workspace.bzl
+
+# Ensure it is being run from repo root
+if [ ! -f $BZL_FILE_PATH ]; then
+  echo "Could not find ${BZL_FILE_PATH}":
+  echo "Likely you are not running this from the root directory of the repository.";
+  exit 1;
+fi
+
+GEMMLOWP_URL="https://github.com/google/gemmlowp/archive/719139ce755a0f31cbf1c37f7f98adcc7fc9f425.zip"
+FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/1f5eae5d6a135ff6811724f6c57f911d1f46bb15.tar.gz"
+CMSIS_URL="https://github.com/ARM-software/CMSIS_5/archive/5.4.0.zip"
+STM32_BARE_LIB_URL="https://github.com/google/stm32_bare_lib/archive/50e0da307a2821bb54af1f57b969e6b76cb89d32.zip"
+
+download_and_extract() {
+  local usage="Usage: download_and_extract URL DIR"
+  local url="${1:?${usage}}"
+  local dir="${2:?${usage}}"
+  echo "downloading ${url}" >&2
+  mkdir -p "${dir}"
+  if [[ "${url}" == *gz ]]; then
+    curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xz
+  elif [[ "${url}" == *zip ]]; then
+    tempdir=$(mktemp -d)
+    tempdir2=$(mktemp -d)
+
+    curl -L ${url} > ${tempdir}/zipped.zip
+    unzip ${tempdir}/zipped.zip -d ${tempdir2}
+
+    # If the zip file contains nested directories, extract the files from the
+    # inner directory.
+    if ls ${tempdir2}/*/* 1> /dev/null 2>&1; then
+      # unzip has no strip components, so unzip to a temp dir, and move the
+      # files we want from the tempdir to destination.
+      cp -R ${tempdir2}/*/* ${dir}/
+    else
+      cp -R ${tempdir2}/* ${dir}/
+    fi
+    rm -rf ${tempdir2} ${tempdir}
+  fi
+
+  # Delete any potential BUILD files, which would interfere with Bazel builds.
+  find "${dir}" -type f -name '*BUILD' -delete
+}
+
+download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
+download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
+download_and_extract "${CMSIS_URL}" "${DOWNLOADS_DIR}/cmsis"
+download_and_extract "${STM32_BARE_LIB_URL}" "${DOWNLOADS_DIR}/stm32_bare_lib"
+
+echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/contrib/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/contrib/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
new file mode 100644
index 0000000000..022a8422dc
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
@@ -0,0 +1,65 @@
+# Settings for Blue Pill platforms.
+ifeq ($(TARGET), bluepill)
+  TARGET_ARCH := cortex-m3
+  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+
+  PLATFORM_FLAGS = \
+    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+    -DTF_LITE_STATIC_MEMORY \
+    -DTF_LITE_MCU_DEBUG_LOG \
+    -fno-rtti \
+    -fmessage-length=0 \
+    -fno-exceptions \
+    -fno-unwind-tables \
+    -fno-builtin \
+    -ffunction-sections \
+    -fdata-sections \
+    -funsigned-char \
+    -MMD \
+    -mcpu=cortex-m3 \
+    -mthumb \
+    -std=gnu++11 \
+    -Wvla \
+    -Wall \
+    -Wextra \
+    -Wno-unused-parameter \
+    -Wno-missing-field-initializers \
+    -Wno-write-strings \
+    -Wno-sign-compare \
+    -fno-delete-null-pointer-checks \
+    -fomit-frame-pointer \
+    -fpermissive \
+    -nostdlib \
+    -g \
+    -Os
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+  LDFLAGS += \
+    -T $(MAKEFILE_DIR)/downloads/stm32_bare_lib/stm32_linker_layout.lds \
+    -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref \
+    -Wl,--gc-sections
+	BUILD_TYPE := micro
+  MICROLITE_LIBS := \
+    -lm
+  INCLUDES += \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+    -I$(MAKEFILE_DIR)/downloads/stm32_bare_lib/include
+  MICROLITE_CC_SRCS += \
+    $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.c) \
+    $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.cc)
+    TEST_SCRIPT := tensorflow/contrib/lite/experimental/micro/testing/test_bluepill_binary.sh
+  # These are tests that don't currently work on the blue pill.
+  EXCLUDED_TESTS := \
+    tensorflow/contrib/lite/experimental/micro/micro_interpreter_test.cc \
+    tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator_test.cc
+  MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
+
+$(BINDIR)/%.bin: $(BINDIR)/%
+	@mkdir -p $(dir $@)
+	$(OBJCOPY) $< $@ -O binary
+
+endif
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h
index b87cf2b60d..7c176e0fa1 100644
--- a/tensorflow/contrib/lite/kernels/internal/compatibility.h
+++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h
@@ -84,4 +84,27 @@ using uint16 = std::uint16_t;
 using int32 = std::int32_t;
 using uint32 = std::uint32_t;
 
+// TFLITE_DEPRECATED()
+//
+// Duplicated from absl/base/macros.h to avoid pulling in that library.
+// Marks a deprecated class, struct, enum, function, method and variable
+// declarations. The macro argument is used as a custom diagnostic message (e.g.
+// suggestion of a better alternative).
+//
+// Example:
+//
+//   class TFLITE_DEPRECATED("Use Bar instead") Foo {...};
+//   TFLITE_DEPRECATED("Use Baz instead") void Bar() {...}
+//
+// Every usage of a deprecated entity will trigger a warning when compiled with
+// clang's `-Wdeprecated-declarations` option. This option is turned off by
+// default, but the warnings will be reported by clang-tidy.
+#if defined(__clang__) && __cplusplus >= 201103L
+#define TFLITE_DEPRECATED(message) __attribute__((deprecated(message)))
+#endif
+
+#ifndef TFLITE_DEPRECATED
+#define TFLITE_DEPRECATED(message)
+#endif
+
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index b39347758a..64a39dd2a2 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -18,7 +18,6 @@ limitations under the License.
 #include <algorithm>
 #include <cstring>
 
-#include "absl/base/macros.h"
 #include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
 
 namespace tflite {
@@ -441,7 +440,7 @@ inline int FlatSize(const Dims<N>& dims) {
   return flat_size;
 }
 
-ABSL_DEPRECATED("Prefer FlatSize.")
+TFLITE_DEPRECATED("Prefer FlatSize.")
 inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
   return FlatSize(dims);
 }
-- 
GitLab


From ac22e1583aed390d78d2e87a4bf8a6ec39400ec4 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Thu, 4 Oct 2018 09:21:05 -0700
Subject: [PATCH 409/570] Gracefully disallow updating resource variables with
 invalid shapes.

During graph construction, the shape function for AssignAddVariableOp etc.
would raise an error when the value being "assign add"ed to the variable
has an incompatible shape.

With eager execution, no such validation was being made which triggerred
an assertion failure in eigen:
https://github.com/eigenteam/eigen-git-mirror/blob/7d97e1cbbe4424fda39e31c88def7c0863897640/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h#L479

This change prevents that assertion failure.

PiperOrigin-RevId: 215749071
---
 tensorflow/core/kernels/resource_variable_ops.cc         | 6 ++++++
 .../python/kernel_tests/resource_variable_ops_test.py    | 9 ++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 23d76986bf..678d675c4a 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -426,6 +426,12 @@ class AssignUpdateVariableOp : public OpKernel {
     // ADD if value's refcount was 1.
     mutex_lock ml(*variable->mu());
     Tensor* var_tensor = variable->tensor();
+    OP_REQUIRES(context, var_tensor->shape().IsSameSize(value.shape()),
+                errors::InvalidArgument("Cannot update variable with shape ",
+                                        var_tensor->shape().DebugString(),
+                                        " using a Tensor with shape ",
+                                        value.shape().DebugString(),
+                                        ", shapes must be equal."));
     OP_REQUIRES_OK(context,
                    PrepareToUpdateVariable<Device, T>(context, var_tensor));
     functor::DenseUpdate<Device, T, Op> update_functor;
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 1365d4b240..a9fd93e9f8 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -142,7 +142,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v = resource_variable_ops.ResourceVariable(1.0)
     ops.reset_default_graph()
     v.assign(2.0)  # Note: this fails if we run convert_to_tensor on not the
-                   # variable graph.
+    # variable graph.
 
   def testFetchHandle(self):
     with self.cached_session():
@@ -908,6 +908,13 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(Exception, r"shape.*2.*3"):
       state_ops.scatter_update(v, [0, 1], [0, 1, 2])
 
+  @test_util.run_in_graph_and_eager_modes
+  def testAssignIncompatibleShape(self):
+    v = resource_variable_ops.ResourceVariable([0, 1, 2, 3])
+    self.evaluate(v.initializer)
+    with self.assertRaisesRegexp(Exception, r"hapes must be equal"):
+      self.assertAllEqual(self.evaluate(v.assign_add(1)), [1, 2, 3, 4])
+
 
 class _MixedPrecisionVariableTest(test_util.TensorFlowTestCase):
 
-- 
GitLab


From 1fb84c2e41c454939a02a69093cb214673eab343 Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Thu, 4 Oct 2018 09:26:19 -0700
Subject: [PATCH 410/570] Add ability to vectorize nodes that do not derive
 from function arguments. (This indirectly handles "Const" outputs
 automagically, since they are always unstacked.)

PiperOrigin-RevId: 215749824
---
 .../core/grappler/optimizers/data/BUILD       |   1 +
 .../optimizers/data/map_vectorization.cc      |   2 +-
 .../optimizers/data/vectorization_utils.cc    | 247 ++++++++++++++++-
 .../data/vectorization_utils_test.cc          | 251 ++++++++++++++++++
 .../optimization/map_vectorization_test.py    |   4 +
 5 files changed, 492 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 755af3361e..ee7c14e3ab 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -524,6 +524,7 @@ cc_library(
     deps = [
         ":function_utils",
         ":graph_utils",
+        "//tensorflow/cc:ops",
         "@com_google_absl//absl/strings",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
index 9328a7ca99..ba521e79bc 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
@@ -44,7 +44,7 @@ FunctionDef* CreateMapDefunWrapper(const NodeDef& map_node,
   // Function inputs and outputs are the same as original, just
   // with different shapes.
   *vectorized_func->mutable_signature() = orig_func.signature();
-  graph_utils::SetUniqueGraphFunctionName("vectorized_function", library,
+  graph_utils::SetUniqueGraphFunctionName("naively_vectorized_fn", library,
                                           vectorized_func);
 
   // Add MapDefun node
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
index 2d6cf562b1..344c420902 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/data/vectorization_utils.h"
-#include <memory>
 #include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
 
 #include "absl/strings/str_join.h"
+#include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/device_base.h"
@@ -28,13 +28,13 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/strings/scanner.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -45,6 +45,22 @@ namespace {
 // Describes a tensor with its operation Node and output position
 typedef std::pair<Node*, int> TensorDesc;
 
+// Equivalent to python Pfor's WrappedTensor struct
+struct WrappedTensor {
+  TensorDesc tensor;
+
+  // Whether the tensor is stacked, i.e. represents the results of applying
+  // the operation on all slices of the input, where each row i of the
+  // tensor corresponds to the op's output on slice i of the input. False
+  // if the tensor is not stacked, i.e. represents the result of the op on
+  // a single slice of the input, where the result does not vary between
+  // slices.
+  bool stacked;
+
+  WrappedTensor(TensorDesc&& tensor, bool stacked)
+      : tensor(std::move(tensor)), stacked(stacked) {}
+};
+
 const char* const kRetValOp = "_Retval";
 
 void ReplaceEdgeSources(const TensorDesc& old_src, const TensorDesc& new_src,
@@ -132,7 +148,8 @@ class Vectorization {
                    const NodeDef& map_defun_node, FunctionDef** result);
 
  private:
-  // Converts FunctionDefs to Graphs.
+  // Converts FunctionDefs to Graphs and adds mappings from
+  // arg nodes and unstacked nodes to the corresponding nodes in outer_scope_.
   Status Initialize(const FunctionDef& outer_scope,
                     const NodeDef& map_defun_node);
 
@@ -162,9 +179,30 @@ class Vectorization {
   //    the conversion map.
   Status AddConversionMapping(Node* op_node);
 
-  // Maps a tensor to the corresponding vectorized tensor. For example,
-  // {"Cast" Node*, 0} -> {"Vectorize/Cast" Node*, 0}
-  std::map<TensorDesc, TensorDesc> conversion_map_;
+  // Given a tensor t in `unstacked`, stacks it by doing the equivalent of
+  // tf.tile(tf.expand_dims(t, 0), [n, 1, 1, ...]) where n is dimension 0 of
+  // inputs to `map_defun_node_`. This stacked tensor will be compatible with
+  // the expected output shape of `map_defun_node_`.
+  // This is equivalent to the _stack function in python Pfor.
+  Status StackTensor(WrappedTensor* unstacked, TensorDesc* result);
+
+  // Recursively looks for unstacked nodes in the `map_defun_fn_` graph by
+  // doing a depth-first search from the ret nodes. Lifts nodes that are
+  // unstacked (i.e. don't derive from arg nodes) into `outer_scope_` directly
+  // and add mappings to `conversion_map_`.
+  Status AddUnstackedNodeMappings();
+
+  // Recursive helper for `AddUnstackedNodeMappings`, returns true if tensor
+  // is unstacked.
+  bool AddUnstackedNodeMappingsHelper(TensorDesc&& tensor, Status* status);
+
+  // Add mappings from `map_defun_fn_` arg nodes to `map_defun_node_` input
+  // nodes to `conversion_map_`.
+  Status AddArgNodeMappings();
+
+  // Maps a tensor to the corresponding WrappedTensor. For example,
+  // {"Cast" Node*, 0} -> WrappedTensor({"Vectorize/Cast" Node*, 0}, true)
+  std::map<TensorDesc, WrappedTensor> conversion_map_;
 
   // Unconvertible ret nodes
   std::set<Node*> unconvertible_;
@@ -180,6 +218,10 @@ class Vectorization {
   std::unique_ptr<Graph> outer_scope_;
   std::unique_ptr<FunctionBody> map_defun_fn_;
   Node* map_defun_node_ = nullptr;  // Owned by `outer_scope`
+
+  // Caches the loop_len_node_ needed for tiling unstacked output. This
+  // corresponds to a vector with one element.
+  Node* loop_len_node_ = nullptr;  // Owned by `outer_scope`
   Status status_;
 };
 
@@ -224,7 +266,7 @@ Status Vectorization::AddConversionMapping(Node* op_node) {
 
   // Add output mappings.
   for (size_t i = 0; i < op_node->num_outputs(); ++i) {
-    conversion_map_.insert({{op_node, i}, std::move(output_ports[i])});
+    conversion_map_.insert({{op_node, i}, {std::move(output_ports[i]), true}});
   }
 
   return Status::OK();
@@ -242,10 +284,22 @@ Status Vectorization::ConvertOutput(int output_position) {
   if (auto found = gtl::FindOrNull(conversion_map_, output)) {
     // It's possible the output already has a mapping, if it comes from a node
     // that has already been converted.
-    converted_output = *found;
+    if (found->stacked) {
+      converted_output = found->tensor;
+    } else {
+      // Some outputs may be unstacked if they don't derive from arg nodes
+      // (for example, if a function returns a constant). For these, we
+      // have to add extra nodes to tile it in the 0th dimension.
+      TF_RETURN_IF_ERROR(StackTensor(found, &converted_output));
+    }
   } else {
+    // Note: All unstacked nodes are converted ahead of time in `Initialize`,
+    // and here we assume that all op vectorizers create only stacked outputs.
+    // This may not hold in the future, as more vectorizers are added that
+    // may actually create unstacked outputs. For example, see the `Shape`
+    // converter in third_party/tensorflow/python/ops/parallel_for/pfor.py
     TF_RETURN_IF_ERROR(AddConversionMapping(output.first));
-    converted_output = conversion_map_.at(output);
+    converted_output = conversion_map_.at(output).tensor;
   }
 
   ReplaceEdgeSources({map_defun_node_, output_position}, converted_output,
@@ -297,6 +351,7 @@ void Vectorization::VectorizeHelper() {
     map_defun_node_->AddAttr("output_types", map_defun_fn_->ret_types);
   }
 }
+
 Status Vectorization::Initialize(const FunctionDef& outer_scope,
                                  const NodeDef& map_defun_node) {
   // Convert outer_scope and map_defun_fn to FunctionBodys so we can
@@ -337,16 +392,184 @@ Status Vectorization::Initialize(const FunctionDef& outer_scope,
   }
   map_defun_node_ = outer_scope_->FindNodeId(node_id);
 
-  // Add mappings from map_defun_fn_ arg nodes to map_defun_node_ input nodes to
-  // the conversion map
+  TF_RETURN_IF_ERROR(AddArgNodeMappings());
+
+  TF_RETURN_IF_ERROR(AddUnstackedNodeMappings());
+  loop_len_node_ = nullptr;
+
+  return Status::OK();
+}
+
+// TODO(rachelim): It might be profitable to use the C++ API for this instead of
+// NodeBuilder
+Status Vectorization::StackTensor(WrappedTensor* unstacked,
+                                  TensorDesc* result) {
+  // Note that all these nodes are necessary as the size of the batch may not be
+  // constant.
+  if (unstacked->stacked) {
+    return errors::Internal("Can only stack unstacked tensor.");
+  }
+
+  Graph* g = outer_scope_.get();
+  auto node_builder = [](StringPiece op) {
+    return NodeBuilder(strings::StrCat("vectorized/stack/", op), op);
+  };
+
+  auto make_const = [&node_builder](const Input::Initializer& val, Graph* graph,
+                                    Node** result) {
+    TF_RETURN_IF_ERROR(val.status);
+    return node_builder("Const")
+        .Attr("value", val.tensor)
+        .Attr("dtype", val.tensor.dtype())
+        .Finalize(graph, result);
+  };
+
+  // If loop_len_node_ hasn't been created yet, add the node and cache it.
+  if (loop_len_node_ == nullptr) {
+    Node* input_node;
+    TF_RETURN_IF_ERROR(map_defun_node_->input_node(0, &input_node));
+
+    Node* shape_node;
+    TF_RETURN_IF_ERROR(
+        node_builder("Shape").Input(input_node).Finalize(g, &shape_node));
+
+    Node* const_vec_0;
+    TF_RETURN_IF_ERROR(make_const({0}, g, &const_vec_0));
+    Node* const_vec_1;
+    TF_RETURN_IF_ERROR(make_const({1}, g, &const_vec_1));
+
+    Node* strided_slice_node;
+    TF_RETURN_IF_ERROR(node_builder("StridedSlice")
+                           .Input(shape_node)   // input
+                           .Input(const_vec_0)  // begin
+                           .Input(const_vec_1)  // end
+                           .Input(const_vec_1)  // strides
+                           .Finalize(g, &strided_slice_node));
+
+    // Produces a vector of length 1
+    TF_RETURN_IF_ERROR(node_builder("Reshape")
+                           .Input(strided_slice_node)  // tensor
+                           .Input(const_vec_1)         // shape
+                           .Finalize(g, &loop_len_node_));
+  }
+
+  Node* ones_shape;
+  TF_RETURN_IF_ERROR(node_builder("Shape")
+                         .Input(unstacked->tensor.first)  // input
+                         .Finalize(g, &ones_shape));
+
+  Node* ones;
+  TF_RETURN_IF_ERROR(
+      node_builder("OnesLike").Input(ones_shape).Finalize(g, &ones));
+
+  Node* const_0;
+  TF_RETURN_IF_ERROR(make_const(0, g, &const_0));
+
+  Node* multiples;
+  TF_RETURN_IF_ERROR(node_builder("Concat")
+                         .Input(const_0)                           // concat_dim
+                         .Input({{loop_len_node_, 0}, {ones, 0}})  // values
+                         .Finalize(g, &multiples));
+
+  Node* expand_dims;
+  TF_RETURN_IF_ERROR(node_builder("ExpandDims")
+                         .Input(unstacked->tensor.first)  // input
+                         .Input(const_0)                  // dim
+                         .Finalize(g, &expand_dims));
+
+  TF_RETURN_IF_ERROR(node_builder("Tile")
+                         .Input(expand_dims)  // input
+                         .Input(multiples)    // multiples
+                         .Finalize(g, &result->first));
+  result->second = 0;
+  return Status::OK();
+}
+
+Status Vectorization::AddArgNodeMappings() {
   for (auto arg_node : map_defun_fn_->arg_nodes) {
     Node* input_node;
     TF_RETURN_IF_ERROR(map_defun_node_->input_node(
         arg_node->attrs().Find("index")->i(), &input_node));
 
-    conversion_map_.insert({{arg_node, 0}, {input_node, 0}});
+    conversion_map_.insert({{arg_node, 0}, {{input_node, 0}, true}});
+
+    // Control inputs
+    conversion_map_.insert({{arg_node, Graph::kControlSlot},
+                            {{input_node, Graph::kControlSlot}, true}});
   }
+  return Status::OK();
+}
 
+bool Vectorization::AddUnstackedNodeMappingsHelper(TensorDesc&& tensor,
+                                                   Status* status) {
+  if (auto found = gtl::FindOrNull(conversion_map_, tensor)) {
+    return !found->stacked;
+  }
+
+  if (tensor.first->op_def().is_stateful()) {
+    // We don't lift stateful nodes directly out of the MapDefun, since they may
+    // have to be executed N times.
+    return false;
+  }
+
+  bool is_unstacked = true;
+  for (auto edge : tensor.first->in_edges()) {
+    // Ignore Source nodes. Note that these are also ignored in the
+    // GraphToFunctionDef conversion.
+    if (edge->src()->IsSource()) continue;
+
+    // A node is unstacked if all of its inputs are unstacked
+    is_unstacked &= AddUnstackedNodeMappingsHelper(
+        {edge->src(), edge->src_output()}, status);
+  }
+
+  if (!is_unstacked) {
+    return false;
+  }
+
+  // If the node is unstacked, we copy it into outer_scope_ and
+  // add it to the map. Note that we don't clean up the nodes that are copied
+  // in map_defun_fn_, and rely on them being pruned out later.
+  Node* node = outer_scope_->AddNode(tensor.first->def(), status);
+  if (!status->ok()) return true;
+
+  // Add input edges to nodes that should already have been lifted.
+  for (auto edge : tensor.first->in_edges()) {
+    // Ignore Source nodes. Note that these are also ignored in the
+    // GraphToFunctionDef conversion.
+    if (edge->src()->IsSource()) continue;
+
+    if (auto found = gtl::FindOrNull(conversion_map_,
+                                     {edge->src(), edge->src_output()})) {
+      outer_scope_->AddEdge(found->tensor.first, found->tensor.second, node,
+                            edge->dst_input());
+    } else {
+      status->Update(errors::Internal(
+          "Could not find input conversion even though we did depth first "
+          "conversion."));
+    }
+  }
+
+  // Add output mappings
+  for (int i = 0; i < tensor.first->num_outputs(); ++i) {
+    conversion_map_.insert(
+        {{tensor.first, i}, WrappedTensor({node, i}, false)});
+  }
+  conversion_map_.insert({{tensor.first, Graph::kControlSlot},
+                          WrappedTensor({node, Graph::kControlSlot}, false)});
+
+  return true;
+}
+
+Status Vectorization::AddUnstackedNodeMappings() {
+  SetVector<Node*> unstacked_nodes;
+  Status s;
+  for (const auto& ret_node : map_defun_fn_->ret_nodes) {
+    const Edge* in_edge = nullptr;
+    TF_RETURN_IF_ERROR(ret_node->input_edge(0, &in_edge));
+    AddUnstackedNodeMappingsHelper({in_edge->src(), in_edge->src_output()}, &s);
+    TF_RETURN_IF_ERROR(s);
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
index 1ff62217dd..a958d706c1 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
@@ -670,6 +670,257 @@ TEST(VectorizeMapDefunTest, VectorizeDefunWithControlInputs) {
               cast_node.input(1) == control_input);
 }
 
+// Before:
+//
+//
+//                 +------+
+// +---------------+ Arg0 +---------+
+// |               +---+--+         |
+// |                   |            |
+// |               +---v--+         |
+// |   +-----------+ Arg0 +-----+   |
+// |   |           +------+     |   |
+// |   |                        |   |
+// |   |                        |   |
+// |   |           +------+     |   |
+// |   |           |Const |     |   |
+// |   |           +---+--+     |   |
+// |   |               |        |   |
+// |   | MapDefun  +---v--+     |   |
+// |   +-----------+ Ret0 +-----+   |
+// |               +---+--+         |
+// |                   |            |
+// |               +---v--+         |
+// +---------------+ Ret0 +---------+
+//                 +------+
+//
+//
+//  After:
+//
+//                 +------+
+// +---------------+ Arg0 +---------+
+// |               +------+         |
+// |                                |
+// |               +------+         |
+// |               |Const |         |
+// |               +---+--+         |
+// |                   |            |
+// |                   |            |
+// |                   |            |
+// |               +---v--+         |
+// |               |Stack*|         |
+// |               +---+--+         |
+// |                   |            |
+// |                   |            |
+// |                   |            |
+// |               +---v--+         |
+// +---------------+ Ret0 +---------+
+//                 +------+
+// *Not actually a Stack node, but does the equivalent.
+//
+TEST(VectorizeMapDefunTest, VectorizeConst) {
+  FunctionDef inner = FunctionDefHelper::Create(
+      "inner_function", {"arg0: int32"}, {"ret0: int32"}, {/* attrs */},
+      {/* nodes */ FunctionDefHelper::Const("Const", 2)},
+      {{"ret0", "Const:output:0"}});
+  FunctionDef outer = FunctionDefHelper::Create(
+      "outer_function", {"outer_arg0: int32"}, {"mapdefun: int32"},
+      {/* attrs */}, {/* nodes */}, {{"mapdefun", "MapDefun:output:0"}});
+
+  NodeDef* map_defun =
+      AddMapDefunNode("MapDefun", {"outer_arg0"}, {DT_INT32}, {DT_INT32}, {{}},
+                      inner.signature().name(), &outer);
+
+  FunctionDefLibrary lib;
+  *lib.add_function() = outer;
+  *lib.add_function() = inner;
+  FunctionDef* vectorized;
+  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  EXPECT_TRUE(
+      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+  EXPECT_TRUE(function_utils::ContainsFunctionNodeWithOp("Const", *vectorized));
+}
+
+// Before:
+//
+//
+//                 +------+
+// +---------------+ Arg0 +---------+
+// |               +---+--+         |
+// |                   |            |
+// |               +---v--+         |
+// |   +-----------+ Arg0 +-----+   |
+// |   |           +------+     |   |
+// |   |                        |   |
+// |   |                        |   |
+// |   |           +------+     |   |
+// |   |           |Const |     |   |
+// |   |           +---+--+     |   |
+// |   |               |        |   |
+// |   |           +---v--+     |   |
+// |   |           | Cast |     |   |
+// |   |           +---+--+     |   |
+// |   |               |        |   |
+// |   | MapDefun  +---v--+     |   |
+// |   +-----------+ Ret0 +-----+   |
+// |               +---+--+         |
+// |                   |            |
+// |               +---v--+         |
+// +---------------+ Ret0 +---------+
+//                 +------+
+//
+//
+//  After:
+//
+//                 +------+
+// +---------------+ Arg0 +---------+
+// |               +------+         |
+// |                                |
+// |               +------+         |
+// |               |Const |         |
+// |               +---+--+         |
+// |                   |            |
+// |               +---v--+         |
+// |               | Cast |         |
+// |               +---+--+         |
+// |                   |            |
+// |               +---v--+         |
+// |               |Stack*|         |
+// |               +---+--+         |
+// |                   |            |
+// |                   |            |
+// |                   |            |
+// |               +---v--+         |
+// +---------------+ Ret0 +---------+
+//                 +------+
+// *Not actually a Stack node, but does the equivalent.
+//
+TEST(VectorizeMapDefunTest, VectorizeUnstackedOutput) {
+  FunctionDef inner = FunctionDefHelper::Create(
+      "inner_function", {"arg0: int32"}, {"ret0: int64"}, {/* attrs */},
+      {/* nodes */ FunctionDefHelper::Const("Const", 2)},
+      {{"ret0", "Cast:y:0"}});
+  AddCastNode("Cast", {"Const:output:0"}, DT_INT32, DT_INT64, false, &inner);
+
+  FunctionDef outer = FunctionDefHelper::Create(
+      "outer_function", {"outer_arg0: int32"}, {"mapdefun: int64"},
+      {/* attrs */}, {/* nodes */}, {{"mapdefun", "MapDefun:output:0"}});
+
+  NodeDef* map_defun =
+      AddMapDefunNode("MapDefun", {"outer_arg0"}, {DT_INT32}, {DT_INT64}, {{}},
+                      inner.signature().name(), &outer);
+
+  FunctionDefLibrary lib;
+  *lib.add_function() = outer;
+  *lib.add_function() = inner;
+  FunctionDef* vectorized;
+  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  EXPECT_TRUE(
+      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+  auto const_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Const", *vectorized));
+  auto cast_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Cast", *vectorized));
+  EXPECT_EQ(cast_node.input(0).substr(0, cast_node.input(0).find(':')),
+            const_node.name());
+}
+
+// Before:
+//
+//
+//                 +------+
+// +---------------+ Arg0 +---------+
+// |               +---+--+         |
+// |                   |            |
+// |               +---v--+         |
+// |   +-----------+ Arg0 +-----+   |
+// |   |           +------+     |   |
+// |   |                        |   |
+// |   | +------+  +------+     |   |
+// |   | |Const |  |Const |     |   |
+// |   | +---+--+  +---+--+     |   |
+// |   |     :     +---v--+     |   |
+// |   |     ::::::> Cast |     |   |
+// |   |           +---+--+     |   |
+// |   |               |        |   |
+// |   | MapDefun  +---v--+     |   |
+// |   +-----------+ Ret0 +-----+   |
+// |               +---+--+         |
+// |                   |            |
+// |               +---v--+         |
+// +---------------+ Ret0 +---------+
+//                 +------+
+//
+//
+//  After:
+//
+//
+//                 +------+
+// +---------------+ Arg0 +---------+
+// |               +------+         |
+// |                                |
+// |                                |
+// |               +------+         |
+// |     +------+  |Const |         |
+// |     |Const |  +---+--+         |
+// |     +---+--+      |            |
+// |         :     +---v--+         |
+// |         ::::::> Cast |         |
+// |               +---+--+         |
+// |                   |            |
+// |               +---v--+         |
+// |               +Stack*+         |
+// |               +---+--+         |
+// |                   |            |
+// |               +---v--+         |
+// +---------------+ Ret0 +---------+
+//                 +------+
+// *Not actually a Stack node, but does the equivalent.
+//
+TEST(VectorizeMapDefunTest, VectorizeUnstackedControl) {
+  FunctionDef inner = FunctionDefHelper::Create(
+      "inner_function", {"arg0: int32"}, {"ret0: int64"}, {/* attrs */},
+      {/* nodes */ FunctionDefHelper::Const("Const", 2),
+       FunctionDefHelper::Const("ConstDep", 3)},
+      {{"ret0", "Cast:y:0"}});
+  AddCastNode("Cast", {"Const:output:0", "^ConstDep"}, DT_INT32, DT_INT64,
+              false, &inner);
+
+  FunctionDef outer = FunctionDefHelper::Create(
+      "outer_function", {"outer_arg0: int32"}, {"mapdefun: int64"},
+      {/* attrs */}, {/* nodes */}, {{"mapdefun", "MapDefun:output:0"}});
+
+  NodeDef* map_defun =
+      AddMapDefunNode("MapDefun", {"outer_arg0"}, {DT_INT32}, {DT_INT64}, {{}},
+                      inner.signature().name(), &outer);
+
+  FunctionDefLibrary lib;
+  *lib.add_function() = outer;
+  *lib.add_function() = inner;
+
+  FunctionDef* vectorized;
+  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+
+  auto find_const = [vectorized](int val) -> const NodeDef* {
+    for (const auto& n : vectorized->node_def()) {
+      if (n.attr().at("value").tensor().int_val(0) == val) {
+        return &n;
+      }
+    }
+    return nullptr;
+  };
+
+  EXPECT_TRUE(
+      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+  auto const_node = find_const(2);
+  auto const_dep_node = find_const(3);
+  auto cast_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Cast", *vectorized));
+  EXPECT_EQ(cast_node.input(0).substr(0, cast_node.input(0).find(':')),
+            const_node->name());
+  EXPECT_EQ(cast_node.input(1), strings::StrCat("^", const_dep_node->name()));
+}
+
 // TODO(rachelim): More test cases when we get around to implementing them:
 // [] A badly defined converter, e.g. doesn't produce nodes that have the
 //    same number of outputs/inputs as the nodes to be converted
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index 32ebc49c40..971a2d94b9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -78,6 +78,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
       ("Basic", lambda x: (x, x + 1), None),
+      ("Const", lambda x: 2, 12),
       ("Parallel", lambda x: (x, x + 1), 12),
       ("Gather", lambda x: array_ops.gather(x, 0), 12),
   )
@@ -207,6 +208,9 @@ class MapVectorizationBenchmark(test.Benchmark):
   def benchmarkAddConst(self):
     self._benchmark_helper(lambda *args: [x + 1 for x in args], "add_const")
 
+  def benchmarkReturnConst(self):
+    self._benchmark_helper(lambda *args: [constant_op.constant(2)], "ret_const")
+
   def benchmarkSelect(self):
     self._benchmark_helper(lambda *args: args[0], "select")
 
-- 
GitLab


From c2552cd33c05fa84f280e766e33ba01308ffbcb2 Mon Sep 17 00:00:00 2001
From: Youlong Cheng <ylc@google.com>
Date: Thu, 4 Oct 2018 09:42:13 -0700
Subject: [PATCH 411/570]  Skip numeric checking in BROADCAST mode.

PiperOrigin-RevId: 215752559
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 37 +++++++++++++------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 3aa5b6efa1..8d15c857f8 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -177,14 +177,29 @@ def _create_or_get_iterations_per_loop():
           use_resource=True)
 
 
-def _sync_variables_ops():
-  # Gets the variables back from TPU nodes. This means the variables updated
-  # by TPU will now be *synced* to host memory.
-  return [
-      array_ops.check_numerics(v.read_value(),
-                               'Gradient for %s is NaN' % v.name).op
-      for v in variables.trainable_variables()
-  ]
+def _sync_variables_ops(ctx):
+  """Create varriables synchronization ops.
+
+  Gets the variables back from TPU nodes. This means the variables updated
+  by TPU will now be *synced* to host memory.
+  In BROADCAST mode, we skip this sync since the variables are ususally too
+  big to transmit via RPC.
+
+  Args:
+    ctx: A `_InternalTPUContext` instance with mode.
+
+  Returns:
+    A list of sync ops.
+  """
+
+  if not ctx.is_input_broadcast_with_iterators():
+    return [
+        array_ops.check_numerics(v.read_value(),
+                                 'Gradient for %s is NaN' % v.name).op
+        for v in variables.trainable_variables()
+    ]
+  else:
+    return [control_flow_ops.no_op()]
 
 
 def _increase_eval_step_op(iterations_per_loop):
@@ -2567,7 +2582,7 @@ class TPUEstimator(estimator_lib.Estimator):
 
           summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
           with ops.control_dependencies([loss]):
-            update_ops = _sync_variables_ops()
+            update_ops = _sync_variables_ops(ctx)
 
           # Validate the TPU training graph to catch basic errors
           _validate_tpu_training_graph()
@@ -2600,7 +2615,7 @@ class TPUEstimator(estimator_lib.Estimator):
             # After TPU evaluation computation is done (the mean_loss tensor),
             # reads all variables back from TPU and updates the eval step
             # counter properly
-            internal_ops_to_run = _sync_variables_ops()
+            internal_ops_to_run = _sync_variables_ops(ctx)
             internal_ops_to_run.append(
                 _increase_eval_step_op(iterations_per_loop_var))
             with ops.control_dependencies(internal_ops_to_run):
@@ -2645,7 +2660,7 @@ class TPUEstimator(estimator_lib.Estimator):
          scaffold, prediction_hooks) = _predict_on_tpu_system(
              ctx, model_fn_wrapper, dequeue_fn)
         with ops.control_dependencies([dummy_predict_op]):
-          internal_ops_to_run = _sync_variables_ops()
+          internal_ops_to_run = _sync_variables_ops(ctx)
           with ops.control_dependencies(internal_ops_to_run):
             dummy_predict_op = control_flow_ops.no_op()
 
-- 
GitLab


From 5e1b45d0a8aa3f268745cdc683c26d9ebdd1ea8b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 10:10:58 -0700
Subject: [PATCH 412/570] Automated rollback of commit
 f22037abf5a6f4581f5fb6013f72f91747f22965

PiperOrigin-RevId: 215757701
---
 tensorflow/compiler/jit/xla_device_context.cc    | 15 ++++-----------
 tensorflow/compiler/jit/xla_device_context.h     |  3 +--
 .../xla/service/generic_transfer_manager.cc      |  2 +-
 .../xla/service/generic_transfer_manager.h       |  7 +++----
 .../compiler/xla/service/transfer_manager.h      | 16 +---------------
 5 files changed, 10 insertions(+), 33 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index e083652978..af83c792e5 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -75,9 +75,8 @@ XlaTransferManager::XlaTransferManager(
   }
 }
 
-Status XlaTransferManager::TransferLiteralToDevice(const Tensor& host_tensor,
-                                                   Tensor* device_tensor,
-                                                   bool buffer_is_fresh) const {
+Status XlaTransferManager::TransferLiteralToDevice(
+    const Tensor& host_tensor, Tensor* device_tensor) const {
   xla::Shape xla_shape;
   TF_RETURN_IF_ERROR(TensorShapeToXLAShape(host_tensor.dtype(),
                                            host_tensor.shape(), &xla_shape));
@@ -98,11 +97,8 @@ Status XlaTransferManager::TransferLiteralToDevice(const Tensor& host_tensor,
     // synchronized.
     host_to_device_stream_->ThenWaitFor(stream_.get());
   }
-  xla::TransferManager::TransferToDeviceHint hint =
-      buffer_is_fresh ? xla::TransferManager::kBufferUndefined
-                      : xla::TransferManager::kNoHint;
   TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDeviceAsync(
-      host_to_device_stream_.get(), *literal, shaped_buffer, hint));
+      host_to_device_stream_.get(), *literal, shaped_buffer));
   if (UseMultipleStreams()) {
     auto event = std::make_shared<se::Event>(stream_->parent());
     TF_RET_CHECK(event->Init()) << "Event failed to initialize!";
@@ -169,7 +165,6 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
     return;
   }
   TensorShape shape = shape_or_status.ValueOrDie();
-  bool buffer_is_fresh = false;
   if (!xla_tensor->has_shaped_buffer()) {
     Status s =
         xla_tensor->AllocateShapedBuffer(device_tensor->dtype(), shape, client_,
@@ -178,7 +173,6 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
       done(s);
       return;
     }
-    buffer_is_fresh = true;
   }
 
   Status status;
@@ -189,8 +183,7 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
           "Tensor::CopyFrom failed when copying from CPU to XLA device"));
       return;
     }
-    status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor,
-                                     buffer_is_fresh);
+    status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor);
   } else {
     se::DeviceMemoryBase dev_dst_ptr =
         XlaTensor::DeviceMemoryFromTensor(*device_tensor);
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index a4c0c296fc..df82421294 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -67,8 +67,7 @@ class XlaTransferManager {
 
  private:
   Status TransferLiteralToDevice(const Tensor& host_tensor,
-                                 Tensor* device_tensor,
-                                 bool buffer_is_fresh) const;
+                                 Tensor* device_tensor) const;
   void TransferLiteralFromDevice(Tensor* host_tensor,
                                  const Tensor& device_tensor,
                                  const StatusCallback& done) const;
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index f92fde7f46..bec02e14f9 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -98,7 +98,7 @@ Status GenericTransferManager::TransferLiteralFromDeviceInternal(
 
 Status GenericTransferManager::TransferLiteralToDeviceAsync(
     se::Stream* stream, const LiteralSlice& literal,
-    const ShapedBuffer& device_buffer, TransferToDeviceHint /*hint*/) {
+    const ShapedBuffer& device_buffer) {
   const Shape& shape = literal.shape();
   VLOG(2) << "transferring literal shape to device: "
           << ShapeUtil::HumanString(shape)
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index b1cba82b9f..86c8b1c145 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -45,10 +45,9 @@ class GenericTransferManager : public TransferManager {
                                  MutableBorrowingLiteral literal,
                                  std::function<void(Status)> done) override;
 
-  Status TransferLiteralToDeviceAsync(se::Stream* stream,
-                                      const LiteralSlice& literal,
-                                      const ShapedBuffer& device_buffer,
-                                      TransferToDeviceHint hint) override;
+  Status TransferLiteralToDeviceAsync(
+      se::Stream* stream, const LiteralSlice& literal,
+      const ShapedBuffer& device_buffer) override;
 
   Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                  const LiteralSlice& literal) override;
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 9199e32d0f..f952e64af2 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -89,16 +89,6 @@ class TransferManager {
                                          const LiteralSlice& literal,
                                          const ShapedBuffer& device_buffer);
 
-  // Hint type given to TransferLiteralToDeviceAsync.
-  enum TransferToDeviceHint {
-    // No hint available.
-    kNoHint,
-
-    // The destination buffer is undefined on the device, meaning it can be
-    // transferred to eagerly rather than waiting for Stream ordering.
-    kBufferUndefined,
-  };
-
   // Transfers the given literal into the previously allocated device memory
   // represented by the given ShapedBuffer using the given executor. The shape
   // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible,
@@ -106,13 +96,9 @@ class TransferManager {
   //
   // This operation is performed asynchronously on the given stream. It returns
   // once the transfer is enqueued.
-  //
-  // The optional hint can allow implementations to optimize transfers. It is
-  // not mandatory for an implementation to obey the hint.
   virtual Status TransferLiteralToDeviceAsync(
       se::Stream* stream, const LiteralSlice& literal,
-      const ShapedBuffer& device_buffer,
-      TransferToDeviceHint hint = kNoHint) = 0;
+      const ShapedBuffer& device_buffer) = 0;
 
   // Convenience methods for transferring an array to or from the device at a
   // known address. This avoids having to construct a ShapedBuffer just to
-- 
GitLab


From 100714d9e5eb723525eb54142769f9bd8eec5edd Mon Sep 17 00:00:00 2001
From: Alan Chiao <alanchiao@google.com>
Date: Thu, 4 Oct 2018 10:11:56 -0700
Subject: [PATCH 413/570] Fix quantization util test to pass with defined
 behavior on 32-bit architectures.

PiperOrigin-RevId: 215757844
---
 .../contrib/lite/kernels/internal/quantization_util_test.cc     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
index 14281f25c6..25ea72b886 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
@@ -259,7 +259,7 @@ TEST(QuantizationUtilTest, IntegerFrExpVersusDouble) {
   EXPECT_EQ(double_shift, 1);
 
   result = IntegerFrExp(123.45, &shift);
-  EXPECT_NEAR(result, (0.964453 * (1L << 31)), 1000);
+  EXPECT_NEAR(result, (0.964453 * (1LL << 31)), 1000);
   EXPECT_EQ(shift, 7);
   double_result = std::frexp(123.45, &double_shift);
   EXPECT_NEAR(double_result, 0.964453, 1e-5);
-- 
GitLab


From 8622f05a62948d8966be8962a6a33e0a8b5a116d Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Thu, 4 Oct 2018 10:17:02 -0700
Subject: [PATCH 414/570] Don't CHECK-fail on malformed graphs in deadness
 analysis

Instead return a friendlier failed Status from the following two methods which
used to CHECK-fail before:  GetIncomingPreds, FindUniqueBackedge.

While at it, also rename GetIncomingPreds to GetInputPreds to be consistent with
the variable names.

PiperOrigin-RevId: 215758757
---
 tensorflow/compiler/jit/deadness_analysis.cc | 77 ++++++++++++++------
 1 file changed, 55 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index e0b9932d80..b7ae7fbeb3 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/jit/deadness_analysis_internal.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -579,7 +580,8 @@ class DeadnessAnalysisImpl : public DeadnessAnalysis {
  private:
   enum class EdgeKind { kDataAndControl, kDataOnly, kControlOnly };
 
-  std::vector<Predicate*> GetIncomingPreds(Node* n, EdgeKind edge_kind);
+  Status GetInputPreds(Node* n, EdgeKind edge_kind,
+                       std::vector<Predicate*>* result);
 
   // Sets the predicate for output `output_idx` of `n` to `pred`.  Sets the i'th
   // bit of `should_revisit` if `pred` is different from the current predicate
@@ -625,9 +627,10 @@ TensorId InputEdgeToTensorId(const Edge* e) {
   return TensorId(e->src()->name(), e->src_output());
 }
 
-std::vector<Predicate*> DeadnessAnalysisImpl::GetIncomingPreds(
-    Node* n, DeadnessAnalysisImpl::EdgeKind edge_kind) {
-  std::vector<Predicate*> incoming_preds;
+Status DeadnessAnalysisImpl::GetInputPreds(
+    Node* n, DeadnessAnalysisImpl::EdgeKind edge_kind,
+    std::vector<Predicate*>* result) {
+  result->clear();
   for (const Edge* in_edge : n->in_edges()) {
     bool should_process =
         edge_kind == EdgeKind::kDataAndControl ||
@@ -636,17 +639,27 @@ std::vector<Predicate*> DeadnessAnalysisImpl::GetIncomingPreds(
 
     if (should_process) {
       auto it = predicate_map_.find(InputEdgeToTensorId(in_edge));
-      CHECK(it != predicate_map_.end()) << n->name();
-      incoming_preds.push_back(it->second);
+      if (it == predicate_map_.end()) {
+        GraphCycles graph_cycles;
+        TF_RETURN_IF_ERROR(CreateCycleDetectionGraph(&graph_, &graph_cycles));
+
+        // If we didn't return with an error above then the graph is probably
+        // fine and we have a bug in deadness analysis.
+        return errors::Internal("Could not find input ", in_edge->DebugString(),
+                                " to ", n->name(),
+                                " when visiting the graph in post-order.  Most "
+                                "likely indicates a bug in deadness analysis.");
+      }
+      result->push_back(it->second);
     }
   }
-  return incoming_preds;
+  return Status::OK();
 }
 
 Status DeadnessAnalysisImpl::HandleSwitch(Node* n,
                                           std::vector<bool>* should_revisit) {
-  std::vector<Predicate*> input_preds =
-      GetIncomingPreds(n, EdgeKind::kDataAndControl);
+  std::vector<Predicate*> input_preds;
+  TF_RETURN_IF_ERROR(GetInputPreds(n, EdgeKind::kDataAndControl, &input_preds));
   const Edge* pred_edge;
   TF_RETURN_IF_ERROR(n->input_edge(1, &pred_edge));
   Predicate* true_switch = predicate_factory_.MakeSymbolPredicate(
@@ -675,17 +688,31 @@ Status DeadnessAnalysisImpl::HandleSwitch(Node* n,
 }
 
 namespace {
-const Edge* FindUniqueBackedge(Node* merge) {
+Status CreateMultipleNextIterationInputsError(Node* merge) {
+  std::vector<string> backedges;
+  for (const Edge* backedge : merge->in_edges()) {
+    if (backedge->src()->IsNextIteration()) {
+      backedges.push_back(absl::StrCat("  ", SummarizeNode(*backedge->src())));
+    }
+  }
+  return errors::InvalidArgument(
+      "Multiple NextIteration inputs to merge node ", SummarizeNode(*merge),
+      ": \n", absl::StrJoin(backedges, "\n"),
+      "\nMerge nodes can have at most one incoming NextIteration edge.");
+}
+
+Status FindUniqueBackedge(Node* merge, const Edge** result) {
+  *result = nullptr;
   CHECK(merge->IsMerge());
-  const Edge* result = nullptr;
   for (const Edge* e : merge->in_edges()) {
     if (e->src()->IsNextIteration()) {
-      CHECK_EQ(result, nullptr)
-          << "Multiple backedges to " << merge->DebugString();
-      result = e;
+      if (*result != nullptr) {
+        return CreateMultipleNextIterationInputsError(merge);
+      }
+      *result = e;
     }
   }
-  return result;
+  return Status::OK();
 }
 
 // If `backedge_predicate` is equal to `symbolic_predicate` & Step where Step
@@ -764,9 +791,12 @@ Status DeadnessAnalysisImpl::HandleMerge(Node* n,
       return Status::OK();
     }
 
+    std::vector<Predicate*> input_preds;
+    TF_RETURN_IF_ERROR(GetInputPreds(n, EdgeKind::kDataOnly, &input_preds));
+
     // We're visiting this merge for the first time and it is a acyclic merge.
-    Predicate* input_data_pred = predicate_factory_.MakeOrPredicate(
-        GetIncomingPreds(n, EdgeKind::kDataOnly));
+    Predicate* input_data_pred =
+        predicate_factory_.MakeOrPredicate(input_preds);
     SetPredicate(n, {0, 1, Graph::kControlSlot}, input_data_pred,
                  should_revisit);
     return Status::OK();
@@ -777,7 +807,9 @@ Status DeadnessAnalysisImpl::HandleMerge(Node* n,
     // of an unvisited backedge.  Try to pattern match the predicate expression
     // for that backedge (which should be visited now) into an and recurrence
     // for the merge node.
-    if (const Edge* unique_backedge = FindUniqueBackedge(n)) {
+    const Edge* unique_backedge;
+    TF_RETURN_IF_ERROR(FindUniqueBackedge(n, &unique_backedge));
+    if (unique_backedge) {
       if (Predicate* step = DeduceStepPredicate(
               &predicate_factory_, it->second,
               predicate_map_[InputEdgeToTensorId(unique_backedge)])) {
@@ -808,8 +840,8 @@ Status DeadnessAnalysisImpl::HandleRecv(Node* n,
                                         std::vector<bool>* should_revisit) {
   // In addition to being alive or dead based on the inputs, a _Recv can also
   // acquire a dead signal from a _Send.
-  std::vector<Predicate*> input_preds =
-      GetIncomingPreds(n, EdgeKind::kDataAndControl);
+  std::vector<Predicate*> input_preds;
+  TF_RETURN_IF_ERROR(GetInputPreds(n, EdgeKind::kDataAndControl, &input_preds));
   input_preds.push_back(predicate_factory_.MakeSymbolPredicate(
       TensorId(n->name(), 0), /*must_be_true=*/false));
   SetPredicate(n, {0, Graph::kControlSlot},
@@ -821,8 +853,9 @@ Status DeadnessAnalysisImpl::HandleRecv(Node* n,
 Status DeadnessAnalysisImpl::HandleGeneric(Node* n,
                                            std::vector<bool>* should_revisit) {
   // Generally nodes are alive iff all their inputs are alive.
-  Predicate* pred = predicate_factory_.MakeAndPredicate(
-      GetIncomingPreds(n, EdgeKind::kDataAndControl));
+  std::vector<Predicate*> input_preds;
+  TF_RETURN_IF_ERROR(GetInputPreds(n, EdgeKind::kDataAndControl, &input_preds));
+  Predicate* pred = predicate_factory_.MakeAndPredicate(input_preds);
   for (int output_idx = 0; output_idx < n->num_outputs(); output_idx++) {
     SetPredicate(n, output_idx, pred, should_revisit);
   }
-- 
GitLab


From 8ac087482f7224273fb6697a66191b2661e86477 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Thu, 4 Oct 2018 10:27:57 -0700
Subject: [PATCH 415/570] Add tensorflow_estimator pip package to
 install_pip_packages.sh

We will need this for remote-build presubmits to pass.

PiperOrigin-RevId: 215760872
---
 tensorflow/tools/ci_build/install/install_pip_packages.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 7f293e8604..329d05342a 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -124,6 +124,10 @@ pip3 install keras_preprocessing==1.0.5 --no-deps
 pip2 install --upgrade h5py==2.8.0
 pip3 install --upgrade h5py==2.8.0
 
+# Estimator
+pip2 install tensorflow_estimator --no-deps
+pip3 install tensorflow_estimator --no-deps
+
 # Install last working version of setuptools.
 pip2 install --upgrade setuptools==39.1.0
 pip3 install --upgrade setuptools==39.1.0
-- 
GitLab


From 419fff9de94ea9573f2e368fd6a68fdf54c59bab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 10:44:58 -0700
Subject: [PATCH 416/570] Implement LiteralBase::Slice for all primitive type

PiperOrigin-RevId: 215764305
---
 tensorflow/compiler/xla/literal.cc | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index deeb140b8f..177f39cc74 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -727,16 +727,34 @@ Literal LiteralBase::Slice(absl::Span<const int64> start_indices,
       ShapeUtil::MakeShapeWithLayout(shape().element_type(), result_dimensions,
                                      LayoutUtil::MinorToMajor(shape()));
   switch (result_shape.element_type()) {
-    case F32:
-      return SliceInternal<float>(result_shape, start_indices);
+    case PRED:
+      return SliceInternal<bool>(result_shape, start_indices);
+    case U8:
+      return SliceInternal<uint8>(result_shape, start_indices);
+    case U16:
+      return SliceInternal<uint16>(result_shape, start_indices);
+    case U32:
+      return SliceInternal<uint32>(result_shape, start_indices);
+    case U64:
+      return SliceInternal<uint64>(result_shape, start_indices);
+    case S8:
+      return SliceInternal<int8>(result_shape, start_indices);
+    case S16:
+      return SliceInternal<int16>(result_shape, start_indices);
+    case S32:
+      return SliceInternal<int32>(result_shape, start_indices);
+    case S64:
+      return SliceInternal<int64>(result_shape, start_indices);
+    case F16:
+      return SliceInternal<half>(result_shape, start_indices);
     case BF16:
       return SliceInternal<bfloat16>(result_shape, start_indices);
+    case F32:
+      return SliceInternal<float>(result_shape, start_indices);
+    case F64:
+      return SliceInternal<double>(result_shape, start_indices);
     case C64:
       return SliceInternal<complex64>(result_shape, start_indices);
-    case S32:
-      return SliceInternal<int32>(result_shape, start_indices);
-    case U32:
-      return SliceInternal<uint32>(result_shape, start_indices);
     default:
       LOG(FATAL) << "not yet implemented: "
                  << PrimitiveType_Name(result_shape.element_type());
-- 
GitLab


From 5e9bd578802fcfff5de9729332eea4ae85c05c9e Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 4 Oct 2018 10:46:16 -0700
Subject: [PATCH 417/570] [tf.data] Fix C++ shape inference for
 `Dataset.concatenate()`.

Previously, we were returning an unknown shape in
`Dataset::output_shapes()` for the "most specific compatible shape"
between the two inputs. While this does not cause correctness problems
(since the unknown shape *is* compatible), we gain the ability to
raise errors earlier when more shape information is available.

PiperOrigin-RevId: 215764530
---
 tensorflow/core/kernels/data/concatenate_dataset_op.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index a04f150e71..9607e9444c 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -171,16 +171,16 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
 
     static PartialTensorShape MostSpecificCompatibleShape(
         const PartialTensorShape& ts1, const PartialTensorShape& ts2) {
-      PartialTensorShape output_tensorshape;
       if (ts1.dims() != ts2.dims() || ts1.unknown_rank() || ts2.unknown_rank())
-        return output_tensorshape;
+        return PartialTensorShape();
+      PartialTensorShape output_tensorshape({});
       auto dims1 = ts1.dim_sizes();
       auto dims2 = ts2.dim_sizes();
       for (int d = 0; d < ts1.dims(); d++) {
         if (dims1[d] == dims2[d])
-          output_tensorshape.Concatenate(dims1[d]);
+          output_tensorshape.AddDim(dims1[d]);
         else
-          output_tensorshape.Concatenate(-1);
+          output_tensorshape.AddDim(-1);
       }
       return output_tensorshape;
     }
-- 
GitLab


From e1a8f4b03df2ef84538c01788b6043eb723cd046 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 11:04:41 -0700
Subject: [PATCH 418/570] Automated rollback of commit
 8dc7bc7764150253c03a666eee84fc48f867d6a2

PiperOrigin-RevId: 215768310
---
 .../toco/graph_transformations/resolve_constant_binary.cc | 8 --------
 .../resolve_constant_concatenation.cc                     | 7 -------
 .../graph_transformations/resolve_constant_fake_quant.cc  | 7 -------
 .../toco/graph_transformations/resolve_constant_fill.cc   | 7 -------
 .../toco/graph_transformations/resolve_constant_gather.cc | 8 --------
 .../toco/graph_transformations/resolve_constant_pack.cc   | 8 --------
 .../resolve_constant_random_uniform.cc                    | 7 -------
 .../toco/graph_transformations/resolve_constant_range.cc  | 8 --------
 .../graph_transformations/resolve_constant_reshape.cc     | 7 -------
 .../toco/graph_transformations/resolve_constant_select.cc | 8 --------
 .../resolve_constant_shape_or_rank.cc                     | 8 --------
 .../toco/graph_transformations/resolve_constant_slice.cc  | 8 --------
 .../resolve_constant_strided_slice.cc                     | 8 --------
 .../toco/graph_transformations/resolve_constant_tile.cc   | 7 -------
 .../graph_transformations/resolve_constant_transpose.cc   | 8 --------
 .../toco/graph_transformations/resolve_constant_unary.cc  | 8 --------
 16 files changed, 122 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
index 3e57d3f467..f7e5aa6609 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
@@ -191,14 +191,6 @@ void EvaluateBinaryOperatorOnConstantInputs(Model* model,
 bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
   const auto binary_it = model->operators.begin() + op_index;
   const auto* binary_op = binary_it->get();
-
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, binary_op->outputs[0])) {
-    return false;
-  }
-
   // Test for binary ops of types that we know how to resolve
   if (binary_op->type != OperatorType::kAdd &&
       binary_op->type != OperatorType::kMul &&
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index c6c5035a51..d916ae0ddf 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -144,13 +144,6 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
   const auto* concat_op =
       static_cast<const ConcatenationOperator*>(concat_base_op);
 
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, concat_op->outputs[0])) {
-    return false;
-  }
-
   for (const string& input_name : concat_op->inputs) {
     // We only expect constant unquantized arrays as input, otherwise we return.
     // We  also make sure the shapes of the input arrays are known and they are
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
index 3d797533c9..f5f2f77460 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
@@ -69,13 +69,6 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
   const auto* fakequant_op =
       static_cast<const FakeQuantOperator*>(fakequant_base_op);
 
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, fakequant_op->outputs[0])) {
-    return false;
-  }
-
   // Yield until the fakequant MinMax has been resolved.
   if (!fakequant_op->minmax) {
     return false;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc
index 2cb1e64f3a..f6f95481b5 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc
@@ -52,13 +52,6 @@ bool ResolveConstantFill::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
 
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, op->outputs[0])) {
-    return false;
-  }
-
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
index 4dfe203a25..36d7dad0ce 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
@@ -71,14 +71,6 @@ bool ResolveConstantGather::Run(Model* model, std::size_t op_index) {
 
   CHECK_GE(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
-
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, op->outputs[0])) {
-    return false;
-  }
-
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_pack.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_pack.cc
index 6f44025dd4..e86616574d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_pack.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_pack.cc
@@ -59,14 +59,6 @@ bool ResolveConstantPack::Run(Model* model, std::size_t op_index) {
 
   CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
-
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, op->outputs[0])) {
-    return false;
-  }
-
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
index c9f2b95d09..88d06d7dc7 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
@@ -70,13 +70,6 @@ bool ResolveConstantRandomUniform::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
 
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, op->outputs[0])) {
-    return false;
-  }
-
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc
index e347286dd4..1a0ba9e2bc 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc
@@ -28,14 +28,6 @@ bool ResolveConstantRange::Run(Model* model, std::size_t op_index) {
   auto* op = static_cast<RangeOperator*>(base_op);
 
   CHECK_EQ(op->inputs.size(), 3);
-
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, op->outputs[0])) {
-    return false;
-  }
-
   const auto& start_array = model->GetArray(op->inputs[0]);
   if (!start_array.has_shape()) {
     // Yield until all input dims have been resolved.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
index bfdaa8aafd..a6f665b5f0 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
@@ -33,13 +33,6 @@ bool ResolveConstantReshape::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
 
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, op->outputs[0])) {
-    return false;
-  }
-
   // We require constant inputs.
   if (!IsConstantParameterArray(*model, op->inputs[0]) ||
       !IsConstantParameterArray(*model, op->inputs[1])) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc
index 3a95d39cd4..e880a3f44d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc
@@ -37,14 +37,6 @@ bool ResolveConstantSelect::Run(Model* model, std::size_t op_index) {
 
   CHECK_GE(op->inputs.size(), 3);
   CHECK_EQ(op->outputs.size(), 1);
-
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, op->outputs[0])) {
-    return false;
-  }
-
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
index 452bef1f16..8a0e3e8995 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
@@ -27,14 +27,6 @@ bool ResolveConstantShapeOrRank::Run(Model* model, std::size_t op_index) {
   }
 
   CHECK_EQ(op->outputs.size(), 1);
-
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, op->outputs[0])) {
-    return false;
-  }
-
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been resolved
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc
index 58d6797e1c..b35c3e19c4 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc
@@ -96,14 +96,6 @@ bool ResolveConstantSlice::Run(Model* model, std::size_t op_index) {
   const SliceOperator* op = static_cast<const SliceOperator*>(base_op);
 
   CHECK_EQ(op->outputs.size(), 1);
-
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, op->outputs[0])) {
-    return false;
-  }
-
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
index e275447a0c..8853ed87e6 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -114,14 +114,6 @@ bool ResolveConstantStridedSlice::Run(Model* model, std::size_t op_index) {
       static_cast<const StridedSliceOperator*>(base_op);
 
   CHECK_EQ(op->outputs.size(), 1);
-
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, op->outputs[0])) {
-    return false;
-  }
-
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc
index 378a38f14b..5cfa1a5582 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc
@@ -105,13 +105,6 @@ bool ResolveConstantTile::Run(Model* model, std::size_t op_index) {
   }
   const auto* op = static_cast<const TensorFlowTileOperator*>(base_op);
 
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, op->outputs[0])) {
-    return false;
-  }
-
   CHECK_GE(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
   auto& output_array = model->GetArray(op->outputs[0]);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
index 5d3f4a6240..fe15dfa06f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -111,14 +111,6 @@ bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
 
   CHECK_EQ(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
-
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, op->outputs[0])) {
-    return false;
-  }
-
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
index e35ed0898b..c698a9567a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -48,14 +48,6 @@ bool CopyMinMaxFromFirstInput(const Operator& op, Model* model) {
 bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
   const auto unary_it = model->operators.begin() + op_index;
   const auto* unary_op = unary_it->get();
-
-  // If the output of this op is a non-discardable array such as an input_array
-  // or a state array of the model, then this is a job for RemoveUnusedOp, not
-  // for constants-propagation.
-  if (!IsDiscardableArray(*model, unary_op->outputs[0])) {
-    return false;
-  }
-
   // Test for unary ops of types that we know how to resolve.
   switch (unary_op->type) {
     case OperatorType::kCast:
-- 
GitLab


From 6850dafeeaaa48efa748134688844bd079ef3949 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 11:09:52 -0700
Subject: [PATCH 419/570] collective_param_resolver_local.cc: delete
 DCHECK(!ir->out_mu.try_lock()); in a lambda

UNLOCK_FUNCTION(ir->out_mu) annotates that the lock is held on entry.
try_lock() should not be called.

PiperOrigin-RevId: 215769341
---
 .../core/common_runtime/collective_param_resolver_local.cc       | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index 3b2dc6a050..7cb90de3c7 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -522,7 +522,6 @@ void CollectiveParamResolverLocal::CallInitInstanceSharedParams(
   InitInstanceSharedParams(
       gr, cp, ir,
       [this, ir, done](const Status& s) UNLOCK_FUNCTION(ir->out_mu) {
-        DCHECK(!ir->out_mu.try_lock());
         DCHECK(ir->out_mu_available);
         ir->status.Update(s);
         ir->out_mu.unlock();
-- 
GitLab


From c8d5054e8c12800f0c3db0e51f3d5902e04eaa37 Mon Sep 17 00:00:00 2001
From: Tong Shen <endlessroad@google.com>
Date: Thu, 4 Oct 2018 11:24:41 -0700
Subject: [PATCH 420/570] Roll forward change "Skip control flow
 functionalization if there is no Switch or Merge node.".

PiperOrigin-RevId: 215772272
---
 .../tf2xla/functionalize_control_flow.cc      | 129 ++++++++++++------
 .../core/common_runtime/constant_folding.cc   |  37 ++---
 .../core/common_runtime/constant_folding.h    |   4 +
 .../core/common_runtime/graph_optimizer.cc    |   5 +-
 .../core/common_runtime/graph_optimizer.h     |   5 +-
 5 files changed, 122 insertions(+), 58 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 36c6f5d316..28e09d7b79 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -79,7 +79,10 @@ Status FunctionalizeControlFlowForFunction(
     const string& func_name, const string& new_func_name,
     const protobuf::Map<string, tensorflow::AttrValue>& attrs,
     FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
-    std::map<string, string>* canonicalized_name_to_new_name) {
+    std::map<string, absl::optional<string>>* canonicalized_name_to_new_name,
+    bool* modified) {
+  *modified = false;
+
   // Convert the function to Graph.
   FunctionLibraryRuntime::Handle handle;
   TF_RETURN_IF_ERROR(flr->Instantiate(func_name, AttrSlice(&attrs), &handle));
@@ -92,6 +95,19 @@ Status FunctionalizeControlFlowForFunction(
   });
   const FunctionBody* body = flr->GetFunctionBody(handle);
 
+  // Check if the graph has Switch or Merge node before optimizing the graph.
+  bool has_switch_or_merge = false;
+  for (Node* n : body->graph->nodes()) {
+    if (n->type_string() == "Switch" || n->type_string() == "Merge") {
+      has_switch_or_merge = true;
+      break;
+    }
+  }
+  // We cannot return here directly if the graph has no Switch/Merge.
+  // It might contain function call nodes, or If/While nodes with Switch/Merge
+  // in function body. We still need to rewrite those functions and modify
+  // corresponding nodes.
+
   // Call graph optimizer. The most important optimization we need is constant
   // folding, which will replace ops like Shape/BroadcastGradientArgs with
   // constant shape input. Without this optimization, those ops might become
@@ -129,6 +145,13 @@ Status FunctionalizeControlFlowForFunction(
         absl::StrCat("functionalize_control_flow_after_opt_", func_name),
         *optimized_graph, fld);
   }
+  // Some inlined functions might have Switch/Merge nodes.
+  for (Node* n : optimized_graph->nodes()) {
+    if (n->type_string() == "Switch" || n->type_string() == "Merge") {
+      has_switch_or_merge = true;
+      break;
+    }
+  }
 
   // If any node has associated functions, functionalize them first.
   // Gather nodes with associated functions first, because rewriting those nodes
@@ -151,10 +174,15 @@ Status FunctionalizeControlFlowForFunction(
           Canonicalize(name, AttrSlice(&associated_function.attrs()));
       auto iter = canonicalized_name_to_new_name->find(canonicalized_name);
       string new_name;
+      bool function_modified;
       if (iter != canonicalized_name_to_new_name->end()) {
-        // If we already functionalized this function, skip functionalization
-        // but still rewrite the node.
-        new_name = iter->second;
+        // If we already processed this function, check if it was rewritten. If
+        // the function was rewritten, the entry will be non-empty. Otherwise
+        // the entry will be empty.
+        function_modified = iter->second.has_value();
+        if (function_modified) {
+          new_name = iter->second.value();
+        }
       } else {
         if (associated_function.type() ==
             AssociatedFunctionInfo::AssociatedFunctionType::kSymbolicGradient) {
@@ -166,42 +194,62 @@ Status FunctionalizeControlFlowForFunction(
         }
         TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
             name, new_name, associated_function.attrs(), fld, flr,
-            canonicalized_name_to_new_name));
-        (*canonicalized_name_to_new_name)[canonicalized_name] = new_name;
+            canonicalized_name_to_new_name, &function_modified));
+        if (function_modified) {
+          // If the function was rewritten, add an non-empty entry. So later we
+          // know we have processed this function, and it was rewritten into
+          // another function.
+          (*canonicalized_name_to_new_name)[canonicalized_name] = new_name;
+        } else {
+          // If the function was not rewritten, add an empty entry. So later
+          // we know we have processed this function, and it does not need to be
+          // rewritten.
+          (*canonicalized_name_to_new_name)[canonicalized_name] = absl::nullopt;
+        }
+      }
+      if (function_modified) {
+        *modified = true;
+
+        // Notice that if "n" is a function call, RewriteAssociatedFunction()
+        // will delete it and create a new node instead, making "n" an invalid
+        // pointer. That's fine because in that case, associated_functions will
+        // only have one member and the loop will only run once.
+        TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
+            optimized_graph.get(), n, fld, associated_function, new_name));
       }
-      // Notice that if "n" is a function call, RewriteAssociatedFunction() will
-      // delete it and create a new node instead, making "n" an invalid pointer.
-      // That's fine because in that case, associated_functions will only have
-      // one member and the loop will only run once.
-      TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
-          optimized_graph.get(), n, fld, associated_function, new_name));
     }
   }
 
-  // Functionalize the function body.
-  if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
-        absl::StrCat("functionalize_control_flow_before_fdef_", func_name),
-        *optimized_graph, fld);
-  }
-  TF_RETURN_IF_ERROR(FunctionalizeControlFlow(optimized_graph.get(), fld));
-  if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
-        absl::StrCat("functionalize_control_flow_after_fdef_", func_name),
-        *optimized_graph, fld);
+  if (has_switch_or_merge) {
+    *modified = true;
+
+    // Functionalize the function body.
+    if (VLOG_IS_ON(4)) {
+      dump_graph::DumpGraphToFile(
+          absl::StrCat("functionalize_control_flow_before_fdef_", func_name),
+          *optimized_graph, fld);
+    }
+    TF_RETURN_IF_ERROR(FunctionalizeControlFlow(optimized_graph.get(), fld));
+    if (VLOG_IS_ON(4)) {
+      dump_graph::DumpGraphToFile(
+          absl::StrCat("functionalize_control_flow_after_fdef_", func_name),
+          *optimized_graph, fld);
+    }
   }
-  FunctionDef functionalized_fdef;
-  TF_RETURN_IF_ERROR(GraphToFunctionDef(*optimized_graph, new_func_name,
-                                        &functionalized_fdef));
 
-  // Add rewritten FunctionDef into library.
-  if (func_name == new_func_name) {
-    VLOG(2) << "Replacing function " << func_name;
-    TF_RETURN_IF_ERROR(
-        fld->ReplaceFunction(new_func_name, functionalized_fdef));
-  } else {
-    VLOG(2) << "Adding function " << new_func_name;
-    TF_RETURN_IF_ERROR(fld->AddFunctionDef(functionalized_fdef));
+  if (*modified) {
+    // Add rewritten FunctionDef into library.
+    FunctionDef functionalized_fdef;
+    TF_RETURN_IF_ERROR(GraphToFunctionDef(*optimized_graph, new_func_name,
+                                          &functionalized_fdef));
+    if (func_name == new_func_name) {
+      VLOG(2) << "Replacing function " << func_name;
+      TF_RETURN_IF_ERROR(
+          fld->ReplaceFunction(new_func_name, functionalized_fdef));
+    } else {
+      VLOG(2) << "Adding function " << new_func_name;
+      TF_RETURN_IF_ERROR(fld->AddFunctionDef(functionalized_fdef));
+    }
   }
 
   return ret_status;
@@ -227,7 +275,7 @@ Status FunctionalizeControlFlowPass::Run(
           {"TPUCompile", "function"},
           {"XlaLaunch", "function"},
       };
-  std::map<string, string> canonicalized_name_to_new_name;
+  std::map<string, absl::optional<string>> canonicalized_name_to_new_name;
   for (Node* n : graph->nodes()) {
     auto it = kNodeTypeToFunctionAttrMapping->find(n->type_string());
     if (it == kNodeTypeToFunctionAttrMapping->end()) {
@@ -242,12 +290,15 @@ Status FunctionalizeControlFlowPass::Run(
               << ". Corresponding function: " << func.name();
       string new_func_name = options.flib_def->UniqueFunctionName(
           absl::StrCat(func.name(), "_f15n_"));
+      bool modified;
       TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
           func.name(), new_func_name, func.attr(), options.flib_def, flr,
-          &canonicalized_name_to_new_name));
-      n->ClearAttr(func_attr);
-      func.set_name(new_func_name);
-      n->AddAttr(func_attr, func);
+          &canonicalized_name_to_new_name, &modified));
+      if (modified) {
+        n->ClearAttr(func_attr);
+        func.set_name(new_func_name);
+        n->AddAttr(func_attr, func);
+      }
     }
   }
 
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 419867ff58..db137f1a19 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -466,7 +466,7 @@ Graph* GetConstantGraph(
 bool ReplaceTensorWithConstant(
     Graph* graph, Device* partition_device, NodeAndOutput tensor,
     const Tensor& constant, const gtl::FlatSet<Node*>& control_deps,
-    int64 max_constant_size_in_bytes,
+    int64 max_constant_size_in_bytes, bool disable_memory_output_type_check,
     const ConstantFoldNameGenerator& generate_new_name) {
   // Be conservative when replacing a tensor with a constant, when not
   // running on CPU.
@@ -535,21 +535,23 @@ bool ReplaceTensorWithConstant(
   if (!NodeBuilder(builder).Finalize(graph, &constant_node).ok()) {
     return false;
   }
-  if (partition_device && device_type != DEVICE_CPU) {
-    MemoryType original_output_memory_type;
-    if (!MemoryTypeForOutput(device_type, graph, tensor.first, tensor.second,
-                             &original_output_memory_type)
-             .ok()) {
-      return false;
-    }
-    MemoryType const_output_memory_type;
-    if (!MemoryTypeForOutput(device_type, graph, constant_node, 0,
-                             &const_output_memory_type)
-             .ok()) {
-      return false;
-    }
-    if (original_output_memory_type != const_output_memory_type) {
-      return false;
+  if (!disable_memory_output_type_check) {
+    if (partition_device && device_type != DEVICE_CPU) {
+      MemoryType original_output_memory_type;
+      if (!MemoryTypeForOutput(device_type, graph, tensor.first, tensor.second,
+                               &original_output_memory_type)
+               .ok()) {
+        return false;
+      }
+      MemoryType const_output_memory_type;
+      if (!MemoryTypeForOutput(device_type, graph, constant_node, 0,
+                               &const_output_memory_type)
+               .ok()) {
+        return false;
+      }
+      if (original_output_memory_type != const_output_memory_type) {
+        return false;
+      }
     }
   }
   for (auto edge : edges_to_remove) {
@@ -658,7 +660,8 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
         constant_control_deps[tensors_to_replace[c].first];
     if (ReplaceTensorWithConstant(
             graph, partition_device, tensors_to_replace[c], outputs[c],
-            control_deps, opts.max_constant_size_in_bytes, generate_new_name)) {
+            control_deps, opts.max_constant_size_in_bytes,
+            opts.disable_memory_output_type_check, generate_new_name)) {
       ++num_nodes_replaced;
     }
   }
diff --git a/tensorflow/core/common_runtime/constant_folding.h b/tensorflow/core/common_runtime/constant_folding.h
index a9a84f761b..4c71b7bd27 100644
--- a/tensorflow/core/common_runtime/constant_folding.h
+++ b/tensorflow/core/common_runtime/constant_folding.h
@@ -45,6 +45,10 @@ struct ConstantFoldingOptions {
   // optimization.
   int64 max_constant_size_in_bytes = 10 * 1024 * 1024;
 
+  // If disable_memory_output_type_check is true, we will disable output memory
+  // type check for constant node replacement.
+  bool disable_memory_output_type_check = false;
+
   // A generator for the name suffix of constant folded nodes. A
   // default id generator that monotonically increases is used if nullptr is
   // passed.
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 37a979a8f1..91194bc86f 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -39,7 +39,8 @@ void GraphOptimizer::Optimize(
     const std::unordered_map<string, std::vector<PartialTensorShape>>*
         shape_map,
     const std::function<bool(const Node*)>& cse_consider_fn,
-    const std::function<bool(const Node*)>& cf_consider_fn) {
+    const std::function<bool(const Node*)>& cf_consider_fn,
+    bool cf_disable_memory_output_type_check) {
   Graph* g = graph->get();
   DumpGraph("Initial", g);
 
@@ -64,6 +65,8 @@ void GraphOptimizer::Optimize(
       ConstantFoldingOptions cf_opts;
       cf_opts.shape_map = shape_map;
       cf_opts.consider = cf_consider_fn;
+      cf_opts.disable_memory_output_type_check =
+          cf_disable_memory_output_type_check;
       if (opts_.max_folded_constant_in_bytes() > 0) {
         cf_opts.max_constant_size_in_bytes =
             opts_.max_folded_constant_in_bytes();
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index 789cc56942..8954e9612d 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -47,13 +47,16 @@ class GraphOptimizer {
   // returns true will be considered for CSE.
   // If cf_consider_fn is not null then only nodes for which cf_consider_fn
   // returns true will be considered for CF.
+  // If cf_disable_memory_output_type_check is true, CF will discard output
+  // memory type check for constant node replacement.
   void Optimize(
       FunctionLibraryRuntime* runtime, Env* env, Device* device,
       std::unique_ptr<Graph>* graph,
       const std::unordered_map<string, std::vector<PartialTensorShape>>*
           shape_map,
       const std::function<bool(const Node*)>& cse_consider_fn = nullptr,
-      const std::function<bool(const Node*)>& cf_consider_fn = nullptr);
+      const std::function<bool(const Node*)>& cf_consider_fn = nullptr,
+      bool cf_disable_memory_output_type_check = false);
 
   const OptimizerOptions& options() { return opts_; }
 
-- 
GitLab


From 700c3325311e16be9bb4856cbf944d1871ff35c1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 11:30:52 -0700
Subject: [PATCH 421/570] Add "encoding" attribute to string substr op, which
 controls how each "character" is treated:   * BYTE: Position & length refer
 to bytes in the string.  (Default)   * UTF8: The string is interpreted as
 UTF-8 encoded Unicode code points, and position & length are treated relative
 to them.

RELNOTES: Add option to get substring using Unicode characters
PiperOrigin-RevId: 215773373
---
 .../api_def/base_api/api_def_Substr.pbtxt     |  10 +
 .../api_def/python_api/api_def_Substr.pbtxt   |   8 +-
 tensorflow/core/kernels/BUILD                 |   7 +-
 tensorflow/core/kernels/string_util.cc        |   4 -
 tensorflow/core/kernels/string_util.h         |  44 ++
 tensorflow/core/kernels/substr_op.cc          | 162 +++++-
 tensorflow/core/kernels/substr_op_test.cc     | 100 +++-
 tensorflow/core/ops/string_ops.cc             |   1 +
 .../python/kernel_tests/substr_op_test.py     | 503 ++++++++++++------
 tensorflow/python/ops/string_ops.py           |  16 +
 .../tools/api/golden/v1/tensorflow.pbtxt      |   2 +-
 .../api/golden/v1/tensorflow.strings.pbtxt    |   2 +-
 .../tools/api/golden/v2/tensorflow.pbtxt      |   2 +-
 .../api/golden/v2/tensorflow.strings.pbtxt    |   2 +-
 14 files changed, 655 insertions(+), 208 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_Substr.pbtxt b/tensorflow/core/api_def/base_api/api_def_Substr.pbtxt
index 5246090ab3..fe0fcc9508 100644
--- a/tensorflow/core/api_def/base_api/api_def_Substr.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Substr.pbtxt
@@ -16,6 +16,16 @@ END
     name: "len"
     description: <<END
 Scalar defining the number of characters to include in each substring
+END
+  }
+  attr {
+    name: "unit"
+    description: <<END
+The unit that is used to create the substring.  One of: `"BYTE"` (for
+defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
+encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
+`unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
+UTF-8.
 END
   }
   out_arg {
diff --git a/tensorflow/core/api_def/python_api/api_def_Substr.pbtxt b/tensorflow/core/api_def/python_api/api_def_Substr.pbtxt
index 4778d7927c..4fb9ee56e9 100644
--- a/tensorflow/core/api_def/python_api/api_def_Substr.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Substr.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "Substr"
-  endpoint {
-    name: "strings.substr"
-  }
-  endpoint {
-    name: "substr"
-    deprecated: true
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 9439ab332c..3a920f26f3 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4458,7 +4458,12 @@ cc_library(
     name = "string_util",
     srcs = ["string_util.cc"],
     hdrs = ["string_util.h"],
-    deps = ["//tensorflow/core:lib"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@icu//:common",
+    ],
 )
 
 STRING_DEPS = [
diff --git a/tensorflow/core/kernels/string_util.cc b/tensorflow/core/kernels/string_util.cc
index 3a9803a052..92c73220d8 100644
--- a/tensorflow/core/kernels/string_util.cc
+++ b/tensorflow/core/kernels/string_util.cc
@@ -16,10 +16,6 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/errors.h"
 
-namespace {
-inline bool IsTrailByte(char x) { return static_cast<signed char>(x) < -0x40; }
-}  // namespace
-
 namespace tensorflow {
 
 // Sets unit value based on str.
diff --git a/tensorflow/core/kernels/string_util.h b/tensorflow/core/kernels/string_util.h
index 390cf57702..d40e93ea33 100644
--- a/tensorflow/core/kernels/string_util.h
+++ b/tensorflow/core/kernels/string_util.h
@@ -30,6 +30,9 @@ enum class UnicodeEncoding { UTF8 };
 // TODO(edloper): Add support for: UTF32_CHAR, etc.
 enum class CharUnit { BYTE, UTF8_CHAR };
 
+// Whether or not the given byte is the trailing byte of a UTF-8/16/32 char.
+inline bool IsTrailByte(char x) { return static_cast<signed char>(x) < -0x40; }
+
 // Sets `encoding` based on `str`.
 Status ParseUnicodeEncoding(const string& str, UnicodeEncoding* encoding);
 
@@ -40,6 +43,47 @@ Status ParseCharUnit(const string& str, CharUnit* unit);
 // Result may be incorrect if the input string is not valid UTF-8.
 int32 UTF8StrLen(const string& string);
 
+// Get the next UTF8 character position starting at the given position and
+// skipping the given number of characters. Position is a byte offset, and
+// should never be `null`. The function return true if successful. However, if
+// the end of the string is reached before the requested characters, then the
+// position will point to the end of string and this function will return false.
+template <typename T>
+bool ForwardNUTF8CharPositions(const StringPiece in,
+                               const T num_utf8_chars_to_shift, T* pos) {
+  const size_t size = in.size();
+  T utf8_chars_counted = 0;
+  while (utf8_chars_counted < num_utf8_chars_to_shift && *pos < size) {
+    // move forward one utf-8 character
+    do {
+      ++*pos;
+    } while (IsTrailByte(in[*pos]) && *pos < size);
+    ++utf8_chars_counted;
+  }
+  return utf8_chars_counted == num_utf8_chars_to_shift;
+}
+
+// Get the previous UTF8 character position starting at the given position and
+// skipping the given number of characters. Position is a byte offset with a
+// positive value, relative to the beginning of the string, and should never be
+// `null`. The function return true if successful. However, if the beginning of
+// the string is reached before the requested character, then the position will
+// point to the beginning of the string and this function will return false.
+template <typename T>
+bool BackNUTF8CharPositions(const StringPiece in,
+                            const T num_utf8_chars_to_shift, T* pos) {
+  const size_t start = 0;
+  T utf8_chars_counted = 0;
+  while (utf8_chars_counted < num_utf8_chars_to_shift && (*pos > start)) {
+    // move back one utf-8 character
+    do {
+      --*pos;
+    } while (IsTrailByte(in[*pos]) && *pos > start);
+    ++utf8_chars_counted;
+  }
+  return utf8_chars_counted == num_utf8_chars_to_shift;
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_
diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 07f1d6e767..93c427039d 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/string_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
@@ -37,7 +38,11 @@ namespace tensorflow {
 template <typename T>
 class SubstrOp : public OpKernel {
  public:
-  using OpKernel::OpKernel;
+  explicit SubstrOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    string unit;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("unit", &unit));
+    OP_REQUIRES_OK(ctx, ParseCharUnit(unit, &unit_));
+  }
 
   void Compute(OpKernelContext* context) override {
     // Get inputs
@@ -69,11 +74,23 @@ class SubstrOp : public OpKernel {
             tensorflow::internal::SubtleMustCopy(len_tensor.scalar<T>()());
         for (size_t i = 0; i < input_tensor.NumElements(); ++i) {
           StringPiece in(input(i));
-          OP_REQUIRES(
-              context, FastBoundsCheck(std::abs(pos), in.size() + 1),
-              errors::InvalidArgument("pos ", pos, " out of range for string",
-                                      "b'", in, "' at index ", i));
-          StringPiece sub_in = in.substr(AdjustedPosIndex(pos, in), len);
+          T byte_pos = pos;
+          T byte_len = len;
+          switch (unit_) {
+            case CharUnit::UTF8_CHAR:
+              OP_REQUIRES(
+                  context, UpdatePosAndLenForUtf8(in, &byte_pos, &byte_len),
+                  errors::InvalidArgument("pos ", pos, " out of range for ",
+                                          "string at index ", i));
+              break;
+            case CharUnit::BYTE:
+              byte_pos = AdjustedPosIndex(byte_pos, in);
+              OP_REQUIRES(
+                  context, FastBoundsCheck(byte_pos, in.size() + 1),
+                  errors::InvalidArgument("pos ", pos, " out of range for ",
+                                          "string b'", in, "' at index ", i));
+          }
+          StringPiece sub_in = in.substr(byte_pos, byte_len);
           output(i).assign(sub_in.data(), sub_in.size());
         }
       } else {
@@ -84,11 +101,23 @@ class SubstrOp : public OpKernel {
           StringPiece in(input(i));
           const T pos = tensorflow::internal::SubtleMustCopy(pos_flat(i));
           const T len = tensorflow::internal::SubtleMustCopy(len_flat(i));
-          OP_REQUIRES(
-              context, FastBoundsCheck(std::abs(pos), in.size() + 1),
-              errors::InvalidArgument("pos ", pos, " out of range for string",
-                                      "b'", in, "' at index ", i));
-          StringPiece sub_in = in.substr(AdjustedPosIndex(pos, in), len);
+          T byte_pos = pos;
+          T byte_len = len;
+          switch (unit_) {
+            case CharUnit::UTF8_CHAR:
+              OP_REQUIRES(
+                  context, UpdatePosAndLenForUtf8(in, &byte_pos, &byte_len),
+                  errors::InvalidArgument("pos ", pos, " out of range for ",
+                                          "string at index ", i));
+              break;
+            case CharUnit::BYTE:
+              byte_pos = AdjustedPosIndex(byte_pos, in);
+              OP_REQUIRES(
+                  context, FastBoundsCheck(byte_pos, in.size() + 1),
+                  errors::InvalidArgument("pos ", pos, " out of range for ",
+                                          "string b'", in, "' at index ", i));
+          }
+          StringPiece sub_in = in.substr(byte_pos, byte_len);
           output(i).assign(sub_in.data(), sub_in.size());
         }
       }
@@ -151,12 +180,24 @@ class SubstrOp : public OpKernel {
             StringPiece in(input_bcast(i));
             const T pos = tensorflow::internal::SubtleMustCopy(pos_bcast(i));
             const T len = tensorflow::internal::SubtleMustCopy(len_bcast(i));
-            OP_REQUIRES(
-                context,
-                FastBoundsCheck(std::abs(pos), input_bcast(i).size() + 1),
-                errors::InvalidArgument("pos ", pos, " out of range for string",
-                                        "b'", in, "' at index ", i));
-            StringPiece sub_in = in.substr(AdjustedPosIndex(pos, in), len);
+            T byte_pos = pos;
+            T byte_len = len;
+            switch (unit_) {
+              case CharUnit::UTF8_CHAR:
+                OP_REQUIRES(
+                    context, UpdatePosAndLenForUtf8(in, &byte_pos, &byte_len),
+                    errors::InvalidArgument("pos ", pos, " out of range for ",
+                                            "string at index ", i));
+                break;
+              case CharUnit::BYTE:
+                byte_pos = AdjustedPosIndex(byte_pos, in);
+                OP_REQUIRES(
+                    context,
+                    FastBoundsCheck(byte_pos, input_bcast(i).size() + 1),
+                    errors::InvalidArgument("pos ", pos, " out of range for ",
+                                            "string b'", in, "' at index ", i));
+            }
+            StringPiece sub_in = in.substr(byte_pos, byte_len);
             output(i).assign(sub_in.data(), sub_in.size());
           }
           break;
@@ -205,12 +246,24 @@ class SubstrOp : public OpKernel {
                   tensorflow::internal::SubtleMustCopy(pos_bcast(i, j));
               const T len =
                   tensorflow::internal::SubtleMustCopy(len_bcast(i, j));
-              OP_REQUIRES(
-                  context, FastBoundsCheck(std::abs(pos), in.size() + 1),
-                  errors::InvalidArgument("pos ", pos, " out of range for ",
-                                          "string b'", in, "' at index (", i,
-                                          ", ", j, ")"));
-              StringPiece sub_in = in.substr(AdjustedPosIndex(pos, in), len);
+              T byte_pos = pos;
+              T byte_len = len;
+              switch (unit_) {
+                case CharUnit::UTF8_CHAR:
+                  OP_REQUIRES(
+                      context, UpdatePosAndLenForUtf8(in, &byte_pos, &byte_len),
+                      errors::InvalidArgument("pos ", pos, " out of range for ",
+                                              "string at index ", i));
+                  break;
+                case CharUnit::BYTE:
+                  byte_pos = AdjustedPosIndex(byte_pos, in);
+                  OP_REQUIRES(
+                      context, FastBoundsCheck(byte_pos, in.size() + 1),
+                      errors::InvalidArgument("pos ", pos, " out of range for ",
+                                              "string b'", in, "' at index (",
+                                              i, ", ", j, ")"));
+              }
+              StringPiece sub_in = in.substr(byte_pos, byte_len);
               output(i, j).assign(sub_in.data(), sub_in.size());
             }
           }
@@ -227,12 +280,73 @@ class SubstrOp : public OpKernel {
  private:
   // This adjusts the requested position. Note it does not perform any bound
   // checks.
-  T AdjustedPosIndex(const T pos_requested, const StringPiece s) {
+  static inline T AdjustedPosIndex(const T pos_requested, const StringPiece s) {
     if (pos_requested < 0) {
       return s.size() + pos_requested;
     }
     return pos_requested;
   }
+
+  // Return true if successful; otherwise, return false if the `pos` argument
+  // is out of range in the string.
+  static inline bool UpdatePosAndLenForUtf8(const StringPiece in, T* pos,
+                                            T* len) {
+    if (*pos >= 0) {
+      return UpdatePositivePosAndLenForUtf8(in, *pos, *len, pos, len);
+    } else {
+      return UpdateNegativePosAndLenForUtf8(in, *pos, *len, pos, len);
+    }
+  }
+
+  static bool UpdatePositivePosAndLenForUtf8(const StringPiece in, const T pos,
+                                             const T len, T* char_pos,
+                                             T* char_len) {
+    *char_pos = 0;
+    // Determine byte position of the substring start.
+    if (!ForwardNUTF8CharPositions(in, pos, char_pos)) {
+      return false;
+    }
+    // Determine position of the end of the substring.
+    // The length will be capped at the end of the string, and we ignore whether
+    // the string had enough characters to handle it or not.
+    *char_len = *char_pos;
+    ForwardNUTF8CharPositions(in, len, char_len);
+    // The length in bytes is the position end of the substring less the start.
+    *char_len = *char_len - *char_pos;
+    return true;
+  }
+
+  // This function expects a negative position relative to the end of the
+  // string, but will update the character position to a positive number
+  // relative to the beginning of the string.
+  static bool UpdateNegativePosAndLenForUtf8(const StringPiece in, const T pos,
+                                             const T len, T* char_pos,
+                                             T* char_len) {
+    // Initially treat the length as position of the end of the substring.
+    *char_len = in.size();
+    // This is the number of character to skip from the end of the string to
+    // arrive at the position where the substring should end.
+    T utf8_chars_to_skip = -pos - len;
+    if (utf8_chars_to_skip < 0) {
+      utf8_chars_to_skip = 0;
+    }
+    // Find the byte position where the substring should end using the computed
+    // number of characters to skip.
+    if (!BackNUTF8CharPositions(in, utf8_chars_to_skip, char_len)) {
+      return false;
+    }
+    // Next, determine where the substring should begin. The number of chars to
+    // skip is the requested position minus the chars we've previously skipped.
+    *char_pos = *char_len;
+    if (!BackNUTF8CharPositions(in, -pos - utf8_chars_to_skip, char_pos)) {
+      return false;
+    }
+    // The length in bytes is the position end of the substring less the start.
+    *char_len = *char_len - *char_pos;
+    return true;
+  }
+
+  CharUnit unit_ = CharUnit::BYTE;
 };
 
 #define REGISTER_SUBSTR(type)                                      \
diff --git a/tensorflow/core/kernels/substr_op_test.cc b/tensorflow/core/kernels/substr_op_test.cc
index 2e07050260..ea6b1ed500 100644
--- a/tensorflow/core/kernels/substr_op_test.cc
+++ b/tensorflow/core/kernels/substr_op_test.cc
@@ -42,7 +42,7 @@ limitations under the License.
 namespace tensorflow {
 
 // Test data from the TensorFlow README.md.
-const char* lines[] = {
+const char* ascii_lines[] = {
     "**TensorFlow** is an open source software library for numerical "
     "computation using data flow graphs.",
     "The graph nodes represent mathematical operations, while the graph edges "
@@ -64,17 +64,76 @@ const char* lines[] = {
     "backwards compatibility guarantee like C++, Go, Java, JavaScript and "
     "Swift."};
 
+const char* unicode_lines[] = {
+    "TensorFlow\xe6\x98\xaf\xe4\xb8\x80\xe4\xb8\xaa\xe4\xbd\xbf\xe7\x94\xa8\xe6"
+    "\x95\xb0\xe6\x8d\xae\xe6\xb5\x81\xe5\x9b\xbe\xe8\xbf\x9b\xe8\xa1\x8c\xe6"
+    "\x95\xb0\xe5\x80\xbc\xe8\xae\xa1\xe7\xae\x97\xe7\x9a\x84\xe5\xbc\x80\xe6"
+    "\xba\x90\xe8\xbd\xaf\xe4\xbb\xb6\xe5\xba\x93\xe3\x80\x82",
+    "\xe5\x9b\xbe\xe5\xbd\xa2\xe8\x8a\x82\xe7\x82\xb9\xe8\xa1\xa8\xe7\xa4\xba"
+    "\xe6\x95\xb0\xe5\xad\xa6\xe8\xbf\x90\xe7\xae\x97\xef\xbc\x8c\xe8\x80\x8c"
+    "\xe5\x9b\xbe\xe5\xbd\xa2\xe8\xbe\xb9\xe7\xbc\x98\xe8\xa1\xa8\xe7\xa4\xba"
+    "\xe5\x9c\xa8\xe5\xae\x83\xe4\xbb\xac\xe4\xb9\x8b\xe9\x97\xb4\xe6\xb5\x81"
+    "\xe5\x8a\xa8\xe7\x9a\x84\xe5\xa4\x9a\xe7\xbb\xb4\xe6\x95\xb0\xe6\x8d\xae"
+    "\xe9\x98\xb5\xe5\x88\x97\xef\xbc\x88\xe5\xbc\xa0\xe9\x87\x8f\xef\xbc\x89"
+    "\xe3\x80\x82",
+    "\xe8\xbf\x99\xe7\xa7\x8d\xe7\x81\xb5\xe6\xb4\xbb\xe7\x9a\x84\xe4\xbd\x93"
+    "\xe7\xb3\xbb\xe7\xbb\x93\xe6\x9e\x84\xe4\xbd\xbf\xe6\x82\xa8\xe5\x8f\xaf"
+    "\xe4\xbb\xa5\xe5\xb0\x86\xe8\xae\xa1\xe7\xae\x97\xe9\x83\xa8\xe7\xbd\xb2"
+    "\xe5\x88\xb0\xe6\xa1\x8c\xe9\x9d\xa2\xef\xbc\x8c\xe6\x9c\x8d\xe5\x8a\xa1"
+    "\xe5\x99\xa8\xe6\x88\x96\xe7\xa7\xbb\xe5\x8a\xa8\xe8\xae\xbe\xe5\xa4\x87"
+    "\xe4\xb8\xad\xe7\x9a\x84\xe4\xb8\x80\xe4\xb8\xaa\xe6\x88\x96\xe5\xa4\x9a"
+    "\xe4\xb8\xaa CPU\xe6\x88\x96GPU\xef\xbc\x8c\xe8\x80\x8c\xe6\x97\xa0\xe9"
+    "\x9c\x80\xe9\x87\x8d\xe5\x86\x99\xe4\xbb\xa3\xe7\xa0\x81\xe3\x80\x82",
+    "TensorFlow\xe8\xbf\x98\xe5\x8c\x85\xe6\x8b\xac[TensorBoard]\xef\xbc\x88"
+    "https://www.tensorflow.org/guide/summaries_and_tensorboard\xef\xbc\x89\xef"
+    "\xbc\x8c\xe8\xbf\x99\xe6\x98\xaf\xe4\xb8\x80\xe4\xb8\xaa\xe6\x95\xb0\xe6"
+    "\x8d\xae\xe5\x8f\xaf\xe8\xa7\x86\xe5\x8c\x96\xe5\xb7\xa5\xe5\x85\xb7\xe5"
+    "\x8c\x85\xe3\x80\x82",
+    "TensorFlow\xe6\x9c\x80\xe5\x88\x9d\xe6\x98\xaf\xe7\x94\xb1\xe7\xa0\x94\xe7"
+    "\xa9\xb6\xe4\xba\xba\xe5\x91\x98\xe5\x92\x8c\xe5\xb7\xa5\xe7\xa8\x8b\xe5"
+    "\xb8\x88\xe5\x9c\xa8Google\xe6\x9c\xba\xe5\x99\xa8\xe6\x99\xba\xe8\x83\xbd"
+    "\xe7\xa0\x94\xe7\xa9\xb6\xe7\xbb\x84\xe7\xbb\x87\xe7\x9a\x84Google Brain"
+    "\xe5\x9b\xa2\xe9\x98\x9f\xe5\xbc\x80\xe5\x8f\x91\xe7\x9a\x84\xef\xbc\x8c"
+    "\xe7\x9b\xae\xe7\x9a\x84\xe6\x98\xaf\xe8\xbf\x9b\xe8\xa1\x8c\xe6\x9c\xba"
+    "\xe5\x99\xa8\xe5\xad\xa6\xe4\xb9\xa0\xe5\x92\x8c\xe6\xb7\xb1\xe5\xba\xa6"
+    "\xe7\xa5\x9e\xe7\xbb\x8f\xe7\xbd\x91\xe7\xbb\x9c\xe7\xa0\x94\xe7\xa9\xb6"
+    "\xe3\x80\x82",
+    "\xe8\xaf\xa5\xe7\xb3\xbb\xe7\xbb\x9f\xe8\xb6\xb3\xe4\xbb\xa5\xe9\x80\x82"
+    "\xe7\x94\xa8\xe4\xba\x8e\xe5\x90\x84\xe7\xa7\x8d\xe5\x85\xb6\xe4\xbb\x96"
+    "\xe9\xa2\x86\xe5\x9f\x9f\xe4\xb9\x9f\xe6\x98\xaf\xe5\xa6\x82\xe6\xad\xa4"
+    "\xe3\x80\x82",
+    "TensorFlow\xe6\x8f\x90\xe4\xbe\x9b\xe7\xa8\xb3\xe5\xae\x9a\xe7\x9a\x84"
+    "Python API\xe5\x92\x8c C API\xef\xbc\x8c\xe4\xbb\xa5\xe5\x8f\x8a\xe6\xb2"
+    "\xa1\xe6\x9c\x89 API\xe5\x90\x91\xe5\x90\x8e\xe5\x85\xbc\xe5\xae\xb9\xe6"
+    "\x80\xa7\xe4\xbf\x9d\xe8\xaf\x81\xef\xbc\x8c\xe5\xa6\x82 C ++\xef\xbc\x8c"
+    "Go\xef\xbc\x8cJava\xef\xbc\x8cJavaScript\xe5\x92\x8cSwift\xe3\x80\x82",
+};
+
+const char* const kByteUnit = "BYTE";
+const char* const kUTF8Unit = "UTF8_CHAR";
+
 Tensor GetTestTensor(int batch) {
-  const int sz = TF_ARRAYSIZE(lines);
+  const int sz = TF_ARRAYSIZE(ascii_lines);
+  Tensor t(DT_STRING, {batch});
+  auto s = t.flat<string>();
+  for (int i = 0; i < batch; ++i) {
+    s(i) = ascii_lines[i % sz];
+  }
+  return t;
+}
+
+Tensor GetTestUTF8Tensor(int batch) {
+  const int sz = TF_ARRAYSIZE(unicode_lines);
   Tensor t(DT_STRING, {batch});
   auto s = t.flat<string>();
   for (int i = 0; i < batch; ++i) {
-    s(i) = lines[i % sz];
+    s(i) = unicode_lines[i % sz];
   }
   return t;
 }
 
-Graph* SetupSubstrGraph(const Tensor& input, const int32 pos, const int32 len) {
+Graph* SetupSubstrGraph(const Tensor& input, const int32 pos, const int32 len,
+                        const char* const unit) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor position(DT_INT32, TensorShape({}));
   position.flat<int32>().setConstant(pos);
@@ -85,21 +144,46 @@ Graph* SetupSubstrGraph(const Tensor& input, const int32 pos, const int32 len) {
                   .Input(test::graph::Constant(g, input))
                   .Input(test::graph::Constant(g, position))
                   .Input(test::graph::Constant(g, length))
+                  .Attr("unit", unit)
                   .Finalize(g, nullptr /* node */));
   return g;
 }
 
-void BM_Substr(int iters, int batch_size) {
+void BM_SubstrByte(int iters, int batch_size) {
   testing::StopTiming();
   testing::ItemsProcessed(static_cast<int64>(iters));
   testing::UseRealTime();
   Tensor input = GetTestTensor(batch_size);
-  Graph* g = SetupSubstrGraph(input, 3, 30);
+  Graph* g = SetupSubstrGraph(input, 3, 30, kByteUnit);
+  testing::StartTiming();
+  test::Benchmark("cpu", g).Run(iters);
+}
+
+void BM_SubstrUTF8(int iters, int batch_size) {
+  testing::StopTiming();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  testing::UseRealTime();
+  Tensor input = GetTestUTF8Tensor(batch_size);
+  Graph* g = SetupSubstrGraph(input, 3, 30, kUTF8Unit);
   testing::StartTiming();
   test::Benchmark("cpu", g).Run(iters);
 }
 
-BENCHMARK(BM_Substr)->Arg(1)->Arg(8)->Arg(16)->Arg(32)->Arg(64)->Arg(128)->Arg(
-    256);
+BENCHMARK(BM_SubstrByte)
+    ->Arg(1)
+    ->Arg(8)
+    ->Arg(16)
+    ->Arg(32)
+    ->Arg(64)
+    ->Arg(128)
+    ->Arg(256);
+BENCHMARK(BM_SubstrUTF8)
+    ->Arg(1)
+    ->Arg(8)
+    ->Arg(16)
+    ->Arg(32)
+    ->Arg(64)
+    ->Arg(128)
+    ->Arg(256);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index b4fbde54d9..94d71a4113 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -223,6 +223,7 @@ REGISTER_OP("Substr")
     .Input("len: T")
     .Output("output: string")
     .Attr("T: {int32, int64}")
+    .Attr("unit: {'BYTE', 'UTF8_CHAR'} = 'BYTE'")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle pos_shape = c->input(1);
       ShapeHandle len_shape = c->input(2);
diff --git a/tensorflow/python/kernel_tests/substr_op_test.py b/tensorflow/python/kernel_tests/substr_op_test.py
index cd3fe14883..37aa624b07 100644
--- a/tensorflow/python/kernel_tests/substr_op_test.py
+++ b/tensorflow/python/kernel_tests/substr_op_test.py
@@ -28,270 +28,448 @@ from tensorflow.python.platform import test
 
 class SubstrOpTest(test.TestCase, parameterized.TestCase):
 
-  def _testScalarString(self, dtype):
-    test_string = b"Hello"
-    position = np.array(1, dtype)
+  @parameterized.parameters(
+      (np.int32, 1, "BYTE"),
+      (np.int64, 1, "BYTE"),
+      (np.int32, -4, "BYTE"),
+      (np.int64, -4, "BYTE"),
+      (np.int32, 1, "UTF8_CHAR"),
+      (np.int64, 1, "UTF8_CHAR"),
+      (np.int32, -4, "UTF8_CHAR"),
+      (np.int64, -4, "UTF8_CHAR"),
+  )
+  def testScalarString(self, dtype, pos, unit):
+    test_string = {
+        "BYTE": b"Hello",
+        "UTF8_CHAR": u"He\xc3\xc3\U0001f604".encode("utf-8"),
+    }[unit]
+    expected_value = {
+        "BYTE": b"ell",
+        "UTF8_CHAR": u"e\xc3\xc3".encode("utf-8"),
+    }[unit]
+    position = np.array(pos, dtype)
     length = np.array(3, dtype)
-    expected_value = b"ell"
-
-    substr_op = string_ops.substr(test_string, position, length)
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       substr = substr_op.eval()
       self.assertAllEqual(substr, expected_value)
 
-    # Negative position.
-    test_string = b"Hello"
-    position = np.array(-4, dtype)
+  @parameterized.parameters(
+      (np.int32, "BYTE"),
+      (np.int64, "BYTE"),
+      (np.int32, "UTF8_CHAR"),
+      (np.int64, "UTF8_CHAR"),
+  )
+  def testScalarString_EdgeCases(self, dtype, unit):
+    # Empty string
+    test_string = {
+        "BYTE": b"",
+        "UTF8_CHAR": u"".encode("utf-8"),
+    }[unit]
+    expected_value = b""
+    position = np.array(0, dtype)
     length = np.array(3, dtype)
-    expected_value = b"ell"
-
-    substr_op = string_ops.substr(test_string, position, length)
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       substr = substr_op.eval()
       self.assertAllEqual(substr, expected_value)
 
-    # Position is equal to the length of string.
-    test_string = b""
+    # Full string
+    test_string = {
+        "BYTE": b"Hello",
+        "UTF8_CHAR": u"H\xc3ll\U0001f604".encode("utf-8"),
+    }[unit]
     position = np.array(0, dtype)
-    length = np.array(2, dtype)
-    expected_value = b""
-
-    substr_op = string_ops.substr(test_string, position, length)
+    length = np.array(5, dtype)
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       substr = substr_op.eval()
-      self.assertAllEqual(substr, expected_value)
-
-    # Negative position magnitude is equal to the length of string.
-    test_string = b"yo"
-    position = np.array(-2, dtype)
-    length = np.array(1, dtype)
-    expected_value = b"y"
-
-    substr_op = string_ops.substr(test_string, position, length)
+      self.assertAllEqual(substr, test_string)
+
+    # Full string (Negative)
+    test_string = {
+        "BYTE": b"Hello",
+        "UTF8_CHAR": u"H\xc3ll\U0001f604".encode("utf-8"),
+    }[unit]
+    position = np.array(-5, dtype)
+    length = np.array(5, dtype)
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       substr = substr_op.eval()
-      self.assertAllEqual(substr, expected_value)
-
-  def _testVectorStrings(self, dtype):
-    test_string = [b"Hello", b"World"]
-    position = np.array(1, dtype)
-    length = np.array(3, dtype)
-    expected_value = [b"ell", b"orl"]
-
-    substr_op = string_ops.substr(test_string, position, length)
+      self.assertAllEqual(substr, test_string)
+
+    # Length is larger in magnitude than a negative position
+    test_string = {
+        "BYTE": b"Hello",
+        "UTF8_CHAR": u"H\xc3ll\U0001f604".encode("utf-8"),
+    }[unit]
+    expected_string = {
+        "BYTE": b"ello",
+        "UTF8_CHAR": u"\xc3ll\U0001f604".encode("utf-8"),
+    }[unit]
+    position = np.array(-4, dtype)
+    length = np.array(5, dtype)
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       substr = substr_op.eval()
-      self.assertAllEqual(substr, expected_value)
-
-    # Negative position.
-    test_string = [b"Hello", b"World"]
-    position = np.array(-4, dtype)
+      self.assertAllEqual(substr, expected_string)
+
+  @parameterized.parameters(
+      (np.int32, 1, "BYTE"),
+      (np.int64, 1, "BYTE"),
+      (np.int32, -4, "BYTE"),
+      (np.int64, -4, "BYTE"),
+      (np.int32, 1, "UTF8_CHAR"),
+      (np.int64, 1, "UTF8_CHAR"),
+      (np.int32, -4, "UTF8_CHAR"),
+      (np.int64, -4, "UTF8_CHAR"),
+  )
+  def testVectorStrings(self, dtype, pos, unit):
+    test_string = {
+        "BYTE": [b"Hello", b"World"],
+        "UTF8_CHAR": [x.encode("utf-8") for x in [u"H\xc3llo",
+                                                  u"W\U0001f604rld"]],
+    }[unit]
+    expected_value = {
+        "BYTE": [b"ell", b"orl"],
+        "UTF8_CHAR": [x.encode("utf-8") for x in [u"\xc3ll", u"\U0001f604rl"]],
+    }[unit]
+    position = np.array(pos, dtype)
     length = np.array(3, dtype)
-    expected_value = [b"ell", b"orl"]
-
-    substr_op = string_ops.substr(test_string, position, length)
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       substr = substr_op.eval()
       self.assertAllEqual(substr, expected_value)
 
-  def _testMatrixStrings(self, dtype):
-    test_string = [[b"ten", b"eleven", b"twelve"],
-                   [b"thirteen", b"fourteen", b"fifteen"],
-                   [b"sixteen", b"seventeen", b"eighteen"]]
+  @parameterized.parameters(
+      (np.int32, "BYTE"),
+      (np.int64, "BYTE"),
+      (np.int32, "UTF8_CHAR"),
+      (np.int64, "UTF8_CHAR"),
+  )
+  def testMatrixStrings(self, dtype, unit):
+    test_string = {
+        "BYTE": [[b"ten", b"eleven", b"twelve"],
+                 [b"thirteen", b"fourteen", b"fifteen"],
+                 [b"sixteen", b"seventeen", b"eighteen"]],
+        "UTF8_CHAR": [[x.encode("utf-8") for x in [u"\U0001d229\U0001d227n",
+                                                   u"\xc6\u053c\u025bv\u025bn",
+                                                   u"tw\u0c1dlv\u025b"]],
+                      [x.encode("utf-8") for x in [u"He\xc3\xc3o",
+                                                   u"W\U0001f604rld",
+                                                   u"d\xfcd\xea"]]],
+    }[unit]
     position = np.array(1, dtype)
     length = np.array(4, dtype)
-    expected_value = [[b"en", b"leve", b"welv"], [b"hirt", b"ourt", b"ifte"],
-                      [b"ixte", b"even", b"ight"]]
-
-    substr_op = string_ops.substr(test_string, position, length)
+    expected_value = {
+        "BYTE": [[b"en", b"leve", b"welv"], [b"hirt", b"ourt", b"ifte"],
+                 [b"ixte", b"even", b"ight"]],
+        "UTF8_CHAR": [[x.encode("utf-8") for x in [u"\U0001d227n",
+                                                   u"\u053c\u025bv\u025b",
+                                                   u"w\u0c1dlv"]],
+                      [x.encode("utf-8") for x in [u"e\xc3\xc3o",
+                                                   u"\U0001f604rld",
+                                                   u"\xfcd\xea"]]],
+    }[unit]
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       substr = substr_op.eval()
       self.assertAllEqual(substr, expected_value)
 
-    # Negative position
-    test_string = [[b"ten", b"eleven", b"twelve"],
-                   [b"thirteen", b"fourteen", b"fifteen"],
-                   [b"sixteen", b"seventeen", b"eighteen"]]
-    position = np.array(-2, dtype)
+    position = np.array(-3, dtype)
     length = np.array(2, dtype)
-    expected_value = [[b"en", b"en", b"ve"], [b"en", b"en", b"en"],
-                      [b"en", b"en", b"en"]]
-
-    substr_op = string_ops.substr(test_string, position, length)
+    expected_value = {
+        "BYTE": [[b"te", b"ve", b"lv"], [b"ee", b"ee", b"ee"],
+                 [b"ee", b"ee", b"ee"]],
+        "UTF8_CHAR": [[x.encode("utf-8") for x in [u"\U0001d229\U0001d227",
+                                                   u"v\u025b", u"lv"]],
+                      [x.encode("utf-8") for x in [u"\xc3\xc3", u"rl",
+                                                   u"\xfcd"]]],
+    }[unit]
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       substr = substr_op.eval()
       self.assertAllEqual(substr, expected_value)
 
-  def _testElementWisePosLen(self, dtype):
-    test_string = [[b"ten", b"eleven", b"twelve"],
-                   [b"thirteen", b"fourteen", b"fifteen"],
-                   [b"sixteen", b"seventeen", b"eighteen"]]
+  @parameterized.parameters(
+      (np.int32, "BYTE"),
+      (np.int64, "BYTE"),
+      (np.int32, "UTF8_CHAR"),
+      (np.int64, "UTF8_CHAR"),
+  )
+  def testElementWisePosLen(self, dtype, unit):
+    test_string = {
+        "BYTE": [[b"ten", b"eleven", b"twelve"],
+                 [b"thirteen", b"fourteen", b"fifteen"],
+                 [b"sixteen", b"seventeen", b"eighteen"]],
+        "UTF8_CHAR": [[x.encode("utf-8") for x in [u"\U0001d229\U0001d227n",
+                                                   u"\xc6\u053c\u025bv\u025bn",
+                                                   u"tw\u0c1dlv\u025b"]],
+                      [x.encode("utf-8") for x in [u"He\xc3\xc3o",
+                                                   u"W\U0001f604rld",
+                                                   u"d\xfcd\xea"]],
+                      [x.encode("utf-8") for x in [u"sixt\xea\xean",
+                                                   u"se\U00010299enteen",
+                                                   u"ei\U0001e920h\x86een"]]],
+    }[unit]
     position = np.array([[1, -4, 3], [1, 2, -4], [-5, 2, 3]], dtype)
     length = np.array([[2, 2, 4], [4, 3, 2], [5, 5, 5]], dtype)
-    expected_value = [[b"en", b"ev", b"lve"], [b"hirt", b"urt", b"te"],
-                      [b"xteen", b"vente", b"hteen"]]
-
-    substr_op = string_ops.substr(test_string, position, length)
+    expected_value = {
+        "BYTE": [[b"en", b"ev", b"lve"], [b"hirt", b"urt", b"te"],
+                 [b"xteen", b"vente", b"hteen"]],
+        "UTF8_CHAR": [[x.encode("utf-8") for x in [u"\U0001d227n",
+                                                   u"\u025bv",
+                                                   u"lv\u025b"]],
+                      [x.encode("utf-8") for x in [u"e\xc3\xc3o",
+                                                   u"rld",
+                                                   u"d\xfc"]],
+                      [x.encode("utf-8") for x in [u"xt\xea\xean",
+                                                   u"\U00010299ente",
+                                                   u"h\x86een"]]],
+    }[unit]
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       substr = substr_op.eval()
       self.assertAllEqual(substr, expected_value)
 
-  def _testBroadcast(self, dtype):
+  @parameterized.parameters(
+      (np.int32, "BYTE"),
+      (np.int64, "BYTE"),
+      (np.int32, "UTF8_CHAR"),
+      (np.int64, "UTF8_CHAR"),
+  )
+  def testBroadcast(self, dtype, unit):
     # Broadcast pos/len onto input string
-    test_string = [[b"ten", b"eleven", b"twelve"],
-                   [b"thirteen", b"fourteen", b"fifteen"],
-                   [b"sixteen", b"seventeen", b"eighteen"],
-                   [b"nineteen", b"twenty", b"twentyone"]]
+    test_string = {
+        "BYTE": [[b"ten", b"eleven", b"twelve"],
+                 [b"thirteen", b"fourteen", b"fifteen"],
+                 [b"sixteen", b"seventeen", b"eighteen"],
+                 [b"nineteen", b"twenty", b"twentyone"]],
+        "UTF8_CHAR": [[x.encode("utf-8") for x in [u"\U0001d229\U0001d227n",
+                                                   u"\xc6\u053c\u025bv\u025bn",
+                                                   u"tw\u0c1dlv\u025b"]],
+                      [x.encode("utf-8") for x in [u"th\xcdrt\xea\xean",
+                                                   u"f\U0001f604urt\xea\xean",
+                                                   u"f\xcd\ua09ctee\ua0e4"]],
+                      [x.encode("utf-8") for x in [u"s\xcdxt\xea\xean",
+                                                   u"se\U00010299enteen",
+                                                   u"ei\U0001e920h\x86een"]],
+                      [x.encode("utf-8") for x in [u"nineteen",
+                                                   u"twenty",
+                                                   u"twentyone"]]],
+    }[unit]
     position = np.array([1, -4, 3], dtype)
     length = np.array([1, 2, 3], dtype)
-    expected_value = [[b"e", b"ev", b"lve"], [b"h", b"te", b"tee"],
-                      [b"i", b"te", b"hte"], [b"i", b"en", b"nty"]]
-    substr_op = string_ops.substr(test_string, position, length)
+    expected_value = {
+        "BYTE": [[b"e", b"ev", b"lve"], [b"h", b"te", b"tee"],
+                 [b"i", b"te", b"hte"], [b"i", b"en", b"nty"]],
+        "UTF8_CHAR": [[x.encode("utf-8") for x in [u"\U0001d227",
+                                                   u"\u025bv", u"lv\u025b"]],
+                      [x.encode("utf-8") for x in [u"h", u"t\xea", u"tee"]],
+                      [x.encode("utf-8") for x in [u"\xcd", u"te", u"h\x86e"]],
+                      [x.encode("utf-8") for x in [u"i", u"en", u"nty"]]],
+    }[unit]
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       substr = substr_op.eval()
       self.assertAllEqual(substr, expected_value)
 
     # Broadcast input string onto pos/len
-    test_string = [b"thirteen", b"fourteen", b"fifteen"]
+    test_string = {
+        "BYTE": [b"thirteen", b"fourteen", b"fifteen"],
+        "UTF8_CHAR": [x.encode("utf-8") for x in [u"th\xcdrt\xea\xean",
+                                                  u"f\U0001f604urt\xea\xean",
+                                                  u"f\xcd\ua09ctee\ua0e4"]],
+    }[unit]
     position = np.array([[1, -2, 3], [-3, 2, 1], [5, 5, -5]], dtype)
     length = np.array([[3, 2, 1], [1, 2, 3], [2, 2, 2]], dtype)
-    expected_value = [[b"hir", b"en", b"t"], [b"e", b"ur", b"ift"],
-                      [b"ee", b"ee", b"ft"]]
-    substr_op = string_ops.substr(test_string, position, length)
+    expected_value = {
+        "BYTE": [[b"hir", b"en", b"t"], [b"e", b"ur", b"ift"],
+                 [b"ee", b"ee", b"ft"]],
+        "UTF8_CHAR": [[x.encode("utf-8") for x in [u"h\xcdr", u"\xean", u"t"]],
+                      [x.encode("utf-8") for x in [u"\xea", u"ur",
+                                                   u"\xcd\ua09ct"]],
+                      [x.encode("utf-8") for x in [u"\xea\xea", u"\xea\xea",
+                                                   u"\ua09ct"]]],
+    }[unit]
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       substr = substr_op.eval()
       self.assertAllEqual(substr, expected_value)
 
     # Test 1D broadcast
-    test_string = b"thirteen"
-    position = np.array([1, -5, 7], dtype)
+    test_string = {
+        "BYTE": b"thirteen",
+        "UTF8_CHAR": u"th\xcdrt\xea\xean".encode("utf-8"),
+    }[unit]
+    position = np.array([1, -4, 7], dtype)
     length = np.array([3, 2, 1], dtype)
-    expected_value = [b"hir", b"rt", b"n"]
-    substr_op = string_ops.substr(test_string, position, length)
+    expected_value = {
+        "BYTE": [b"hir", b"te", b"n"],
+        "UTF8_CHAR": [x.encode("utf-8") for x in [u"h\xcdr", u"t\xea", u"n"]],
+    }[unit]
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       substr = substr_op.eval()
       self.assertAllEqual(substr, expected_value)
 
-  def _testBadBroadcast(self, dtype):
+  @parameterized.parameters(
+      (np.int32, "BYTE"),
+      (np.int64, "BYTE"),
+      (np.int32, "UTF8_CHAR"),
+      (np.int64, "UTF8_CHAR"),
+  )
+  def testBadBroadcast(self, dtype, unit):
     test_string = [[b"ten", b"eleven", b"twelve"],
                    [b"thirteen", b"fourteen", b"fifteen"],
                    [b"sixteen", b"seventeen", b"eighteen"]]
     position = np.array([1, 2, -3, 4], dtype)
     length = np.array([1, 2, 3, 4], dtype)
     with self.assertRaises(ValueError):
-      substr_op = string_ops.substr(test_string, position, length)
-
-  def _testOutOfRangeError(self, dtype):
+      string_ops.substr(test_string, position, length, unit=unit)
+
+  @parameterized.parameters(
+      (np.int32, 6, "BYTE"),
+      (np.int64, 6, "BYTE"),
+      (np.int32, -6, "BYTE"),
+      (np.int64, -6, "BYTE"),
+      (np.int32, 6, "UTF8_CHAR"),
+      (np.int64, 6, "UTF8_CHAR"),
+      (np.int32, -6, "UTF8_CHAR"),
+      (np.int64, -6, "UTF8_CHAR"),
+  )
+  def testOutOfRangeError_Scalar(self, dtype, pos, unit):
     # Scalar/Scalar
-    test_string = b"Hello"
-    position = np.array(7, dtype)
-    length = np.array(3, dtype)
-    substr_op = string_ops.substr(test_string, position, length)
-    with self.cached_session():
-      with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr = substr_op.eval()
-
-    # Scalar/Scalar (with negative)
-    test_string = b"Hello"
-    position = np.array(-7, dtype)
+    test_string = {
+        "BYTE": b"Hello",
+        "UTF8_CHAR": u"H\xc3ll\U0001f604".encode("utf-8"),
+    }[unit]
+    position = np.array(pos, dtype)
     length = np.array(3, dtype)
-    substr_op = string_ops.substr(test_string, position, length)
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr = substr_op.eval()
-
+        substr_op.eval()
+
+  @parameterized.parameters(
+      (np.int32, 4, "BYTE"),
+      (np.int64, 4, "BYTE"),
+      (np.int32, -4, "BYTE"),
+      (np.int64, -4, "BYTE"),
+      (np.int32, 4, "UTF8_CHAR"),
+      (np.int64, 4, "UTF8_CHAR"),
+      (np.int32, -4, "UTF8_CHAR"),
+      (np.int64, -4, "UTF8_CHAR"),
+  )
+  def testOutOfRangeError_VectorScalar(self, dtype, pos, unit):
     # Vector/Scalar
-    test_string = [b"good", b"good", b"bad", b"good"]
-    position = np.array(4, dtype)
-    length = np.array(1, dtype)
-    substr_op = string_ops.substr(test_string, position, length)
-    with self.cached_session():
-      with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr = substr_op.eval()
-
-    # Vector/Scalar (with negative)
-    test_string = [b"good", b"good", b"bad", b"good"]
-    position = np.array(-4, dtype)
+    test_string = {
+        "BYTE": [b"good", b"good", b"bad", b"good"],
+        "UTF8_CHAR": [x.encode("utf-8") for x in [u"g\xc3\xc3d", u"b\xc3d",
+                                                  u"g\xc3\xc3d"]],
+    }[unit]
+    position = np.array(pos, dtype)
     length = np.array(1, dtype)
-    substr_op = string_ops.substr(test_string, position, length)
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr = substr_op.eval()
-
+        substr_op.eval()
+
+  @parameterized.parameters(
+      (np.int32, "BYTE"),
+      (np.int64, "BYTE"),
+      (np.int32, "UTF8_CHAR"),
+      (np.int64, "UTF8_CHAR"),
+  )
+  def testOutOfRangeError_MatrixMatrix(self, dtype, unit):
     # Matrix/Matrix
-    test_string = [[b"good", b"good", b"good"], [b"good", b"good", b"bad"],
-                   [b"good", b"good", b"good"]]
+    test_string = {
+        "BYTE": [[b"good", b"good", b"good"], [b"good", b"good", b"bad"],
+                 [b"good", b"good", b"good"]],
+        "UTF8_CHAR": [[x.encode("utf-8") for x in [u"g\xc3\xc3d", u"g\xc3\xc3d",
+                                                   u"g\xc3\xc3d"]],
+                      [x.encode("utf-8") for x in [u"g\xc3\xc3d", u"g\xc3\xc3d",
+                                                   u"b\xc3d"]],
+                      [x.encode("utf-8") for x in [u"g\xc3\xc3d", u"g\xc3\xc3d",
+                                                   u"g\xc3\xc3d"]]],
+    }[unit]
     position = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 3]], dtype)
     length = np.array([[3, 2, 1], [1, 2, 3], [2, 2, 2]], dtype)
-    substr_op = string_ops.substr(test_string, position, length)
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr = substr_op.eval()
+        substr_op.eval()
 
     # Matrix/Matrix (with negative)
-    test_string = [[b"good", b"good", b"good"], [b"good", b"good", b"bad"],
-                   [b"good", b"good", b"good"]]
     position = np.array([[1, 2, -3], [1, 2, -4], [1, 2, -3]], dtype)
     length = np.array([[3, 2, 1], [1, 2, 3], [2, 2, 2]], dtype)
-    substr_op = string_ops.substr(test_string, position, length)
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr = substr_op.eval()
-
+        substr_op.eval()
+
+  @parameterized.parameters(
+      (np.int32, "BYTE"),
+      (np.int64, "BYTE"),
+      (np.int32, "UTF8_CHAR"),
+      (np.int64, "UTF8_CHAR"),
+  )
+  def testOutOfRangeError_Broadcast(self, dtype, unit):
     # Broadcast
-    test_string = [[b"good", b"good", b"good"], [b"good", b"good", b"bad"]]
+    test_string = {
+        "BYTE": [[b"good", b"good", b"good"], [b"good", b"good", b"bad"]],
+        "UTF8_CHAR": [[x.encode("utf-8") for x in [u"g\xc3\xc3d", u"g\xc3\xc3d",
+                                                   u"g\xc3\xc3d"]],
+                      [x.encode("utf-8") for x in [u"g\xc3\xc3d", u"g\xc3\xc3d",
+                                                   u"b\xc3d"]]],
+    }[unit]
     position = np.array([1, 2, 4], dtype)
     length = np.array([1, 2, 3], dtype)
-    substr_op = string_ops.substr(test_string, position, length)
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr = substr_op.eval()
+        substr_op.eval()
 
     # Broadcast (with negative)
-    test_string = [[b"good", b"good", b"good"], [b"good", b"good", b"bad"]]
     position = np.array([-1, -2, -4], dtype)
     length = np.array([1, 2, 3], dtype)
-    substr_op = string_ops.substr(test_string, position, length)
+    substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr = substr_op.eval()
-
-  def _testMismatchPosLenShapes(self, dtype):
-    test_string = [[b"ten", b"eleven", b"twelve"],
-                   [b"thirteen", b"fourteen", b"fifteen"],
-                   [b"sixteen", b"seventeen", b"eighteen"]]
+        substr_op.eval()
+
+  @parameterized.parameters(
+      (np.int32, "BYTE"),
+      (np.int64, "BYTE"),
+      (np.int32, "UTF8_CHAR"),
+      (np.int64, "UTF8_CHAR"),
+  )
+  def testMismatchPosLenShapes(self, dtype, unit):
+    test_string = {
+        "BYTE": [[b"ten", b"eleven", b"twelve"],
+                 [b"thirteen", b"fourteen", b"fifteen"],
+                 [b"sixteen", b"seventeen", b"eighteen"]],
+        "UTF8_CHAR": [[x.encode("utf-8") for x in [u"\U0001d229\U0001d227n",
+                                                   u"\xc6\u053c\u025bv\u025bn",
+                                                   u"tw\u0c1dlv\u025b"]],
+                      [x.encode("utf-8") for x in [u"th\xcdrt\xea\xean",
+                                                   u"f\U0001f604urt\xea\xean",
+                                                   u"f\xcd\ua09ctee\ua0e4"]],
+                      [x.encode("utf-8") for x in [u"s\xcdxt\xea\xean",
+                                                   u"se\U00010299enteen",
+                                                   u"ei\U0001e920h\x86een"]]],
+    }[unit]
     position = np.array([[1, 2, 3]], dtype)
     length = np.array([2, 3, 4], dtype)
     # Should fail: position/length have different rank
     with self.assertRaises(ValueError):
-      substr_op = string_ops.substr(test_string, position, length)
+      string_ops.substr(test_string, position, length)
 
     position = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]], dtype)
     length = np.array([[2, 3, 4]], dtype)
     # Should fail: position/length have different dimensionality
     with self.assertRaises(ValueError):
-      substr_op = string_ops.substr(test_string, position, length)
-
-    # Negative position.
-    test_string = [[b"ten", b"eleven", b"twelve"],
-                   [b"thirteen", b"fourteen", b"fifteen"],
-                   [b"sixteen", b"seventeen", b"eighteen"]]
-    position = np.array([[-1, -2, -3]], dtype)
-    length = np.array([1, 2, 3], dtype)
-    # Should fail: position/length have different rank
-    with self.assertRaises(ValueError):
-      substr_op = string_ops.substr(test_string, position, length)
-
-  @parameterized.parameters(np.int32, np.int64)
-  def testAll(self, dtype):
-    self._testScalarString(dtype)
-    self._testVectorStrings(dtype)
-    self._testMatrixStrings(dtype)
-    self._testElementWisePosLen(dtype)
-    self._testBroadcast(dtype)
-    self._testBadBroadcast(dtype)
-    self._testOutOfRangeError(dtype)
-    self._testMismatchPosLenShapes(dtype)
+      string_ops.substr(test_string, position, length)
 
   def testWrongDtype(self):
     with self.cached_session():
@@ -300,6 +478,11 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(TypeError):
         string_ops.substr(b"test", 3, 1.0)
 
+  def testInvalidUnit(self):
+    with self.cached_session():
+      with self.assertRaises(ValueError):
+        string_ops.substr(b"test", 3, 1, unit="UTF8")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 0812f901a2..f26388efea 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -347,6 +347,22 @@ def string_length(input, name=None, unit="BYTE"):
 string_length.__doc__ = gen_string_ops.string_length.__doc__
 
 
+@tf_export("substr")
+@deprecation.deprecated(None, "Use `tf.strings.substr` instead of `tf.substr`.")
+def substr_deprecated(input, pos, len, name=None, unit="BYTE"):
+  return substr(input, pos, len, name=name, unit=unit)
+
+substr_deprecated.__doc__ = gen_string_ops.substr.__doc__
+
+
+@tf_export("strings.substr")
+def substr(input, pos, len, name=None, unit="BYTE"):
+  return gen_string_ops.substr(input, pos, len, unit=unit, name=name)
+
+
+substr.__doc__ = gen_string_ops.substr.__doc__
+
+
 ops.NotDifferentiable("RegexReplace")
 ops.NotDifferentiable("StringToHashBucket")
 ops.NotDifferentiable("StringToHashBucketFast")
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index c1cc7322f0..247dfcc1ca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -2094,7 +2094,7 @@ tf_module {
   }
   member_method {
     name: "substr"
-    argspec: "args=[\'input\', \'pos\', \'len\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'pos\', \'len\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
   }
   member_method {
     name: "subtract"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
index ebdaf57231..5ba48e7f57 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
@@ -34,7 +34,7 @@ tf_module {
   }
   member_method {
     name: "substr"
-    argspec: "args=[\'input\', \'pos\', \'len\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'pos\', \'len\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
   }
   member_method {
     name: "to_hash_bucket"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 571abc3b19..978afcf985 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -1934,7 +1934,7 @@ tf_module {
   }
   member_method {
     name: "substr"
-    argspec: "args=[\'input\', \'pos\', \'len\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'pos\', \'len\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
   }
   member_method {
     name: "subtract"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index ebdaf57231..5ba48e7f57 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -34,7 +34,7 @@ tf_module {
   }
   member_method {
     name: "substr"
-    argspec: "args=[\'input\', \'pos\', \'len\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'pos\', \'len\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
   }
   member_method {
     name: "to_hash_bucket"
-- 
GitLab


From 31619b408551907030dc25d8270f8997a0d9e6aa Mon Sep 17 00:00:00 2001
From: Yanan Cao <ycao@google.com>
Date: Thu, 4 Oct 2018 11:34:55 -0700
Subject: [PATCH 422/570] Add xla library into contrib_py

PiperOrigin-RevId: 215774158
---
 tensorflow/contrib/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index fbe0573d5d..fa06d351d4 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -29,6 +29,7 @@ py_library(
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/coder:coder_py",
         "//tensorflow/contrib/compiler:compiler_py",
+        "//tensorflow/contrib/compiler:xla",
         "//tensorflow/contrib/autograph",
         "//tensorflow/contrib/constrained_optimization",
         "//tensorflow/contrib/copy_graph:copy_graph_py",
-- 
GitLab


From 2390b48b11efda60a0f68a683c94af9612a5306f Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Thu, 4 Oct 2018 11:54:24 -0700
Subject: [PATCH 423/570] Add a separator between shape and dtype in cache key
 encoding.

It was possible that we could mix shapes and types (T111 could mean a tensor of dtype 1 and shape (1, 1) or a tensor of dtype 11 and shape (1)).

PiperOrigin-RevId: 215777629
---
 tensorflow/python/eager/function_test.py  | 44 +++++++++++++++++++++--
 tensorflow/python/eager/pywrap_tfe_src.cc | 34 +++++++++---------
 2 files changed, 58 insertions(+), 20 deletions(-)

diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 9ce367a837..a2cfb4b476 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -1255,6 +1255,44 @@ class FunctionTest(test.TestCase):
     defined(Foo())
     self.assertEqual(len(defined._function_cache), 2)
 
+  def testCacheTensorShapeDtypeCollision(self):
+
+    def func(t):
+      return t + t
+
+    defined = function.defun(func)
+    t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
+    defined(t)
+    self.assertEqual(len(defined._function_cache), 1)
+
+    t = constant_op.constant([1.0], dtype=dtypes.complex128)
+    defined(t)
+    self.assertEqual(len(defined._function_cache), 2)
+
+  def testCacheTensorUnknownShapesCollision(self):
+
+    def func(t):
+      return t + t
+
+    with context.graph_mode(), self.cached_session():
+      defined = function.defun(func)
+
+      p = array_ops.placeholder(dtype=dtypes.float32, shape=None)
+      defined(p)
+      self.assertEqual(len(defined._function_cache), 1)
+
+      p = array_ops.placeholder(dtype=dtypes.float32, shape=[None])
+      defined(p)
+      self.assertEqual(len(defined._function_cache), 2)
+
+      p = array_ops.placeholder(dtype=dtypes.float32, shape=[None, None])
+      defined(p)
+      self.assertEqual(len(defined._function_cache), 3)
+
+      t = constant_op.constant(1.0, dtype=dtypes.float32)
+      defined(t)
+      self.assertEqual(len(defined._function_cache), 4)
+
   def testPythonFunctionWithDefaultArgs(self):
 
     def func(foo, bar=1, baz=2):
@@ -1271,17 +1309,17 @@ class FunctionTest(test.TestCase):
       return tuple(key[0] for key in defined._function_cache)
 
     # `True` corresponds to the fact that we're executing eagerly
-    self.assertIn(('tRRR', (0, 1, 20)), cache_keys())
+    self.assertIn(('URRR', (0, 1, 20)), cache_keys())
 
     defined(1)  # bar=1, baz=2
-    self.assertIn(('tRRR', (1, 1, 2)), cache_keys())
+    self.assertIn(('URRR', (1, 1, 2)), cache_keys())
 
     # This matches the previous call.
     defined(foo=1)
     self.assertEqual(len(defined._function_cache), 2)
 
     defined(1, 2, 3)
-    self.assertIn(('tRRR', (1, 2, 3)), cache_keys())
+    self.assertIn(('URRR', (1, 2, 3)), cache_keys())
 
     # This matches the previous call.
     defined(1, bar=2, baz=3)
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index ae1e12f9c3..6193f40ce8 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -2747,11 +2747,15 @@ PyObject* TFE_Py_RecordGradient(PyObject* op_name, PyObject* inputs,
 }
 
 namespace {
-
-tensorflow::int64 GetPyNoneHash() {
-  tensorflow::int64 py_none_hash = PyObject_Hash(Py_None);
-  return py_none_hash;
-}
+const char kTensor[] = "T";
+const char kIndexedSlices[] = "I";
+const char kList[] = "L";
+const char kTuple[] = "U";
+const char kDict[] = "D";
+const char kRaw[] = "R";
+const char kShape[] = "s";
+const char kDType[] = "d";
+const char kNone[] = "n";
 
 struct EncodeResult {
   string str;
@@ -2784,8 +2788,10 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
     TFE_TensorHandle* t = EagerTensor_Handle(arg);
     tensorflow::TensorShape tensor_shape;
     TF_RETURN_IF_ERROR(t->handle->Shape(&tensor_shape));
-    absl::StrAppend(&result->str, t->handle->dtype);
 
+    absl::StrAppend(&result->str, kDType, t->handle->dtype);
+
+    absl::StrAppend(&result->str, kShape);
     for (tensorflow::int64 dim_size : tensor_shape.dim_sizes()) {
       absl::StrAppend(&result->str, dim_size);
     }
@@ -2812,7 +2818,7 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
   tensorflow::DataType dtype =
       static_cast<tensorflow::DataType>(MakeInt(dtype_enum.get()));
 
-  absl::StrAppend(&result->str, dtype);
+  absl::StrAppend(&result->str, kDType, dtype);
   static char _shape_tuple[] = "_shape_tuple";
   tensorflow::Safe_PyObjectPtr shape_tuple(
       PyObject_CallMethod(arg, _shape_tuple, nullptr));
@@ -2824,10 +2830,11 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
 
   if (shape_tuple.get() == Py_None) {
     // Unknown shape, encode that directly.
-    absl::StrAppend(&result->str, GetPyNoneHash());
+    absl::StrAppend(&result->str, kNone);
     return tensorflow::Status::OK();
   }
 
+  absl::StrAppend(&result->str, kShape);
   tensorflow::Safe_PyObjectPtr shape_seq(PySequence_Fast(
       shape_tuple.get(), "shape_tuple didn't return a sequence"));
 
@@ -2835,7 +2842,7 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
   for (int i = 0; i < len; ++i) {
     PyObject* item = PySequence_Fast_GET_ITEM(shape_seq.get(), i);
     if (item == Py_None) {
-      absl::StrAppend(&result->str, GetPyNoneHash());
+      absl::StrAppend(&result->str, kNone);
     } else {
       absl::StrAppend(&result->str, MakeInt(item));
     }
@@ -2844,13 +2851,6 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
   return tensorflow::Status::OK();
 }
 
-const char kTensor[] = "T";
-const char kIndexedSlices[] = "I";
-const char kList[] = "L";
-const char kTuple[] = "t";
-const char kDict[] = "D";
-const char kRaw[] = "R";
-
 tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result);
 
 // This function doesn't set the type of sequence before
@@ -2864,7 +2864,7 @@ tensorflow::Status TFE_Py_EncodeSequence(PyObject* arg, const char* type,
   for (int i = 0; i < len; ++i) {
     PyObject* item = PySequence_Fast_GET_ITEM(arg_seq.get(), i);
     if (item == Py_None) {
-      absl::StrAppend(&result->str, GetPyNoneHash());
+      absl::StrAppend(&result->str, kNone);
     } else {
       TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(item, result));
     }
-- 
GitLab


From b82c4dad705bffac6d14a189605c9ece89f8c17b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 11:55:48 -0700
Subject: [PATCH 424/570] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 215777837

---
 tensorflow/go/op/wrappers.go | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index b4d4db3e4d..a7bbb80c82 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -29094,6 +29094,17 @@ func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source
 	return op.Output(0)
 }
 
+// SubstrAttr is an optional argument to Substr.
+type SubstrAttr func(optionalAttr)
+
+// SubstrUnit sets the optional unit attribute to value.
+// If not specified, defaults to "BYTE"
+func SubstrUnit(value string) SubstrAttr {
+	return func(m optionalAttr) {
+		m["unit"] = value
+	}
+}
+
 // Return substrings from `Tensor` of strings.
 //
 // For each string in the input `Tensor`, creates a substring starting at index
@@ -29178,15 +29189,20 @@ func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source
 //	len: Scalar defining the number of characters to include in each substring
 //
 // Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "Substr",
 		Input: []tf.Input{
 			input, pos, len,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
-- 
GitLab


From 2667ed3bf01e7153f466b27c450fc2b662c00bdd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 11:59:37 -0700
Subject: [PATCH 425/570] Makes sure Keras Layer's `__call__` is always used in
 Eager.

Currently if a Layer is invoked with the Functional API in Eager, `__call__` is only used
during setup, and thereafter `call` is used internally. This limits the ability
to add pre/post processing steps to `call` in Eager in the future.
Additionally, the Subclassed Model API already always uses `__call__` in Eager.

PiperOrigin-RevId: 215778408
---
 tensorflow/python/keras/engine/network.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 8d34006967..918488bd7a 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -1028,7 +1028,10 @@ class Network(base_layer.Layer):
                 output_tensors, output_masks = layer._call_and_compute_mask(
                     computed_tensor, **kwargs)
               else:
-                output_tensors = layer.call(computed_tensor, **kwargs)
+                if context.executing_eagerly():
+                  output_tensors = layer(computed_tensor, **kwargs)
+                else:
+                  output_tensors = layer.call(computed_tensor, **kwargs)
                 if hasattr(layer, 'compute_mask'):
                   output_masks = layer.compute_mask(computed_tensor,
                                                     computed_mask)
@@ -1049,7 +1052,10 @@ class Network(base_layer.Layer):
                 output_tensors, output_masks = layer._call_and_compute_mask(
                     computed_tensors, **kwargs)
               else:
-                output_tensors = layer.call(computed_tensors, **kwargs)
+                if context.executing_eagerly():
+                  output_tensors = layer(computed_tensors, **kwargs)
+                else:
+                  output_tensors = layer.call(computed_tensors, **kwargs)
                 if hasattr(layer, 'compute_mask'):
                   output_masks = layer.compute_mask(computed_tensors,
                                                     computed_masks)
-- 
GitLab


From 5bdd0f7c2807ed413cfc60319f1e75b1e6a4a5b5 Mon Sep 17 00:00:00 2001
From: Paul Donnelly <pauldonnelly@google.com>
Date: Thu, 4 Oct 2018 12:12:39 -0700
Subject: [PATCH 426/570] Remove obsolete TODO.

PiperOrigin-RevId: 215780734
---
 tensorflow/core/kernels/dequantize_op.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/core/kernels/dequantize_op.cc b/tensorflow/core/kernels/dequantize_op.cc
index 42fbf95cd3..28940e0849 100644
--- a/tensorflow/core/kernels/dequantize_op.cc
+++ b/tensorflow/core/kernels/dequantize_op.cc
@@ -96,8 +96,6 @@ class DequantizeOp : public OpKernel {
             output);
       }
     } else if (mode_ == QUANTIZE_MODE_SCALED) {
-      // TODO(pauldonnelly): Update QuantizeAndDequantizeV2 and
-      // QuantizeAndDequantizeV3 to match this SCALED mode again.
       const float scale_factor =
           std::numeric_limits<T>::min() == 0
               ? (max_range / std::numeric_limits<T>::max())
-- 
GitLab


From 900d115135656229e3667025f925eb92687dce18 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 12:29:50 -0700
Subject: [PATCH 427/570] [XLA] Move FusionQueue class declaration into
 separate header

PiperOrigin-RevId: 215783391
---
 tensorflow/compiler/xla/service/BUILD         |  9 ++++
 .../compiler/xla/service/fusion_queue.h       | 53 +++++++++++++++++++
 .../xla/service/instruction_fusion.cc         |  1 +
 .../compiler/xla/service/instruction_fusion.h | 28 +---------
 4 files changed, 64 insertions(+), 27 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/fusion_queue.h

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index f329a27e14..2f8bab0614 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1323,11 +1323,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "fusion_queue",
+    hdrs = ["fusion_queue.h"],
+    deps = [
+        ":hlo",
+    ],
+)
+
 cc_library(
     name = "instruction_fusion",
     srcs = ["instruction_fusion.cc"],
     hdrs = ["instruction_fusion.h"],
     deps = [
+        ":fusion_queue",
         ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:util",
diff --git a/tensorflow/compiler/xla/service/fusion_queue.h b/tensorflow/compiler/xla/service/fusion_queue.h
new file mode 100644
index 0000000000..1208a7dda8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/fusion_queue.h
@@ -0,0 +1,53 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_FUSION_QUEUE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_FUSION_QUEUE_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+
+namespace xla {
+
+// A queue interface that allows implementations to choose fusion candidates in
+// custom order.
+class FusionQueue {
+ public:
+  FusionQueue() = default;
+  virtual ~FusionQueue() = default;
+
+  // Dequeues the next fusion candidates: a consumer and the list of producers
+  // as operand indices.
+  virtual std::pair<HloInstruction*, std::vector<int64>>
+  DequeueNextInstructionAndOperandsToFuseInOrder() = 0;
+
+  // A callback passed to the queue implementation right before the producer is
+  // fused into the consumer.
+  virtual void PreFusion(HloInstruction* producer, HloInstruction* consumer) {}
+
+  // A callback passed to the queue implementation right after the fusion is
+  // created. Note that original_producer could have been destroyed.
+  virtual void OnFusingInstruction(HloInstruction* fusion,
+                                   HloInstruction* original_producer,
+                                   HloInstruction* original_consumer) {}
+
+  // A callback passed to the queue implementation to notify the removal of an
+  // instruction.
+  virtual void RemoveInstruction(HloInstruction* instruction) = 0;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_FUSION_QUEUE_H_
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 5a99c40df4..69a4c160ee 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/fusion_queue.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index da2032f6c7..f14c667520 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -17,6 +17,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INSTRUCTION_FUSION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_INSTRUCTION_FUSION_H_
 
+#include "tensorflow/compiler/xla/service/fusion_queue.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -25,33 +26,6 @@ limitations under the License.
 
 namespace xla {
 
-// A queue interface that allows implementations to choose fusion candidates in
-// custom order.
-class FusionQueue {
- public:
-  FusionQueue() = default;
-  virtual ~FusionQueue() = default;
-
-  // Dequeues the next fusion candidates: a consumer and the list of producers
-  // as operand indices.
-  virtual std::pair<HloInstruction*, std::vector<int64>>
-  DequeueNextInstructionAndOperandsToFuseInOrder() = 0;
-
-  // A callback passed to the queue implementation right before the producer is
-  // fused into the consumer.
-  virtual void PreFusion(HloInstruction* producer, HloInstruction* consumer) {}
-
-  // A callback passed to the queue implementation right after the fusion is
-  // created. Note that original_producer could have been destroyed.
-  virtual void OnFusingInstruction(HloInstruction* fusion,
-                                   HloInstruction* original_producer,
-                                   HloInstruction* original_consumer) {}
-
-  // A callback passed to the queue implementation to notify the removal of an
-  // instruction.
-  virtual void RemoveInstruction(HloInstruction* instruction) = 0;
-};
-
 // HLO pass which performs instruction fusion. Instructions are fused
 // "vertically", meaning producing instructions are fused into their consumers
 // with the intent that the loops which compute their values will be fused in
-- 
GitLab


From 2c75da86ffdb9d04b2b94ce89891f17a8656da22 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 4 Oct 2018 12:41:23 -0700
Subject: [PATCH 428/570] [tf.data] Clean up tests for `tf.data.experimental`.

This change splits up large test files into smaller ones, and re-enables tests that were disabled for obsolete reasons.

PiperOrigin-RevId: 215785396
---
 .../python/data/experimental/benchmarks/BUILD |  25 +
 .../map_benchmark.py}                         | 114 ---
 .../data/experimental/kernel_tests/BUILD      | 545 ++++++------
 .../kernel_tests/batch_dataset_op_test.py     | 686 ---------------
 .../bucket_by_sequence_length_test.py         | 322 +++++++
 .../kernel_tests/bucketing_test.py            | 824 ------------------
 ...ing_ops_test.py => copy_to_device_test.py} | 417 +--------
 .../experimental/kernel_tests/counter_test.py |  51 ++
 ...dataset_op_test.py => csv_dataset_test.py} |   4 +-
 .../dataset_serialization_test_base.py        | 692 ---------------
 .../dense_to_sparse_batch_test.py             | 124 +++
 ...t_op_test.py => enumerate_dataset_test.py} |  26 +-
 .../function_buffering_resource_test.py       | 247 ++++++
 .../kernel_tests/group_by_reducer_test.py     | 199 +++++
 .../kernel_tests/group_by_window_test.py      | 367 ++++++++
 .../kernel_tests/ignore_errors_test.py        | 115 +++
 .../make_batched_features_dataset_test.py     | 239 +++++
 ...t_ops_test.py => make_csv_dataset_test.py} | 425 +--------
 .../make_tf_record_dataset_test.py            | 243 ++++++
 .../kernel_tests/map_and_batch_test.py        | 337 +++++++
 ...ps_test.py => override_threadpool_test.py} |   6 +-
 ...op_test.py => parallel_interleave_test.py} |   4 +-
 ..._test.py => parse_example_dataset_test.py} |   4 +-
 .../kernel_tests/prefetch_to_device_test.py   | 234 +++++
 .../reader_dataset_ops_test_base.py           |   4 +-
 ...ple_test.py => rejection_resample_test.py} |   4 +-
 ...p_test.py => restructured_dataset_test.py} |   4 +-
 .../{scan_dataset_op_test.py => scan_test.py} |   4 +-
 .../kernel_tests/serialization/BUILD          |  22 +-
 .../checkpoint_input_pipeline_hook_test.py}   |   0
 ...arse_example_dataset_serialization_test.py |   2 +-
 .../sql_dataset_serialization_test.py         |   4 +-
 .../serialization_integration_test.py         |  85 --
 ..._op_test.py => shuffle_and_repeat_test.py} |   2 +-
 ...dataset_op_test.py => sql_dataset_test.py} |   6 +-
 ..._test_base.py => sql_dataset_test_base.py} |   3 +-
 .../kernel_tests/stats_dataset_ops_test.py    |   2 +-
 ...r_ops_test.py => tf_record_writer_test.py} |   2 +-
 .../experimental/kernel_tests/unbatch_test.py | 300 +++++++
 ...ique_dataset_op_test.py => unique_test.py} |   4 +-
 .../data/kernel_tests/map_dataset_op_test.py  |  31 +-
 41 files changed, 3172 insertions(+), 3557 deletions(-)
 create mode 100644 tensorflow/python/data/experimental/benchmarks/BUILD
 rename tensorflow/python/data/experimental/{kernel_tests/map_dataset_op_test.py => benchmarks/map_benchmark.py} (71%)
 delete mode 100644 tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
 delete mode 100644 tensorflow/python/data/experimental/kernel_tests/bucketing_test.py
 rename tensorflow/python/data/experimental/kernel_tests/{prefetching_ops_test.py => copy_to_device_test.py} (56%)
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/counter_test.py
 rename tensorflow/python/data/experimental/kernel_tests/{csv_dataset_op_test.py => csv_dataset_test.py} (99%)
 delete mode 100644 tensorflow/python/data/experimental/kernel_tests/dataset_serialization_test_base.py
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
 rename tensorflow/python/data/experimental/kernel_tests/{range_dataset_op_test.py => enumerate_dataset_test.py} (68%)
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
 rename tensorflow/python/data/experimental/kernel_tests/{reader_dataset_ops_test.py => make_csv_dataset_test.py} (57%)
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
 rename tensorflow/python/data/experimental/kernel_tests/{threadpool_dataset_ops_test.py => override_threadpool_test.py} (94%)
 rename tensorflow/python/data/experimental/kernel_tests/{interleave_dataset_op_test.py => parallel_interleave_test.py} (99%)
 rename tensorflow/python/data/experimental/kernel_tests/{parsing_ops_test.py => parse_example_dataset_test.py} (99%)
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
 rename tensorflow/python/data/experimental/kernel_tests/{resample_test.py => rejection_resample_test.py} (97%)
 rename tensorflow/python/data/experimental/kernel_tests/{dataset_constructor_op_test.py => restructured_dataset_test.py} (95%)
 rename tensorflow/python/data/experimental/kernel_tests/{scan_dataset_op_test.py => scan_test.py} (98%)
 rename tensorflow/python/data/experimental/kernel_tests/{iterator_ops_test.py => serialization/checkpoint_input_pipeline_hook_test.py} (100%)
 delete mode 100644 tensorflow/python/data/experimental/kernel_tests/serialization_integration_test.py
 rename tensorflow/python/data/experimental/kernel_tests/{shuffle_dataset_op_test.py => shuffle_and_repeat_test.py} (98%)
 rename tensorflow/python/data/experimental/kernel_tests/{sql_dataset_op_test.py => sql_dataset_test.py} (99%)
 rename tensorflow/python/data/experimental/kernel_tests/{sql_dataset_op_test_base.py => sql_dataset_test_base.py} (98%)
 rename tensorflow/python/data/experimental/kernel_tests/{writer_ops_test.py => tf_record_writer_test.py} (98%)
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
 rename tensorflow/python/data/experimental/kernel_tests/{unique_dataset_op_test.py => unique_test.py} (96%)

diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
new file mode 100644
index 0000000000..b9398aebe7
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -0,0 +1,25 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_test(
+    name = "map_benchmark",
+    size = "medium",
+    srcs = ["map_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/experimental/benchmarks/map_benchmark.py
similarity index 71%
rename from tensorflow/python/data/experimental/kernel_tests/map_dataset_op_test.py
rename to tensorflow/python/data/experimental/benchmarks/map_benchmark.py
index 2f0bd1456b..ad253cffa5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_benchmark.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 import hashlib
 import itertools
-import os
 import time
 
 import numpy as np
@@ -27,128 +26,15 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
-from tensorflow.python.util import compat
 
 _NUMPY_RANDOM_SEED = 42
 
 
-class MapDatasetTest(test_base.DatasetTestBase):
-
-  def testMapIgnoreError(self):
-    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.check_numerics(x, "message")).apply(
-            error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for x in [1., 2., 3., 5.]:
-        self.assertEqual(x, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testParallelMapIgnoreError(self):
-    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(
-            lambda x: array_ops.check_numerics(x, "message"),
-            num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for x in [1., 2., 3., 5.]:
-        self.assertEqual(x, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testReadFileIgnoreError(self):
-
-    def write_string_to_file(value, filename):
-      with open(filename, "w") as f:
-        f.write(value)
-
-    filenames = [
-        os.path.join(self.get_temp_dir(), "file_%d.txt" % i) for i in range(5)
-    ]
-    for filename in filenames:
-      write_string_to_file(filename, filename)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(filenames).map(
-            io_ops.read_file,
-            num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # All of the files are present.
-      sess.run(init_op)
-      for filename in filenames:
-        self.assertEqual(compat.as_bytes(filename), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Delete one of the files.
-      os.remove(filenames[0])
-
-      # Attempting to read filenames[0] will fail, but ignore_errors()
-      # will catch the error.
-      sess.run(init_op)
-      for filename in filenames[1:]:
-        self.assertEqual(compat.as_bytes(filename), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testCaptureResourceInMapFn(self):
-
-    def _build_ds(iterator):
-
-      def _map_fn(x):
-        get_next = iterator.get_next()
-        return x * get_next
-
-      return dataset_ops.Dataset.range(10).map(_map_fn)
-
-    def _build_graph():
-      captured_iterator = dataset_ops.Dataset.range(
-          10).make_initializable_iterator()
-      ds = _build_ds(captured_iterator)
-      iterator = ds.make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      return captured_iterator.initializer, init_op, get_next
-
-    with ops.Graph().as_default() as g:
-      captured_init_op, init_op, get_next = _build_graph()
-      with self.session(graph=g) as sess:
-        sess.run(captured_init_op)
-        sess.run(init_op)
-        for i in range(10):
-          self.assertEquals(i * i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-
 class MapDatasetBenchmark(test.Benchmark):
 
   # The purpose of this benchmark is to compare the performance of chaining vs
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index f56127f3ef..4eef9580ad 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -8,75 +8,62 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_test(
-    name = "batch_dataset_op_test",
+    name = "bucket_by_sequence_length_test",
     size = "medium",
-    srcs = ["batch_dataset_op_test.py"],
+    srcs = ["bucket_by_sequence_length_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",  # (b/79552534)
-        "no_pip",
-        "no_windows",
-    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
+cuda_py_test(
+    name = "copy_to_device_test",
+    size = "small",
+    srcs = ["copy_to_device_test.py"],
+    additional_deps = [
+        "//tensorflow/python/data/experimental/ops:prefetching_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/compat:compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+    tags = ["no_windows_gpu"],
+)
+
 py_test(
-    name = "bucketing_test",
-    size = "medium",
-    srcs = ["bucketing_test.py"],
+    name = "counter_test",
+    size = "small",
+    srcs = ["counter_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
     deps = [
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/experimental/ops:grouping",
+        "//tensorflow/python/data/experimental/ops:counter",
         "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
 
 py_test(
-    name = "csv_dataset_op_test",
+    name = "csv_dataset_test",
     size = "medium",
-    srcs = ["csv_dataset_op_test.py"],
+    srcs = ["csv_dataset_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
+    tags = ["no_pip"],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -97,25 +84,18 @@ py_test(
 )
 
 py_test(
-    name = "dataset_constructor_op_test",
-    size = "medium",
-    srcs = ["dataset_constructor_op_test.py"],
+    name = "dense_to_sparse_batch_test",
+    srcs = ["dense_to_sparse_batch_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "no_oss",
-        "no_pip",
-        "no_windows",
-        "nomac",  # b/62040583
-    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -124,11 +104,6 @@ py_test(
     size = "medium",
     srcs = ["directed_interleave_dataset_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -140,15 +115,68 @@ py_test(
     ],
 )
 
+py_test(
+    name = "enumerate_dataset_test",
+    size = "small",
+    srcs = ["enumerate_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/experimental/ops:enumerate_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_test(
+    name = "filter_dataset_op_test",
+    size = "medium",
+    srcs = ["filter_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "function_buffering_resource_test",
+    size = "small",
+    srcs = ["function_buffering_resource_test.py"],
+    additional_deps = [
+        "//tensorflow/python/data/experimental/ops:prefetching_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:function",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+    tags = ["no_windows_gpu"],
+)
+
 py_test(
     name = "get_single_element_test",
     size = "small",
     srcs = ["get_single_element_test.py"],
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -165,19 +193,20 @@ py_test(
 )
 
 py_test(
-    name = "indexed_dataset_ops_test",
-    srcs = ["indexed_dataset_ops_test.py"],
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
+    name = "group_by_reducer_test",
+    size = "medium",
+    srcs = ["group_by_reducer_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python/data/experimental/ops:indexed_dataset_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
@@ -185,107 +214,134 @@ py_test(
 )
 
 py_test(
-    name = "interleave_dataset_op_test",
+    name = "group_by_window_test",
     size = "medium",
-    srcs = ["interleave_dataset_op_test.py"],
+    srcs = ["group_by_window_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-        "notap",
-    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/experimental/ops:interleave_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "@six_archive//:six",
+        "//third_party/py/numpy",
     ],
 )
 
 py_test(
-    name = "iterator_ops_test",
-    size = "small",
-    srcs = ["iterator_ops_test.py"],
+    name = "ignore_errors_test",
+    srcs = ["ignore_errors_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:error_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
     ],
+)
+
+py_test(
+    name = "indexed_dataset_ops_test",
+    srcs = ["indexed_dataset_ops_test.py"],
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/experimental/ops:iterator_ops",
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python/data/experimental/ops:indexed_dataset_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/estimator:estimator_py",
+        "//third_party/py/numpy",
     ],
 )
 
 py_test(
-    name = "map_dataset_op_test",
+    name = "make_batched_features_dataset_test",
     size = "medium",
-    srcs = ["map_dataset_op_test.py"],
+    srcs = ["make_batched_features_dataset_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-        "noasan",  # times out
-        "optonly",
+    tags = ["no_pip"],
+    deps = [
+        ":reader_dataset_ops_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python/data/experimental/ops:readers",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:nest",
+        "//third_party/py/numpy",
     ],
+)
+
+py_test(
+    name = "make_csv_dataset_test",
+    size = "medium",
+    srcs = ["make_csv_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/experimental/ops:batching",
-        "//tensorflow/python/data/experimental/ops:error_ops",
-        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
         "//third_party/py/numpy",
     ],
 )
 
 py_test(
-    name = "filter_dataset_op_test",
+    name = "make_tf_record_dataset_test",
     size = "medium",
-    srcs = ["filter_dataset_op_test.py"],
+    srcs = ["make_tf_record_dataset_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
+    tags = ["no_pip"],
+    deps = [
+        ":reader_dataset_ops_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python/data/experimental/ops:readers",
+        "//tensorflow/python/data/util:nest",
     ],
+)
+
+py_test(
+    name = "map_and_batch_test",
+    size = "medium",
+    srcs = ["map_and_batch_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -294,11 +350,7 @@ py_test(
     size = "small",
     srcs = ["map_defun_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
+    tags = ["no_pip"],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -317,15 +369,56 @@ py_test(
 )
 
 py_test(
-    name = "parsing_ops_test",
+    name = "override_threadpool_test",
     size = "small",
-    srcs = ["parsing_ops_test.py"],
+    srcs = ["override_threadpool_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python/data/experimental/ops:threadpool",
+        "//tensorflow/python/data/experimental/ops:unique",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "parallel_interleave_test",
+    size = "medium",
+    srcs = ["parallel_interleave_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
         "no_pip",
-        "no_windows",
+        "notap",
     ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "parse_example_dataset_test",
+    size = "small",
+    srcs = ["parse_example_dataset_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -344,53 +437,20 @@ py_test(
 )
 
 cuda_py_test(
-    name = "prefetching_ops_test",
+    name = "prefetch_to_device_test",
     size = "small",
-    srcs = ["prefetching_ops_test.py"],
+    srcs = ["prefetch_to_device_test.py"],
     additional_deps = [
         "//tensorflow/python/data/experimental/ops:prefetching_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/compat:compat",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-        "no_windows_gpu",
-    ],
-)
-
-py_test(
-    name = "range_dataset_op_test",
-    size = "small",
-    srcs = ["range_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/experimental/ops:counter",
-        "//tensorflow/python/data/experimental/ops:enumerate_ops",
-        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
+    tags = ["no_windows_gpu"],
 )
 
 py_library(
@@ -421,41 +481,12 @@ py_library(
 )
 
 py_test(
-    name = "reader_dataset_ops_test",
-    size = "medium",
-    srcs = ["reader_dataset_ops_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
-        ":reader_dataset_ops_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python/data/experimental/ops:readers",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python/data/util:nest",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "resample_test",
+    name = "rejection_resample_test",
     size = "medium",
-    srcs = ["resample_test.py"],
+    srcs = ["rejection_resample_test.py"],
     shard_count = 2,
     srcs_version = "PY2AND3",
     tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
         "noasan",
         "optonly",
     ],
@@ -477,15 +508,27 @@ py_test(
 )
 
 py_test(
-    name = "scan_dataset_op_test",
-    size = "small",
-    srcs = ["scan_dataset_op_test.py"],
+    name = "restructured_dataset_test",
+    size = "medium",
+    srcs = ["restructured_dataset_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
     ],
+)
+
+py_test(
+    name = "scan_test",
+    size = "small",
+    srcs = ["scan_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -503,14 +546,12 @@ py_test(
 )
 
 py_test(
-    name = "shuffle_dataset_op_test",
+    name = "shuffle_and_repeat_test",
     size = "medium",
-    srcs = ["shuffle_dataset_op_test.py"],
+    srcs = ["shuffle_and_repeat_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-        "no_oss",
         "no_pip",
-        "no_windows",
         "optonly",
     ],
     deps = [
@@ -525,8 +566,8 @@ py_test(
 )
 
 py_library(
-    name = "sql_dataset_op_test_base",
-    srcs = ["sql_dataset_op_test_base.py"],
+    name = "sql_dataset_test_base",
+    srcs = ["sql_dataset_test_base.py"],
     srcs_version = "PY2AND3",
     visibility = [
         "//tensorflow/python/data/experimental/kernel_tests:__pkg__",
@@ -543,17 +584,13 @@ py_library(
 )
 
 py_test(
-    name = "sql_dataset_op_test",
+    name = "sql_dataset_test",
     size = "small",
-    srcs = ["sql_dataset_op_test.py"],
+    srcs = ["sql_dataset_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
+    tags = ["no_pip"],
     deps = [
-        ":sql_dataset_op_test_base",
+        ":sql_dataset_test_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -565,11 +602,7 @@ py_test(
     size = "medium",
     srcs = ["stats_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
+    tags = ["no_pip"],
     deps = [
         ":reader_dataset_ops_test_base",
         ":stats_dataset_test_base",
@@ -595,68 +628,60 @@ py_library(
 )
 
 py_test(
-    name = "threadpool_dataset_ops_test",
+    name = "tf_record_writer_test",
     size = "small",
-    srcs = ["threadpool_dataset_ops_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
+    srcs = ["tf_record_writer_test.py"],
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python/data/experimental/ops:threadpool",
-        "//tensorflow/python/data/experimental/ops:unique",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:writers",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python/data/ops:readers",
     ],
 )
 
 py_test(
-    name = "unique_dataset_op_test",
-    size = "small",
-    srcs = ["unique_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
+    name = "unbatch_test",
+    size = "medium",
+    srcs = ["unbatch_test.py"],
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
-        "//tensorflow/python/data/experimental/ops:unique",
+        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_test(
-    name = "writer_ops_test",
+    name = "unique_test",
     size = "small",
-    srcs = ["writer_ops_test.py"],
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
+    srcs = ["unique_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:lib",
+        "//tensorflow/python:errors",
         "//tensorflow/python:util",
-        "//tensorflow/python/data/experimental/ops:writers",
+        "//tensorflow/python/data/experimental/ops:unique",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:readers",
     ],
 )
diff --git a/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
deleted file mode 100644
index 956b4518f6..0000000000
--- a/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
+++ /dev/null
@@ -1,686 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import time
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import script_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  def testDenseToSparseBatchDataset(self):
-    components = np.random.randint(12, size=(100,)).astype(np.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.fill([x], x)).apply(
-            batching.dense_to_sparse_batch(4, [12]))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-
-      for start in range(0, len(components), 4):
-        results = sess.run(get_next)
-        self.assertAllEqual([[i, j]
-                             for i, c in enumerate(components[start:start + 4])
-                             for j in range(c)], results.indices)
-        self.assertAllEqual(
-            [c for c in components[start:start + 4] for _ in range(c)],
-            results.values)
-        self.assertAllEqual([min(4,
-                                 len(components) - start), 12],
-                            results.dense_shape)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testDenseToSparseBatchDatasetWithUnknownShape(self):
-    components = np.random.randint(5, size=(40,)).astype(np.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.fill([x, x], x)).apply(
-            batching.dense_to_sparse_batch(
-                4, [5, None])).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-
-      for start in range(0, len(components), 4):
-        results = sess.run(get_next)
-        self.assertAllEqual([[i, j, z]
-                             for i, c in enumerate(components[start:start + 4])
-                             for j in range(c)
-                             for z in range(c)], results.indices)
-        self.assertAllEqual([
-            c
-            for c in components[start:start + 4] for _ in range(c)
-            for _ in range(c)
-        ], results.values)
-        self.assertAllEqual([
-            min(4,
-                len(components) - start), 5,
-            np.max(components[start:start + 4])
-        ], results.dense_shape)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testDenseToSparseBatchDatasetWithInvalidShape(self):
-    input_tensor = array_ops.constant([[1]])
-    with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"):
-      dataset_ops.Dataset.from_tensors(input_tensor).apply(
-          batching.dense_to_sparse_batch(4, [-2])).make_initializable_iterator()
-
-  def testDenseToSparseBatchDatasetShapeErrors(self):
-    input_tensor = array_ops.placeholder(dtypes.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensors(input_tensor).apply(
-            batching.dense_to_sparse_batch(4, [12]))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Initialize with an input tensor of incompatible rank.
-      sess.run(init_op, feed_dict={input_tensor: [[1]]})
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "incompatible with the row shape"):
-        sess.run(get_next)
-
-      # Initialize with an input tensor that is larger than `row_shape`.
-      sess.run(init_op, feed_dict={input_tensor: range(13)})
-      with self.assertRaisesRegexp(errors.DataLossError,
-                                   "larger than the row shape"):
-        sess.run(get_next)
-
-  def testUnbatchWithUnknownRankInput(self):
-    placeholder = array_ops.placeholder(dtypes.int32)
-    dataset = dataset_ops.Dataset.from_tensors(placeholder).apply(
-        batching.unbatch())
-    iterator = dataset.make_initializable_iterator()
-    next_elem = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer, feed_dict={placeholder: [0, 1, 2, 3]})
-      for i in range(4):
-        self.assertEqual(i, sess.run(next_elem))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_elem)
-
-  def testUnbatchScalarDataset(self):
-    data = tuple([math_ops.range(10) for _ in range(3)])
-    data = dataset_ops.Dataset.from_tensor_slices(data)
-    expected_types = (dtypes.int32,) * 3
-    data = data.batch(2)
-    self.assertEqual(expected_types, data.output_types)
-    data = data.apply(batching.unbatch())
-    self.assertEqual(expected_types, data.output_types)
-
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual((i,) * 3, sess.run(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
-
-  def testUnbatchDatasetWithStrings(self):
-    data = tuple([math_ops.range(10) for _ in range(3)])
-    data = dataset_ops.Dataset.from_tensor_slices(data)
-    data = data.map(lambda x, y, z: (x, string_ops.as_string(y), z))
-    expected_types = (dtypes.int32, dtypes.string, dtypes.int32)
-    data = data.batch(2)
-    self.assertEqual(expected_types, data.output_types)
-    data = data.apply(batching.unbatch())
-    self.assertEqual(expected_types, data.output_types)
-
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual((i, compat.as_bytes(str(i)), i), sess.run(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
-
-  def testUnbatchDatasetWithSparseTensor(self):
-    st = sparse_tensor.SparseTensorValue(
-        indices=[[i, i] for i in range(10)],
-        values=list(range(10)),
-        dense_shape=[10, 10])
-    data = dataset_ops.Dataset.from_tensors(st)
-    data = data.apply(batching.unbatch())
-    data = data.batch(5)
-    data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        st_row = sess.run(next_element)
-        self.assertEqual([i], st_row.indices)
-        self.assertEqual([i], st_row.values)
-        self.assertEqual([10], st_row.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testUnbatchDatasetWithDenseAndSparseTensor(self):
-    st = sparse_tensor.SparseTensorValue(
-        indices=[[i, i] for i in range(10)],
-        values=list(range(10)),
-        dense_shape=[10, 10])
-    data = dataset_ops.Dataset.from_tensors((list(range(10)), st))
-    data = data.apply(batching.unbatch())
-    data = data.batch(5)
-    data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        dense_elem, st_row = sess.run(next_element)
-        self.assertEqual(i, dense_elem)
-        self.assertEqual([i], st_row.indices)
-        self.assertEqual([i], st_row.values)
-        self.assertEqual([10], st_row.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testUnbatchSingleElementTupleDataset(self):
-    data = tuple([(math_ops.range(10),) for _ in range(3)])
-    data = dataset_ops.Dataset.from_tensor_slices(data)
-    expected_types = ((dtypes.int32,),) * 3
-    data = data.batch(2)
-    self.assertEqual(expected_types, data.output_types)
-    data = data.apply(batching.unbatch())
-    self.assertEqual(expected_types, data.output_types)
-
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual(((i,),) * 3, sess.run(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
-
-  def testUnbatchMultiElementTupleDataset(self):
-    data = tuple([(math_ops.range(10 * i, 10 * i + 10),
-                   array_ops.fill([10], "hi")) for i in range(3)])
-    data = dataset_ops.Dataset.from_tensor_slices(data)
-    expected_types = ((dtypes.int32, dtypes.string),) * 3
-    data = data.batch(2)
-    self.assertAllEqual(expected_types, data.output_types)
-    data = data.apply(batching.unbatch())
-    self.assertAllEqual(expected_types, data.output_types)
-
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual(((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")),
-                         sess.run(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
-
-  def testUnbatchEmpty(self):
-    data = dataset_ops.Dataset.from_tensors(
-        (constant_op.constant([]), constant_op.constant([], shape=[0, 4]),
-         constant_op.constant([], shape=[0, 4, 0])))
-    data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testUnbatchStaticShapeMismatch(self):
-    data = dataset_ops.Dataset.from_tensors((np.arange(7), np.arange(8),
-                                             np.arange(9)))
-    with self.assertRaises(ValueError):
-      data.apply(batching.unbatch())
-
-  def testUnbatchDynamicShapeMismatch(self):
-    ph1 = array_ops.placeholder(dtypes.int32, shape=[None])
-    ph2 = array_ops.placeholder(dtypes.int32, shape=None)
-    data = dataset_ops.Dataset.from_tensors((ph1, ph2))
-    data = data.apply(batching.unbatch())
-    iterator = data.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Mismatch in the 0th dimension.
-      sess.run(
-          iterator.initializer,
-          feed_dict={
-              ph1: np.arange(7).astype(np.int32),
-              ph2: np.arange(8).astype(np.int32)
-          })
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(next_element)
-
-      # No 0th dimension (i.e. scalar value) for one component.
-      sess.run(
-          iterator.initializer,
-          feed_dict={
-              ph1: np.arange(7).astype(np.int32),
-              ph2: 7
-          })
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(next_element)
-
-  @parameterized.named_parameters(
-      ("Default", None, None),
-      ("SequentialCalls", 1, None),
-      ("ParallelCalls", 2, None),
-      ("ParallelBatches", None, 10),
-  )
-  def testMapAndBatch(self, num_parallel_calls, num_parallel_batches):
-    """Test a dataset that maps a TF function across its input elements."""
-    # The pipeline is TensorSliceDataset ->
-    # RepeatDataset(count) -> MapAndBatchDataset(square_3, batch_size).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).repeat(count).apply(
-            batching.map_and_batch(
-                map_func=_map_fn,
-                batch_size=batch_size,
-                num_parallel_calls=num_parallel_calls,
-                num_parallel_batches=num_parallel_batches))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
-                     [t.shape.as_list() for t in get_next])
-
-    with self.cached_session() as sess:
-      # Batch of a finite input, where the batch_size divides the
-      # total number of elements.
-      sess.run(init_op, feed_dict={count: 28, batch_size: 14})
-      num_batches = (28 * 7) // 14
-      for i in range(num_batches):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(14):
-            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
-                                result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Batch of a finite input, where the batch_size does not
-      # divide the total number of elements.
-      sess.run(init_op, feed_dict={count: 14, batch_size: 8})
-
-      # We expect (num_batches - 1) full-sized batches.
-      num_batches = int(math.ceil((14 * 7) / 8))
-      for i in range(num_batches - 1):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(8):
-            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
-                                result_component[j])
-      result = sess.run(get_next)
-      for component, result_component in zip(components, result):
-        for j in range((14 * 7) % 8):
-          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
-                              result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Batch of an empty input should fail straight away.
-      sess.run(init_op, feed_dict={count: 0, batch_size: 8})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Empty batch should be an initialization time error.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={count: 14, batch_size: 0})
-
-  @parameterized.named_parameters(
-      ("Even", False),
-      ("Uneven", True),
-  )
-  def testMapAndBatchPartialBatch(self, drop_remainder):
-    iterator = (
-        dataset_ops.Dataset.range(10).apply(
-            batching.map_and_batch(
-                lambda x: array_ops.reshape(x * x, [1]),
-                batch_size=4,
-                drop_remainder=drop_remainder)).make_one_shot_iterator())
-    if drop_remainder:
-      self.assertEqual([4, 1], iterator.output_shapes.as_list())
-    else:
-      self.assertEqual([None, 1], iterator.output_shapes.as_list())
-    next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
-      if not drop_remainder:
-        self.assertAllEqual([[64], [81]], sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testMapAndBatchYieldsPartialBatch(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .apply(batching.map_and_batch(
-                    lambda x: array_ops.reshape(x * x, [1]), 4))
-                .make_one_shot_iterator())
-    self.assertEqual([None, 1], iterator.output_shapes.as_list())
-    next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
-      self.assertAllEqual([[64], [81]], sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testMapAndBatchParallelGetNext(self):
-    iterator = (dataset_ops.Dataset.range(50000)
-                .apply(batching.map_and_batch(lambda x: x, batch_size=100))
-                .make_one_shot_iterator())
-    elements = []
-    for _ in range(100):
-      elements.append(iterator.get_next())
-    with self.cached_session() as sess:
-      for i in range(5):
-        got = sess.run(elements)
-        got.sort(key=lambda x: x[0])
-        expected = []
-        for j in range(100):
-          expected.append(range(i*10000+j*100, i*10000+(j+1)*100))
-        self.assertAllEqual(got, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elements)
-
-  def testMapAndBatchParallelGetNextDropRemainder(self):
-    iterator = (
-        dataset_ops.Dataset.range(49999).apply(
-            batching.map_and_batch(
-                lambda x: x, batch_size=100, drop_remainder=True))
-        .make_one_shot_iterator())
-    elements = []
-    for _ in range(100):
-      elements.append(iterator.get_next())
-    with self.cached_session() as sess:
-      for i in range(4):
-        got = sess.run(elements)
-        got.sort(key=lambda x: x[0])
-        expected = []
-        for j in range(100):
-          expected.append(range(i*10000+j*100, i*10000+(j+1)*100))
-        self.assertAllEqual(got, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elements)
-
-  def testMapAndBatchSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).apply(
-        batching.map_and_batch(_sparse, 5)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(2):
-        actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testMapAndBatchFails(self):
-    """Test a dataset that maps a TF function across its input elements."""
-    dataset = dataset_ops.Dataset.from_tensors(
-        array_ops.check_numerics(
-            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
-        sess.run(init_op, feed_dict={batch_size: 14})
-
-  def testMapAndBatchShapeMismatch(self):
-    """Test a dataset that maps a TF function across its input elements."""
-
-    def generator():
-      yield [1]
-      yield [2]
-      yield [3]
-      yield [[4, 5, 6]]
-
-    dataset = dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.int32)
-    batch_size = 4
-    iterator = (
-        dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "number of elements does not match"):
-        sess.run(get_next)
-
-  def testMapAndBatchImplicitDispose(self):
-    # Tests whether a map and batch dataset will be cleaned up correctly when
-    # the pipeline does not run it until exhaustion.
-    # The pipeline is TensorSliceDataset -> RepeatDataset(1000) ->
-    # MapAndBatchDataset(f=square_3, batch_size=100).
-    components = (np.arange(1000),
-                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
-                  np.array(37.0) * np.arange(1000))
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(components).repeat(
-        1000).apply(batching.map_and_batch(_map_fn, batch_size=100))
-    dataset = dataset.prefetch(5)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(3):
-        sess.run(get_next)
-
-  @parameterized.named_parameters(
-      ("1", 0),
-      ("2", 5),
-      ("3", 10),
-      ("4", 90),
-      ("5", 95),
-      ("6", 99),
-  )
-  def testMapAndBatchOutOfRangeError(self, threshold):
-
-    def raising_py_fn(i):
-      if i >= threshold:
-        raise StopIteration()
-      else:
-        return i
-
-    iterator = (
-        dataset_ops.Dataset.range(100).apply(
-            batching.map_and_batch(
-                lambda x: script_ops.py_func(raising_py_fn, [x], dtypes.int64),
-                batch_size=10)).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(threshold // 10):
-        self.assertAllEqual([i * 10 + j for j in range(10)], sess.run(get_next))
-      if threshold % 10 != 0:
-        self.assertAllEqual(
-            [threshold // 10 * 10 + j for j in range(threshold % 10)],
-            sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  @parameterized.named_parameters(
-      ("1", False, dtypes.bool),
-      ("2", -42, dtypes.int8),
-      ("3", -42, dtypes.int16),
-      ("4", -42, dtypes.int32),
-      ("5", -42, dtypes.int64),
-      ("6", 42, dtypes.uint8),
-      ("7", 42, dtypes.uint16),
-      ("8", 42.0, dtypes.float16),
-      ("9", 42.0, dtypes.float32),
-      ("10", 42.0, dtypes.float64),
-      ("11", b"hello", dtypes.string),
-  )
-  def testMapAndBatchTypes(self, element, dtype):
-    def gen():
-      yield element
-
-    dataset = dataset_ops.Dataset.from_generator(gen, dtype).repeat(100).apply(
-        batching.map_and_batch(lambda x: x, batch_size=10))
-
-    get_next = dataset.make_one_shot_iterator().get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(10):
-        self.assertAllEqual([element for _ in range(10)], sess.run(get_next))
-
-
-class UnbatchDatasetBenchmark(test.Benchmark):
-
-  def benchmarkNativeUnbatch(self):
-    batch_sizes = [1, 2, 5, 10, 20, 50]
-    elems_per_trial = 10000
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
-      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-      dataset = dataset.batch(batch_size_placeholder)
-      dataset = dataset.apply(batching.unbatch())
-      dataset = dataset.skip(elems_per_trial)
-      iterator = dataset.make_initializable_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for batch_size in batch_sizes:
-          deltas = []
-          for _ in range(5):
-            sess.run(
-                iterator.initializer,
-                feed_dict={batch_size_placeholder: batch_size})
-            start = time.time()
-            sess.run(next_element.op)
-            end = time.time()
-            deltas.append((end - start) / elems_per_trial)
-
-          median_wall_time = np.median(deltas)
-          print("Unbatch (native) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
-          self.report_benchmark(
-              iters=10000,
-              wall_time=median_wall_time,
-              name="benchmark_unbatch_dataset_native_batch_size_%d" %
-              batch_size)
-
-  # Include a benchmark of the previous `unbatch()` implementation that uses
-  # a composition of more primitive ops. Eventually we'd hope to generate code
-  # that is as good in both cases.
-  def benchmarkOldUnbatchImplementation(self):
-    batch_sizes = [1, 2, 5, 10, 20, 50]
-    elems_per_trial = 10000
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
-      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-      dataset = dataset.batch(batch_size_placeholder)
-      dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
-      dataset = dataset.skip(elems_per_trial)
-      iterator = dataset.make_initializable_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for batch_size in batch_sizes:
-          deltas = []
-          for _ in range(5):
-            sess.run(
-                iterator.initializer,
-                feed_dict={batch_size_placeholder: batch_size})
-            start = time.time()
-            sess.run(next_element.op)
-            end = time.time()
-            deltas.append((end - start) / elems_per_trial)
-
-          median_wall_time = np.median(deltas)
-          print("Unbatch (unfused) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
-          self.report_benchmark(
-              iters=10000,
-              wall_time=median_wall_time,
-              name="benchmark_unbatch_dataset_unfused_batch_size_%d" %
-              batch_size)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
new file mode 100644
index 0000000000..3903ec49b9
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
@@ -0,0 +1,322 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.bucket_by_sequence_length()."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+
+from tensorflow.python.data.experimental.ops import grouping
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+def _element_length_fn(x, y=None):
+  del y
+  return array_ops.shape(x)[0]
+
+
+def _to_sparse_tensor(record):
+  return sparse_tensor.SparseTensor(**record)
+
+
+def _format_record(array, sparse):
+  if sparse:
+    return {
+        "values": array,
+        "indices": [[i] for i in range(len(array))],
+        "dense_shape": (len(array),)
+    }
+  return array
+
+
+def _get_record_type(sparse):
+  if sparse:
+    return {
+        "values": dtypes.int64,
+        "indices": dtypes.int64,
+        "dense_shape": dtypes.int64
+    }
+  return dtypes.int32
+
+
+def _get_record_shape(sparse):
+  if sparse:
+    return {
+        "values": tensor_shape.TensorShape([None,]),
+        "indices": tensor_shape.TensorShape([None, 1]),
+        "dense_shape": tensor_shape.TensorShape([1,])
+    }
+  return tensor_shape.TensorShape([None])
+
+
+class BucketBySequenceLengthTest(test_base.DatasetTestBase):
+
+  def testBucket(self):
+
+    boundaries = [10, 20, 30]
+    batch_sizes = [10, 8, 4, 2]
+    lengths = [8, 13, 25, 35]
+
+    def build_dataset(sparse):
+      def _generator():
+        # Produce 1 batch for each bucket
+        elements = []
+        for batch_size, length in zip(batch_sizes, lengths):
+          record_len = length - 1
+          for _ in range(batch_size):
+            elements.append([1] * record_len)
+            record_len = length
+        random.shuffle(elements)
+        for el in elements:
+          yield (_format_record(el, sparse),)
+      dataset = dataset_ops.Dataset.from_generator(
+          _generator,
+          (_get_record_type(sparse),),
+          (_get_record_shape(sparse),))
+      if sparse:
+        dataset = dataset.map(lambda x: (_to_sparse_tensor(x),))
+      return dataset
+
+    def _test_bucket_by_padding(no_padding):
+      dataset = build_dataset(sparse=no_padding)
+      dataset = dataset.apply(
+          grouping.bucket_by_sequence_length(
+              _element_length_fn,
+              boundaries,
+              batch_sizes,
+              no_padding=no_padding))
+      batch, = dataset.make_one_shot_iterator().get_next()
+
+      with self.cached_session() as sess:
+        batches = []
+        for _ in range(4):
+          batches.append(sess.run(batch))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(batch)
+      batch_sizes_val = []
+      lengths_val = []
+      for batch in batches:
+        shape = batch.dense_shape if no_padding else batch.shape
+        batch_size = shape[0]
+        length = shape[1]
+        batch_sizes_val.append(batch_size)
+        lengths_val.append(length)
+        sum_check = batch.values.sum() if no_padding else batch.sum()
+        self.assertEqual(sum_check, batch_size * length - 1)
+      self.assertEqual(sum(batch_sizes_val), sum(batch_sizes))
+      self.assertEqual(sorted(batch_sizes), sorted(batch_sizes_val))
+      self.assertEqual(sorted(lengths), sorted(lengths_val))
+
+    for no_padding in (True, False):
+      _test_bucket_by_padding(no_padding)
+
+  def testPadToBoundary(self):
+
+    boundaries = [10, 20, 30]
+    batch_sizes = [10, 8, 4, 2]
+    lengths = [8, 13, 25]
+
+    def element_gen():
+      # Produce 1 batch for each bucket
+      elements = []
+      for batch_size, length in zip(batch_sizes[:-1], lengths):
+        for _ in range(batch_size):
+          elements.append([1] * length)
+      random.shuffle(elements)
+      for el in elements:
+        yield (el,)
+      for _ in range(batch_sizes[-1]):
+        el = [1] * (boundaries[-1] + 5)
+        yield (el,)
+
+    element_len = lambda el: array_ops.shape(el)[0]
+    dataset = dataset_ops.Dataset.from_generator(
+        element_gen, (dtypes.int64,), ([None],)).apply(
+            grouping.bucket_by_sequence_length(
+                element_len, boundaries, batch_sizes,
+                pad_to_bucket_boundary=True))
+    batch, = dataset.make_one_shot_iterator().get_next()
+
+    with self.cached_session() as sess:
+      batches = []
+      for _ in range(3):
+        batches.append(sess.run(batch))
+      with self.assertRaisesOpError("bucket_boundaries"):
+        sess.run(batch)
+    batch_sizes_val = []
+    lengths_val = []
+    for batch in batches:
+      batch_size = batch.shape[0]
+      length = batch.shape[1]
+      batch_sizes_val.append(batch_size)
+      lengths_val.append(length)
+    batch_sizes = batch_sizes[:-1]
+    self.assertEqual(sum(batch_sizes_val), sum(batch_sizes))
+    self.assertEqual(sorted(batch_sizes), sorted(batch_sizes_val))
+    self.assertEqual([boundary - 1 for boundary in sorted(boundaries)],
+                     sorted(lengths_val))
+
+  def testPadToBoundaryNoExtraneousPadding(self):
+
+    boundaries = [3, 7, 11]
+    batch_sizes = [2, 2, 2, 2]
+    lengths = range(1, 11)
+
+    def element_gen():
+      for length in lengths:
+        yield ([1] * length,)
+
+    element_len = lambda element: array_ops.shape(element)[0]
+    dataset = dataset_ops.Dataset.from_generator(
+        element_gen, (dtypes.int64,), ([None],)).apply(
+            grouping.bucket_by_sequence_length(
+                element_len, boundaries, batch_sizes,
+                pad_to_bucket_boundary=True))
+    batch, = dataset.make_one_shot_iterator().get_next()
+
+    with self.cached_session() as sess:
+      batches = []
+      for _ in range(5):
+        batches.append(sess.run(batch))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(batch)
+
+    self.assertAllEqual(batches[0], [[1, 0],
+                                     [1, 1]])
+    self.assertAllEqual(batches[1], [[1, 1, 1, 0, 0, 0],
+                                     [1, 1, 1, 1, 0, 0]])
+    self.assertAllEqual(batches[2], [[1, 1, 1, 1, 1, 0],
+                                     [1, 1, 1, 1, 1, 1]])
+    self.assertAllEqual(batches[3], [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
+                                     [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    self.assertAllEqual(batches[4], [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
+                                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+
+  def testTupleElements(self):
+
+    def build_dataset(sparse):
+      def _generator():
+        text = [[1, 2, 3], [3, 4, 5, 6, 7], [1, 2], [8, 9, 0, 2, 3]]
+        label = [1, 2, 1, 2]
+        for x, y in zip(text, label):
+          yield (_format_record(x, sparse), y)
+      dataset = dataset_ops.Dataset.from_generator(
+          generator=_generator,
+          output_types=(_get_record_type(sparse), dtypes.int32),
+          output_shapes=(_get_record_shape(sparse),
+                         tensor_shape.TensorShape([])))
+      if sparse:
+        dataset = dataset.map(lambda x, y: (_to_sparse_tensor(x), y))
+      return dataset
+
+    def _test_tuple_elements_by_padding(no_padding):
+      dataset = build_dataset(sparse=no_padding)
+      dataset = dataset.apply(grouping.bucket_by_sequence_length(
+          element_length_func=_element_length_fn,
+          bucket_batch_sizes=[2, 2, 2],
+          bucket_boundaries=[0, 8],
+          no_padding=no_padding))
+      shapes = dataset.output_shapes
+      self.assertEqual([None, None], shapes[0].as_list())
+      self.assertEqual([None], shapes[1].as_list())
+
+    for no_padding in (True, False):
+      _test_tuple_elements_by_padding(no_padding)
+
+  def testBucketSparse(self):
+    """Tests bucketing of sparse tensors (case where `no_padding` == True).
+
+    Test runs on following dataset:
+      [
+        [0],
+        [0, 1],
+        [0, 1, 2]
+        ...
+        [0, ..., max_len - 1]
+      ]
+    Sequences are bucketed by length and batched with
+      `batch_size` < `bucket_size`.
+    """
+
+    min_len = 0
+    max_len = 100
+    batch_size = 7
+    bucket_size = 10
+
+    def _build_dataset():
+      input_data = [range(i+1) for i in range(min_len, max_len)]
+      def generator_fn():
+        for record in input_data:
+          yield _format_record(record, sparse=True)
+      dataset = dataset_ops.Dataset.from_generator(
+          generator=generator_fn,
+          output_types=_get_record_type(sparse=True))
+      dataset = dataset.map(_to_sparse_tensor)
+      return dataset
+
+    def _compute_expected_batches():
+      """Computes expected batch outputs and stores in a set."""
+      all_expected_sparse_tensors = set()
+      for bucket_start_len in range(min_len, max_len, bucket_size):
+        for batch_offset in range(0, bucket_size, batch_size):
+          batch_start_len = bucket_start_len + batch_offset
+          batch_end_len = min(batch_start_len + batch_size,
+                              bucket_start_len + bucket_size)
+          expected_indices = []
+          expected_values = []
+          for length in range(batch_start_len, batch_end_len):
+            for val in range(length + 1):
+              expected_indices.append((length - batch_start_len, val))
+              expected_values.append(val)
+          expected_sprs_tensor = (tuple(expected_indices),
+                                  tuple(expected_values))
+          all_expected_sparse_tensors.add(expected_sprs_tensor)
+      return all_expected_sparse_tensors
+
+    def _compute_batches(dataset):
+      """Computes actual batch outputs of dataset and stores in a set."""
+      batch = dataset.make_one_shot_iterator().get_next()
+      all_sparse_tensors = set()
+      with self.cached_session() as sess:
+        with self.assertRaises(errors.OutOfRangeError):
+          while True:
+            output = sess.run(batch)
+            sprs_tensor = (tuple([tuple(idx) for idx in output.indices]),
+                           tuple(output.values))
+            all_sparse_tensors.add(sprs_tensor)
+      return all_sparse_tensors
+
+    dataset = _build_dataset()
+    boundaries = range(min_len + bucket_size + 1, max_len, bucket_size)
+    dataset = dataset.apply(grouping.bucket_by_sequence_length(
+        _element_length_fn,
+        boundaries,
+        [batch_size] * (len(boundaries) + 1),
+        no_padding=True))
+    batches = _compute_batches(dataset)
+    expected_batches = _compute_expected_batches()
+    self.assertEqual(batches, expected_batches)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/bucketing_test.py b/tensorflow/python/data/experimental/kernel_tests/bucketing_test.py
deleted file mode 100644
index 153a03989b..0000000000
--- a/tensorflow/python/data/experimental/kernel_tests/bucketing_test.py
+++ /dev/null
@@ -1,824 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import random
-
-import numpy as np
-
-from tensorflow.python.data.experimental.ops import grouping
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.platform import test
-
-
-class GroupByReducerTest(test_base.DatasetTestBase):
-
-  def checkResults(self, dataset, shapes, values):
-    self.assertEqual(shapes, dataset.output_shapes)
-    get_next = dataset.make_one_shot_iterator().get_next()
-    with self.cached_session() as sess:
-      for expected in values:
-        got = sess.run(get_next)
-        self.assertEqual(got, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSum(self):
-    reducer = grouping.Reducer(
-        init_func=lambda _: np.int64(0),
-        reduce_func=lambda x, y: x + y,
-        finalize_func=lambda x: x)
-    for i in range(1, 11):
-      dataset = dataset_ops.Dataset.range(2 * i).apply(
-          grouping.group_by_reducer(lambda x: x % 2, reducer))
-      self.checkResults(
-          dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
-
-  def testAverage(self):
-
-    def reduce_fn(x, y):
-      return (x[0] * x[1] + math_ops.cast(y, dtypes.float32)) / (
-          x[1] + 1), x[1] + 1
-
-    reducer = grouping.Reducer(
-        init_func=lambda _: (0.0, 0.0),
-        reduce_func=reduce_fn,
-        finalize_func=lambda x, _: x)
-    for i in range(1, 11):
-      dataset = dataset_ops.Dataset.range(2 * i).apply(
-          grouping.group_by_reducer(
-              lambda x: math_ops.cast(x, dtypes.int64) % 2, reducer))
-      self.checkResults(
-          dataset, shapes=tensor_shape.scalar(), values=[i - 1, i])
-
-  def testConcat(self):
-    components = np.array(list("abcdefghijklmnopqrst")).view(np.chararray)
-    reducer = grouping.Reducer(
-        init_func=lambda x: "",
-        reduce_func=lambda x, y: x + y[0],
-        finalize_func=lambda x: x)
-    for i in range(1, 11):
-      dataset = dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.from_tensor_slices(components),
-           dataset_ops.Dataset.range(2 * i))).apply(
-               grouping.group_by_reducer(lambda x, y: y % 2, reducer))
-      self.checkResults(
-          dataset,
-          shapes=tensor_shape.scalar(),
-          values=[b"acegikmoqs" [:i], b"bdfhjlnprt" [:i]])
-
-  def testSparseSum(self):
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1], dtype=np.int64)),
-          dense_shape=np.array([1, 1]))
-
-    reducer = grouping.Reducer(
-        init_func=lambda _: _sparse(np.int64(0)),
-        reduce_func=lambda x, y: _sparse(x.values[0] + y.values[0]),
-        finalize_func=lambda x: x.values[0])
-    for i in range(1, 11):
-      dataset = dataset_ops.Dataset.range(2 * i).map(_sparse).apply(
-          grouping.group_by_reducer(lambda x: x.values[0] % 2, reducer))
-      self.checkResults(
-          dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
-
-  def testChangingStateShape(self):
-
-    def reduce_fn(x, _):
-      # Statically known rank, but dynamic length.
-      larger_dim = array_ops.concat([x[0], x[0]], 0)
-      # Statically unknown rank.
-      larger_rank = array_ops.expand_dims(x[1], 0)
-      return larger_dim, larger_rank
-
-    reducer = grouping.Reducer(
-        init_func=lambda x: ([0], 1),
-        reduce_func=reduce_fn,
-        finalize_func=lambda x, y: (x, y))
-
-    for i in range(1, 11):
-      dataset = dataset_ops.Dataset.from_tensors(np.int64(0)).repeat(i).apply(
-          grouping.group_by_reducer(lambda x: x, reducer))
-      self.assertEqual([None], dataset.output_shapes[0].as_list())
-      self.assertIs(None, dataset.output_shapes[1].ndims)
-      iterator = dataset.make_one_shot_iterator()
-      get_next = iterator.get_next()
-      with self.cached_session() as sess:
-        x, y = sess.run(get_next)
-        self.assertAllEqual([0] * (2**i), x)
-        self.assertAllEqual(np.array(1, ndmin=i), y)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testTypeMismatch(self):
-    reducer = grouping.Reducer(
-        init_func=lambda x: constant_op.constant(1, dtype=dtypes.int32),
-        reduce_func=lambda x, y: constant_op.constant(1, dtype=dtypes.int64),
-        finalize_func=lambda x: x)
-
-    dataset = dataset_ops.Dataset.range(10)
-    with self.assertRaisesRegexp(
-        TypeError,
-        "The element types for the new state must match the initial state."):
-      dataset.apply(
-          grouping.group_by_reducer(lambda _: np.int64(0), reducer))
-
-  # TODO(b/78665031): Remove once non-scalar keys are supported.
-  def testInvalidKeyShape(self):
-    reducer = grouping.Reducer(
-        init_func=lambda x: np.int64(0),
-        reduce_func=lambda x, y: x + y,
-        finalize_func=lambda x: x)
-
-    dataset = dataset_ops.Dataset.range(10)
-    with self.assertRaisesRegexp(
-        ValueError, "`key_func` must return a single tf.int64 tensor."):
-      dataset.apply(
-          grouping.group_by_reducer(lambda _: np.int64((0, 0)), reducer))
-
-  # TODO(b/78665031): Remove once non-int64 keys are supported.
-  def testInvalidKeyType(self):
-    reducer = grouping.Reducer(
-        init_func=lambda x: np.int64(0),
-        reduce_func=lambda x, y: x + y,
-        finalize_func=lambda x: x)
-
-    dataset = dataset_ops.Dataset.range(10)
-    with self.assertRaisesRegexp(
-        ValueError, "`key_func` must return a single tf.int64 tensor."):
-      dataset.apply(
-          grouping.group_by_reducer(lambda _: "wrong", reducer))
-
-  def testTuple(self):
-    def init_fn(_):
-      return np.array([], dtype=np.int64), np.int64(0)
-
-    def reduce_fn(state, value):
-      s1, s2 = state
-      v1, v2 = value
-      return array_ops.concat([s1, [v1]], 0), s2 + v2
-
-    def finalize_fn(s1, s2):
-      return s1, s2
-
-    reducer = grouping.Reducer(init_fn, reduce_fn, finalize_fn)
-    dataset = dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.range(10), dataset_ops.Dataset.range(10))).apply(
-            grouping.group_by_reducer(lambda x, y: np.int64(0), reducer))
-    get_next = dataset.make_one_shot_iterator().get_next()
-    with self.cached_session() as sess:
-      x, y = sess.run(get_next)
-      self.assertAllEqual(x, np.asarray([x for x in range(10)]))
-      self.assertEqual(y, 45)
-
-
-class GroupByWindowTest(test_base.DatasetTestBase):
-
-  def testSimple(self):
-    components = np.random.randint(100, size=(200,)).astype(np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
-        .apply(
-            grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      counts = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          result = sess.run(get_next)
-          self.assertTrue(
-              all(x % 2 == 0
-                  for x in result) or all(x % 2 == 1)
-              for x in result)
-          counts.append(result.shape[0])
-
-      self.assertEqual(len(components), sum(counts))
-      num_full_batches = len([c for c in counts if c == 4])
-      self.assertGreaterEqual(num_full_batches, 24)
-      self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
-
-  def testImmediateOutput(self):
-    components = np.array(
-        [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply(
-            grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      # The input is infinite, so this test demonstrates that:
-      # 1. We produce output without having to consume the entire input,
-      # 2. Different buckets can produce output at different rates, and
-      # 3. For deterministic input, the output is deterministic.
-      for _ in range(3):
-        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
-        self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
-        self.assertAllEqual([2, 2, 2, 2], sess.run(get_next))
-        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
-
-  def testSmallGroups(self):
-    components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).apply(
-            grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
-      self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
-      # The small outputs at the end are deterministically produced in key
-      # order.
-      self.assertAllEqual([0, 0, 0], sess.run(get_next))
-      self.assertAllEqual([1], sess.run(get_next))
-
-  def testEmpty(self):
-    iterator = (
-        dataset_ops.Dataset.range(4).apply(
-            grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "Window size must be greater than zero, but got 0."):
-        print(sess.run(get_next))
-
-  def testReduceFuncError(self):
-    components = np.random.randint(100, size=(200,)).astype(np.int64)
-
-    def reduce_func(_, xs):
-      # Introduce an incorrect padded shape that cannot (currently) be
-      # detected at graph construction time.
-      return xs.padded_batch(
-          4,
-          padded_shapes=(tensor_shape.TensorShape([]),
-                         constant_op.constant([5], dtype=dtypes.int64) * -1))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
-            grouping.group_by_window(lambda x, _: x % 2, reduce_func,
-                                     32)).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-
-  def testConsumeWindowDatasetMoreThanOnce(self):
-    components = np.random.randint(50, size=(200,)).astype(np.int64)
-
-    def reduce_func(key, window):
-      # Apply two different kinds of padding to the input: tight
-      # padding, and quantized (to a multiple of 10) padding.
-      return dataset_ops.Dataset.zip((
-          window.padded_batch(
-              4, padded_shapes=tensor_shape.TensorShape([None])),
-          window.padded_batch(
-              4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),
-      ))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x))
-        .apply(grouping.group_by_window(
-            lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64),
-            reduce_func, 4))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      counts = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          tight_result, multiple_of_10_result = sess.run(get_next)
-          self.assertEqual(0, multiple_of_10_result.shape[1] % 10)
-          self.assertAllEqual(tight_result,
-                              multiple_of_10_result[:, :tight_result.shape[1]])
-          counts.append(tight_result.shape[0])
-      self.assertEqual(len(components), sum(counts))
-
-
-# NOTE(mrry): These tests are based on the tests in bucket_ops_test.py.
-# Currently, they use a constant batch size, though should be made to use a
-# different batch size per key.
-class BucketTest(test_base.DatasetTestBase):
-
-  def _dynamicPad(self, bucket, window, window_size):
-    # TODO(mrry): To match `tf.contrib.training.bucket()`, implement a
-    # generic form of padded_batch that pads every component
-    # dynamically and does not rely on static shape information about
-    # the arguments.
-    return dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.from_tensors(bucket),
-         window.padded_batch(
-             32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape(
-                 [None]), tensor_shape.TensorShape([3])))))
-
-  def testSingleBucket(self):
-
-    def _map_fn(v):
-      return (v, array_ops.fill([v], v),
-              array_ops.fill([3], string_ops.as_string(v)))
-
-    input_dataset = (
-        dataset_ops.Dataset.from_tensor_slices(math_ops.range(32)).map(_map_fn))
-
-    bucketed_dataset = input_dataset.apply(
-        grouping.group_by_window(
-            lambda x, y, z: 0,
-            lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
-
-    iterator = bucketed_dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-
-      which_bucket, bucketed_values = sess.run(get_next)
-
-      self.assertEqual(0, which_bucket)
-
-      expected_scalar_int = np.arange(32, dtype=np.int64)
-      expected_unk_int64 = np.zeros((32, 31)).astype(np.int64)
-      for i in range(32):
-        expected_unk_int64[i, :i] = i
-      expected_vec3_str = np.vstack(3 * [np.arange(32).astype(bytes)]).T
-
-      self.assertAllEqual(expected_scalar_int, bucketed_values[0])
-      self.assertAllEqual(expected_unk_int64, bucketed_values[1])
-      self.assertAllEqual(expected_vec3_str, bucketed_values[2])
-
-  def testEvenOddBuckets(self):
-
-    def _map_fn(v):
-      return (v, array_ops.fill([v], v),
-              array_ops.fill([3], string_ops.as_string(v)))
-
-    input_dataset = (
-        dataset_ops.Dataset.from_tensor_slices(math_ops.range(64)).map(_map_fn))
-
-    bucketed_dataset = input_dataset.apply(
-        grouping.group_by_window(
-            lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
-            lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
-
-    iterator = bucketed_dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-
-      # Get two minibatches (one containing even values, one containing odds)
-      which_bucket_even, bucketed_values_even = sess.run(get_next)
-      which_bucket_odd, bucketed_values_odd = sess.run(get_next)
-
-      # Count number of bucket_tensors.
-      self.assertEqual(3, len(bucketed_values_even))
-      self.assertEqual(3, len(bucketed_values_odd))
-
-      # Ensure bucket 0 was used for all minibatch entries.
-      self.assertAllEqual(0, which_bucket_even)
-      self.assertAllEqual(1, which_bucket_odd)
-
-      # Test the first bucket outputted, the events starting at 0
-      expected_scalar_int = np.arange(0, 32 * 2, 2, dtype=np.int64)
-      expected_unk_int64 = np.zeros((32, 31 * 2)).astype(np.int64)
-      for i in range(0, 32):
-        expected_unk_int64[i, :2 * i] = 2 * i
-        expected_vec3_str = np.vstack(
-            3 * [np.arange(0, 32 * 2, 2).astype(bytes)]).T
-
-      self.assertAllEqual(expected_scalar_int, bucketed_values_even[0])
-      self.assertAllEqual(expected_unk_int64, bucketed_values_even[1])
-      self.assertAllEqual(expected_vec3_str, bucketed_values_even[2])
-
-      # Test the second bucket outputted, the odds starting at 1
-      expected_scalar_int = np.arange(1, 32 * 2 + 1, 2, dtype=np.int64)
-      expected_unk_int64 = np.zeros((32, 31 * 2 + 1)).astype(np.int64)
-      for i in range(0, 32):
-        expected_unk_int64[i, :2 * i + 1] = 2 * i + 1
-        expected_vec3_str = np.vstack(
-            3 * [np.arange(1, 32 * 2 + 1, 2).astype(bytes)]).T
-
-      self.assertAllEqual(expected_scalar_int, bucketed_values_odd[0])
-      self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1])
-      self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
-
-  def testEvenOddBucketsFilterOutAllOdd(self):
-
-    def _map_fn(v):
-      return {
-          "x": v,
-          "y": array_ops.fill([v], v),
-          "z": array_ops.fill([3], string_ops.as_string(v))
-      }
-
-    def _dynamic_pad_fn(bucket, window, _):
-      return dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.from_tensors(bucket),
-           window.padded_batch(
-               32, {
-                   "x": tensor_shape.TensorShape([]),
-                   "y": tensor_shape.TensorShape([None]),
-                   "z": tensor_shape.TensorShape([3])
-               })))
-
-    input_dataset = (
-        dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn)
-        .filter(lambda d: math_ops.equal(d["x"] % 2, 0)))
-
-    bucketed_dataset = input_dataset.apply(
-        grouping.group_by_window(
-            lambda d: math_ops.cast(d["x"] % 2, dtypes.int64),
-            lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32))
-
-    iterator = bucketed_dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-
-      # Get two minibatches ([0, 2, ...] and [64, 66, ...])
-      which_bucket0, bucketed_values_even0 = sess.run(get_next)
-      which_bucket1, bucketed_values_even1 = sess.run(get_next)
-
-      # Ensure that bucket 1 was completely filtered out
-      self.assertAllEqual(0, which_bucket0)
-      self.assertAllEqual(0, which_bucket1)
-      self.assertAllEqual(
-          np.arange(0, 64, 2, dtype=np.int64), bucketed_values_even0["x"])
-      self.assertAllEqual(
-          np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"])
-
-  def testDynamicWindowSize(self):
-    components = np.arange(100).astype(np.int64)
-
-    # Key fn: even/odd
-    # Reduce fn: batches of 5
-    # Window size fn: even=5, odd=10
-
-    def window_size_func(key):
-      window_sizes = constant_op.constant([5, 10], dtype=dtypes.int64)
-      return window_sizes[key]
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
-        grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(20),
-                                 None, window_size_func))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      with self.assertRaises(errors.OutOfRangeError):
-        batches = 0
-        while True:
-          result = sess.run(get_next)
-          is_even = all(x % 2 == 0 for x in result)
-          is_odd = all(x % 2 == 1 for x in result)
-          self.assertTrue(is_even or is_odd)
-          expected_batch_size = 5 if is_even else 10
-          self.assertEqual(expected_batch_size, result.shape[0])
-          batches += 1
-
-      self.assertEqual(batches, 15)
-
-
-def _element_length_fn(x, y=None):
-  del y
-  return array_ops.shape(x)[0]
-
-
-def _to_sparse_tensor(record):
-  return sparse_tensor.SparseTensor(**record)
-
-
-def _format_record(array, sparse):
-  if sparse:
-    return {
-        "values": array,
-        "indices": [[i] for i in range(len(array))],
-        "dense_shape": (len(array),)
-    }
-  return array
-
-
-def _get_record_type(sparse):
-  if sparse:
-    return {
-        "values": dtypes.int64,
-        "indices": dtypes.int64,
-        "dense_shape": dtypes.int64
-    }
-  return dtypes.int32
-
-
-def _get_record_shape(sparse):
-  if sparse:
-    return {
-        "values": tensor_shape.TensorShape([None,]),
-        "indices": tensor_shape.TensorShape([None, 1]),
-        "dense_shape": tensor_shape.TensorShape([1,])
-    }
-  return tensor_shape.TensorShape([None])
-
-
-class BucketBySequenceLength(test_base.DatasetTestBase):
-
-  def testBucket(self):
-
-    boundaries = [10, 20, 30]
-    batch_sizes = [10, 8, 4, 2]
-    lengths = [8, 13, 25, 35]
-
-    def build_dataset(sparse):
-      def _generator():
-        # Produce 1 batch for each bucket
-        elements = []
-        for batch_size, length in zip(batch_sizes, lengths):
-          record_len = length - 1
-          for _ in range(batch_size):
-            elements.append([1] * record_len)
-            record_len = length
-        random.shuffle(elements)
-        for el in elements:
-          yield (_format_record(el, sparse),)
-      dataset = dataset_ops.Dataset.from_generator(
-          _generator,
-          (_get_record_type(sparse),),
-          (_get_record_shape(sparse),))
-      if sparse:
-        dataset = dataset.map(lambda x: (_to_sparse_tensor(x),))
-      return dataset
-
-    def _test_bucket_by_padding(no_padding):
-      dataset = build_dataset(sparse=no_padding)
-      dataset = dataset.apply(
-          grouping.bucket_by_sequence_length(
-              _element_length_fn,
-              boundaries,
-              batch_sizes,
-              no_padding=no_padding))
-      batch, = dataset.make_one_shot_iterator().get_next()
-
-      with self.cached_session() as sess:
-        batches = []
-        for _ in range(4):
-          batches.append(sess.run(batch))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(batch)
-      batch_sizes_val = []
-      lengths_val = []
-      for batch in batches:
-        shape = batch.dense_shape if no_padding else batch.shape
-        batch_size = shape[0]
-        length = shape[1]
-        batch_sizes_val.append(batch_size)
-        lengths_val.append(length)
-        sum_check = batch.values.sum() if no_padding else batch.sum()
-        self.assertEqual(sum_check, batch_size * length - 1)
-      self.assertEqual(sum(batch_sizes_val), sum(batch_sizes))
-      self.assertEqual(sorted(batch_sizes), sorted(batch_sizes_val))
-      self.assertEqual(sorted(lengths), sorted(lengths_val))
-
-    for no_padding in (True, False):
-      _test_bucket_by_padding(no_padding)
-
-  def testPadToBoundary(self):
-
-    boundaries = [10, 20, 30]
-    batch_sizes = [10, 8, 4, 2]
-    lengths = [8, 13, 25]
-
-    def element_gen():
-      # Produce 1 batch for each bucket
-      elements = []
-      for batch_size, length in zip(batch_sizes[:-1], lengths):
-        for _ in range(batch_size):
-          elements.append([1] * length)
-      random.shuffle(elements)
-      for el in elements:
-        yield (el,)
-      for _ in range(batch_sizes[-1]):
-        el = [1] * (boundaries[-1] + 5)
-        yield (el,)
-
-    element_len = lambda el: array_ops.shape(el)[0]
-    dataset = dataset_ops.Dataset.from_generator(
-        element_gen, (dtypes.int64,), ([None],)).apply(
-            grouping.bucket_by_sequence_length(
-                element_len, boundaries, batch_sizes,
-                pad_to_bucket_boundary=True))
-    batch, = dataset.make_one_shot_iterator().get_next()
-
-    with self.cached_session() as sess:
-      batches = []
-      for _ in range(3):
-        batches.append(sess.run(batch))
-      with self.assertRaisesOpError("bucket_boundaries"):
-        sess.run(batch)
-    batch_sizes_val = []
-    lengths_val = []
-    for batch in batches:
-      batch_size = batch.shape[0]
-      length = batch.shape[1]
-      batch_sizes_val.append(batch_size)
-      lengths_val.append(length)
-    batch_sizes = batch_sizes[:-1]
-    self.assertEqual(sum(batch_sizes_val), sum(batch_sizes))
-    self.assertEqual(sorted(batch_sizes), sorted(batch_sizes_val))
-    self.assertEqual([boundary - 1 for boundary in sorted(boundaries)],
-                     sorted(lengths_val))
-
-  def testPadToBoundaryNoExtraneousPadding(self):
-
-    boundaries = [3, 7, 11]
-    batch_sizes = [2, 2, 2, 2]
-    lengths = range(1, 11)
-
-    def element_gen():
-      for length in lengths:
-        yield ([1] * length,)
-
-    element_len = lambda element: array_ops.shape(element)[0]
-    dataset = dataset_ops.Dataset.from_generator(
-        element_gen, (dtypes.int64,), ([None],)).apply(
-            grouping.bucket_by_sequence_length(
-                element_len, boundaries, batch_sizes,
-                pad_to_bucket_boundary=True))
-    batch, = dataset.make_one_shot_iterator().get_next()
-
-    with self.cached_session() as sess:
-      batches = []
-      for _ in range(5):
-        batches.append(sess.run(batch))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(batch)
-
-    self.assertAllEqual(batches[0], [[1, 0],
-                                     [1, 1]])
-    self.assertAllEqual(batches[1], [[1, 1, 1, 0, 0, 0],
-                                     [1, 1, 1, 1, 0, 0]])
-    self.assertAllEqual(batches[2], [[1, 1, 1, 1, 1, 0],
-                                     [1, 1, 1, 1, 1, 1]])
-    self.assertAllEqual(batches[3], [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
-                                     [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
-    self.assertAllEqual(batches[4], [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
-                                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-
-  def testTupleElements(self):
-
-    def build_dataset(sparse):
-      def _generator():
-        text = [[1, 2, 3], [3, 4, 5, 6, 7], [1, 2], [8, 9, 0, 2, 3]]
-        label = [1, 2, 1, 2]
-        for x, y in zip(text, label):
-          yield (_format_record(x, sparse), y)
-      dataset = dataset_ops.Dataset.from_generator(
-          generator=_generator,
-          output_types=(_get_record_type(sparse), dtypes.int32),
-          output_shapes=(_get_record_shape(sparse),
-                         tensor_shape.TensorShape([])))
-      if sparse:
-        dataset = dataset.map(lambda x, y: (_to_sparse_tensor(x), y))
-      return dataset
-
-    def _test_tuple_elements_by_padding(no_padding):
-      dataset = build_dataset(sparse=no_padding)
-      dataset = dataset.apply(grouping.bucket_by_sequence_length(
-          element_length_func=_element_length_fn,
-          bucket_batch_sizes=[2, 2, 2],
-          bucket_boundaries=[0, 8],
-          no_padding=no_padding))
-      shapes = dataset.output_shapes
-      self.assertEqual([None, None], shapes[0].as_list())
-      self.assertEqual([None], shapes[1].as_list())
-
-    for no_padding in (True, False):
-      _test_tuple_elements_by_padding(no_padding)
-
-  def testBucketSparse(self):
-    """Tests bucketing of sparse tensors (case where `no_padding` == True).
-
-    Test runs on following dataset:
-      [
-        [0],
-        [0, 1],
-        [0, 1, 2]
-        ...
-        [0, ..., max_len - 1]
-      ]
-    Sequences are bucketed by length and batched with
-      `batch_size` < `bucket_size`.
-    """
-
-    min_len = 0
-    max_len = 100
-    batch_size = 7
-    bucket_size = 10
-
-    def _build_dataset():
-      input_data = [range(i+1) for i in range(min_len, max_len)]
-      def generator_fn():
-        for record in input_data:
-          yield _format_record(record, sparse=True)
-      dataset = dataset_ops.Dataset.from_generator(
-          generator=generator_fn,
-          output_types=_get_record_type(sparse=True))
-      dataset = dataset.map(_to_sparse_tensor)
-      return dataset
-
-    def _compute_expected_batches():
-      """Computes expected batch outputs and stores in a set."""
-      all_expected_sparse_tensors = set()
-      for bucket_start_len in range(min_len, max_len, bucket_size):
-        for batch_offset in range(0, bucket_size, batch_size):
-          batch_start_len = bucket_start_len + batch_offset
-          batch_end_len = min(batch_start_len + batch_size,
-                              bucket_start_len + bucket_size)
-          expected_indices = []
-          expected_values = []
-          for length in range(batch_start_len, batch_end_len):
-            for val in range(length + 1):
-              expected_indices.append((length - batch_start_len, val))
-              expected_values.append(val)
-          expected_sprs_tensor = (tuple(expected_indices),
-                                  tuple(expected_values))
-          all_expected_sparse_tensors.add(expected_sprs_tensor)
-      return all_expected_sparse_tensors
-
-    def _compute_batches(dataset):
-      """Computes actual batch outputs of dataset and stores in a set."""
-      batch = dataset.make_one_shot_iterator().get_next()
-      all_sparse_tensors = set()
-      with self.cached_session() as sess:
-        with self.assertRaises(errors.OutOfRangeError):
-          while True:
-            output = sess.run(batch)
-            sprs_tensor = (tuple([tuple(idx) for idx in output.indices]),
-                           tuple(output.values))
-            all_sparse_tensors.add(sprs_tensor)
-      return all_sparse_tensors
-
-    dataset = _build_dataset()
-    boundaries = range(min_len + bucket_size + 1, max_len, bucket_size)
-    dataset = dataset.apply(grouping.bucket_by_sequence_length(
-        _element_length_fn,
-        boundaries,
-        [batch_size] * (len(boundaries) + 1),
-        no_padding=True))
-    batches = _compute_batches(dataset)
-    expected_batches = _compute_expected_batches()
-    self.assertEqual(batches, expected_batches)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetching_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
similarity index 56%
rename from tensorflow/python/data/experimental/kernel_tests/prefetching_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
index 7d7b842c17..adfacf1c9f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetching_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
@@ -12,440 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for prefetching_ops."""
+"""Tests for `tf.data.experimental.copy_to_device()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import prefetching_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
 
-class PrefetchingKernelsOpsTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    self._event = threading.Event()
-
-  def _create_ds_and_iterator(self, device0, initializable=False):
-
-    def gen():
-      for i in range(1, 10):
-        yield [float(i)]
-        if i == 6:
-          self._event.set()
-
-    with ops.device(device0):
-      ds = dataset_ops.Dataset.from_generator(gen, (dtypes.float32))
-      if initializable:
-        ds_iterator = ds.make_initializable_iterator()
-      else:
-        ds_iterator = ds.make_one_shot_iterator()
-      return (ds, ds_iterator)
-
-  def _create_ops(self, ds, ds_iterator, buffer_name, device0, device1):
-    ds_iterator_handle = ds_iterator.string_handle()
-
-    @function.Defun(dtypes.string)
-    def _remote_fn(h):
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, ds.output_types, ds.output_shapes)
-      return remote_iterator.get_next()
-
-    target = constant_op.constant(device0)
-    with ops.device(device1):
-      buffer_resource_handle = prefetching_ops.function_buffering_resource(
-          f=_remote_fn,
-          output_types=[dtypes.float32],
-          target_device=target,
-          string_arg=ds_iterator_handle,
-          buffer_size=3,
-          shared_name=buffer_name)
-
-    with ops.device(device1):
-      prefetch_op = prefetching_ops.function_buffering_resource_get_next(
-          function_buffer_resource=buffer_resource_handle,
-          output_types=[dtypes.float32])
-      reset_op = prefetching_ops.function_buffering_resource_reset(
-          function_buffer_resource=buffer_resource_handle)
-      destroy_op = resource_variable_ops.destroy_resource_op(
-          buffer_resource_handle, ignore_lookup_error=True)
-
-    return (prefetch_op, reset_op, destroy_op)
-
-  def _prefetch_fn_helper_one_shot(self, buffer_name, device0, device1):
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-
-    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=False)
-    prefetch_op, _, destroy_op = self._create_ops(ds, ds_iterator, buffer_name,
-                                                  device0, device1)
-
-    with self.test_session(config=worker_config) as sess:
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [4.0])
-      self._event.wait()
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [5.0])
-      sess.run(destroy_op)
-
-  def testSameDeviceCPU(self):
-    self._prefetch_fn_helper_one_shot("same_device_cpu",
-                                      "/job:localhost/replica:0/task:0/cpu:0",
-                                      "/job:localhost/replica:0/task:0/cpu:0")
-
-  def testDifferentDeviceCPU(self):
-    self._prefetch_fn_helper_one_shot("diff_device_cpu",
-                                      "/job:localhost/replica:0/task:0/cpu:0",
-                                      "/job:localhost/replica:0/task:0/cpu:1")
-
-  def testDifferentDeviceCPUGPU(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    self._prefetch_fn_helper_one_shot("cpu_gpu",
-                                      "/job:localhost/replica:0/task:0/cpu:0",
-                                      "/job:localhost/replica:0/task:0/gpu:0")
-
-  def testReinitialization(self):
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-
-    device0 = "/job:localhost/replica:0/task:0/cpu:0"
-    device1 = "/job:localhost/replica:0/task:0/cpu:1"
-    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=True)
-    prefetch_op, reset_op, destroy_op = self._create_ops(
-        ds, ds_iterator, "reinit", device0, device1)
-
-    with self.test_session(config=worker_config) as sess:
-      sess.run(ds_iterator.initializer)
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [4.0])
-      self._event.wait()
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [5.0])
-      # Lets reset the function buffering resource and reinitialize the
-      # iterator. Should be able to go through this again.
-      self._event.clear()
-      sess.run(reset_op)
-      sess.run(ds_iterator.initializer)
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [4.0])
-      self._event.wait()
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [5.0])
-      sess.run(destroy_op)
-
-  def testReinitializationOutOfRange(self):
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-
-    device0 = "/job:localhost/replica:0/task:0/cpu:0"
-    device1 = "/job:localhost/replica:0/task:0/cpu:1"
-    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=True)
-    prefetch_op, reset_op, destroy_op = self._create_ops(
-        ds, ds_iterator, "reinit", device0, device1)
-
-    with self.test_session(config=worker_config) as sess:
-      sess.run(ds_iterator.initializer)
-      for i in range(1, 10):
-        elem = sess.run(prefetch_op)
-        self.assertEqual(elem, [float(i)])
-      # Try fetching after its over twice to test out end of sequence.
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-
-      # Now reset everything and try it out again.
-      self._event.clear()
-      sess.run(reset_op)
-      sess.run(ds_iterator.initializer)
-      for i in range(1, 10):
-        elem = sess.run(prefetch_op)
-        self.assertEqual(elem, [float(i)])
-      # Try fetching after its over twice to test out end of sequence.
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-
-      sess.run(destroy_op)
-
-  def testStringsGPU(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    device0 = "/job:localhost/replica:0/task:0/cpu:0"
-    device1 = "/job:localhost/replica:0/task:0/gpu:0"
-
-    ds = dataset_ops.Dataset.from_tensor_slices(["a", "b", "c"])
-    ds_iterator = ds.make_one_shot_iterator()
-    ds_iterator_handle = ds_iterator.string_handle()
-
-    @function.Defun(dtypes.string)
-    def _remote_fn(h):
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, ds.output_types, ds.output_shapes)
-      return remote_iterator.get_next()
-
-    target = constant_op.constant(device0)
-    with ops.device(device1):
-      buffer_resource_handle = prefetching_ops.function_buffering_resource(
-          f=_remote_fn,
-          output_types=[dtypes.string],
-          target_device=target,
-          string_arg=ds_iterator_handle,
-          buffer_size=3,
-          shared_name="strings")
-
-    with ops.device(device1):
-      prefetch_op = prefetching_ops.function_buffering_resource_get_next(
-          function_buffer_resource=buffer_resource_handle,
-          output_types=[dtypes.string])
-      destroy_op = resource_variable_ops.destroy_resource_op(
-          buffer_resource_handle, ignore_lookup_error=True)
-
-    with self.cached_session() as sess:
-      self.assertEqual([b"a"], sess.run(prefetch_op))
-      self.assertEqual([b"b"], sess.run(prefetch_op))
-      self.assertEqual([b"c"], sess.run(prefetch_op))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-
-      sess.run(destroy_op)
-
-
-class PrefetchToDeviceTest(test_base.DatasetTestBase):
-
-  def testPrefetchToDevice(self):
-    host_dataset = dataset_ops.Dataset.range(10)
-    device_dataset = host_dataset.apply(
-        prefetching_ops.prefetch_to_device("/cpu:1"))
-
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
-
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
-
-    next_element = iterator.get_next()
-    self.assertEqual(dtypes.int64, next_element.dtype)
-    self.assertEqual([], next_element.shape)
-
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testPrefetchToSameDevice(self):
-    host_dataset = dataset_ops.Dataset.range(10)
-    device_dataset = host_dataset.apply(
-        prefetching_ops.prefetch_to_device(
-            "/job:localhost/replica:0/task:0/device:CPU:0"))
-
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
-
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
-
-    next_element = iterator.get_next()
-    self.assertEqual(dtypes.int64, next_element.dtype)
-    self.assertEqual([], next_element.shape)
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testPrefetchDictToDevice(self):
-    host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
-    device_dataset = host_dataset.apply(
-        prefetching_ops.prefetch_to_device("/cpu:1"))
-
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
-
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
-
-    next_element = iterator.get_next()
-    self.assertEqual(dtypes.int64, next_element["a"].dtype)
-    self.assertEqual([], next_element["a"].shape)
-
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      for i in range(10):
-        self.assertEqual({"a": i}, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testPrefetchSparseTensorsToDevice(self):
-    def make_tensor(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0]], values=(i*[1]), dense_shape=[2, 2])
-    host_dataset = dataset_ops.Dataset.range(10).map(make_tensor)
-
-    device_dataset = host_dataset.apply(
-        prefetching_ops.prefetch_to_device("/cpu:1"))
-
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
-
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
-
-    next_element = iterator.get_next()
-    self.assertEqual(dtypes.int64, next_element.dtype)
-
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      for i in range(10):
-        actual = sess.run(next_element)
-        self.assertAllEqual([i], actual.values)
-        self.assertAllEqual([[0, 0]], actual.indices)
-        self.assertAllEqual([2, 2], actual.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testPrefetchToDeviceGpu(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    host_dataset = dataset_ops.Dataset.range(10)
-    device_dataset = host_dataset.apply(
-        prefetching_ops.prefetch_to_device("/gpu:0"))
-
-    iterator = device_dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testPrefetchToDeviceWithReInit(self):
-    host_dataset = dataset_ops.Dataset.range(10)
-    device_dataset = host_dataset.apply(
-        prefetching_ops.prefetch_to_device("/cpu:1"))
-
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
-
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
-
-    next_element = iterator.get_next()
-    self.assertEqual(dtypes.int64, next_element.dtype)
-    self.assertEqual([], next_element.shape)
-
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      sess.run(iterator.initializer)
-      for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
-      for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testPrefetchToDeviceGpuWithReInit(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    host_dataset = dataset_ops.Dataset.range(10)
-    device_dataset = host_dataset.apply(
-        prefetching_ops.prefetch_to_device("/gpu:0"))
-
-    iterator = device_dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
-      for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-
 class CopyToDeviceTest(test_base.DatasetTestBase):
 
   def testCopyToDevice(self):
diff --git a/tensorflow/python/data/experimental/kernel_tests/counter_test.py b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
new file mode 100644
index 0000000000..4e114ac479
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
@@ -0,0 +1,51 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.Counter`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import counter
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+class CounterTest(test_base.DatasetTestBase):
+
+  def testCounter(self):
+    """Test dataset construction using `count`."""
+    iterator = (counter.Counter(start=3, step=4)
+                .make_one_shot_iterator())
+    get_next = iterator.get_next()
+    self.assertEqual([], get_next.shape.as_list())
+    self.assertEqual(dtypes.int64, get_next.dtype)
+
+    negative_iterator = (counter.Counter(start=0, step=-1)
+                         .make_one_shot_iterator())
+    negative_get_next = negative_iterator.get_next()
+
+    with self.cached_session() as sess:
+      self.assertEqual(3, sess.run(get_next))
+      self.assertEqual(3 + 4, sess.run(get_next))
+      self.assertEqual(3 + 2 * 4, sess.run(get_next))
+
+      self.assertEqual(0, sess.run(negative_get_next))
+      self.assertEqual(-1, sess.run(negative_get_next))
+      self.assertEqual(-2, sess.run(negative_get_next))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
similarity index 99%
rename from tensorflow/python/data/experimental/kernel_tests/csv_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
index 4ee1779710..fb75be1fbc 100644
--- a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for CsvDatasetOp."""
+"""Tests for `tf.data.experimental.CsvDataset`."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -44,7 +44,7 @@ from tensorflow.python.platform import test
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class CsvDatasetOpTest(test_base.DatasetTestBase):
+class CsvDatasetTest(test_base.DatasetTestBase):
 
   def _setup_files(self, inputs, linebreak='\n', compression_type=None):
     filenames = []
diff --git a/tensorflow/python/data/experimental/kernel_tests/dataset_serialization_test_base.py b/tensorflow/python/data/experimental/kernel_tests/dataset_serialization_test_base.py
deleted file mode 100644
index 7f435b8239..0000000000
--- a/tensorflow/python/data/experimental/kernel_tests/dataset_serialization_test_base.py
+++ /dev/null
@@ -1,692 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Base class for testing serializable datasets."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import numpy as np
-
-from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import lookup_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.util import nest
-
-
-def remove_variants(get_next_op):
-  # TODO(b/72408568): Remove this once session.run can get
-  # variant tensors.
-  """Remove variants from a nest structure, so sess.run will execute."""
-
-  def _remove_variant(x):
-    if isinstance(x, ops.Tensor) and x.dtype == dtypes.variant:
-      return ()
-    else:
-      return x
-
-  return nest.map_structure(_remove_variant, get_next_op)
-
-
-class DatasetSerializationTestBase(test.TestCase):
-  """Base class for testing serializable datasets."""
-
-  def tearDown(self):
-    self._delete_ckpt()
-
-  # TODO(b/72657739): Remove sparse_tensor argument, which is to test the
-  # (deprecated) saveable `SparseTensorSliceDataset`, once the API
-  # `from_sparse_tensor_slices()`and related tests are deleted.
-  def run_core_tests(self, ds_fn1, ds_fn2, num_outputs, sparse_tensors=False):
-    """Runs the core tests.
-
-    Args:
-      ds_fn1: 0-argument function that returns a Dataset.
-      ds_fn2: 0-argument function that returns a Dataset different from
-        ds_fn1. If None, verify_restore_in_modified_graph test is not run.
-      num_outputs: Total number of outputs expected from this Dataset.
-      sparse_tensors: Whether dataset is built from SparseTensor(s).
-
-    Raises:
-      AssertionError if any test fails.
-    """
-    self.verify_unused_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
-    self.verify_fully_used_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
-    self.verify_exhausted_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
-    self.verify_init_before_restore(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
-    self.verify_multiple_breaks(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
-    self.verify_reset_restored_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
-    self.verify_restore_in_empty_graph(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
-    if ds_fn2:
-      self.verify_restore_in_modified_graph(
-          ds_fn1, ds_fn2, num_outputs, sparse_tensors=sparse_tensors)
-
-  def verify_unused_iterator(self,
-                             ds_fn,
-                             num_outputs,
-                             sparse_tensors=False,
-                             verify_exhausted=True):
-    """Verifies that saving and restoring an unused iterator works.
-
-    Args:
-      ds_fn: See `run_core_tests`.
-      num_outputs: See `run_core_tests`.
-      sparse_tensors: See `run_core_tests`.
-      verify_exhausted: See `gen_outputs`.
-
-    Raises:
-      AssertionError if any test fails.
-    """
-    self.verify_run_with_breaks(
-        ds_fn, [0],
-        num_outputs,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=verify_exhausted)
-
-  def verify_fully_used_iterator(self, ds_fn, num_outputs,
-                                 sparse_tensors=False):
-    """Verifies that saving and restoring a fully used iterator works.
-
-    Note that this only checks saving and restoring an iterator from which
-    `num_outputs` items have been produced but does not check for an
-    exhausted iterator, i.e., one from which an OutOfRange error has been
-    returned.
-
-    Args:
-      ds_fn: See `run_core_tests`.
-      num_outputs: See `run_core_tests`.
-      sparse_tensors: See `run_core_tests`.
-
-    Raises:
-      AssertionError if test fails.
-    """
-    self.verify_run_with_breaks(
-        ds_fn, [num_outputs], num_outputs, sparse_tensors=sparse_tensors)
-
-  def verify_exhausted_iterator(self, ds_fn, num_outputs, sparse_tensors=False):
-    """Verifies that saving and restoring an exhausted iterator works.
-
-    An exhausted iterator is one which has returned an OutOfRange error.
-
-    Args:
-      ds_fn: See `run_core_tests`.
-      num_outputs: See `run_core_tests`.
-      sparse_tensors: See `run_core_tests`.
-
-    Raises:
-      AssertionError if any test fails.
-    """
-    self.gen_outputs(
-        ds_fn, [],
-        num_outputs,
-        verify_exhausted=True,
-        sparse_tensors=sparse_tensors)
-    actual = self.gen_outputs(
-        ds_fn, [],
-        0,
-        ckpt_saved=True,
-        verify_exhausted=True,
-        sparse_tensors=sparse_tensors)
-    self.assertEqual(len(actual), 0)
-
-  def verify_init_before_restore(self,
-                                 ds_fn,
-                                 num_outputs,
-                                 sparse_tensors=False,
-                                 verify_exhausted=True):
-    """Verifies that restoring into an already initialized iterator works.
-
-    Args:
-      ds_fn: See `run_core_tests`.
-      num_outputs: See `run_core_tests`.
-      sparse_tensors: See `run_core_tests`.
-      verify_exhausted: See `gen_outputs`.
-
-    Raises:
-      AssertionError if any test fails.
-    """
-    self.verify_run_with_breaks(
-        ds_fn,
-        self.gen_break_points(num_outputs),
-        num_outputs,
-        init_before_restore=True,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=verify_exhausted)
-
-  def verify_multiple_breaks(self,
-                             ds_fn,
-                             num_outputs,
-                             num_breaks=10,
-                             sparse_tensors=False,
-                             verify_exhausted=True):
-    """Attempts to save/restore at multiple break points.
-
-    Args:
-      ds_fn: See `run_core_tests`.
-      num_outputs: See `run_core_tests`.
-      num_breaks: The number of break points. These are uniformly spread in
-        [0, num_outputs] both inclusive.
-      sparse_tensors: See `run_core_tests`.
-      verify_exhausted: See `gen_outputs`.
-
-    Raises:
-      AssertionError if any test fails.
-    """
-    self.verify_run_with_breaks(
-        ds_fn,
-        self.gen_break_points(num_outputs, num_breaks),
-        num_outputs,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=verify_exhausted)
-
-  def verify_reset_restored_iterator(self,
-                                     ds_fn,
-                                     num_outputs,
-                                     break_point=None,
-                                     sparse_tensors=False,
-                                     verify_exhausted=True):
-    """Attempts to re-initialize a restored iterator.
-
-    This is useful when restoring a training checkpoint during validation.
-
-    Args:
-      ds_fn: See `run_core_tests`.
-      num_outputs: See `run_core_tests`.
-      break_point: Break point. Optional. Defaults to num_outputs/2.
-      sparse_tensors: See `run_core_tests`.
-      verify_exhausted: See `gen_outputs`.
-
-    Raises:
-      AssertionError if any test fails.
-    """
-    break_point = num_outputs // 2 if not break_point else break_point
-
-    # Collect ground truth containing all outputs.
-    expected = self.gen_outputs(
-        ds_fn, [],
-        num_outputs,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=verify_exhausted)
-
-    # Skip some items and save checkpoint.
-    self.gen_outputs(
-        ds_fn, [],
-        break_point,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=False)
-
-    actual = []
-    # Restore from checkpoint and then run init_op.
-    with ops.Graph().as_default() as g:
-      saver = self._import_meta_graph()
-      init_op, get_next_op = self._get_iterator_ops_from_collection(
-          ds_fn, sparse_tensors=sparse_tensors)
-      get_next_op = remove_variants(get_next_op)
-      with self.session(graph=g) as sess:
-        self._restore(saver, sess)
-        self._initialize(init_op, sess)
-        for _ in range(num_outputs):
-          actual.append(sess.run(get_next_op))
-        if verify_exhausted:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-    self.match(expected, actual)
-
-  def verify_restore_in_modified_graph(self,
-                                       ds_fn1,
-                                       ds_fn2,
-                                       num_outputs,
-                                       break_point=None,
-                                       sparse_tensors=False,
-                                       verify_exhausted=True):
-    """Attempts to restore an iterator in a modified graph.
-
-    Builds an input pipeline using ds_fn1, runs it for `break_point` steps
-    and saves a checkpoint. Then builds a new graph using ds_fn2, restores
-    the checkpoint from ds_fn1 and verifies that the restore is successful.
-
-    Args:
-      ds_fn1: See `run_core_tests`.
-      ds_fn2: See `run_core_tests`.
-      num_outputs: See `run_core_tests`.
-      break_point: Break point. Optional. Defaults to num_outputs/2.
-      sparse_tensors: See `run_core_tests`.
-      verify_exhausted: See `gen_outputs`.
-
-    Raises:
-      AssertionError if any test fails.
-    """
-    break_point = num_outputs // 2 if not break_point else break_point
-
-    # Skip `break_point` items and store the remaining produced from ds_fn1
-    # in `expected`.
-    self.gen_outputs(
-        ds_fn1, [],
-        break_point,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=False)
-    expected = self.gen_outputs(
-        ds_fn1, [],
-        num_outputs - break_point,
-        ckpt_saved=True,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=verify_exhausted)
-
-    # Generate `break_point` items from ds_fn1 and save checkpoint.
-    self.gen_outputs(
-        ds_fn1, [],
-        break_point,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=False)
-
-    actual = []
-    # Build graph for ds_fn2 but load checkpoint for ds_fn1.
-    with ops.Graph().as_default() as g:
-      _, get_next_op, saver = self._build_graph(
-          ds_fn2, sparse_tensors=sparse_tensors)
-      get_next_op = remove_variants(get_next_op)
-      with self.session(graph=g) as sess:
-        self._restore(saver, sess)
-        for _ in range(num_outputs - break_point):
-          actual.append(sess.run(get_next_op))
-        if verify_exhausted:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    self.match(expected, actual)
-
-  def verify_restore_in_empty_graph(self,
-                                    ds_fn,
-                                    num_outputs,
-                                    break_point=None,
-                                    sparse_tensors=False,
-                                    verify_exhausted=True):
-    """Attempts to restore an iterator in an empty graph.
-
-    Builds an input pipeline using ds_fn, runs it for `break_point` steps
-    and saves a checkpoint. Then builds a new empty graph, restores
-    the checkpoint from ds_fn and verifies that the restore is successful.
-
-    Args:
-      ds_fn: See `run_core_tests`.
-      num_outputs: See `run_core_tests`.
-      break_point: Break point. Optional. Defaults to num_outputs/2.
-      sparse_tensors: See `run_core_tests`.
-      verify_exhausted: See `gen_outputs`.
-
-    Raises:
-      AssertionError if any test fails.
-    """
-    break_point = num_outputs // 2 if not break_point else break_point
-
-    # Skip `break_point` items and store the remaining produced from ds_fn
-    # in `expected`.
-    self.gen_outputs(
-        ds_fn, [],
-        break_point,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=False)
-    expected = self.gen_outputs(
-        ds_fn, [],
-        num_outputs - break_point,
-        ckpt_saved=True,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=verify_exhausted)
-
-    # Generate `break_point` items from ds_fn and save checkpoint.
-    self.gen_outputs(
-        ds_fn, [],
-        break_point,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=False)
-
-    actual = []
-    # Build an empty graph but load checkpoint for ds_fn.
-    with ops.Graph().as_default() as g:
-      get_next_op, saver = self._build_empty_graph(
-          ds_fn, sparse_tensors=sparse_tensors)
-      get_next_op = remove_variants(get_next_op)
-      with self.session(graph=g) as sess:
-        self._restore(saver, sess)
-        for _ in range(num_outputs - break_point):
-          actual.append(sess.run(get_next_op))
-        if verify_exhausted:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    self.match(expected, actual)
-
-  def verify_error_on_save(self,
-                           ds_fn,
-                           num_outputs,
-                           error,
-                           break_point=None,
-                           sparse_tensors=False):
-    """Attempts to save a non-saveable iterator.
-
-    Args:
-      ds_fn: See `run_core_tests`.
-      num_outputs: See `run_core_tests`.
-      error: Declared error when trying to save iterator.
-      break_point: Break point. Optional. Defaults to num_outputs/2.
-      sparse_tensors: See `run_core_tests`.
-
-    Raises:
-      AssertionError if any test fails.
-    """
-
-    break_point = num_outputs // 2 if not break_point else break_point
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = self._build_graph(
-          ds_fn, sparse_tensors=sparse_tensors)
-      get_next_op = remove_variants(get_next_op)
-      with self.session(graph=g) as sess:
-        self._initialize(init_op, sess)
-        for _ in range(break_point):
-          sess.run(get_next_op)
-        with self.assertRaises(error):
-          self._save(sess, saver)
-
-  def verify_run_with_breaks(self,
-                             ds_fn,
-                             break_points,
-                             num_outputs,
-                             init_before_restore=False,
-                             sparse_tensors=False,
-                             verify_exhausted=True):
-    """Verifies that ds_fn() produces the same outputs with and without breaks.
-
-    1. Builds a Dataset using `ds_fn` and produces `num_outputs` items from it
-       *without* stopping at break points.
-    2. Builds a Dataset using `ds_fn` and produces `num_outputs` items from it
-       with stopping at break points.
-
-    Deep matches outputs from 1 and 2.
-
-    Args:
-      ds_fn: See `gen_outputs`.
-      break_points: See `gen_outputs`.
-      num_outputs: See `gen_outputs`.
-      init_before_restore: See `gen_outputs`.
-      sparse_tensors: See `run_core_tests`.
-      verify_exhausted: See `gen_outputs`.
-
-    Raises:
-      AssertionError if any test fails.
-    """
-    expected = self.gen_outputs(
-        ds_fn, [],
-        num_outputs,
-        init_before_restore=init_before_restore,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=verify_exhausted)
-
-    actual = self.gen_outputs(
-        ds_fn,
-        break_points,
-        num_outputs,
-        init_before_restore=init_before_restore,
-        sparse_tensors=sparse_tensors,
-        verify_exhausted=verify_exhausted)
-
-    self.match(expected, actual)
-
-  def gen_outputs(self,
-                  ds_fn,
-                  break_points,
-                  num_outputs,
-                  ckpt_saved=False,
-                  init_before_restore=False,
-                  sparse_tensors=False,
-                  verify_exhausted=True,
-                  save_checkpoint_at_end=True):
-    """Generates elements from input dataset while stopping at break points.
-
-    Produces `num_outputs` outputs and saves the state of the iterator in the
-    Saver checkpoint.
-
-    Args:
-      ds_fn: 0-argument function that returns the dataset.
-      break_points: A list of integers. For each `break_point` in
-        `break_points`, we produce outputs till `break_point` number of items
-        have been produced and then checkpoint the state. The current graph
-        and session are destroyed and a new graph and session are used to
-        produce outputs till next checkpoint or till `num_outputs` elements
-        have been produced. `break_point` must be <= `num_outputs`.
-      num_outputs: The total number of outputs to produce from the iterator.
-      ckpt_saved: Whether a checkpoint already exists. If False, we build the
-        graph from ds_fn.
-      init_before_restore: Whether init should be called before saver.restore.
-        This is just so that we can verify that restoring an already initialized
-        iterator works.
-      sparse_tensors:  Whether dataset is built from SparseTensor(s).
-      verify_exhausted: Whether to verify that the iterator has been exhausted
-        after producing `num_outputs` elements.
-      save_checkpoint_at_end: Whether to save a checkpoint after producing all
-        outputs. If False, checkpoints are saved each break point but not at the
-        end. Note that checkpoints overwrite each other so there is always only
-        a single checkpoint available. Defaults to True.
-
-    Returns:
-      A list of `num_outputs` items.
-    """
-    outputs = []
-
-    def get_ops():
-      if ckpt_saved:
-        saver = self._import_meta_graph()
-        init_op, get_next_op = self._get_iterator_ops_from_collection(
-            ds_fn, sparse_tensors=sparse_tensors)
-      else:
-        init_op, get_next_op, saver = self._build_graph(
-            ds_fn, sparse_tensors=sparse_tensors)
-      return init_op, get_next_op, saver
-
-    for i in range(len(break_points) + 1):
-      with ops.Graph().as_default() as g:
-        init_op, get_next_op, saver = get_ops()
-        get_next_op = remove_variants(get_next_op)
-        with self.session(graph=g) as sess:
-          if ckpt_saved:
-            if init_before_restore:
-              self._initialize(init_op, sess)
-            self._restore(saver, sess)
-          else:
-            self._initialize(init_op, sess)
-          start = break_points[i - 1] if i > 0 else 0
-          end = break_points[i] if i < len(break_points) else num_outputs
-          num_iters = end - start
-          for _ in range(num_iters):
-            outputs.append(sess.run(get_next_op))
-          if i == len(break_points) and verify_exhausted:
-            with self.assertRaises(errors.OutOfRangeError):
-              sess.run(get_next_op)
-          if save_checkpoint_at_end or i < len(break_points):
-            self._save(sess, saver)
-            ckpt_saved = True
-
-    return outputs
-
-  def match(self, expected, actual):
-    """Matches nested structures.
-
-    Recursively matches shape and values of `expected` and `actual`.
-    Handles scalars, numpy arrays and other python sequence containers
-    e.g. list, dict.
-
-    Args:
-      expected: Nested structure 1.
-      actual: Nested structure 2.
-
-    Raises:
-      AssertionError if matching fails.
-    """
-    if isinstance(expected, np.ndarray):
-      expected = expected.tolist()
-    if isinstance(actual, np.ndarray):
-      actual = actual.tolist()
-    self.assertEqual(type(expected), type(actual))
-
-    if nest.is_sequence(expected):
-      self.assertEqual(len(expected), len(actual))
-      if isinstance(expected, dict):
-        for key1, key2 in zip(sorted(expected), sorted(actual)):
-          self.assertEqual(key1, key2)
-          self.match(expected[key1], actual[key2])
-      else:
-        for item1, item2 in zip(expected, actual):
-          self.match(item1, item2)
-    else:
-      self.assertEqual(expected, actual)
-
-  def does_not_match(self, expected, actual):
-    with self.assertRaises(AssertionError):
-      self.match(expected, actual)
-
-  def gen_break_points(self, num_outputs, num_samples=10):
-    """Generates `num_samples` breaks points in [0, num_outputs]."""
-    return np.linspace(0, num_outputs, num_samples, dtype=int)
-
-  def _build_graph(self, ds_fn, sparse_tensors=False):
-    iterator = ds_fn().make_initializable_iterator()
-
-    saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-    init_op = iterator.initializer
-    if sparse_tensors:
-      get_next = sparse_tensor.SparseTensor(*iterator.get_next())
-    else:
-      get_next = iterator.get_next()
-    self._add_iterator_ops_to_collection(init_op, get_next, ds_fn,
-                                         sparse_tensors)
-    saver = saver_lib.Saver(allow_empty=True)
-    return init_op, get_next, saver
-
-  def _build_empty_graph(self, ds_fn, sparse_tensors=False):
-    iterator = iterator_ops.Iterator.from_structure(
-        self._get_output_types(ds_fn),
-        output_shapes=self._get_output_shapes(ds_fn),
-        output_classes=self._get_output_classes(ds_fn))
-    saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-    if sparse_tensors:
-      get_next = sparse_tensor.SparseTensor(*iterator.get_next())
-    else:
-      get_next = iterator.get_next()
-    saver = saver_lib.Saver(allow_empty=True)
-    return get_next, saver
-
-  def _add_iterator_ops_to_collection(self,
-                                      init_op,
-                                      get_next,
-                                      ds_fn,
-                                      sparse_tensors=False):
-    ops.add_to_collection("iterator_ops", init_op)
-    # `get_next` may be a tuple e.g. in TensorSliceDataset. Since Collections
-    # do not support tuples we flatten the tensors and restore the shape in
-    # `_get_iterator_ops_from_collection`.
-    if sparse_tensors:  # specific for deprecated `from_sparse_tensor_slices`.
-      ops.add_to_collection("iterator_ops", get_next.indices)
-      ops.add_to_collection("iterator_ops", get_next.values)
-      ops.add_to_collection("iterator_ops", get_next.dense_shape)
-      return
-
-    get_next_list = nest.flatten(get_next)
-    for i, output_class in enumerate(
-        nest.flatten(self._get_output_classes(ds_fn))):
-      if output_class is sparse_tensor.SparseTensor:
-        ops.add_to_collection("iterator_ops", get_next_list[i].indices)
-        ops.add_to_collection("iterator_ops", get_next_list[i].values)
-        ops.add_to_collection("iterator_ops", get_next_list[i].dense_shape)
-      else:
-        ops.add_to_collection("iterator_ops", get_next_list[i])
-
-  def _get_iterator_ops_from_collection(self, ds_fn, sparse_tensors=False):
-    all_ops = ops.get_collection("iterator_ops")
-    if sparse_tensors:  # specific for deprecated `from_sparse_tensor_slices`.
-      init_op, indices, values, dense_shape = all_ops
-      return init_op, sparse_tensor.SparseTensor(indices, values, dense_shape)
-    get_next_list = []
-    i = 1
-    for output_class in nest.flatten(self._get_output_classes(ds_fn)):
-      if output_class is sparse_tensor.SparseTensor:
-        indices, values, dense_shape = all_ops[i:i + 3]
-        i += 3
-        get_next_list.append(
-            sparse_tensor.SparseTensor(indices, values, dense_shape))
-      else:
-        get_next_list.append(all_ops[i])
-        i += 1
-    return all_ops[0], nest.pack_sequence_as(
-        self._get_output_types(ds_fn), get_next_list)
-
-  def _get_output_types(self, ds_fn):
-    with ops.Graph().as_default():
-      return ds_fn().output_types
-
-  def _get_output_shapes(self, ds_fn):
-    with ops.Graph().as_default():
-      return ds_fn().output_shapes
-
-  def _get_output_classes(self, ds_fn):
-    with ops.Graph().as_default():
-      return ds_fn().output_classes
-
-  def _ckpt_path(self):
-    return os.path.join(self.get_temp_dir(), "iterator")
-
-  def _latest_ckpt(self):
-    return checkpoint_management.latest_checkpoint(self.get_temp_dir())
-
-  def _save(self, sess, saver):
-    saver.save(sess, self._ckpt_path())
-
-  def _restore(self, saver, sess):
-    sess.run(lookup_ops.tables_initializer())
-    saver.restore(sess, self._latest_ckpt())
-
-  def _initialize(self, init_op, sess):
-    sess.run(variables.global_variables_initializer())
-    sess.run(lookup_ops.tables_initializer())
-    sess.run(init_op)
-
-  def _import_meta_graph(self):
-    meta_file_path = self._ckpt_path() + ".meta"
-    return saver_lib.import_meta_graph(meta_file_path)
-
-  def _delete_ckpt(self):
-    # Remove all checkpoint files.
-    prefix = self._ckpt_path()
-    pattern = prefix + "*"
-    files = gfile.Glob(pattern)
-    map(gfile.Remove, files)
diff --git a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
new file mode 100644
index 0000000000..73be6cbcca
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
@@ -0,0 +1,124 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.dense_to_sparse_batch()."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class DenseToSparseBatchTest(test_base.DatasetTestBase):
+
+  def testDenseToSparseBatchDataset(self):
+    components = np.random.randint(12, size=(100,)).astype(np.int32)
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .map(lambda x: array_ops.fill([x], x)).apply(
+            batching.dense_to_sparse_batch(4, [12]))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+
+      for start in range(0, len(components), 4):
+        results = sess.run(get_next)
+        self.assertAllEqual([[i, j]
+                             for i, c in enumerate(components[start:start + 4])
+                             for j in range(c)], results.indices)
+        self.assertAllEqual(
+            [c for c in components[start:start + 4] for _ in range(c)],
+            results.values)
+        self.assertAllEqual([min(4,
+                                 len(components) - start), 12],
+                            results.dense_shape)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testDenseToSparseBatchDatasetWithUnknownShape(self):
+    components = np.random.randint(5, size=(40,)).astype(np.int32)
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .map(lambda x: array_ops.fill([x, x], x)).apply(
+            batching.dense_to_sparse_batch(
+                4, [5, None])).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+
+      for start in range(0, len(components), 4):
+        results = sess.run(get_next)
+        self.assertAllEqual([[i, j, z]
+                             for i, c in enumerate(components[start:start + 4])
+                             for j in range(c)
+                             for z in range(c)], results.indices)
+        self.assertAllEqual([
+            c
+            for c in components[start:start + 4] for _ in range(c)
+            for _ in range(c)
+        ], results.values)
+        self.assertAllEqual([
+            min(4,
+                len(components) - start), 5,
+            np.max(components[start:start + 4])
+        ], results.dense_shape)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testDenseToSparseBatchDatasetWithInvalidShape(self):
+    input_tensor = array_ops.constant([[1]])
+    with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"):
+      dataset_ops.Dataset.from_tensors(input_tensor).apply(
+          batching.dense_to_sparse_batch(4, [-2])).make_initializable_iterator()
+
+  def testDenseToSparseBatchDatasetShapeErrors(self):
+    input_tensor = array_ops.placeholder(dtypes.int32)
+    iterator = (
+        dataset_ops.Dataset.from_tensors(input_tensor).apply(
+            batching.dense_to_sparse_batch(4, [12]))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      # Initialize with an input tensor of incompatible rank.
+      sess.run(init_op, feed_dict={input_tensor: [[1]]})
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "incompatible with the row shape"):
+        sess.run(get_next)
+
+      # Initialize with an input tensor that is larger than `row_shape`.
+      sess.run(init_op, feed_dict={input_tensor: range(13)})
+      with self.assertRaisesRegexp(errors.DataLossError,
+                                   "larger than the row shape"):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/range_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
similarity index 68%
rename from tensorflow/python/data/experimental/kernel_tests/range_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
index 22412c3965..e54235d9f8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
@@ -12,12 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Test RangeDataset."""
+"""Tests for `tf.data.experimental.enumerate_dataset()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.experimental.ops import counter
 from tensorflow.python.data.experimental.ops import enumerate_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
@@ -28,7 +27,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import test
 
 
-class RangeDatasetTest(test_base.DatasetTestBase):
+class EnumerateDatasetTest(test_base.DatasetTestBase):
 
   def testEnumerateDataset(self):
     components = (["a", "b"], [1, 2], [37.0, 38])
@@ -52,27 +51,6 @@ class RangeDatasetTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testCounter(self):
-    """Test dataset construction using `count`."""
-    iterator = (counter.Counter(start=3, step=4)
-                .make_one_shot_iterator())
-    get_next = iterator.get_next()
-    self.assertEqual([], get_next.shape.as_list())
-    self.assertEqual(dtypes.int64, get_next.dtype)
-
-    negative_iterator = (counter.Counter(start=0, step=-1)
-                         .make_one_shot_iterator())
-    negative_get_next = negative_iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEqual(3, sess.run(get_next))
-      self.assertEqual(3 + 4, sess.run(get_next))
-      self.assertEqual(3 + 2 * 4, sess.run(get_next))
-
-      self.assertEqual(0, sess.run(negative_get_next))
-      self.assertEqual(-1, sess.run(negative_get_next))
-      self.assertEqual(-2, sess.run(negative_get_next))
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py b/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py
new file mode 100644
index 0000000000..399fd284f4
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py
@@ -0,0 +1,247 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the private `FunctionBufferingResource` used in prefetching."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.experimental.ops import prefetching_ops
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import test
+
+
+class FunctionBufferingResourceTest(test_base.DatasetTestBase):
+
+  def setUp(self):
+    self._event = threading.Event()
+
+  def _create_ds_and_iterator(self, device0, initializable=False):
+
+    def gen():
+      for i in range(1, 10):
+        yield [float(i)]
+        if i == 6:
+          self._event.set()
+
+    with ops.device(device0):
+      ds = dataset_ops.Dataset.from_generator(gen, (dtypes.float32))
+      if initializable:
+        ds_iterator = ds.make_initializable_iterator()
+      else:
+        ds_iterator = ds.make_one_shot_iterator()
+      return (ds, ds_iterator)
+
+  def _create_ops(self, ds, ds_iterator, buffer_name, device0, device1):
+    ds_iterator_handle = ds_iterator.string_handle()
+
+    @function.Defun(dtypes.string)
+    def _remote_fn(h):
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          h, ds.output_types, ds.output_shapes)
+      return remote_iterator.get_next()
+
+    target = constant_op.constant(device0)
+    with ops.device(device1):
+      buffer_resource_handle = prefetching_ops.function_buffering_resource(
+          f=_remote_fn,
+          output_types=[dtypes.float32],
+          target_device=target,
+          string_arg=ds_iterator_handle,
+          buffer_size=3,
+          shared_name=buffer_name)
+
+    with ops.device(device1):
+      prefetch_op = prefetching_ops.function_buffering_resource_get_next(
+          function_buffer_resource=buffer_resource_handle,
+          output_types=[dtypes.float32])
+      reset_op = prefetching_ops.function_buffering_resource_reset(
+          function_buffer_resource=buffer_resource_handle)
+      destroy_op = resource_variable_ops.destroy_resource_op(
+          buffer_resource_handle, ignore_lookup_error=True)
+
+    return (prefetch_op, reset_op, destroy_op)
+
+  def _prefetch_fn_helper_one_shot(self, buffer_name, device0, device1):
+    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
+
+    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=False)
+    prefetch_op, _, destroy_op = self._create_ops(ds, ds_iterator, buffer_name,
+                                                  device0, device1)
+
+    with self.test_session(config=worker_config) as sess:
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [1.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [2.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [3.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [4.0])
+      self._event.wait()
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [5.0])
+      sess.run(destroy_op)
+
+  def testSameDeviceCPU(self):
+    self._prefetch_fn_helper_one_shot("same_device_cpu",
+                                      "/job:localhost/replica:0/task:0/cpu:0",
+                                      "/job:localhost/replica:0/task:0/cpu:0")
+
+  def testDifferentDeviceCPU(self):
+    self._prefetch_fn_helper_one_shot("diff_device_cpu",
+                                      "/job:localhost/replica:0/task:0/cpu:0",
+                                      "/job:localhost/replica:0/task:0/cpu:1")
+
+  def testDifferentDeviceCPUGPU(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    self._prefetch_fn_helper_one_shot("cpu_gpu",
+                                      "/job:localhost/replica:0/task:0/cpu:0",
+                                      "/job:localhost/replica:0/task:0/gpu:0")
+
+  def testReinitialization(self):
+    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
+
+    device0 = "/job:localhost/replica:0/task:0/cpu:0"
+    device1 = "/job:localhost/replica:0/task:0/cpu:1"
+    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=True)
+    prefetch_op, reset_op, destroy_op = self._create_ops(
+        ds, ds_iterator, "reinit", device0, device1)
+
+    with self.test_session(config=worker_config) as sess:
+      sess.run(ds_iterator.initializer)
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [1.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [2.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [3.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [4.0])
+      self._event.wait()
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [5.0])
+      # Lets reset the function buffering resource and reinitialize the
+      # iterator. Should be able to go through this again.
+      self._event.clear()
+      sess.run(reset_op)
+      sess.run(ds_iterator.initializer)
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [1.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [2.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [3.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [4.0])
+      self._event.wait()
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [5.0])
+      sess.run(destroy_op)
+
+  def testReinitializationOutOfRange(self):
+    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
+
+    device0 = "/job:localhost/replica:0/task:0/cpu:0"
+    device1 = "/job:localhost/replica:0/task:0/cpu:1"
+    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=True)
+    prefetch_op, reset_op, destroy_op = self._create_ops(
+        ds, ds_iterator, "reinit", device0, device1)
+
+    with self.test_session(config=worker_config) as sess:
+      sess.run(ds_iterator.initializer)
+      for i in range(1, 10):
+        elem = sess.run(prefetch_op)
+        self.assertEqual(elem, [float(i)])
+      # Try fetching after its over twice to test out end of sequence.
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(prefetch_op)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(prefetch_op)
+
+      # Now reset everything and try it out again.
+      self._event.clear()
+      sess.run(reset_op)
+      sess.run(ds_iterator.initializer)
+      for i in range(1, 10):
+        elem = sess.run(prefetch_op)
+        self.assertEqual(elem, [float(i)])
+      # Try fetching after its over twice to test out end of sequence.
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(prefetch_op)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(prefetch_op)
+
+      sess.run(destroy_op)
+
+  def testStringsGPU(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    device0 = "/job:localhost/replica:0/task:0/cpu:0"
+    device1 = "/job:localhost/replica:0/task:0/gpu:0"
+
+    ds = dataset_ops.Dataset.from_tensor_slices(["a", "b", "c"])
+    ds_iterator = ds.make_one_shot_iterator()
+    ds_iterator_handle = ds_iterator.string_handle()
+
+    @function.Defun(dtypes.string)
+    def _remote_fn(h):
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          h, ds.output_types, ds.output_shapes)
+      return remote_iterator.get_next()
+
+    target = constant_op.constant(device0)
+    with ops.device(device1):
+      buffer_resource_handle = prefetching_ops.function_buffering_resource(
+          f=_remote_fn,
+          output_types=[dtypes.string],
+          target_device=target,
+          string_arg=ds_iterator_handle,
+          buffer_size=3,
+          shared_name="strings")
+
+    with ops.device(device1):
+      prefetch_op = prefetching_ops.function_buffering_resource_get_next(
+          function_buffer_resource=buffer_resource_handle,
+          output_types=[dtypes.string])
+      destroy_op = resource_variable_ops.destroy_resource_op(
+          buffer_resource_handle, ignore_lookup_error=True)
+
+    with self.cached_session() as sess:
+      self.assertEqual([b"a"], sess.run(prefetch_op))
+      self.assertEqual([b"b"], sess.run(prefetch_op))
+      self.assertEqual([b"c"], sess.run(prefetch_op))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(prefetch_op)
+
+      sess.run(destroy_op)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
new file mode 100644
index 0000000000..9030328593
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
@@ -0,0 +1,199 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.group_by_reducer()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import grouping
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class GroupByReducerTest(test_base.DatasetTestBase):
+
+  def checkResults(self, dataset, shapes, values):
+    self.assertEqual(shapes, dataset.output_shapes)
+    get_next = dataset.make_one_shot_iterator().get_next()
+    with self.cached_session() as sess:
+      for expected in values:
+        got = sess.run(get_next)
+        self.assertEqual(got, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSum(self):
+    reducer = grouping.Reducer(
+        init_func=lambda _: np.int64(0),
+        reduce_func=lambda x, y: x + y,
+        finalize_func=lambda x: x)
+    for i in range(1, 11):
+      dataset = dataset_ops.Dataset.range(2 * i).apply(
+          grouping.group_by_reducer(lambda x: x % 2, reducer))
+      self.checkResults(
+          dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
+
+  def testAverage(self):
+
+    def reduce_fn(x, y):
+      return (x[0] * x[1] + math_ops.cast(y, dtypes.float32)) / (
+          x[1] + 1), x[1] + 1
+
+    reducer = grouping.Reducer(
+        init_func=lambda _: (0.0, 0.0),
+        reduce_func=reduce_fn,
+        finalize_func=lambda x, _: x)
+    for i in range(1, 11):
+      dataset = dataset_ops.Dataset.range(2 * i).apply(
+          grouping.group_by_reducer(
+              lambda x: math_ops.cast(x, dtypes.int64) % 2, reducer))
+      self.checkResults(
+          dataset, shapes=tensor_shape.scalar(), values=[i - 1, i])
+
+  def testConcat(self):
+    components = np.array(list("abcdefghijklmnopqrst")).view(np.chararray)
+    reducer = grouping.Reducer(
+        init_func=lambda x: "",
+        reduce_func=lambda x, y: x + y[0],
+        finalize_func=lambda x: x)
+    for i in range(1, 11):
+      dataset = dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.from_tensor_slices(components),
+           dataset_ops.Dataset.range(2 * i))).apply(
+               grouping.group_by_reducer(lambda x, y: y % 2, reducer))
+      self.checkResults(
+          dataset,
+          shapes=tensor_shape.scalar(),
+          values=[b"acegikmoqs" [:i], b"bdfhjlnprt" [:i]])
+
+  def testSparseSum(self):
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1], dtype=np.int64)),
+          dense_shape=np.array([1, 1]))
+
+    reducer = grouping.Reducer(
+        init_func=lambda _: _sparse(np.int64(0)),
+        reduce_func=lambda x, y: _sparse(x.values[0] + y.values[0]),
+        finalize_func=lambda x: x.values[0])
+    for i in range(1, 11):
+      dataset = dataset_ops.Dataset.range(2 * i).map(_sparse).apply(
+          grouping.group_by_reducer(lambda x: x.values[0] % 2, reducer))
+      self.checkResults(
+          dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
+
+  def testChangingStateShape(self):
+
+    def reduce_fn(x, _):
+      # Statically known rank, but dynamic length.
+      larger_dim = array_ops.concat([x[0], x[0]], 0)
+      # Statically unknown rank.
+      larger_rank = array_ops.expand_dims(x[1], 0)
+      return larger_dim, larger_rank
+
+    reducer = grouping.Reducer(
+        init_func=lambda x: ([0], 1),
+        reduce_func=reduce_fn,
+        finalize_func=lambda x, y: (x, y))
+
+    for i in range(1, 11):
+      dataset = dataset_ops.Dataset.from_tensors(np.int64(0)).repeat(i).apply(
+          grouping.group_by_reducer(lambda x: x, reducer))
+      self.assertEqual([None], dataset.output_shapes[0].as_list())
+      self.assertIs(None, dataset.output_shapes[1].ndims)
+      iterator = dataset.make_one_shot_iterator()
+      get_next = iterator.get_next()
+      with self.cached_session() as sess:
+        x, y = sess.run(get_next)
+        self.assertAllEqual([0] * (2**i), x)
+        self.assertAllEqual(np.array(1, ndmin=i), y)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testTypeMismatch(self):
+    reducer = grouping.Reducer(
+        init_func=lambda x: constant_op.constant(1, dtype=dtypes.int32),
+        reduce_func=lambda x, y: constant_op.constant(1, dtype=dtypes.int64),
+        finalize_func=lambda x: x)
+
+    dataset = dataset_ops.Dataset.range(10)
+    with self.assertRaisesRegexp(
+        TypeError,
+        "The element types for the new state must match the initial state."):
+      dataset.apply(
+          grouping.group_by_reducer(lambda _: np.int64(0), reducer))
+
+  # TODO(b/78665031): Remove once non-scalar keys are supported.
+  def testInvalidKeyShape(self):
+    reducer = grouping.Reducer(
+        init_func=lambda x: np.int64(0),
+        reduce_func=lambda x, y: x + y,
+        finalize_func=lambda x: x)
+
+    dataset = dataset_ops.Dataset.range(10)
+    with self.assertRaisesRegexp(
+        ValueError, "`key_func` must return a single tf.int64 tensor."):
+      dataset.apply(
+          grouping.group_by_reducer(lambda _: np.int64((0, 0)), reducer))
+
+  # TODO(b/78665031): Remove once non-int64 keys are supported.
+  def testInvalidKeyType(self):
+    reducer = grouping.Reducer(
+        init_func=lambda x: np.int64(0),
+        reduce_func=lambda x, y: x + y,
+        finalize_func=lambda x: x)
+
+    dataset = dataset_ops.Dataset.range(10)
+    with self.assertRaisesRegexp(
+        ValueError, "`key_func` must return a single tf.int64 tensor."):
+      dataset.apply(
+          grouping.group_by_reducer(lambda _: "wrong", reducer))
+
+  def testTuple(self):
+    def init_fn(_):
+      return np.array([], dtype=np.int64), np.int64(0)
+
+    def reduce_fn(state, value):
+      s1, s2 = state
+      v1, v2 = value
+      return array_ops.concat([s1, [v1]], 0), s2 + v2
+
+    def finalize_fn(s1, s2):
+      return s1, s2
+
+    reducer = grouping.Reducer(init_fn, reduce_fn, finalize_fn)
+    dataset = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(10), dataset_ops.Dataset.range(10))).apply(
+            grouping.group_by_reducer(lambda x, y: np.int64(0), reducer))
+    get_next = dataset.make_one_shot_iterator().get_next()
+    with self.cached_session() as sess:
+      x, y = sess.run(get_next)
+      self.assertAllEqual(x, np.asarray([x for x in range(10)]))
+      self.assertEqual(y, 45)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
new file mode 100644
index 0000000000..557d56e8b9
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
@@ -0,0 +1,367 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.group_by_window()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import grouping
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+# NOTE(mrry): These tests are based on the tests in bucket_ops_test.py.
+# Currently, they use a constant batch size, though should be made to use a
+# different batch size per key.
+class GroupByWindowTest(test_base.DatasetTestBase):
+
+  def _dynamicPad(self, bucket, window, window_size):
+    # TODO(mrry): To match `tf.contrib.training.bucket()`, implement a
+    # generic form of padded_batch that pads every component
+    # dynamically and does not rely on static shape information about
+    # the arguments.
+    return dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.from_tensors(bucket),
+         window.padded_batch(
+             32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape(
+                 [None]), tensor_shape.TensorShape([3])))))
+
+  def testSingleBucket(self):
+
+    def _map_fn(v):
+      return (v, array_ops.fill([v], v),
+              array_ops.fill([3], string_ops.as_string(v)))
+
+    input_dataset = (
+        dataset_ops.Dataset.from_tensor_slices(math_ops.range(32)).map(_map_fn))
+
+    bucketed_dataset = input_dataset.apply(
+        grouping.group_by_window(
+            lambda x, y, z: 0,
+            lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
+
+    iterator = bucketed_dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+
+      which_bucket, bucketed_values = sess.run(get_next)
+
+      self.assertEqual(0, which_bucket)
+
+      expected_scalar_int = np.arange(32, dtype=np.int64)
+      expected_unk_int64 = np.zeros((32, 31)).astype(np.int64)
+      for i in range(32):
+        expected_unk_int64[i, :i] = i
+      expected_vec3_str = np.vstack(3 * [np.arange(32).astype(bytes)]).T
+
+      self.assertAllEqual(expected_scalar_int, bucketed_values[0])
+      self.assertAllEqual(expected_unk_int64, bucketed_values[1])
+      self.assertAllEqual(expected_vec3_str, bucketed_values[2])
+
+  def testEvenOddBuckets(self):
+
+    def _map_fn(v):
+      return (v, array_ops.fill([v], v),
+              array_ops.fill([3], string_ops.as_string(v)))
+
+    input_dataset = (
+        dataset_ops.Dataset.from_tensor_slices(math_ops.range(64)).map(_map_fn))
+
+    bucketed_dataset = input_dataset.apply(
+        grouping.group_by_window(
+            lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
+            lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
+
+    iterator = bucketed_dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+
+      # Get two minibatches (one containing even values, one containing odds)
+      which_bucket_even, bucketed_values_even = sess.run(get_next)
+      which_bucket_odd, bucketed_values_odd = sess.run(get_next)
+
+      # Count number of bucket_tensors.
+      self.assertEqual(3, len(bucketed_values_even))
+      self.assertEqual(3, len(bucketed_values_odd))
+
+      # Ensure bucket 0 was used for all minibatch entries.
+      self.assertAllEqual(0, which_bucket_even)
+      self.assertAllEqual(1, which_bucket_odd)
+
+      # Test the first bucket outputted, the events starting at 0
+      expected_scalar_int = np.arange(0, 32 * 2, 2, dtype=np.int64)
+      expected_unk_int64 = np.zeros((32, 31 * 2)).astype(np.int64)
+      for i in range(0, 32):
+        expected_unk_int64[i, :2 * i] = 2 * i
+        expected_vec3_str = np.vstack(
+            3 * [np.arange(0, 32 * 2, 2).astype(bytes)]).T
+
+      self.assertAllEqual(expected_scalar_int, bucketed_values_even[0])
+      self.assertAllEqual(expected_unk_int64, bucketed_values_even[1])
+      self.assertAllEqual(expected_vec3_str, bucketed_values_even[2])
+
+      # Test the second bucket outputted, the odds starting at 1
+      expected_scalar_int = np.arange(1, 32 * 2 + 1, 2, dtype=np.int64)
+      expected_unk_int64 = np.zeros((32, 31 * 2 + 1)).astype(np.int64)
+      for i in range(0, 32):
+        expected_unk_int64[i, :2 * i + 1] = 2 * i + 1
+        expected_vec3_str = np.vstack(
+            3 * [np.arange(1, 32 * 2 + 1, 2).astype(bytes)]).T
+
+      self.assertAllEqual(expected_scalar_int, bucketed_values_odd[0])
+      self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1])
+      self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
+
+  def testEvenOddBucketsFilterOutAllOdd(self):
+
+    def _map_fn(v):
+      return {
+          "x": v,
+          "y": array_ops.fill([v], v),
+          "z": array_ops.fill([3], string_ops.as_string(v))
+      }
+
+    def _dynamic_pad_fn(bucket, window, _):
+      return dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.from_tensors(bucket),
+           window.padded_batch(
+               32, {
+                   "x": tensor_shape.TensorShape([]),
+                   "y": tensor_shape.TensorShape([None]),
+                   "z": tensor_shape.TensorShape([3])
+               })))
+
+    input_dataset = (
+        dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn)
+        .filter(lambda d: math_ops.equal(d["x"] % 2, 0)))
+
+    bucketed_dataset = input_dataset.apply(
+        grouping.group_by_window(
+            lambda d: math_ops.cast(d["x"] % 2, dtypes.int64),
+            lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32))
+
+    iterator = bucketed_dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+
+      # Get two minibatches ([0, 2, ...] and [64, 66, ...])
+      which_bucket0, bucketed_values_even0 = sess.run(get_next)
+      which_bucket1, bucketed_values_even1 = sess.run(get_next)
+
+      # Ensure that bucket 1 was completely filtered out
+      self.assertAllEqual(0, which_bucket0)
+      self.assertAllEqual(0, which_bucket1)
+      self.assertAllEqual(
+          np.arange(0, 64, 2, dtype=np.int64), bucketed_values_even0["x"])
+      self.assertAllEqual(
+          np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"])
+
+  def testDynamicWindowSize(self):
+    components = np.arange(100).astype(np.int64)
+
+    # Key fn: even/odd
+    # Reduce fn: batches of 5
+    # Window size fn: even=5, odd=10
+
+    def window_size_func(key):
+      window_sizes = constant_op.constant([5, 10], dtype=dtypes.int64)
+      return window_sizes[key]
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
+        grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(20),
+                                 None, window_size_func))
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      with self.assertRaises(errors.OutOfRangeError):
+        batches = 0
+        while True:
+          result = sess.run(get_next)
+          is_even = all(x % 2 == 0 for x in result)
+          is_odd = all(x % 2 == 1 for x in result)
+          self.assertTrue(is_even or is_odd)
+          expected_batch_size = 5 if is_even else 10
+          self.assertEqual(expected_batch_size, result.shape[0])
+          batches += 1
+
+      self.assertEqual(batches, 15)
+
+  def testSimple(self):
+    components = np.random.randint(100, size=(200,)).astype(np.int64)
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
+        .apply(
+            grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
+                                     4)).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      counts = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          result = sess.run(get_next)
+          self.assertTrue(
+              all(x % 2 == 0
+                  for x in result) or all(x % 2 == 1)
+              for x in result)
+          counts.append(result.shape[0])
+
+      self.assertEqual(len(components), sum(counts))
+      num_full_batches = len([c for c in counts if c == 4])
+      self.assertGreaterEqual(num_full_batches, 24)
+      self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
+
+  def testImmediateOutput(self):
+    components = np.array(
+        [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply(
+            grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4),
+                                     4)).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      # The input is infinite, so this test demonstrates that:
+      # 1. We produce output without having to consume the entire input,
+      # 2. Different buckets can produce output at different rates, and
+      # 3. For deterministic input, the output is deterministic.
+      for _ in range(3):
+        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
+        self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
+        self.assertAllEqual([2, 2, 2, 2], sess.run(get_next))
+        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
+
+  def testSmallGroups(self):
+    components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64)
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components).apply(
+            grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
+                                     4)).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
+      self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
+      # The small outputs at the end are deterministically produced in key
+      # order.
+      self.assertAllEqual([0, 0, 0], sess.run(get_next))
+      self.assertAllEqual([1], sess.run(get_next))
+
+  def testEmpty(self):
+    iterator = (
+        dataset_ops.Dataset.range(4).apply(
+            grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "Window size must be greater than zero, but got 0."):
+        print(sess.run(get_next))
+
+  def testReduceFuncError(self):
+    components = np.random.randint(100, size=(200,)).astype(np.int64)
+
+    def reduce_func(_, xs):
+      # Introduce an incorrect padded shape that cannot (currently) be
+      # detected at graph construction time.
+      return xs.padded_batch(
+          4,
+          padded_shapes=(tensor_shape.TensorShape([]),
+                         constant_op.constant([5], dtype=dtypes.int64) * -1))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
+            grouping.group_by_window(lambda x, _: x % 2, reduce_func,
+                                     32)).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next)
+
+  def testConsumeWindowDatasetMoreThanOnce(self):
+    components = np.random.randint(50, size=(200,)).astype(np.int64)
+
+    def reduce_func(key, window):
+      # Apply two different kinds of padding to the input: tight
+      # padding, and quantized (to a multiple of 10) padding.
+      return dataset_ops.Dataset.zip((
+          window.padded_batch(
+              4, padded_shapes=tensor_shape.TensorShape([None])),
+          window.padded_batch(
+              4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),
+      ))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x))
+        .apply(grouping.group_by_window(
+            lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64),
+            reduce_func, 4))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      counts = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          tight_result, multiple_of_10_result = sess.run(get_next)
+          self.assertEqual(0, multiple_of_10_result.shape[1] % 10)
+          self.assertAllEqual(tight_result,
+                              multiple_of_10_result[:, :tight_result.shape[1]])
+          counts.append(tight_result.shape[0])
+      self.assertEqual(len(components), sum(counts))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
new file mode 100644
index 0000000000..c0ec1486ab
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
@@ -0,0 +1,115 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.ignore_errors()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import error_ops
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+_NUMPY_RANDOM_SEED = 42
+
+
+class IgnoreErrorsTest(test_base.DatasetTestBase):
+
+  def testMapIgnoreError(self):
+    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .map(lambda x: array_ops.check_numerics(x, "message")).apply(
+            error_ops.ignore_errors()))
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for x in [1., 2., 3., 5.]:
+        self.assertEqual(x, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testParallelMapIgnoreError(self):
+    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components).map(
+            lambda x: array_ops.check_numerics(x, "message"),
+            num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for x in [1., 2., 3., 5.]:
+        self.assertEqual(x, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testReadFileIgnoreError(self):
+
+    def write_string_to_file(value, filename):
+      with open(filename, "w") as f:
+        f.write(value)
+
+    filenames = [
+        os.path.join(self.get_temp_dir(), "file_%d.txt" % i) for i in range(5)
+    ]
+    for filename in filenames:
+      write_string_to_file(filename, filename)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(filenames).map(
+            io_ops.read_file,
+            num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      # All of the files are present.
+      sess.run(init_op)
+      for filename in filenames:
+        self.assertEqual(compat.as_bytes(filename), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Delete one of the files.
+      os.remove(filenames[0])
+
+      # Attempting to read filenames[0] will fail, but ignore_errors()
+      # will catch the error.
+      sess.run(init_op)
+      for filename in filenames[1:]:
+        self.assertEqual(compat.as_bytes(filename), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
new file mode 100644
index 0000000000..5ee94e14dc
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
@@ -0,0 +1,239 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.make_batched_features_dataset()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
+from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+
+
+class MakeBatchedFeaturesDatasetTest(
+    reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase):
+
+  def testRead(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 10]:
+        with ops.Graph().as_default() as g:
+          with self.session(graph=g) as sess:
+            # Basic test: read from file 0.
+            self.outputs = self.make_batch_feature(
+                filenames=self.test_filenames[0],
+                label_key="label",
+                num_epochs=num_epochs,
+                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.verify_records(
+                sess,
+                batch_size,
+                0,
+                num_epochs=num_epochs,
+                label_key_provided=True)
+            with self.assertRaises(errors.OutOfRangeError):
+              self._next_actual_batch(sess, label_key_provided=True)
+
+        with ops.Graph().as_default() as g:
+          with self.session(graph=g) as sess:
+            # Basic test: read from file 1.
+            self.outputs = self.make_batch_feature(
+                filenames=self.test_filenames[1],
+                label_key="label",
+                num_epochs=num_epochs,
+                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.verify_records(
+                sess,
+                batch_size,
+                1,
+                num_epochs=num_epochs,
+                label_key_provided=True)
+            with self.assertRaises(errors.OutOfRangeError):
+              self._next_actual_batch(sess, label_key_provided=True)
+
+        with ops.Graph().as_default() as g:
+          with self.session(graph=g) as sess:
+            # Basic test: read from both files.
+            self.outputs = self.make_batch_feature(
+                filenames=self.test_filenames,
+                label_key="label",
+                num_epochs=num_epochs,
+                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.verify_records(
+                sess,
+                batch_size,
+                num_epochs=num_epochs,
+                label_key_provided=True)
+            with self.assertRaises(errors.OutOfRangeError):
+              self._next_actual_batch(sess, label_key_provided=True)
+
+        with ops.Graph().as_default() as g:
+          with self.session(graph=g) as sess:
+            # Basic test: read from both files.
+            self.outputs = self.make_batch_feature(
+                filenames=self.test_filenames,
+                num_epochs=num_epochs,
+                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.verify_records(sess, batch_size, num_epochs=num_epochs)
+            with self.assertRaises(errors.OutOfRangeError):
+              self._next_actual_batch(sess)
+
+  def testReadWithEquivalentDataset(self):
+    features = {
+        "file": parsing_ops.FixedLenFeature([], dtypes.int64),
+        "record": parsing_ops.FixedLenFeature([], dtypes.int64),
+    }
+    dataset = (
+        core_readers.TFRecordDataset(self.test_filenames)
+        .map(lambda x: parsing_ops.parse_single_example(x, features))
+        .repeat(10).batch(2))
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for file_batch, _, _, _, record_batch, _ in self._next_expected_batch(
+          range(self._num_files), 2, 10):
+        actual_batch = sess.run(next_element)
+        self.assertAllEqual(file_batch, actual_batch["file"])
+        self.assertAllEqual(record_batch, actual_batch["record"])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testReadWithFusedShuffleRepeatDataset(self):
+    num_epochs = 5
+    total_records = num_epochs * self._num_records
+    for batch_size in [1, 2]:
+      # Test that shuffling with same seed produces the same result.
+      with ops.Graph().as_default() as g:
+        with self.session(graph=g) as sess:
+          outputs1 = self.make_batch_feature(
+              filenames=self.test_filenames[0],
+              num_epochs=num_epochs,
+              batch_size=batch_size,
+              shuffle=True,
+              shuffle_seed=5).make_one_shot_iterator().get_next()
+          outputs2 = self.make_batch_feature(
+              filenames=self.test_filenames[0],
+              num_epochs=num_epochs,
+              batch_size=batch_size,
+              shuffle=True,
+              shuffle_seed=5).make_one_shot_iterator().get_next()
+          for _ in range(total_records // batch_size):
+            batch1 = self._run_actual_batch(outputs1, sess)
+            batch2 = self._run_actual_batch(outputs2, sess)
+            for i in range(len(batch1)):
+              self.assertAllEqual(batch1[i], batch2[i])
+
+      # Test that shuffling with different seeds produces a different order.
+      with ops.Graph().as_default() as g:
+        with self.session(graph=g) as sess:
+          outputs1 = self.make_batch_feature(
+              filenames=self.test_filenames[0],
+              num_epochs=num_epochs,
+              batch_size=batch_size,
+              shuffle=True,
+              shuffle_seed=5).make_one_shot_iterator().get_next()
+          outputs2 = self.make_batch_feature(
+              filenames=self.test_filenames[0],
+              num_epochs=num_epochs,
+              batch_size=batch_size,
+              shuffle=True,
+              shuffle_seed=15).make_one_shot_iterator().get_next()
+          all_equal = True
+          for _ in range(total_records // batch_size):
+            batch1 = self._run_actual_batch(outputs1, sess)
+            batch2 = self._run_actual_batch(outputs2, sess)
+            for i in range(len(batch1)):
+              all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
+          self.assertFalse(all_equal)
+
+  def testParallelReadersAndParsers(self):
+    num_epochs = 5
+    for batch_size in [1, 2]:
+      for reader_num_threads in [2, 4]:
+        for parser_num_threads in [2, 4]:
+          with ops.Graph().as_default() as g:
+            with self.session(graph=g) as sess:
+              self.outputs = self.make_batch_feature(
+                  filenames=self.test_filenames,
+                  label_key="label",
+                  num_epochs=num_epochs,
+                  batch_size=batch_size,
+                  reader_num_threads=reader_num_threads,
+                  parser_num_threads=parser_num_threads).make_one_shot_iterator(
+                  ).get_next()
+              self.verify_records(
+                  sess,
+                  batch_size,
+                  num_epochs=num_epochs,
+                  label_key_provided=True,
+                  interleave_cycle_length=reader_num_threads)
+              with self.assertRaises(errors.OutOfRangeError):
+                self._next_actual_batch(sess, label_key_provided=True)
+
+          with ops.Graph().as_default() as g:
+            with self.session(graph=g) as sess:
+              self.outputs = self.make_batch_feature(
+                  filenames=self.test_filenames,
+                  num_epochs=num_epochs,
+                  batch_size=batch_size,
+                  reader_num_threads=reader_num_threads,
+                  parser_num_threads=parser_num_threads).make_one_shot_iterator(
+                  ).get_next()
+              self.verify_records(
+                  sess,
+                  batch_size,
+                  num_epochs=num_epochs,
+                  interleave_cycle_length=reader_num_threads)
+              with self.assertRaises(errors.OutOfRangeError):
+                self._next_actual_batch(sess)
+
+  def testDropFinalBatch(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 10]:
+        with ops.Graph().as_default():
+          # Basic test: read from file 0.
+          outputs = self.make_batch_feature(
+              filenames=self.test_filenames[0],
+              label_key="label",
+              num_epochs=num_epochs,
+              batch_size=batch_size,
+              drop_final_batch=True).make_one_shot_iterator().get_next()
+          for tensor in nest.flatten(outputs):
+            if isinstance(tensor, ops.Tensor):  # Guard against SparseTensor.
+              self.assertEqual(tensor.shape[0], batch_size)
+
+  def testIndefiniteRepeatShapeInference(self):
+    dataset = self.make_batch_feature(
+        filenames=self.test_filenames[0],
+        label_key="label",
+        num_epochs=None,
+        batch_size=32)
+    for shape, clazz in zip(nest.flatten(dataset.output_shapes),
+                            nest.flatten(dataset.output_classes)):
+      if issubclass(clazz, ops.Tensor):
+        self.assertEqual(32, shape[0])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
similarity index 57%
rename from tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
index a02f4bd14f..e4bf089184 100644
--- a/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.experimental.make_csv_dataset()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -23,226 +23,16 @@ import zlib
 
 import numpy as np
 
-from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
-class ReadBatchFeaturesTest(
-    reader_dataset_ops_test_base.ReadBatchFeaturesTestBase):
-
-  def testRead(self):
-    for batch_size in [1, 2]:
-      for num_epochs in [1, 10]:
-        with ops.Graph().as_default() as g:
-          with self.session(graph=g) as sess:
-            # Basic test: read from file 0.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames[0],
-                label_key="label",
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
-            self.verify_records(
-                sess,
-                batch_size,
-                0,
-                num_epochs=num_epochs,
-                label_key_provided=True)
-            with self.assertRaises(errors.OutOfRangeError):
-              self._next_actual_batch(sess, label_key_provided=True)
-
-        with ops.Graph().as_default() as g:
-          with self.session(graph=g) as sess:
-            # Basic test: read from file 1.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames[1],
-                label_key="label",
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
-            self.verify_records(
-                sess,
-                batch_size,
-                1,
-                num_epochs=num_epochs,
-                label_key_provided=True)
-            with self.assertRaises(errors.OutOfRangeError):
-              self._next_actual_batch(sess, label_key_provided=True)
-
-        with ops.Graph().as_default() as g:
-          with self.session(graph=g) as sess:
-            # Basic test: read from both files.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames,
-                label_key="label",
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
-            self.verify_records(
-                sess,
-                batch_size,
-                num_epochs=num_epochs,
-                label_key_provided=True)
-            with self.assertRaises(errors.OutOfRangeError):
-              self._next_actual_batch(sess, label_key_provided=True)
-
-        with ops.Graph().as_default() as g:
-          with self.session(graph=g) as sess:
-            # Basic test: read from both files.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames,
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
-            self.verify_records(sess, batch_size, num_epochs=num_epochs)
-            with self.assertRaises(errors.OutOfRangeError):
-              self._next_actual_batch(sess)
-
-  def testReadWithEquivalentDataset(self):
-    features = {
-        "file": parsing_ops.FixedLenFeature([], dtypes.int64),
-        "record": parsing_ops.FixedLenFeature([], dtypes.int64),
-    }
-    dataset = (
-        core_readers.TFRecordDataset(self.test_filenames)
-        .map(lambda x: parsing_ops.parse_single_example(x, features))
-        .repeat(10).batch(2))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for file_batch, _, _, _, record_batch, _ in self._next_expected_batch(
-          range(self._num_files), 2, 10):
-        actual_batch = sess.run(next_element)
-        self.assertAllEqual(file_batch, actual_batch["file"])
-        self.assertAllEqual(record_batch, actual_batch["record"])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testReadWithFusedShuffleRepeatDataset(self):
-    num_epochs = 5
-    total_records = num_epochs * self._num_records
-    for batch_size in [1, 2]:
-      # Test that shuffling with same seed produces the same result.
-      with ops.Graph().as_default() as g:
-        with self.session(graph=g) as sess:
-          outputs1 = self.make_batch_feature(
-              filenames=self.test_filenames[0],
-              num_epochs=num_epochs,
-              batch_size=batch_size,
-              shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
-          outputs2 = self.make_batch_feature(
-              filenames=self.test_filenames[0],
-              num_epochs=num_epochs,
-              batch_size=batch_size,
-              shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
-          for _ in range(total_records // batch_size):
-            batch1 = self._run_actual_batch(outputs1, sess)
-            batch2 = self._run_actual_batch(outputs2, sess)
-            for i in range(len(batch1)):
-              self.assertAllEqual(batch1[i], batch2[i])
-
-      # Test that shuffling with different seeds produces a different order.
-      with ops.Graph().as_default() as g:
-        with self.session(graph=g) as sess:
-          outputs1 = self.make_batch_feature(
-              filenames=self.test_filenames[0],
-              num_epochs=num_epochs,
-              batch_size=batch_size,
-              shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
-          outputs2 = self.make_batch_feature(
-              filenames=self.test_filenames[0],
-              num_epochs=num_epochs,
-              batch_size=batch_size,
-              shuffle=True,
-              shuffle_seed=15).make_one_shot_iterator().get_next()
-          all_equal = True
-          for _ in range(total_records // batch_size):
-            batch1 = self._run_actual_batch(outputs1, sess)
-            batch2 = self._run_actual_batch(outputs2, sess)
-            for i in range(len(batch1)):
-              all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
-          self.assertFalse(all_equal)
-
-  def testParallelReadersAndParsers(self):
-    num_epochs = 5
-    for batch_size in [1, 2]:
-      for reader_num_threads in [2, 4]:
-        for parser_num_threads in [2, 4]:
-          with ops.Graph().as_default() as g:
-            with self.session(graph=g) as sess:
-              self.outputs = self.make_batch_feature(
-                  filenames=self.test_filenames,
-                  label_key="label",
-                  num_epochs=num_epochs,
-                  batch_size=batch_size,
-                  reader_num_threads=reader_num_threads,
-                  parser_num_threads=parser_num_threads).make_one_shot_iterator(
-                  ).get_next()
-              self.verify_records(
-                  sess,
-                  batch_size,
-                  num_epochs=num_epochs,
-                  label_key_provided=True,
-                  interleave_cycle_length=reader_num_threads)
-              with self.assertRaises(errors.OutOfRangeError):
-                self._next_actual_batch(sess, label_key_provided=True)
-
-          with ops.Graph().as_default() as g:
-            with self.session(graph=g) as sess:
-              self.outputs = self.make_batch_feature(
-                  filenames=self.test_filenames,
-                  num_epochs=num_epochs,
-                  batch_size=batch_size,
-                  reader_num_threads=reader_num_threads,
-                  parser_num_threads=parser_num_threads).make_one_shot_iterator(
-                  ).get_next()
-              self.verify_records(
-                  sess,
-                  batch_size,
-                  num_epochs=num_epochs,
-                  interleave_cycle_length=reader_num_threads)
-              with self.assertRaises(errors.OutOfRangeError):
-                self._next_actual_batch(sess)
-
-  def testDropFinalBatch(self):
-    for batch_size in [1, 2]:
-      for num_epochs in [1, 10]:
-        with ops.Graph().as_default():
-          # Basic test: read from file 0.
-          outputs = self.make_batch_feature(
-              filenames=self.test_filenames[0],
-              label_key="label",
-              num_epochs=num_epochs,
-              batch_size=batch_size,
-              drop_final_batch=True).make_one_shot_iterator().get_next()
-          for tensor in nest.flatten(outputs):
-            if isinstance(tensor, ops.Tensor):  # Guard against SparseTensor.
-              self.assertEqual(tensor.shape[0], batch_size)
-
-  def testIndefiniteRepeatShapeInference(self):
-    dataset = self.make_batch_feature(
-        filenames=self.test_filenames[0],
-        label_key="label",
-        num_epochs=None,
-        batch_size=32)
-    for shape, clazz in zip(nest.flatten(dataset.output_shapes),
-                            nest.flatten(dataset.output_classes)):
-      if issubclass(clazz, ops.Tensor):
-        self.assertEqual(32, shape[0])
-
-
 class MakeCsvDatasetTest(test_base.DatasetTestBase):
 
   def _make_csv_dataset(self, filenames, batch_size, num_epochs=1, **kwargs):
@@ -866,218 +656,5 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
       self.assertEqual(32, shape[0])
 
 
-class MakeTFRecordDatasetTest(
-    reader_dataset_ops_test_base.TFRecordDatasetTestBase):
-
-  def _interleave(self, iterators, cycle_length):
-    pending_iterators = iterators
-    open_iterators = []
-    num_open = 0
-    for i in range(cycle_length):
-      if pending_iterators:
-        open_iterators.append(pending_iterators.pop(0))
-        num_open += 1
-
-    while num_open:
-      for i in range(min(cycle_length, len(open_iterators))):
-        if open_iterators[i] is None:
-          continue
-        try:
-          yield next(open_iterators[i])
-        except StopIteration:
-          if pending_iterators:
-            open_iterators[i] = pending_iterators.pop(0)
-          else:
-            open_iterators[i] = None
-            num_open -= 1
-
-  def _next_expected_batch(self,
-                           file_indices,
-                           batch_size,
-                           num_epochs,
-                           cycle_length,
-                           drop_final_batch,
-                           use_parser_fn):
-
-    def _next_record(file_indices):
-      for j in file_indices:
-        for i in range(self._num_records):
-          yield j, i
-
-    def _next_record_interleaved(file_indices, cycle_length):
-      return self._interleave([_next_record([i]) for i in file_indices],
-                              cycle_length)
-
-    record_batch = []
-    batch_index = 0
-    for _ in range(num_epochs):
-      if cycle_length == 1:
-        next_records = _next_record(file_indices)
-      else:
-        next_records = _next_record_interleaved(file_indices, cycle_length)
-      for f, r in next_records:
-        record = self._record(f, r)
-        if use_parser_fn:
-          record = record[1:]
-        record_batch.append(record)
-        batch_index += 1
-        if len(record_batch) == batch_size:
-          yield record_batch
-          record_batch = []
-          batch_index = 0
-    if record_batch and not drop_final_batch:
-      yield record_batch
-
-  def _verify_records(self,
-                      sess,
-                      outputs,
-                      batch_size,
-                      file_index,
-                      num_epochs,
-                      interleave_cycle_length,
-                      drop_final_batch,
-                      use_parser_fn):
-    if file_index is not None:
-      file_indices = [file_index]
-    else:
-      file_indices = range(self._num_files)
-
-    for expected_batch in self._next_expected_batch(
-        file_indices, batch_size, num_epochs, interleave_cycle_length,
-        drop_final_batch, use_parser_fn):
-      actual_batch = sess.run(outputs)
-      self.assertAllEqual(expected_batch, actual_batch)
-
-  def _read_test(self, batch_size, num_epochs, file_index=None,
-                 num_parallel_reads=1, drop_final_batch=False, parser_fn=False):
-    if file_index is None:
-      file_pattern = self.test_filenames
-    else:
-      file_pattern = self.test_filenames[file_index]
-
-    if parser_fn:
-      fn = lambda x: string_ops.substr(x, 1, 999)
-    else:
-      fn = None
-
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g) as sess:
-        outputs = readers.make_tf_record_dataset(
-            file_pattern=file_pattern,
-            num_epochs=num_epochs,
-            batch_size=batch_size,
-            parser_fn=fn,
-            num_parallel_reads=num_parallel_reads,
-            drop_final_batch=drop_final_batch,
-            shuffle=False).make_one_shot_iterator().get_next()
-        self._verify_records(
-            sess, outputs, batch_size, file_index, num_epochs=num_epochs,
-            interleave_cycle_length=num_parallel_reads,
-            drop_final_batch=drop_final_batch, use_parser_fn=parser_fn)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(outputs)
-
-  def testRead(self):
-    for batch_size in [1, 2]:
-      for num_epochs in [1, 3]:
-        # Basic test: read from file 0.
-        self._read_test(batch_size, num_epochs, 0)
-
-        # Basic test: read from file 1.
-        self._read_test(batch_size, num_epochs, 1)
-
-        # Basic test: read from both files.
-        self._read_test(batch_size, num_epochs)
-
-        # Basic test: read from both files, with parallel reads.
-        self._read_test(batch_size, num_epochs, num_parallel_reads=8)
-
-  def testDropFinalBatch(self):
-    for batch_size in [1, 2, 10]:
-      for num_epochs in [1, 3]:
-        # Read from file 0.
-        self._read_test(batch_size, num_epochs, 0, drop_final_batch=True)
-
-        # Read from both files.
-        self._read_test(batch_size, num_epochs, drop_final_batch=True)
-
-        # Read from both files, with parallel reads.
-        self._read_test(batch_size, num_epochs, num_parallel_reads=8,
-                        drop_final_batch=True)
-
-  def testParserFn(self):
-    for batch_size in [1, 2]:
-      for num_epochs in [1, 3]:
-        for drop_final_batch in [False, True]:
-          self._read_test(batch_size, num_epochs, parser_fn=True,
-                          drop_final_batch=drop_final_batch)
-          self._read_test(batch_size, num_epochs, num_parallel_reads=8,
-                          parser_fn=True, drop_final_batch=drop_final_batch)
-
-  def _shuffle_test(self, batch_size, num_epochs, num_parallel_reads=1,
-                    seed=None):
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g) as sess:
-        dataset = readers.make_tf_record_dataset(
-            file_pattern=self.test_filenames,
-            num_epochs=num_epochs,
-            batch_size=batch_size,
-            num_parallel_reads=num_parallel_reads,
-            shuffle=True,
-            shuffle_seed=seed)
-        iterator = dataset.make_initializable_iterator()
-        next_element = iterator.get_next()
-
-        sess.run(iterator.initializer)
-        first_batches = []
-        try:
-          while True:
-            first_batches.append(sess.run(next_element))
-        except errors.OutOfRangeError:
-          pass
-
-        sess.run(iterator.initializer)
-        second_batches = []
-        try:
-          while True:
-            second_batches.append(sess.run(next_element))
-        except errors.OutOfRangeError:
-          pass
-
-        self.assertEqual(len(first_batches), len(second_batches))
-        if seed is not None:
-          # if you set a seed, should get the same results
-          for i in range(len(first_batches)):
-            self.assertAllEqual(first_batches[i], second_batches[i])
-
-        expected = []
-        for f in range(self._num_files):
-          for r in range(self._num_records):
-            expected.extend([self._record(f, r)] * num_epochs)
-
-        for batches in (first_batches, second_batches):
-          actual = []
-          for b in batches:
-            actual.extend(b)
-          self.assertAllEqual(sorted(expected), sorted(actual))
-
-  def testShuffle(self):
-    for batch_size in [1, 2]:
-      for num_epochs in [1, 3]:
-        for num_parallel_reads in [1, 2]:
-          # Test that all expected elements are produced
-          self._shuffle_test(batch_size, num_epochs, num_parallel_reads)
-          # Test that elements are produced in a consistent order if
-          # you specify a seed.
-          self._shuffle_test(batch_size, num_epochs, num_parallel_reads,
-                             seed=21345)
-
-  def testIndefiniteRepeatShapeInference(self):
-    dataset = readers.make_tf_record_dataset(
-        file_pattern=self.test_filenames, num_epochs=None, batch_size=32)
-    for shape in nest.flatten(dataset.output_shapes):
-      self.assertEqual(32, shape[0])
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
new file mode 100644
index 0000000000..657cf3c00e
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
@@ -0,0 +1,243 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.make_tf_record_dataset()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
+from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class MakeTFRecordDatasetTest(
+    reader_dataset_ops_test_base.TFRecordDatasetTestBase):
+
+  def _interleave(self, iterators, cycle_length):
+    pending_iterators = iterators
+    open_iterators = []
+    num_open = 0
+    for i in range(cycle_length):
+      if pending_iterators:
+        open_iterators.append(pending_iterators.pop(0))
+        num_open += 1
+
+    while num_open:
+      for i in range(min(cycle_length, len(open_iterators))):
+        if open_iterators[i] is None:
+          continue
+        try:
+          yield next(open_iterators[i])
+        except StopIteration:
+          if pending_iterators:
+            open_iterators[i] = pending_iterators.pop(0)
+          else:
+            open_iterators[i] = None
+            num_open -= 1
+
+  def _next_expected_batch(self,
+                           file_indices,
+                           batch_size,
+                           num_epochs,
+                           cycle_length,
+                           drop_final_batch,
+                           use_parser_fn):
+
+    def _next_record(file_indices):
+      for j in file_indices:
+        for i in range(self._num_records):
+          yield j, i
+
+    def _next_record_interleaved(file_indices, cycle_length):
+      return self._interleave([_next_record([i]) for i in file_indices],
+                              cycle_length)
+
+    record_batch = []
+    batch_index = 0
+    for _ in range(num_epochs):
+      if cycle_length == 1:
+        next_records = _next_record(file_indices)
+      else:
+        next_records = _next_record_interleaved(file_indices, cycle_length)
+      for f, r in next_records:
+        record = self._record(f, r)
+        if use_parser_fn:
+          record = record[1:]
+        record_batch.append(record)
+        batch_index += 1
+        if len(record_batch) == batch_size:
+          yield record_batch
+          record_batch = []
+          batch_index = 0
+    if record_batch and not drop_final_batch:
+      yield record_batch
+
+  def _verify_records(self,
+                      sess,
+                      outputs,
+                      batch_size,
+                      file_index,
+                      num_epochs,
+                      interleave_cycle_length,
+                      drop_final_batch,
+                      use_parser_fn):
+    if file_index is not None:
+      file_indices = [file_index]
+    else:
+      file_indices = range(self._num_files)
+
+    for expected_batch in self._next_expected_batch(
+        file_indices, batch_size, num_epochs, interleave_cycle_length,
+        drop_final_batch, use_parser_fn):
+      actual_batch = sess.run(outputs)
+      self.assertAllEqual(expected_batch, actual_batch)
+
+  def _read_test(self, batch_size, num_epochs, file_index=None,
+                 num_parallel_reads=1, drop_final_batch=False, parser_fn=False):
+    if file_index is None:
+      file_pattern = self.test_filenames
+    else:
+      file_pattern = self.test_filenames[file_index]
+
+    if parser_fn:
+      fn = lambda x: string_ops.substr(x, 1, 999)
+    else:
+      fn = None
+
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g) as sess:
+        outputs = readers.make_tf_record_dataset(
+            file_pattern=file_pattern,
+            num_epochs=num_epochs,
+            batch_size=batch_size,
+            parser_fn=fn,
+            num_parallel_reads=num_parallel_reads,
+            drop_final_batch=drop_final_batch,
+            shuffle=False).make_one_shot_iterator().get_next()
+        self._verify_records(
+            sess, outputs, batch_size, file_index, num_epochs=num_epochs,
+            interleave_cycle_length=num_parallel_reads,
+            drop_final_batch=drop_final_batch, use_parser_fn=parser_fn)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(outputs)
+
+  def testRead(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 3]:
+        # Basic test: read from file 0.
+        self._read_test(batch_size, num_epochs, 0)
+
+        # Basic test: read from file 1.
+        self._read_test(batch_size, num_epochs, 1)
+
+        # Basic test: read from both files.
+        self._read_test(batch_size, num_epochs)
+
+        # Basic test: read from both files, with parallel reads.
+        self._read_test(batch_size, num_epochs, num_parallel_reads=8)
+
+  def testDropFinalBatch(self):
+    for batch_size in [1, 2, 10]:
+      for num_epochs in [1, 3]:
+        # Read from file 0.
+        self._read_test(batch_size, num_epochs, 0, drop_final_batch=True)
+
+        # Read from both files.
+        self._read_test(batch_size, num_epochs, drop_final_batch=True)
+
+        # Read from both files, with parallel reads.
+        self._read_test(batch_size, num_epochs, num_parallel_reads=8,
+                        drop_final_batch=True)
+
+  def testParserFn(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 3]:
+        for drop_final_batch in [False, True]:
+          self._read_test(batch_size, num_epochs, parser_fn=True,
+                          drop_final_batch=drop_final_batch)
+          self._read_test(batch_size, num_epochs, num_parallel_reads=8,
+                          parser_fn=True, drop_final_batch=drop_final_batch)
+
+  def _shuffle_test(self, batch_size, num_epochs, num_parallel_reads=1,
+                    seed=None):
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g) as sess:
+        dataset = readers.make_tf_record_dataset(
+            file_pattern=self.test_filenames,
+            num_epochs=num_epochs,
+            batch_size=batch_size,
+            num_parallel_reads=num_parallel_reads,
+            shuffle=True,
+            shuffle_seed=seed)
+        iterator = dataset.make_initializable_iterator()
+        next_element = iterator.get_next()
+
+        sess.run(iterator.initializer)
+        first_batches = []
+        try:
+          while True:
+            first_batches.append(sess.run(next_element))
+        except errors.OutOfRangeError:
+          pass
+
+        sess.run(iterator.initializer)
+        second_batches = []
+        try:
+          while True:
+            second_batches.append(sess.run(next_element))
+        except errors.OutOfRangeError:
+          pass
+
+        self.assertEqual(len(first_batches), len(second_batches))
+        if seed is not None:
+          # if you set a seed, should get the same results
+          for i in range(len(first_batches)):
+            self.assertAllEqual(first_batches[i], second_batches[i])
+
+        expected = []
+        for f in range(self._num_files):
+          for r in range(self._num_records):
+            expected.extend([self._record(f, r)] * num_epochs)
+
+        for batches in (first_batches, second_batches):
+          actual = []
+          for b in batches:
+            actual.extend(b)
+          self.assertAllEqual(sorted(expected), sorted(actual))
+
+  def testShuffle(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 3]:
+        for num_parallel_reads in [1, 2]:
+          # Test that all expected elements are produced
+          self._shuffle_test(batch_size, num_epochs, num_parallel_reads)
+          # Test that elements are produced in a consistent order if
+          # you specify a seed.
+          self._shuffle_test(batch_size, num_epochs, num_parallel_reads,
+                             seed=21345)
+
+  def testIndefiniteRepeatShapeInference(self):
+    dataset = readers.make_tf_record_dataset(
+        file_pattern=self.test_filenames, num_epochs=None, batch_size=32)
+    for shape in nest.flatten(dataset.output_shapes):
+      self.assertEqual(32, shape[0])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
new file mode 100644
index 0000000000..afd0fc3abf
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -0,0 +1,337 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.map_and_batch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import script_ops
+from tensorflow.python.platform import test
+
+
+class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ("Default", None, None),
+      ("SequentialCalls", 1, None),
+      ("ParallelCalls", 2, None),
+      ("ParallelBatches", None, 10),
+  )
+  def testMapAndBatch(self, num_parallel_calls, num_parallel_batches):
+    """Test a dataset that maps a TF function across its input elements."""
+    # The pipeline is TensorSliceDataset ->
+    # RepeatDataset(count) -> MapAndBatchDataset(square_3, batch_size).
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+
+    count = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components).repeat(count).apply(
+            batching.map_and_batch(
+                map_func=_map_fn,
+                batch_size=batch_size,
+                num_parallel_calls=num_parallel_calls,
+                num_parallel_batches=num_parallel_batches))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
+                     [t.shape.as_list() for t in get_next])
+
+    with self.cached_session() as sess:
+      # Batch of a finite input, where the batch_size divides the
+      # total number of elements.
+      sess.run(init_op, feed_dict={count: 28, batch_size: 14})
+      num_batches = (28 * 7) // 14
+      for i in range(num_batches):
+        result = sess.run(get_next)
+        for component, result_component in zip(components, result):
+          for j in range(14):
+            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
+                                result_component[j])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Batch of a finite input, where the batch_size does not
+      # divide the total number of elements.
+      sess.run(init_op, feed_dict={count: 14, batch_size: 8})
+
+      # We expect (num_batches - 1) full-sized batches.
+      num_batches = int(math.ceil((14 * 7) / 8))
+      for i in range(num_batches - 1):
+        result = sess.run(get_next)
+        for component, result_component in zip(components, result):
+          for j in range(8):
+            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
+                                result_component[j])
+      result = sess.run(get_next)
+      for component, result_component in zip(components, result):
+        for j in range((14 * 7) % 8):
+          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
+                              result_component[j])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Batch of an empty input should fail straight away.
+      sess.run(init_op, feed_dict={count: 0, batch_size: 8})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Empty batch should be an initialization time error.
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(init_op, feed_dict={count: 14, batch_size: 0})
+
+  @parameterized.named_parameters(
+      ("Even", False),
+      ("Uneven", True),
+  )
+  def testMapAndBatchPartialBatch(self, drop_remainder):
+    iterator = (
+        dataset_ops.Dataset.range(10).apply(
+            batching.map_and_batch(
+                lambda x: array_ops.reshape(x * x, [1]),
+                batch_size=4,
+                drop_remainder=drop_remainder)).make_one_shot_iterator())
+    if drop_remainder:
+      self.assertEqual([4, 1], iterator.output_shapes.as_list())
+    else:
+      self.assertEqual([None, 1], iterator.output_shapes.as_list())
+    next_element = iterator.get_next()
+    with self.cached_session() as sess:
+      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
+      if not drop_remainder:
+        self.assertAllEqual([[64], [81]], sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testMapAndBatchYieldsPartialBatch(self):
+    iterator = (dataset_ops.Dataset.range(10)
+                .apply(batching.map_and_batch(
+                    lambda x: array_ops.reshape(x * x, [1]), 4))
+                .make_one_shot_iterator())
+    self.assertEqual([None, 1], iterator.output_shapes.as_list())
+    next_element = iterator.get_next()
+    with self.cached_session() as sess:
+      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
+      self.assertAllEqual([[64], [81]], sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testMapAndBatchParallelGetNext(self):
+    iterator = (dataset_ops.Dataset.range(50000)
+                .apply(batching.map_and_batch(lambda x: x, batch_size=100))
+                .make_one_shot_iterator())
+    elements = []
+    for _ in range(100):
+      elements.append(iterator.get_next())
+    with self.cached_session() as sess:
+      for i in range(5):
+        got = sess.run(elements)
+        got.sort(key=lambda x: x[0])
+        expected = []
+        for j in range(100):
+          expected.append(range(i*10000+j*100, i*10000+(j+1)*100))
+        self.assertAllEqual(got, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elements)
+
+  def testMapAndBatchParallelGetNextDropRemainder(self):
+    iterator = (
+        dataset_ops.Dataset.range(49999).apply(
+            batching.map_and_batch(
+                lambda x: x, batch_size=100, drop_remainder=True))
+        .make_one_shot_iterator())
+    elements = []
+    for _ in range(100):
+      elements.append(iterator.get_next())
+    with self.cached_session() as sess:
+      for i in range(4):
+        got = sess.run(elements)
+        got.sort(key=lambda x: x[0])
+        expected = []
+        for j in range(100):
+          expected.append(range(i*10000+j*100, i*10000+(j+1)*100))
+        self.assertAllEqual(got, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elements)
+
+  def testMapAndBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = dataset_ops.Dataset.range(10).apply(
+        batching.map_and_batch(_sparse, 5)).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for i in range(2):
+        actual = sess.run(get_next)
+        expected = sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
+            dense_shape=[5, 1])
+        self.assertTrue(sparse_tensor.is_sparse(actual))
+        self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testMapAndBatchFails(self):
+    """Test a dataset that maps a TF function across its input elements."""
+    dataset = dataset_ops.Dataset.from_tensors(
+        array_ops.check_numerics(
+            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = (
+        dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    with self.cached_session() as sess:
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
+        sess.run(init_op, feed_dict={batch_size: 14})
+
+  def testMapAndBatchShapeMismatch(self):
+    """Test a dataset that maps a TF function across its input elements."""
+
+    def generator():
+      yield [1]
+      yield [2]
+      yield [3]
+      yield [[4, 5, 6]]
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=dtypes.int32)
+    batch_size = 4
+    iterator = (
+        dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "number of elements does not match"):
+        sess.run(get_next)
+
+  def testMapAndBatchImplicitDispose(self):
+    # Tests whether a map and batch dataset will be cleaned up correctly when
+    # the pipeline does not run it until exhaustion.
+    # The pipeline is TensorSliceDataset -> RepeatDataset(1000) ->
+    # MapAndBatchDataset(f=square_3, batch_size=100).
+    components = (np.arange(1000),
+                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
+                  np.array(37.0) * np.arange(1000))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).repeat(
+        1000).apply(batching.map_and_batch(_map_fn, batch_size=100))
+    dataset = dataset.prefetch(5)
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for _ in range(3):
+        sess.run(get_next)
+
+  @parameterized.named_parameters(
+      ("1", 0),
+      ("2", 5),
+      ("3", 10),
+      ("4", 90),
+      ("5", 95),
+      ("6", 99),
+  )
+  def testMapAndBatchOutOfRangeError(self, threshold):
+
+    def raising_py_fn(i):
+      if i >= threshold:
+        raise StopIteration()
+      else:
+        return i
+
+    iterator = (
+        dataset_ops.Dataset.range(100).apply(
+            batching.map_and_batch(
+                lambda x: script_ops.py_func(raising_py_fn, [x], dtypes.int64),
+                batch_size=10)).make_one_shot_iterator())
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for i in range(threshold // 10):
+        self.assertAllEqual([i * 10 + j for j in range(10)], sess.run(get_next))
+      if threshold % 10 != 0:
+        self.assertAllEqual(
+            [threshold // 10 * 10 + j for j in range(threshold % 10)],
+            sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @parameterized.named_parameters(
+      ("1", False, dtypes.bool),
+      ("2", -42, dtypes.int8),
+      ("3", -42, dtypes.int16),
+      ("4", -42, dtypes.int32),
+      ("5", -42, dtypes.int64),
+      ("6", 42, dtypes.uint8),
+      ("7", 42, dtypes.uint16),
+      ("8", 42.0, dtypes.float16),
+      ("9", 42.0, dtypes.float32),
+      ("10", 42.0, dtypes.float64),
+      ("11", b"hello", dtypes.string),
+  )
+  def testMapAndBatchTypes(self, element, dtype):
+    def gen():
+      yield element
+
+    dataset = dataset_ops.Dataset.from_generator(gen, dtype).repeat(100).apply(
+        batching.map_and_batch(lambda x: x, batch_size=10))
+
+    get_next = dataset.make_one_shot_iterator().get_next()
+
+    with self.cached_session() as sess:
+      for _ in range(10):
+        self.assertAllEqual([element for _ in range(10)], sess.run(get_next))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/threadpool_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
similarity index 94%
rename from tensorflow/python/data/experimental/kernel_tests/threadpool_dataset_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
index 4432dcb05a..5e419a9b2f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/threadpool_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline statistics gathering ops."""
+"""Tests for the private `override_threadpool()` transformation."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -32,8 +32,8 @@ from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
-class OverrideThreadpoolDatasetTest(test_base.DatasetTestBase,
-                                    parameterized.TestCase):
+class OverrideThreadpoolTest(test_base.DatasetTestBase,
+                             parameterized.TestCase):
 
   @parameterized.named_parameters(
       ("1", 1, None),
diff --git a/tensorflow/python/data/experimental/kernel_tests/interleave_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
similarity index 99%
rename from tensorflow/python/data/experimental/kernel_tests/interleave_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
index 560902caad..90ac250df7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.experimental.parallel_interleave()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -37,7 +37,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
-class ParallelInterleaveDatasetTest(test_base.DatasetTestBase):
+class ParallelInterleaveTest(test_base.DatasetTestBase):
 
   def setUp(self):
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/parsing_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
similarity index 99%
rename from tensorflow/python/data/experimental/kernel_tests/parsing_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
index 13f924b656..723e709ae8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tensorflow.ops.parsing_ops."""
+"""Tests for `tf.data.experimental.parse_example_dataset()."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -73,7 +73,7 @@ def _compare_output_to_expected(tester, dict_tensors, expected_tensors,
     i += 1
 
 
-class ParseExampleTest(test_base.DatasetTestBase):
+class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
   def _test(self,
             input_tensor,
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
new file mode 100644
index 0000000000..f73725366c
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
@@ -0,0 +1,234 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.prefetch_to_device()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.experimental.ops import prefetching_ops
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class PrefetchToDeviceTest(test_base.DatasetTestBase):
+
+  def testPrefetchToDevice(self):
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_one_shot_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element.dtype)
+    self.assertEqual([], next_element.shape)
+
+    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
+    with self.test_session(config=worker_config) as sess:
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToSameDevice(self):
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device(
+            "/job:localhost/replica:0/task:0/device:CPU:0"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_one_shot_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element.dtype)
+    self.assertEqual([], next_element.shape)
+
+    with self.cached_session() as sess:
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchDictToDevice(self):
+    host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_one_shot_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element["a"].dtype)
+    self.assertEqual([], next_element["a"].shape)
+
+    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
+    with self.test_session(config=worker_config) as sess:
+      for i in range(10):
+        self.assertEqual({"a": i}, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchSparseTensorsToDevice(self):
+    def make_tensor(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0]], values=(i*[1]), dense_shape=[2, 2])
+    host_dataset = dataset_ops.Dataset.range(10).map(make_tensor)
+
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_one_shot_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element.dtype)
+
+    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
+    with self.test_session(config=worker_config) as sess:
+      for i in range(10):
+        actual = sess.run(next_element)
+        self.assertAllEqual([i], actual.values)
+        self.assertAllEqual([[0, 0]], actual.indices)
+        self.assertAllEqual([2, 2], actual.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToDeviceGpu(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/gpu:0"))
+
+    iterator = device_dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToDeviceWithReInit(self):
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_initializable_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element.dtype)
+    self.assertEqual([], next_element.shape)
+
+    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
+    with self.test_session(config=worker_config) as sess:
+      sess.run(iterator.initializer)
+      for i in range(5):
+        self.assertEqual(i, sess.run(next_element))
+      sess.run(iterator.initializer)
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToDeviceGpuWithReInit(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/gpu:0"))
+
+    iterator = device_dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(iterator.initializer)
+      for i in range(5):
+        self.assertEqual(i, sess.run(next_element))
+      sess.run(iterator.initializer)
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
index b6ab80d132..fe0b3b5f3b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
@@ -63,11 +63,11 @@ class FixedLengthRecordDatasetTestBase(test_base.DatasetTestBase):
     return filenames
 
 
-class ReadBatchFeaturesTestBase(test_base.DatasetTestBase):
+class MakeBatchedFeaturesDatasetTestBase(test_base.DatasetTestBase):
   """Base class for setting up and testing `make_batched_feature_dataset`."""
 
   def setUp(self):
-    super(ReadBatchFeaturesTestBase, self).setUp()
+    super(MakeBatchedFeaturesDatasetTestBase, self).setUp()
     self._num_files = 2
     self._num_records = 7
     self.test_filenames = self._createFiles()
diff --git a/tensorflow/python/data/experimental/kernel_tests/resample_test.py b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
similarity index 97%
rename from tensorflow/python/data/experimental/kernel_tests/resample_test.py
rename to tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
index 775648c943..4c879dbae6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/resample_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.experimental.rejection_resample()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -58,7 +58,7 @@ def _time_resampling(
   return end_time - start_time
 
 
-class ResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
+class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
       ("InitialDistributionKnown", True),
diff --git a/tensorflow/python/data/experimental/kernel_tests/dataset_constructor_op_test.py b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
similarity index 95%
rename from tensorflow/python/data/experimental/kernel_tests/dataset_constructor_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
index 3fc7157bc5..516e489d04 100644
--- a/tensorflow/python/data/experimental/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for the private `_RestructuredDataset` transformation."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,7 +26,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class DatasetConstructorTest(test_base.DatasetTestBase):
+class RestructuredDatasetTest(test_base.DatasetTestBase):
 
   def testRestructureDataset(self):
     components = (array_ops.placeholder(dtypes.int32),
diff --git a/tensorflow/python/data/experimental/kernel_tests/scan_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
similarity index 98%
rename from tensorflow/python/data/experimental/kernel_tests/scan_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/scan_test.py
index 78ec80de23..0730455431 100644
--- a/tensorflow/python/data/experimental/kernel_tests/scan_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.experimental.scan()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -34,7 +34,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class ScanDatasetTest(test_base.DatasetTestBase):
+class ScanTest(test_base.DatasetTestBase):
 
   def _counting_dataset(self, start, scan_fn):
     return dataset_ops.Dataset.from_tensors(0).repeat().apply(
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index 58a335ae4f..e556b65b7c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -69,6 +69,26 @@ py_test(
     ],
 )
 
+py_test(
+    name = "checkpoint_input_pipeline_hook_test",
+    size = "small",
+    srcs = ["checkpoint_input_pipeline_hook_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/experimental/ops:iterator_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
 py_test(
     name = "concatenate_dataset_serialization_test",
     size = "small",
@@ -580,7 +600,7 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python/data/experimental/kernel_tests:sql_dataset_op_test_base",
+        "//tensorflow/python/data/experimental/kernel_tests:sql_dataset_test_base",
         "//tensorflow/python/data/experimental/ops:readers",
     ],
 )
diff --git a/tensorflow/python/data/experimental/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
similarity index 100%
rename from tensorflow/python/data/experimental/kernel_tests/iterator_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py
index a0dd6960b0..b3dfe21486 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py
@@ -23,7 +23,7 @@ from tensorflow.python.platform import test
 
 
 class ParseExampleDatasetSerializationTest(
-    reader_dataset_ops_test_base.ReadBatchFeaturesTestBase,
+    reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase,
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
   def ParseExampleDataset(self, num_repeat, batch_size):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py
index b179770ce3..006279bbe1 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.data.experimental.kernel_tests import sql_dataset_op_test_base
+from tensorflow.python.data.experimental.kernel_tests import sql_dataset_test_base
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.framework import dtypes
@@ -28,7 +28,7 @@ from tensorflow.python.platform import test
 
 
 class SqlDatasetSerializationTest(
-    sql_dataset_op_test_base.SqlDatasetTestBase,
+    sql_dataset_test_base.SqlDatasetTestBase,
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
   def _build_dataset(self, num_repeats):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization_integration_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization_integration_test.py
deleted file mode 100644
index 88d5c896c9..0000000000
--- a/tensorflow/python/data/experimental/kernel_tests/serialization_integration_test.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Integration test for dataset serialization."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import test
-from tensorflow.python.training import saver as saver_lib
-
-
-class SerializationIntegrationTest(test.TestCase):
-
-  def _build_input_pipeline(self, name, num_outputs):
-    with ops.name_scope(name):
-      ds = dataset_ops.Dataset.range(num_outputs).shuffle(
-          10, reshuffle_each_iteration=False).prefetch(10)
-      iterator = ds.make_initializable_iterator()
-      saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-      return iterator.initializer, iterator.get_next()
-
-  def _build_graph(self, num_pipelines, num_outputs):
-    init_ops = []
-    get_next_ops = []
-    for i in range(num_pipelines):
-      name = "input_pipeline_%d" % i
-      init_op, get_next_op = self._build_input_pipeline(name, num_outputs)
-      init_ops.append(init_op)
-      get_next_ops.append(get_next_op)
-    saver = saver_lib.Saver()
-    return init_ops, get_next_ops, saver
-
-  def _ckpt_path(self):
-    return os.path.join(self.get_temp_dir(), "iterator")
-
-  def testConcurrentSaves(self):
-    num_pipelines = 100
-    num_outputs = 100
-    break_point = 10
-    all_outputs = [[] for _ in range(num_pipelines)]
-    with ops.Graph().as_default() as g:
-      init_ops, get_next_ops, saver = self._build_graph(num_pipelines,
-                                                        num_outputs)
-      with self.session(graph=g) as sess:
-        sess.run(init_ops)
-        for _ in range(break_point):
-          output = sess.run(get_next_ops)
-          for i in range(num_pipelines):
-            all_outputs[i].append(output[i])
-        saver.save(sess, self._ckpt_path())
-
-    with ops.Graph().as_default() as g:
-      init_ops, get_next_ops, saver = self._build_graph(num_pipelines,
-                                                        num_outputs)
-      with self.session(graph=g) as sess:
-        saver.restore(sess, self._ckpt_path())
-        for _ in range(num_outputs - break_point):
-          output = sess.run(get_next_ops)
-          for i in range(num_pipelines):
-            all_outputs[i].append(output[i])
-
-    for output in all_outputs:
-      self.assertSequenceEqual(sorted(output), range(num_outputs))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
similarity index 98%
rename from tensorflow/python/data/experimental/kernel_tests/shuffle_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
index 50895b5945..c208963a86 100644
--- a/tensorflow/python/data/experimental/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.experimental.shuffle_and_repeat()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
similarity index 99%
rename from tensorflow/python/data/experimental/kernel_tests/sql_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
index 301f75488a..a2c1169638 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
@@ -12,19 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for experimental sql input op."""
+"""Tests for `tf.data.experimental.SqlDataset`."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.experimental.kernel_tests import sql_dataset_op_test_base
+from tensorflow.python.data.experimental.kernel_tests import sql_dataset_test_base
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
 
 
-class SqlDatasetTest(sql_dataset_op_test_base.SqlDatasetTestBase):
+class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
 
   # Test that SqlDataset can read from a database table.
   def testReadResultSet(self):
diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_op_test_base.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
similarity index 98%
rename from tensorflow/python/data/experimental/kernel_tests/sql_dataset_op_test_base.py
rename to tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
index a135c357f0..6aaaa90c65 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_op_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Base class for testing SqlDataset."""
-
+"""Base class for testing `tf.data.experimental.SqlDataset`."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
index 19f5a62d45..427654cd76 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
@@ -280,7 +280,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
 
 class FeatureStatsDatasetTest(
     stats_dataset_test_base.StatsDatasetTestBase,
-    reader_dataset_ops_test_base.ReadBatchFeaturesTestBase):
+    reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase):
 
   def testFeaturesStats(self):
     num_epochs = 5
diff --git a/tensorflow/python/data/experimental/kernel_tests/writer_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
similarity index 98%
rename from tensorflow/python/data/experimental/kernel_tests/writer_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
index 25a2e63ba1..8fd0ad50c4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/writer_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.experimental.TFRecordWriter`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
new file mode 100644
index 0000000000..0278a208cb
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
@@ -0,0 +1,300 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.unbatch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def testUnbatchWithUnknownRankInput(self):
+    placeholder = array_ops.placeholder(dtypes.int32)
+    dataset = dataset_ops.Dataset.from_tensors(placeholder).apply(
+        batching.unbatch())
+    iterator = dataset.make_initializable_iterator()
+    next_elem = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(iterator.initializer, feed_dict={placeholder: [0, 1, 2, 3]})
+      for i in range(4):
+        self.assertEqual(i, sess.run(next_elem))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_elem)
+
+  def testUnbatchScalarDataset(self):
+    data = tuple([math_ops.range(10) for _ in range(3)])
+    data = dataset_ops.Dataset.from_tensor_slices(data)
+    expected_types = (dtypes.int32,) * 3
+    data = data.batch(2)
+    self.assertEqual(expected_types, data.output_types)
+    data = data.apply(batching.unbatch())
+    self.assertEqual(expected_types, data.output_types)
+
+    iterator = data.make_one_shot_iterator()
+    op = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for i in range(10):
+        self.assertEqual((i,) * 3, sess.run(op))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(op)
+
+  def testUnbatchDatasetWithStrings(self):
+    data = tuple([math_ops.range(10) for _ in range(3)])
+    data = dataset_ops.Dataset.from_tensor_slices(data)
+    data = data.map(lambda x, y, z: (x, string_ops.as_string(y), z))
+    expected_types = (dtypes.int32, dtypes.string, dtypes.int32)
+    data = data.batch(2)
+    self.assertEqual(expected_types, data.output_types)
+    data = data.apply(batching.unbatch())
+    self.assertEqual(expected_types, data.output_types)
+
+    iterator = data.make_one_shot_iterator()
+    op = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for i in range(10):
+        self.assertEqual((i, compat.as_bytes(str(i)), i), sess.run(op))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(op)
+
+  def testUnbatchDatasetWithSparseTensor(self):
+    st = sparse_tensor.SparseTensorValue(
+        indices=[[i, i] for i in range(10)],
+        values=list(range(10)),
+        dense_shape=[10, 10])
+    data = dataset_ops.Dataset.from_tensors(st)
+    data = data.apply(batching.unbatch())
+    data = data.batch(5)
+    data = data.apply(batching.unbatch())
+    iterator = data.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for i in range(10):
+        st_row = sess.run(next_element)
+        self.assertEqual([i], st_row.indices)
+        self.assertEqual([i], st_row.values)
+        self.assertEqual([10], st_row.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testUnbatchDatasetWithDenseAndSparseTensor(self):
+    st = sparse_tensor.SparseTensorValue(
+        indices=[[i, i] for i in range(10)],
+        values=list(range(10)),
+        dense_shape=[10, 10])
+    data = dataset_ops.Dataset.from_tensors((list(range(10)), st))
+    data = data.apply(batching.unbatch())
+    data = data.batch(5)
+    data = data.apply(batching.unbatch())
+    iterator = data.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for i in range(10):
+        dense_elem, st_row = sess.run(next_element)
+        self.assertEqual(i, dense_elem)
+        self.assertEqual([i], st_row.indices)
+        self.assertEqual([i], st_row.values)
+        self.assertEqual([10], st_row.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testUnbatchSingleElementTupleDataset(self):
+    data = tuple([(math_ops.range(10),) for _ in range(3)])
+    data = dataset_ops.Dataset.from_tensor_slices(data)
+    expected_types = ((dtypes.int32,),) * 3
+    data = data.batch(2)
+    self.assertEqual(expected_types, data.output_types)
+    data = data.apply(batching.unbatch())
+    self.assertEqual(expected_types, data.output_types)
+
+    iterator = data.make_one_shot_iterator()
+    op = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for i in range(10):
+        self.assertEqual(((i,),) * 3, sess.run(op))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(op)
+
+  def testUnbatchMultiElementTupleDataset(self):
+    data = tuple([(math_ops.range(10 * i, 10 * i + 10),
+                   array_ops.fill([10], "hi")) for i in range(3)])
+    data = dataset_ops.Dataset.from_tensor_slices(data)
+    expected_types = ((dtypes.int32, dtypes.string),) * 3
+    data = data.batch(2)
+    self.assertAllEqual(expected_types, data.output_types)
+    data = data.apply(batching.unbatch())
+    self.assertAllEqual(expected_types, data.output_types)
+
+    iterator = data.make_one_shot_iterator()
+    op = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for i in range(10):
+        self.assertEqual(((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")),
+                         sess.run(op))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(op)
+
+  def testUnbatchEmpty(self):
+    data = dataset_ops.Dataset.from_tensors(
+        (constant_op.constant([]), constant_op.constant([], shape=[0, 4]),
+         constant_op.constant([], shape=[0, 4, 0])))
+    data = data.apply(batching.unbatch())
+    iterator = data.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testUnbatchStaticShapeMismatch(self):
+    data = dataset_ops.Dataset.from_tensors((np.arange(7), np.arange(8),
+                                             np.arange(9)))
+    with self.assertRaises(ValueError):
+      data.apply(batching.unbatch())
+
+  def testUnbatchDynamicShapeMismatch(self):
+    ph1 = array_ops.placeholder(dtypes.int32, shape=[None])
+    ph2 = array_ops.placeholder(dtypes.int32, shape=None)
+    data = dataset_ops.Dataset.from_tensors((ph1, ph2))
+    data = data.apply(batching.unbatch())
+    iterator = data.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      # Mismatch in the 0th dimension.
+      sess.run(
+          iterator.initializer,
+          feed_dict={
+              ph1: np.arange(7).astype(np.int32),
+              ph2: np.arange(8).astype(np.int32)
+          })
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(next_element)
+
+      # No 0th dimension (i.e. scalar value) for one component.
+      sess.run(
+          iterator.initializer,
+          feed_dict={
+              ph1: np.arange(7).astype(np.int32),
+              ph2: 7
+          })
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(next_element)
+
+
+class UnbatchBenchmark(test.Benchmark):
+
+  def benchmarkNativeUnbatch(self):
+    batch_sizes = [1, 2, 5, 10, 20, 50]
+    elems_per_trial = 10000
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
+      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset = dataset.batch(batch_size_placeholder)
+      dataset = dataset.apply(batching.unbatch())
+      dataset = dataset.skip(elems_per_trial)
+      iterator = dataset.make_initializable_iterator()
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for batch_size in batch_sizes:
+          deltas = []
+          for _ in range(5):
+            sess.run(
+                iterator.initializer,
+                feed_dict={batch_size_placeholder: batch_size})
+            start = time.time()
+            sess.run(next_element.op)
+            end = time.time()
+            deltas.append((end - start) / elems_per_trial)
+
+          median_wall_time = np.median(deltas)
+          print("Unbatch (native) batch size: %d Median wall time per element:"
+                " %f microseconds" % (batch_size, median_wall_time * 1e6))
+          self.report_benchmark(
+              iters=10000,
+              wall_time=median_wall_time,
+              name="benchmark_unbatch_dataset_native_batch_size_%d" %
+              batch_size)
+
+  # Include a benchmark of the previous `unbatch()` implementation that uses
+  # a composition of more primitive ops. Eventually we'd hope to generate code
+  # that is as good in both cases.
+  def benchmarkOldUnbatchImplementation(self):
+    batch_sizes = [1, 2, 5, 10, 20, 50]
+    elems_per_trial = 10000
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
+      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset = dataset.batch(batch_size_placeholder)
+      dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
+      dataset = dataset.skip(elems_per_trial)
+      iterator = dataset.make_initializable_iterator()
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for batch_size in batch_sizes:
+          deltas = []
+          for _ in range(5):
+            sess.run(
+                iterator.initializer,
+                feed_dict={batch_size_placeholder: batch_size})
+            start = time.time()
+            sess.run(next_element.op)
+            end = time.time()
+            deltas.append((end - start) / elems_per_trial)
+
+          median_wall_time = np.median(deltas)
+          print("Unbatch (unfused) batch size: %d Median wall time per element:"
+                " %f microseconds" % (batch_size, median_wall_time * 1e6))
+          self.report_benchmark(
+              iters=10000,
+              wall_time=median_wall_time,
+              name="benchmark_unbatch_dataset_unfused_batch_size_%d" %
+              batch_size)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
similarity index 96%
rename from tensorflow/python/data/experimental/kernel_tests/unique_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/unique_test.py
index b5a0b20f3f..847cff26b0 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unique_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.experimental.unique()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,7 +26,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-class UniqueDatasetTest(test_base.DatasetTestBase):
+class UniqueTest(test_base.DatasetTestBase):
 
   def _testSimpleHelper(self, dtype, test_cases):
     """Test the `unique()` transformation on a list of test cases.
diff --git a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
index 230ae3f3fd..0c372ebb10 100644
--- a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.Dataset.map()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -267,6 +267,35 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testCaptureIterator(self):
+
+    def _build_ds(iterator):
+
+      def _map_fn(x):
+        get_next = iterator.get_next()
+        return x * get_next
+
+      return dataset_ops.Dataset.range(10).map(_map_fn)
+
+    def _build_graph():
+      captured_iterator = dataset_ops.Dataset.range(
+          10).make_initializable_iterator()
+      ds = _build_ds(captured_iterator)
+      iterator = ds.make_initializable_iterator()
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+      return captured_iterator.initializer, init_op, get_next
+
+    with ops.Graph().as_default() as g:
+      captured_init_op, init_op, get_next = _build_graph()
+      with self.session(graph=g) as sess:
+        sess.run(captured_init_op)
+        sess.run(init_op)
+        for i in range(10):
+          self.assertEqual(i * i, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
   def testCaptureHashTable(self):
     # NOTE(mrry): We must use the V2 variants of `HashTable`
     # etc. because these produce a `tf.resource`-typed output that is
-- 
GitLab


From 158b6b8becb6afd08f9d6c87f0c7f144ba5f0584 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Thu, 4 Oct 2018 12:59:38 -0700
Subject: [PATCH 429/570] Use weak symbols to inject flex delegates

PiperOrigin-RevId: 215788183
---
 tensorflow/contrib/lite/BUILD                 |  26 ++++++++--
 tensorflow/contrib/lite/delegates/flex/BUILD  |   4 +-
 .../contrib/lite/delegates/flex/delegate.cc   |   9 ++++
 tensorflow/contrib/lite/interpreter.h         |  15 +++---
 tensorflow/contrib/lite/interpreter_test.cc   |   6 ++-
 tensorflow/contrib/lite/model.cc              |  35 ++++++++++----
 tensorflow/contrib/lite/model_flex_test.cc    |  45 ++++++++++++++++++
 tensorflow/contrib/lite/model_test.cc         |  22 +++++++++
 .../contrib/lite/testdata/multi_add_flex.bin  | Bin 0 -> 1052 bytes
 tensorflow/contrib/lite/tools/benchmark/BUILD |  24 ++--------
 .../tools/benchmark/benchmark_tflite_model.cc |  12 -----
 .../tools/benchmark/benchmark_tflite_model.h  |   6 ---
 12 files changed, 141 insertions(+), 63 deletions(-)
 create mode 100644 tensorflow/contrib/lite/model_flex_test.cc
 create mode 100644 tensorflow/contrib/lite/testdata/multi_add_flex.bin

diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index f3ebe3b245..787a85644c 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -4,6 +4,7 @@ package(default_visibility = [
 
 licenses(["notice"])  # Apache 2.0
 
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
 
 exports_files(glob([
@@ -165,10 +166,6 @@ cc_library(
         "stderr_reporter.h",
     ],
     copts = tflite_copts(),
-    defines = select({
-        ":with_tflite_flex": ["TFLITE_FLEX"],
-        "//conditions:default": [],
-    }),
     linkopts = [
     ] + select({
         "//tensorflow:android": [
@@ -276,6 +273,7 @@ cc_test(
         "testdata/0_subgraphs.bin",
         "testdata/2_subgraphs.bin",
         "testdata/empty_model.bin",
+        "testdata/multi_add_flex.bin",
         "testdata/test_model.bin",
         "testdata/test_model_broken.bin",
     ],
@@ -283,6 +281,26 @@ cc_test(
         ":framework",
         "//tensorflow/contrib/lite/c:c_api_internal",
         "//tensorflow/contrib/lite/core/api",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Test model framework with the flex library linked into the target.
+tf_cc_test(
+    name = "model_flex_test",
+    size = "small",
+    srcs = ["model_flex_test.cc"],
+    data = [
+        "testdata/multi_add_flex.bin",
+    ],
+    tags = ["no_windows"],  # TODO(b/116667551): No weak symbols with MSVC.
+    deps = [
+        ":framework",
+        "//tensorflow/contrib/lite/core/api",
+        "//tensorflow/contrib/lite/delegates/flex:delegate",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
         "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
diff --git a/tensorflow/contrib/lite/delegates/flex/BUILD b/tensorflow/contrib/lite/delegates/flex/BUILD
index 9dd38958e5..9b89ed4f84 100644
--- a/tensorflow/contrib/lite/delegates/flex/BUILD
+++ b/tensorflow/contrib/lite/delegates/flex/BUILD
@@ -2,7 +2,7 @@
 # This is a TF Lite delegate that is powered by TensorFlow's Eager.
 #
 package(default_visibility = [
-    "//visibility:public",
+    "//visibility:private",
 ])
 
 licenses(["notice"])  # Apache 2.0
@@ -50,6 +50,7 @@ cc_library(
     hdrs = [
         "delegate.h",
     ],
+    visibility = ["//visibility:public"],
     deps = [
         ":buffer_map",
         ":delegate_data",
@@ -66,6 +67,7 @@ cc_library(
             "//tensorflow/core:lib",
         ],
     }),
+    alwayslink = 1,
 )
 
 tf_cc_test(
diff --git a/tensorflow/contrib/lite/delegates/flex/delegate.cc b/tensorflow/contrib/lite/delegates/flex/delegate.cc
index ba065a8ff5..c72b0cf513 100644
--- a/tensorflow/contrib/lite/delegates/flex/delegate.cc
+++ b/tensorflow/contrib/lite/delegates/flex/delegate.cc
@@ -83,6 +83,15 @@ TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
 }  // namespace delegate
 }  // namespace flex
 
+// Corresponding weak declaration found in lite/model.cc.
+std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
+AcquireFlexDelegate() {
+  return std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>(
+      tflite::FlexDelegate::Create().release(), [](TfLiteDelegate* delegate) {
+        delete reinterpret_cast<tflite::FlexDelegate*>(delegate);
+      });
+}
+
 std::unique_ptr<FlexDelegate> FlexDelegate::Create() {
   std::unique_ptr<flex::DelegateData> delegate_data;
   if (!flex::DelegateData::Create(&delegate_data).ok()) {
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 7ef736d01b..651a97e9dc 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -349,6 +349,10 @@ class Interpreter {
     return context_.allow_fp32_relax_to_fp16;
   }
 
+  // Owning handle to a TfLiteDelegate instance.
+  using TfLiteDelegatePtr =
+      std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
   // Allow a delegate to look at the graph and modify the graph to handle
   // parts of the graph themselves. After this is called, the graph may
   // contain new nodes that replace 1 more nodes.
@@ -574,19 +578,11 @@ class Interpreter {
                                  TfLiteExternalContextType type,
                                  TfLiteExternalContext* ctx);
 
-  using TfLiteDelegatePtr =
-      std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
-
   // Variant of the public ModifyGraphWithDelegate method that additionally
   // Assumes ownership of the provided delegate.
   // WARNING: This is an experimental API and subject to change.
-  template <typename Delegate>
-  TfLiteStatus ModifyGraphWithDelegate(std::unique_ptr<Delegate> typed_delegate,
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegatePtr delegate,
                                        bool allow_dynamic_tensors = false) {
-    TfLiteDelegatePtr delegate(typed_delegate.release(),
-                               [](TfLiteDelegate* delegate) {
-                                 delete static_cast<Delegate*>(delegate);
-                               });
     // Note that we retain ownership of the delegate even if graph modification
     // fails, as delegate use will be in an indeterminate state at that point.
     owned_delegates_.push_back(std::move(delegate));
@@ -676,6 +672,7 @@ class Interpreter {
   // List of delegates that have been installed and are owned by this
   // interpreter instance. Useful if client delegate ownership is burdensome.
   // WARNING: This is an experimental API and subject to change.
+  // TODO(b/116667551): Use TfLiteExternalContext for storing state.
   std::vector<TfLiteDelegatePtr> owned_delegates_;
 
   std::unique_ptr<MemoryPlanner> memory_planner_;
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index cdede430e2..6c71d5a8d7 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -30,7 +30,11 @@ class InterpreterTest : public ::testing::Test {
   template <typename Delegate>
   static TfLiteStatus ModifyGraphWithDelegate(
       Interpreter* interpreter, std::unique_ptr<Delegate> delegate) {
-    return interpreter->ModifyGraphWithDelegate(std::move(delegate));
+    Interpreter::TfLiteDelegatePtr tflite_delegate(
+        delegate.release(), [](TfLiteDelegate* delegate) {
+          delete reinterpret_cast<Delegate*>(delegate);
+        });
+    return interpreter->ModifyGraphWithDelegate(std::move(tflite_delegate));
   }
 
  protected:
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index d50c345194..d7b109ac1a 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -27,9 +27,6 @@ limitations under the License.
 #ifndef TFLITE_MCU
 #include "tensorflow/contrib/lite/nnapi_delegate.h"
 #endif
-#if defined(TFLITE_FLEX)
-#include "tensorflow/contrib/lite/delegates/flex/delegate.h"
-#endif
 #include "tensorflow/contrib/lite/version.h"
 
 namespace tflite {
@@ -43,6 +40,25 @@ ErrorReporter* ValidateErrorReporter(ErrorReporter* e) {
 
 const char* kEmptyTensorName = "";
 
+// Normally we'd use ABSL_HAVE_ATTRIBUTE_WEAK and ABSL_ATTRIBUTE_WEAK, but
+// we avoid the absl dependency for binary size reasons.
+#ifdef __has_attribute
+#define TFLITE_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+#define TFLITE_HAS_ATTRIBUTE(x) 0
+#endif
+
+#if TFLITE_HAS_ATTRIBUTE(weak) || (defined(__GNUC__) && !defined(__clang__))
+// Using weak symbols for the flex delegate allows automatic injection of the
+// delegate simply by adding it as a dependency. See also the strong override in
+// lite/delegates/flex/delegate.cc.
+__attribute__((weak)) Interpreter::TfLiteDelegatePtr AcquireFlexDelegate() {
+  return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+}
+#else
+Interpreter::TfLiteDelegatePtr (*AcquireFlexDelegate)() = nullptr;
+#endif
+
 #ifndef TFLITE_MCU
 // Loads a model from `filename`. If `mmap_file` is true then use mmap,
 // otherwise make a copy of the model in a buffer.
@@ -450,13 +466,14 @@ TfLiteStatus InterpreterBuilder::operator()(
   }
   (**interpreter).SetVariables(std::move(variables));
 
-#if defined(TFLITE_FLEX)
-  if (auto delegate = FlexDelegate::Create()) {
-    (**interpreter)
-        .ModifyGraphWithDelegate(std::move(delegate),
-                                 /*allow_dynamic_tensors=*/true);
+  // TODO(b/116667551): Only create the flex delegate if the model has flex ops.
+  if (AcquireFlexDelegate != nullptr) {
+    if (auto flex_delegate = AcquireFlexDelegate()) {
+      (**interpreter)
+          .ModifyGraphWithDelegate(std::move(flex_delegate),
+                                   /*allow_dynamic_tensors=*/true);
+    }
   }
-#endif
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/contrib/lite/model_flex_test.cc b/tensorflow/contrib/lite/model_flex_test.cc
new file mode 100644
index 0000000000..52e76bee49
--- /dev/null
+++ b/tensorflow/contrib/lite/model_flex_test.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/model.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+
+// Ensures that a model with TensorFlow ops can be imported as long as the
+// appropriate delegate is linked into the client.
+TEST(FlexModel, WithFlexDelegate) {
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/contrib/lite/testdata/multi_add_flex.bin");
+  ASSERT_TRUE(model);
+
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*model,
+                               ops::builtin::BuiltinOpResolver{})(&interpreter),
+            kTfLiteOk);
+  ASSERT_TRUE(interpreter);
+
+  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
+}
+
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/model_test.cc b/tensorflow/contrib/lite/model_test.cc
index ec7d46af7c..b969bea5dc 100644
--- a/tensorflow/contrib/lite/model_test.cc
+++ b/tensorflow/contrib/lite/model_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/core/api/error_reporter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/testing/util.h"
 
 // Comparison for TfLiteRegistration. Since TfLiteRegistration is a C object,
@@ -193,6 +194,27 @@ TEST(BasicFlatBufferModel, TestModelInInterpreter) {
   }
 }
 
+// Test that loading a model with TensorFlow ops fails when the flex delegate is
+// not linked into the target.
+TEST(FlexModel, FailureWithoutFlexDelegate) {
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/contrib/lite/testdata/multi_add_flex.bin");
+  ASSERT_TRUE(model);
+
+  // Note that creation will succeed when using the BuiltinOpResolver, but
+  // unless the appropriate delegate is linked into the target or the client
+  // explicitly installs the delegate, execution will fail.
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*model,
+                               ops::builtin::BuiltinOpResolver{})(&interpreter),
+            kTfLiteOk);
+  ASSERT_TRUE(interpreter);
+
+  // As the flex ops weren't resolved implicitly by the flex delegate, runtime
+  // allocation and execution will fail.
+  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteError);
+}
+
 // This tests on a flatbuffer that defines a shape of 2 to be a memory mapped
 // buffer. But the buffer is provided to be only 1 element.
 TEST(BasicFlatBufferModel, TestBrokenMmap) {
diff --git a/tensorflow/contrib/lite/testdata/multi_add_flex.bin b/tensorflow/contrib/lite/testdata/multi_add_flex.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9aac2155fedd11b81ed32e587655dfe53e5749a9
GIT binary patch
literal 1052
zcmb1OU|<Mw^D$;%;A4<rU}4~3;9(G85MkhBU|?WoU|?9n%)r3Iz`!8Dz`&ruz`(%B
zz`&5fz`(!{(&z8&ui%`YSC(2-lA5B&z`?-4V8g(`V8Xz_pu@nxpu)hwAj81GAi@AK
zg9T*HzyJS5{{8<i@$dhCnScNPEByQaU*+Hb{~G`P|JV8V|343d2;3DQdkq*E7(i|W
znZ1XRfq{vEfq|8Qfq{*IfdOP51Jq3*agcl2p>~EaFfjCh>}6zNh+$-4*u%iUaEF0`
zVIRnD1_lO@I*^$le}Vjz2r>v{=KufyL1G}YLE;Py_0ABQf#DZaFUW3Wy-85L4tVsM
z{r~?T<PVVBk@Y4+_156gTLje$auc%N3aH+c|Ns9_gQ@|A0kWD(sG5%d|NrlTssV)+
zNDat6Ait$R)qvt2lmb9<xWWLGPCzll08J^NbOJL6qz@F=ATu+eX0kAVAS5k)U|?V<
z0EICF1B1c;|NlX017r^<U4i@t(gM;83SW>LL4E;=n}X7!V@e8x8W&@w5VK=SiWFmt
z6l10muYO8uS!Qyom2*I-m4OyJ7h{MJlK~?Gldh(S2#W^DtQmOh0mTK#9+180_EbRa
zNhH!95pXI8g&xQ>P|5}6B1rnj=0}iyAiF_s1kuQT%!JyVB*mCaj2}T}fXut_|Nnnb
eHU+Uk>Dw(QwE`4uFK~#1;+vg;1DtQM=>q_$K#X(%

literal 0
HcmV?d00001

diff --git a/tensorflow/contrib/lite/tools/benchmark/BUILD b/tensorflow/contrib/lite/tools/benchmark/BUILD
index 502e181139..71bf61657e 100644
--- a/tensorflow/contrib/lite/tools/benchmark/BUILD
+++ b/tensorflow/contrib/lite/tools/benchmark/BUILD
@@ -40,7 +40,7 @@ cc_binary(
     srcs = [
         "benchmark_main.cc",
     ],
-    copts = common_copts + ["-DTFLITE_FLEX"],
+    copts = common_copts,
     linkopts = tflite_linkopts() + select({
         "//tensorflow:android": [
             "-pie",  # Android 5.0 and later supports only PIE
@@ -49,8 +49,9 @@ cc_binary(
         "//conditions:default": [],
     }),
     deps = [
-        ":benchmark_tflite_model_plus_flex_lib",
+        ":benchmark_tflite_model_lib",
         ":logging",
+        "//tensorflow/contrib/lite/delegates/flex:delegate",
     ],
 )
 
@@ -110,25 +111,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "benchmark_tflite_model_plus_flex_lib",
-    srcs = [
-        "benchmark_tflite_model.cc",
-        "logging.h",
-    ],
-    hdrs = ["benchmark_tflite_model.h"],
-    copts = common_copts + ["-DTFLITE_FLEX"],
-    deps = [
-        ":benchmark_model_lib",
-        ":logging",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/delegates/flex:delegate",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/profiling:profile_summarizer",
-    ],
-)
-
 cc_library(
     name = "benchmark_params",
     srcs = [
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
index 463d5993f4..2a3df7f289 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -23,9 +23,6 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#ifdef TFLITE_FLEX
-#include "tensorflow/contrib/lite/delegates/flex/delegate.h"
-#endif  // TFLITE_FLEX
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/op_resolver.h"
@@ -305,15 +302,6 @@ void BenchmarkTfLiteModel::Init() {
 
   interpreter->UseNNAPI(use_nnapi);
 
-#ifdef TFLITE_FLEX
-  TFLITE_LOG(INFO) << "Instantiating Flex Delegate";
-  delegate_ = FlexDelegate::Create();
-  if (delegate_) {
-    interpreter->ModifyGraphWithDelegate(delegate_.get(),
-                                         /*allow_dynamic_tensors=*/true);
-  }
-#endif  // TFLITE_FLEX
-
   auto interpreter_inputs = interpreter->inputs();
 
   if (!inputs.empty()) {
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
index b091e18a29..25a302b2aa 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
@@ -20,9 +20,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#ifdef TFLITE_FLEX
-#include "tensorflow/contrib/lite/delegates/flex/delegate.h"
-#endif  // TFLITE_FLEX
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/profiling/profile_summarizer.h"
 #include "tensorflow/contrib/lite/tools/benchmark/benchmark_model.h"
@@ -73,9 +70,6 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   void PrepareInputsAndOutputs() override;
 
  private:
-#ifdef TFLITE_FLEX
-  std::unique_ptr<FlexDelegate> delegate_;
-#endif  // TFLITE_FLEX
   std::unique_ptr<tflite::FlatBufferModel> model;
   std::unique_ptr<tflite::Interpreter> interpreter;
   std::vector<InputLayerInfo> inputs;
-- 
GitLab


From 074ff471fefbcf3bfd49914ad80bd9f9751df363 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Thu, 4 Oct 2018 13:00:49 -0700
Subject: [PATCH 430/570] Temporarily disable testCondInDefun test in
 control_flow_ops_py_test

PiperOrigin-RevId: 215788359
---
 tensorflow/python/kernel_tests/control_flow_ops_py_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index a1be77601c..c7e89dd5f9 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -3422,7 +3422,8 @@ class EagerTest(test.TestCase):
       self.assertAllEqual(r.numpy(), 10)
       self.assertFalse(isinstance(r, list))
 
-  def testCondInDefun(self):
+  # TODO(b/117279927): Re-enable once msan failure is fixed.
+  def DISABLED_testCondInDefun(self):
     if "GPU" in [d.device_type for d in device_lib.list_local_devices()]:
       return unittest.skip("b/113346829 (gpu failure)")
 
-- 
GitLab


From 7fcb05ff475a0c6c1076eacf9d11e17323d98bc2 Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Thu, 4 Oct 2018 13:01:33 -0700
Subject: [PATCH 431/570] [tf.data] Add a notion of `captured args` to MapDefun

PiperOrigin-RevId: 215788485
---
 .../api_def/base_api/api_def_MapDefun.pbtxt   | 23 +++++--
 .../optimizers/data/map_vectorization.cc      |  1 +
 .../data/vectorization_utils_test.cc          |  3 +
 tensorflow/core/kernels/data/map_defun_op.cc  | 68 +++++++++----------
 tensorflow/core/ops/dataset_ops.cc            | 11 ++-
 .../kernel_tests/map_defun_op_test.py         | 12 ++++
 .../python/data/experimental/ops/map_defun.py |  8 ++-
 7 files changed, 77 insertions(+), 49 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_MapDefun.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapDefun.pbtxt
index 4433693759..d158f4b502 100644
--- a/tensorflow/core/api_def/base_api/api_def_MapDefun.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MapDefun.pbtxt
@@ -4,22 +4,33 @@ op {
   in_arg {
     name: "arguments"
     description: <<END
-    A list of tensors whose types are Targuments, corresponding to the inputs the
-    function should be mapped over.
+    A list of tensors whose types are `Targuments`, corresponding to the inputs
+    the function should be mapped over.
+END
+  }
+  in_arg {
+    name: "captured_inputs"
+    description: <<END
+    A list of tensors whose types are `Tcaptured`, corresponding to the captured
+    inputs of the defun.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-    A list of output tensors whose types are output_types and whose dimensions 0
-    are the same as the dimensions 0 of the tensors in arguments, and whose
-    remaining dimensions correspond to those in output_shapes.
+    A list of output tensors whose types are `output_types` and whose dimensions
+    0 are the same as the dimensions 0 of the tensors in `arguments`, and whose
+    remaining dimensions correspond to those in `output_shapes`.
 END
   }
   attr {
     name: "Targuments"
     description: "A list of types."
   }
+  attr {
+    name: "Tcaptured"
+    description: "A list of types."
+  }
   attr {
     name: "output_types"
     description: "A list of types."
@@ -29,6 +40,6 @@ END
     description: "A list of shapes."
   }
   summary: <<END
-  Maps a function on the list of tensors unpacked from inputs on dimension 0.
+  Maps a function on the list of tensors unpacked from arguments on dimension 0.
 END
 }
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
index ba521e79bc..a9254ed58b 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
@@ -67,6 +67,7 @@ FunctionDef* CreateMapDefunWrapper(const NodeDef& map_node,
     map_defun_node->add_input(input.name());
   }
   (*map_defun_node->mutable_attr())["Targuments"] = t_args;
+  AddNodeAttr("Tcaptured", DataTypeVector(), map_defun_node);
 
   // Set return values to match output names
   string output_prefix = strings::StrCat(map_defun_node->name(), ":output:");
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
index a958d706c1..a6020e36bb 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
@@ -55,6 +55,7 @@ NodeDef* AddMapDefunNode(const string& name, const std::vector<string>& inputs,
   func.set_name(function_name);
   NodeDef* node = function_utils::AddNode(name, "MapDefun", inputs, {}, fn);
   graph_transforms::SetNodeAttr("Targuments", t_arguments, node);
+  graph_transforms::SetNodeAttr("Tcaptured", DataTypeVector(), node);
   graph_transforms::SetNodeAttr("output_types", output_types, node);
   graph_transforms::SetNodeAttr("output_shapes", output_shapes, node);
   graph_transforms::SetNodeAttr("f", func, node);
@@ -142,6 +143,8 @@ TEST(VectorizeMapDefunTest, VectorizeDefunNoOps) {
   *lib.add_function() = outer;
   *lib.add_function() = inner;
   FunctionDef* vectorized;
+  Status s = VectorizeMapDefun(outer, *map_defun, &lib, &vectorized);
+  LOG(ERROR) << s;
   EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
   EXPECT_TRUE(
       !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
diff --git a/tensorflow/core/kernels/data/map_defun_op.cc b/tensorflow/core/kernels/data/map_defun_op.cc
index 6657f2b2b3..705b0393de 100644
--- a/tensorflow/core/kernels/data/map_defun_op.cc
+++ b/tensorflow/core/kernels/data/map_defun_op.cc
@@ -62,24 +62,6 @@ class MapDefunOp : public AsyncOpKernel {
 
   ~MapDefunOp() override {}
 
-  Status GetInputBatchSize(OpKernelContext* ctx, int64* batch_size) {
-    // Validates inputs and gets the size of their leading dimension.
-    *batch_size = ctx->input(0).dims() > 0 ? ctx->input(0).dim_size(0) : -1;
-    for (size_t i = 0; i < ctx->num_inputs(); ++i) {
-      if (ctx->input(i).dims() == 0) {
-        return errors::InvalidArgument(
-            "All inputs must have rank at least 1. Input ", i,
-            " has a rank of 0.");
-      } else if (ctx->input(i).dim_size(0) != *batch_size) {
-        return errors::InvalidArgument(
-            "All inputs must have the same dimension 0. Input ", i,
-            " has leading dimension ", ctx->input(i).dim_size(0),
-            ", while all previous inputs have leading dimension ", batch_size);
-      }
-    }
-    return Status::OK();
-  }
-
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     ComputeOptions* compute_opts = nullptr;
 
@@ -150,8 +132,9 @@ class MapDefunOp : public AsyncOpKernel {
     // all calls to the function are complete. This struct also encapsulates
     // all the components that need to be passed to each MapFunctionCallFrame.
 
-    const std::vector<Tensor> args;
+    OpInputList args;
     const std::vector<TensorShape> arg_shapes;
+    OpInputList captured_inputs;
     const int64 batch_size;
 
     // Output of a compute call
@@ -161,26 +144,31 @@ class MapDefunOp : public AsyncOpKernel {
 
     // Create a copy of output_shapes because every `Compute` may expect a
     // different output shape.
-    ComputeOptions(std::vector<Tensor> args,
+    ComputeOptions(OpInputList args, OpInputList captured_inputs,
                    std::vector<TensorShape> arg_shapes, int64 batch_size,
                    const std::vector<PartialTensorShape>& output_shapes_attr)
-        : args(std::move(args)),
+        : args(args),
           arg_shapes(std::move(arg_shapes)),
+          captured_inputs(captured_inputs),
           batch_size(batch_size),
           output_shapes(output_shapes_attr) {}
   };
 
   // Get inputs to Compute and check that they are valid.
   Status SetupArgs(OpKernelContext* ctx, ComputeOptions** compute_opts) {
-    int64 batch_size =
-        ctx->input(0).dims() > 0 ? ctx->input(0).dim_size(0) : -1;
+    OpInputList arguments;
+    TF_RETURN_IF_ERROR(ctx->input_list("arguments", &arguments));
+    OpInputList captured_inputs;
+    TF_RETURN_IF_ERROR(ctx->input_list("captured_inputs", &captured_inputs));
+
+    int64 batch_size = arguments[0].dims() > 0 ? arguments[0].dim_size(0) : -1;
 
-    for (size_t i = 0; i < ctx->num_inputs(); ++i) {
-      if (ctx->input(i).dims() == 0) {
+    for (size_t i = 0; i < arguments.size(); ++i) {
+      if (arguments[i].dims() == 0) {
         return errors::InvalidArgument(
             "All inputs must have rank at least 1. Input ", i,
             " has a rank of 0.");
-      } else if (ctx->input(i).dim_size(0) != batch_size) {
+      } else if (arguments[i].dim_size(0) != batch_size) {
         return errors::InvalidArgument(
             "All inputs must have the same dimension 0. Input ", i,
             " has leading dimension ", ctx->input(i).dim_size(0),
@@ -188,19 +176,17 @@ class MapDefunOp : public AsyncOpKernel {
       }
     }
 
-    std::vector<Tensor> args;
     std::vector<TensorShape> arg_shapes;
-    args.reserve(ctx->num_inputs());
-    arg_shapes.reserve(ctx->num_inputs());
+    arg_shapes.reserve(arguments.size());
 
-    for (size_t i = 0; i < ctx->num_inputs(); ++i) {
-      args.push_back(ctx->input(i));
-      arg_shapes.push_back(ctx->input(i).shape());
+    for (size_t i = 0; i < arguments.size(); ++i) {
+      arg_shapes.push_back(arguments[i].shape());
       arg_shapes.at(i).RemoveDim(0);
     }
 
-    *compute_opts = new ComputeOptions(std::move(args), std::move(arg_shapes),
-                                       batch_size, output_shapes_);
+    *compute_opts =
+        new ComputeOptions(arguments, captured_inputs, std::move(arg_shapes),
+                           batch_size, output_shapes_);
     return Status::OK();
   }
 
@@ -235,12 +221,21 @@ class MapDefunOp : public AsyncOpKernel {
     }
 
     Status GetArg(int index, Tensor* val) const override {
-      if (index < 0 || index >= compute_opts_->args.size()) {
+      if (index < 0 || index >= compute_opts_->args.size() +
+                                    compute_opts_->captured_inputs.size()) {
         return errors::InvalidArgument(
             "Mismatch in number of function inputs.");
       }
+
+      if (index >= compute_opts_->args.size()) {
+        // The function is calling for a captured input
+        *val =
+            compute_opts_->captured_inputs[index - compute_opts_->args.size()];
+        return Status::OK();
+      }
+
       bool result =
-          val->CopyFrom(compute_opts_->args.at(index).Slice(iter_, iter_ + 1),
+          val->CopyFrom(compute_opts_->args[index].Slice(iter_, iter_ + 1),
                         compute_opts_->arg_shapes.at(index));
       if (!result) {
         return errors::Internal("GetArg failed.");
@@ -248,7 +243,6 @@ class MapDefunOp : public AsyncOpKernel {
         // Ensure alignment
         *val = tensor::DeepCopy(*val);
       }
-
       return Status::OK();
     }
 
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 889a6a4640..ec22eee874 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -903,14 +903,18 @@ REGISTER_OP("ModelDataset")
 
 REGISTER_OP("MapDefun")
     .Input("arguments: Targuments")
+    .Input("captured_inputs: Tcaptured")
     .Output("output: output_types")
     .Attr("Targuments: list(type) >= 1")
+    .Attr("Tcaptured: list(type) >= 0 = []")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("f: func")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       std::vector<PartialTensorShape> output_shapes;
       TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+      DataTypeVector t_args;
+      TF_RETURN_IF_ERROR(c->GetAttr("Targuments", &t_args));
       if (output_shapes.size() != c->num_outputs()) {
         return errors::InvalidArgument(
             "`output_shapes` must be the same length as `output_types` (",
@@ -918,10 +922,11 @@ REGISTER_OP("MapDefun")
       }
 
       int64 dim_zero = -1;
-      for (size_t i = 0; i < static_cast<size_t>(c->num_inputs()); ++i) {
+      for (size_t i = 0; i < t_args.size(); ++i) {
         if (c->Rank(c->input(i)) == 0) {
           return errors::InvalidArgument(
-              "Inputs must have rank at least 1. Input ", i, " has rank of 0");
+              "Arguments must have rank at least 1. Input ", i,
+              " has rank of 0.");
         }
         auto dim_handle = c->Dim(c->input(i), 0);
         if (c->ValueKnown(dim_handle)) {
@@ -929,7 +934,7 @@ REGISTER_OP("MapDefun")
             dim_zero = c->Value(dim_handle);
           } else if (c->Value(dim_handle) != dim_zero) {
             return errors::InvalidArgument(
-                "Inputs must have the same dimension 0.");
+                "Arguments must have the same dimension 0.");
           }
         }
       }
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
index 612ee332c4..ae9dedb0ab 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
@@ -235,6 +235,18 @@ class MapDefunTest(test_base.DatasetTestBase):
       sess.close()
       thread.join()
 
+  def testMapDefunWithCapturedInputs(self):
+    c = constant_op.constant(2)
+
+    @function.Defun(dtypes.int32)
+    def fn(x):
+      return x + c
+
+    x = constant_op.constant([1, 2, 3, 4])
+    map_defun_op = map_defun.map_defun(fn, [x], [dtypes.int32], [()])[0]
+    expected = x + c
+    self.assertAllEqual(self.evaluate(expected), self.evaluate(map_defun_op))
+
 
 class MapDefunBenchmark(test.Benchmark):
 
diff --git a/tensorflow/python/data/experimental/ops/map_defun.py b/tensorflow/python/data/experimental/ops/map_defun.py
index 3d0d0993c9..3ac1158d8b 100644
--- a/tensorflow/python/data/experimental/ops/map_defun.py
+++ b/tensorflow/python/data/experimental/ops/map_defun.py
@@ -47,10 +47,12 @@ def map_defun(fn, elems, output_dtypes, output_shapes):
   if not isinstance(elems, list):
     raise ValueError("`elems` must be a list of tensors.")
   if not isinstance(output_dtypes, list):
-    raise ValueError("`output_dtypes` must be a list of tensors.")
+    raise ValueError("`output_dtypes` must be a list of `tf.DType` objects.")
   if not isinstance(output_shapes, list):
-    raise ValueError("`output_shapes` must be a list of tensors.")
+    raise ValueError("`output_shapes` must be a list of `tf.TensorShape` "
+                     "objects.")
 
   elems = [ops.convert_to_tensor(e) for e in elems]
   output_shapes = [tensor_shape.TensorShape(s) for s in output_shapes]
-  return gen_dataset_ops.map_defun(elems, output_dtypes, output_shapes, fn)
+  return gen_dataset_ops.map_defun(elems, fn.captured_inputs, output_dtypes,
+                                   output_shapes, fn)
-- 
GitLab


From b949f9ee60522ca43f7f8a89b15ea6eeed2ac570 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Thu, 4 Oct 2018 13:14:07 -0700
Subject: [PATCH 432/570] Enable masking through a Sequential model.

PiperOrigin-RevId: 215790636
---
 tensorflow/python/keras/engine/input_layer.py |  1 +
 .../python/keras/engine/topology_test.py      | 31 +++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index 8a4018a0df..6a69d0ed90 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -82,6 +82,7 @@ class InputLayer(base_layer.Layer):
     self.built = True
     self.sparse = sparse
     self.batch_size = batch_size
+    self.supports_masking = True
 
     if isinstance(input_shape, tensor_shape.TensorShape):
       input_shape = tuple(input_shape.as_list())
diff --git a/tensorflow/python/keras/engine/topology_test.py b/tensorflow/python/keras/engine/topology_test.py
index a0da96334b..b4488033cd 100644
--- a/tensorflow/python/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/topology_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
+from tensorflow.python.training import rmsprop
 
 try:
   import yaml  # pylint:disable=g-import-not-at-top
@@ -1182,6 +1183,36 @@ class DefaultShapeInferenceBehaviorTest(test.TestCase):
     output = model(sample_input)
     self.assertEqual(output.shape, (1, 3))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_sequential_as_downstream_of_masking_layer(self):
+    inputs = keras.layers.Input(shape=(3, 4))
+    x = keras.layers.Masking(mask_value=0., input_shape=(3, 4))(inputs)
+
+    s = keras.Sequential()
+    s.add(keras.layers.Dense(5, input_shape=(4,)))
+
+    x = keras.layers.wrappers.TimeDistributed(s)(x)
+    model = keras.Model(inputs=inputs, outputs=x)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(1e-3), loss='mse')
+
+    model_input = np.random.randint(
+        low=1, high=5, size=(10, 3, 4)).astype('float32')
+    for i in range(4):
+      model_input[i, i:, :] = 0.
+    model.fit(model_input,
+              np.random.random((10, 3, 5)), epochs=1, batch_size=6)
+
+    if not context.executing_eagerly():
+      # Note: this doesn't work in eager due to DeferredTensor/ops compatibility
+      # issue.
+      mask_outputs = [model.layers[1].compute_mask(model.layers[1].input)]
+      mask_outputs += [model.layers[2].compute_mask(
+          model.layers[2].input, mask_outputs[-1])]
+      func = keras.backend.function([model.input], mask_outputs)
+      mask_outputs_val = func([model_input])
+      self.assertAllClose(mask_outputs_val[0], np.any(model_input, axis=-1))
+      self.assertAllClose(mask_outputs_val[1], np.any(model_input, axis=-1))
+
 
 class GraphUtilsTest(test.TestCase):
 
-- 
GitLab


From 23a698e670a10eff362c575eb1297c2b4f0bbe11 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 13:18:18 -0700
Subject: [PATCH 433/570] Update ops-related pbtxt files.

PiperOrigin-RevId: 215791283
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 88 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 26 ++++++
 2 files changed, 114 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 33f18ae13f..780c6f6448 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -30566,6 +30566,52 @@ op {
     type: "func"
   }
 }
+op {
+  name: "MapDefun"
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "captured_inputs"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
 op {
   name: "MapIncompleteSize"
   output_arg {
@@ -71843,6 +71889,48 @@ op {
     }
   }
 }
+op {
+  name: "Substr"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "pos"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "len"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "unit"
+    type: "string"
+    default_value {
+      s: "BYTE"
+    }
+    allowed_values {
+      list {
+        s: "BYTE"
+        s: "UTF8_CHAR"
+      }
+    }
+  }
+}
 op {
   name: "Sum"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 0e58a9475d..0d8997c1bd 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -15262,6 +15262,10 @@ op {
     name: "arguments"
     type_list_attr: "Targuments"
   }
+  input_arg {
+    name: "captured_inputs"
+    type_list_attr: "Tcaptured"
+  }
   output_arg {
     name: "output"
     type_list_attr: "output_types"
@@ -15272,6 +15276,15 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
   attr {
     name: "output_types"
     type: "list(type)"
@@ -33748,6 +33761,19 @@ op {
       }
     }
   }
+  attr {
+    name: "unit"
+    type: "string"
+    default_value {
+      s: "BYTE"
+    }
+    allowed_values {
+      list {
+        s: "BYTE"
+        s: "UTF8_CHAR"
+      }
+    }
+  }
 }
 op {
   name: "Sum"
-- 
GitLab


From 589e876139f4c7fbdf96edaa16fdcfe12c7a4b03 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Thu, 4 Oct 2018 13:20:58 -0700
Subject: [PATCH 434/570] Error out when PartitionedCall is created with the
 wrong number of arguments.

(used to be a segfault)

PiperOrigin-RevId: 215791737
---
 tensorflow/core/kernels/partitioned_function_ops.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index fdb4c84c46..3979e4b53a 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -97,6 +97,13 @@ class PartitionedCallOp : public AsyncOpKernel {
         OP_REQUIRES_ASYNC(ctx, fbody != nullptr,
                           errors::Internal("Could not find handle ", handle),
                           done);
+        OP_REQUIRES_ASYNC(
+            ctx, args.size() == fbody->arg_nodes.size(),
+            errors::InvalidArgument(
+                "Wrong number of arguments to the op; function expects ",
+                fbody->arg_nodes.size(), " but PartitionedCall received ",
+                args.size()),
+            done);
         // We need to pass global op_registry as default_registry when creating
         // graph. So that graph optimization passes can lookup all possible ops
         // by name.
-- 
GitLab


From 9e8c7afa5867bd19b6684458566b064148b2665b Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Thu, 4 Oct 2018 13:34:31 -0700
Subject: [PATCH 435/570] Add TF_BUILD_TEST_TIMEOUT to
 ci_parameterized_build.sh

PiperOrigin-RevId: 215793932
---
 .../tools/ci_build/ci_parameterized_build.sh  | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 99bdedf7b4..fdff867ff0 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -83,6 +83,9 @@
 #                     Use the specified configurations when building.
 #                     When set, overrides TF_BUILD_IS_OPT and TF_BUILD_MAVX
 #                     options, as this will replace the two.
+#   TF_BUILD_TEST_TIMEOUT:
+#                     Sets the value of bazel --test_timeout, defaults to -1
+#                     which uses the bazel defaults.
 #   TF_SKIP_CONTRIB_TESTS:
 #                     If set to any non-empty or non-0 value, will skip running
 #                     contrib tests.
@@ -125,6 +128,8 @@ NO_DOCKER_OPT_FLAG="--genrule_strategy=standalone"
 
 DO_DOCKER=1
 
+# Bazel uses defaults for all test sizes when given `-1`.
+TF_BUILD_TEST_TIMEOUT=${TF_BUILD_TEST_TIMEOUT:--1}
 
 # Helpful flags:
 # --test_summary=detailed: Tell us more about which targets are being built
@@ -132,7 +137,16 @@ DO_DOCKER=1
 # --build_tests_only: Don't build targets depended on by tests if the test is
 #                     disabled. Also saves some compilation time. Otherwise,
 #                     tries to build everything.
-BAZEL_TEST_FLAGS="--test_summary=detailed --build_tests_only --keep_going"
+# --test_timeout: Test timeouts in the order short,moderate,long,eternal.
+# --test_env: Environment variables to set when running bazel tests. These are
+#             especially important when using --run_under with
+#             parallel_gpu_execute.
+BAZEL_TEST_FLAGS=""\
+"--test_summary=detailed --build_tests_only --keep_going "\
+"--test_timeout=${TF_BUILD_TEST_TIMEOUT} "\
+"--test_env=TF_GPU_COUNT=${TF_GPU_COUNT} "\
+"--test_env=TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU} "\
+"--test_env=TF_PER_DEVICE_MEMORY_LIMIT_MB=${TF_PER_DEVICE_MEMORY_LIMIT_MB}"
 BAZEL_BUILD_FLAGS="--keep_going"
 
 BAZEL_CMD="bazel test ${BAZEL_TEST_FLAGS}"
@@ -148,13 +162,6 @@ ANDROID_FULL_CMD="${CI_BUILD_DIR}/builds/android_full.sh"
 TF_GPU_COUNT=${TF_GPU_COUNT:-4}
 PARALLEL_GPU_TEST_CMD='//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute'
 
-# Environment variables to set when running bazel tests.  These are especially
-# important when using --run_under with parallel_gpu_execute.
-BAZEL_TEST_ENV=""\
-"--test_env=TF_GPU_COUNT=${TF_GPU_COUNT} "\
-"--test_env=TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU} "\
-"--test_env=TF_PER_DEVICE_MEMORY_LIMIT_MB=${TF_PER_DEVICE_MEMORY_LIMIT_MB} "
-
 BENCHMARK_CMD="${CI_BUILD_DIR}/builds/benchmark.sh"
 
 EXTRA_PARAMS=""
@@ -415,11 +422,11 @@ if [[ ${TF_BUILD_IS_PIP} == "no_pip" ]] ||
   if [[ ${CTYPE} == cpu* ]] || \
      [[ ${CTYPE} == "debian.jessie.cpu" ]]; then
     # CPU only command, fully parallel.
-    NO_PIP_MAIN_CMD="${MAIN_CMD} ${BAZEL_CMD} ${BAZEL_TEST_ENV} ${OPT_FLAG} "\
+    NO_PIP_MAIN_CMD="${MAIN_CMD} ${BAZEL_CMD} ${OPT_FLAG} "\
       "${EXTRA_ARGS} -- ${BAZEL_TARGET}"
   elif [[ ${CTYPE} == gpu* ]]; then
     # GPU only command, run as many jobs as the GPU count only.
-    NO_PIP_MAIN_CMD="${BAZEL_CMD} ${BAZEL_TEST_ENV} ${OPT_FLAG} "\
+    NO_PIP_MAIN_CMD="${BAZEL_CMD} ${OPT_FLAG} "\
 "--local_test_jobs=${TF_GPU_COUNT} "\
 "--run_under=${PARALLEL_GPU_TEST_CMD} "\
 "${EXTRA_ARGS} -- ${BAZEL_TARGET}"
-- 
GitLab


From 9f2d1e2cf6be4a17b6318b429447a71d9d48af32 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 13:35:31 -0700
Subject: [PATCH 436/570] Few more fixes for issued in parsing invalid HLO
 module proto.

PiperOrigin-RevId: 215794086
---
 tensorflow/compiler/xla/literal.cc                |  8 ++++----
 .../compiler/xla/service/hlo_instruction.cc       |  4 ++--
 .../compiler/xla/service/hlo_parser_test.cc       |  2 +-
 tensorflow/compiler/xla/service/hlo_sharding.cc   | 15 +++++++++++++++
 tensorflow/compiler/xla/shape_util.cc             |  7 ++-----
 5 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 177f39cc74..656ce720a1 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -1945,11 +1945,11 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
       }
     } break;
     case TUPLE:
-      LOG(FATAL) << "Should not be called on tuple shapes: "
-                 << ShapeUtil::HumanString(subshape());
-      break;
+      return InvalidArgument("Should not be called on tuple shapes: %s",
+                             ShapeUtil::HumanString(subshape()));
     default:
-      LOG(FATAL) << "Unhandled primitive type " << subshape().element_type();
+      return InvalidArgument("Is called on unsupported shape: %s",
+                             ShapeUtil::HumanString(subshape()));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index fb91adc302..2f6db7cd7c 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -465,8 +465,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kIota:
-      TF_RET_CHECK(proto.dimensions_size() <= 1)
-          << "Iota instruction should have at most 1 dimension but sees "
+      TF_RET_CHECK(proto.dimensions_size() == 1)
+          << "Iota instruction should have 1 dimension but sees "
           << proto.dimensions_size();
       instruction = CreateIota(proto.shape(), proto.dimensions(0));
       break;
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index b618510640..255123d331 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -1304,7 +1304,7 @@ TEST_F(HloParserTest, MoreConstants) {
 
 ENTRY %SelectScalarS32True.v4 () -> s32[] {
   %constant.2 = pred[] constant(true)
-  %constant.1 = s32[] constant(-42), sharding={s32[5,6] devices=[2,3]1,2,3,4}
+  %constant.1 = s32[] constant(-42), sharding={s32[5,6] devices=[2,2]1,2,3,4}
   %constant = s32[] constant(42)
   %select = s32[] select(pred[] %constant.2, s32[] %constant.1, s32[] %constant)
 }
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 94c7bafd3b..188f4acc79 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/overflow_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
@@ -377,6 +378,20 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
       << "Maximal sharding is expected to have single device assignment, but "
       << proto.tile_assignment_devices().size() << " has provided.";
 
+  TF_RET_CHECK(proto.tile_assignment_devices().size() > 1);
+  TF_RET_CHECK(!proto.tile_assignment_dimensions().empty());
+
+  // RE: the product of tile assignment tensor dimensions must be
+  // equal to tile_assignment_devices.size().
+  int64 product_of_dimensions = 1;
+  for (auto dimension : proto.tile_assignment_dimensions()) {
+    TF_RET_CHECK(dimension > 0);
+    product_of_dimensions =
+        MultiplyWithoutOverflow(product_of_dimensions, dimension);
+    TF_RET_CHECK(product_of_dimensions > 0);
+  }
+  TF_RET_CHECK(product_of_dimensions == proto.tile_assignment_devices().size());
+
   // Some versions of gcc cannot infer the TileAssignment constructor from a
   // braced initializer-list, so create one manually.
   std::vector<int64> devices(proto.tile_assignment_devices().begin(),
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 476a9fe868..d244923532 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -869,11 +869,8 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
     return Status::OK();
   }
 
-  if (Rank(shape) != shape.dimensions_size()) {
-    return InvalidArgument(
-        "shape's rank is mismatched with dimension count; rank=%d "
-        "dimensions_size=%d",
-        Rank(shape), shape.dimensions_size());
+  if (LayoutUtil::IsSparseArray(shape) && Rank(shape) == 0) {
+    return InvalidArgument("sparse arrays must have rank > 0");
   }
   for (int64 i = 0; i < Rank(shape); ++i) {
     int64 dimension = shape.dimensions(i);
-- 
GitLab


From d96e073e77929006c519cd3082461d9757865dd7 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 4 Oct 2018 13:42:48 -0700
Subject: [PATCH 437/570] [TF:XLA] Fix inverted condition in randomized test.

PiperOrigin-RevId: 215795518
---
 tensorflow/compiler/tests/randomized_tests.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index 7a96f4c25c..dc119fb0f8 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -1820,7 +1820,7 @@ TEST_F(OpTest, Diag) {
     do {
       dims = RandomDims(1);
       size = TensorShape(dims).num_elements();
-    } while (size * size < tf_xla_max_tensor_size);
+    } while (size * size > tf_xla_max_tensor_size);
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Diag").RandomInput(type, dims).Attr("T", type));
   });
-- 
GitLab


From 08ecc62a38dc58e85cb46ad281486d1c75b1db9b Mon Sep 17 00:00:00 2001
From: Dimitris Vardoulakis <dimvar@google.com>
Date: Thu, 4 Oct 2018 13:43:31 -0700
Subject: [PATCH 438/570] [TF:XLA] Improve the accounting for subcomputations
 in the List scheduler to avoid double-counting.

PiperOrigin-RevId: 215795640
---
 .../xla/service/hlo_memory_scheduler.cc       | 29 ++++++++++++++-----
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
index bf30764488..5cee865b7a 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
@@ -195,13 +195,15 @@ class ListScheduler {
     return entry;
   }
 
-  // Returns the number of bytes freed if the HLO instruction is scheduled.
-  // If the instruction calls subcomputations, we count the memory used by the
-  // subcomputations as memory "defined" by the instruction. This is not
-  // entirely accurate, because subcomputation memory will be freed after the
-  // instruction finishes. But it is more accurate than not taking
-  // subcomputations into account at all. In the future, we may improve
-  // accounting for subcomputation memory (b/65409243).
+  // Returns the number of bytes freed *after* the HLO instruction finishes.
+  // The current List algorithm only considers two states for an instruction:
+  // right before it runs, and after it finishes. We don't represent memory
+  // usage during the execution of an instruction. But if the instruction calls
+  // subcomputations, they are only live during the instruction's execution.
+  // We end up counting the memory used by subcomputations as memory "defined"
+  // by the instruction. This is not entirely accurate, but it is more accurate
+  // than not taking subcomputations into account at all. In the future, we may
+  // improve accounting for subcomputation memory (b/65409243).
   int64 BytesFreedIfScheduled(const ReadyListEntry& entry) {
     int64 freed_bytes = 0;
     for (const auto& kv : entry.used_buffer_unscheduled_use_counts) {
@@ -223,7 +225,18 @@ class ListScheduler {
         }
       }
     }
-    return freed_bytes - entry.bytes_defined - max_subcomputation_bytes;
+    int64 bytes_defined;
+    if (max_subcomputation_bytes > 0 &&
+        (entry.instruction->opcode() == HloOpcode::kWhile ||
+         entry.instruction->opcode() == HloOpcode::kCall ||
+         entry.instruction->opcode() == HloOpcode::kConditional)) {
+      // The output buffer of while/call/conditional is always aliased with the
+      // output buffer of the root instruction in the body. Don't double count.
+      bytes_defined = max_subcomputation_bytes;
+    } else {
+      bytes_defined = entry.bytes_defined + max_subcomputation_bytes;
+    }
+    return freed_bytes - bytes_defined;
   }
 
   // Constructs the scheduling priority of the given instruction.
-- 
GitLab


From 4c1da53840fed235409cb2c571ea081e28388f75 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 4 Oct 2018 13:53:19 -0700
Subject: [PATCH 439/570] Internal change.

PiperOrigin-RevId: 215797256
---
 tensorflow/python/kernel_tests/depthwise_conv_op_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 6aee2eb0a3..737a73f97a 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -131,7 +131,7 @@ class DepthwiseConv2DTest(test.TestCase):
     with self.session(graph=graph, use_gpu=use_gpu) as sess:
       tolerance = {
           dtypes.float16: 4e-2,
-          dtypes.float32: 1e-6,
+          dtypes.float32: 1e-5,
           dtypes.float64: 1e-12,
       }[data_type]
 
-- 
GitLab


From a2e48d849f5c7a97b788ba8d2499e95aaef95945 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 14:18:22 -0700
Subject: [PATCH 440/570] Fix problem in quantized version of Comparison op
 handler

PiperOrigin-RevId: 215801773
---
 tensorflow/contrib/lite/kernels/comparisons.cc   | 16 +++++-----------
 .../contrib/lite/kernels/comparisons_test.cc     | 11 +++++++++++
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/comparisons.cc b/tensorflow/contrib/lite/kernels/comparisons.cc
index f765235e04..3926af5b97 100644
--- a/tensorflow/contrib/lite/kernels/comparisons.cc
+++ b/tensorflow/contrib/lite/kernels/comparisons.cc
@@ -66,31 +66,25 @@ TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
     if (input1->type == kTfLiteUInt8) {                                        \
       auto input1_offset = -input1->params.zero_point;                         \
       auto input2_offset = -input2->params.zero_point;                         \
-      const int left_shift = 20;                                               \
-      const double twice_max_input_scale =                                     \
-          2 * std::max(input1->params.scale, input2->params.scale);            \
-      const double real_input1_multiplier =                                    \
-          input1->params.scale / twice_max_input_scale;                        \
-      const double real_input2_multiplier =                                    \
-          input2->params.scale / twice_max_input_scale;                        \
+      const int left_shift = 8;                                                \
                                                                                \
       int32 input1_multiplier;                                                 \
       int input1_shift;                                                        \
-      QuantizeMultiplierSmallerThanOneExp(real_input1_multiplier,              \
+      QuantizeMultiplierSmallerThanOneExp(input1->params.scale,                \
                                           &input1_multiplier, &input1_shift);  \
       int32 input2_multiplier;                                                 \
       int input2_shift;                                                        \
-      QuantizeMultiplierSmallerThanOneExp(real_input2_multiplier,              \
+      QuantizeMultiplierSmallerThanOneExp(input2->params.scale,                \
                                           &input2_multiplier, &input2_shift);  \
                                                                                \
       ComparisonParams op_params;                                              \
       op_params.left_shift = left_shift;                                       \
       op_params.input1_offset = input1_offset;                                 \
       op_params.input1_multiplier = input1_multiplier;                         \
-      op_params.input1_shift = -input1_shift;                                  \
+      op_params.input1_shift = input1_shift;                                   \
       op_params.input2_offset = input2_offset;                                 \
       op_params.input2_multiplier = input2_multiplier;                         \
-      op_params.input2_shift = -input2_shift;                                  \
+      op_params.input2_shift = input2_shift;                                   \
       if (requires_broadcast) {                                                \
         reference_ops::Broadcast4DSlow##opname##WithScaling(                   \
             op_params, GetTensorShape(input1), GetTensorData<uint8_t>(input1), \
diff --git a/tensorflow/contrib/lite/kernels/comparisons_test.cc b/tensorflow/contrib/lite/kernels/comparisons_test.cc
index 67a91c17fd..04c8bf2e30 100644
--- a/tensorflow/contrib/lite/kernels/comparisons_test.cc
+++ b/tensorflow/contrib/lite/kernels/comparisons_test.cc
@@ -402,6 +402,17 @@ TEST(ComparisonsTest, GreaterQuantized) {
   EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, false));
 }
 
+TEST(ComparisonsTest, GreaterQuantizedSmallRange) {
+  ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, 0.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, 0.0, 2.0},
+                          TensorType_UINT8, BuiltinOperator_GREATER);
+  model.QuantizeAndPopulate<uint8_t>(model.input1(), {1.0, 0.5, 0.35, 0.1});
+  model.QuantizeAndPopulate<uint8_t>(model.input2(), {1.01, 0.25, 0.3, 0.4});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, false));
+}
+
 TEST(ComparisonsTest, GreaterEqualQuantized) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-- 
GitLab


From b01ea7a51c07f6d2988d7f2aa117374591d1e25a Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Thu, 4 Oct 2018 14:18:58 -0700
Subject: [PATCH 441/570] Rename "Inliner" to "MapInliner".

PiperOrigin-RevId: 215801897
---
 tensorflow/compiler/xla/service/BUILD         | 69 +++++++++----------
 tensorflow/compiler/xla/service/cpu/BUILD     |  2 +-
 .../compiler/xla/service/cpu/cpu_compiler.cc  |  4 +-
 .../compiler/xla/service/interpreter/BUILD    |  2 +-
 .../xla/service/interpreter/compiler.cc       |  2 +-
 .../service/{inliner.cc => map_inliner.cc}    | 19 +++--
 .../xla/service/{inliner.h => map_inliner.h}  | 22 +++---
 .../{inliner_test.cc => map_inliner_test.cc}  | 20 +++---
 8 files changed, 68 insertions(+), 72 deletions(-)
 rename tensorflow/compiler/xla/service/{inliner.cc => map_inliner.cc} (87%)
 rename tensorflow/compiler/xla/service/{inliner.h => map_inliner.h} (59%)
 rename tensorflow/compiler/xla/service/{inliner_test.cc => map_inliner_test.cc} (95%)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 2f8bab0614..4797cf3330 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1841,42 +1841,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "inliner",
-    srcs = ["inliner.cc"],
-    hdrs = ["inliner.h"],
-    deps = [
-        ":hlo",
-        ":hlo_pass",
-        ":hlo_query",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-tf_cc_test(
-    name = "inliner_test",
-    srcs = ["inliner_test.cc"],
-    deps = [
-        ":cpu_plugin",
-        ":hlo",
-        ":hlo_matchers",
-        ":inliner",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
 cc_library(
     name = "computation_placer",
     srcs = ["computation_placer.cc"],
@@ -3492,6 +3456,39 @@ cc_library(
     deps = ["//tensorflow/core:lib"],
 )
 
+cc_library(
+    name = "map_inliner",
+    srcs = ["map_inliner.cc"],
+    hdrs = ["map_inliner.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        ":hlo_query",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "map_inliner_test",
+    srcs = ["map_inliner_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_matchers",
+        ":map_inliner",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 tf_cc_test(
     name = "hlo_casting_utils_test",
     srcs = ["hlo_casting_utils_test.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index ae4c6e962d..58abb330a6 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -94,6 +94,7 @@ cc_library(
         ":target_machine_features",
         "@com_google_absl//absl/types:span",
         "//tensorflow/compiler/tf2xla:cpu_function_runtime",
+        "//tensorflow/compiler/xla/service:map_inliner",
         "//tensorflow/compiler/xla/service:scatter_expander",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
@@ -127,7 +128,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:indexed_array_analysis",
-        "//tensorflow/compiler/xla/service:inliner",
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index afc94f2185..5834f67285 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -86,8 +86,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
-#include "tensorflow/compiler/xla/service/inliner.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/map_inliner.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/scatter_expander.h"
@@ -249,7 +249,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
       &pipeline, module->config().debug_options(),
       ReducePrecisionInsertion::PassTiming::BEFORE_OPTIMIZATION);
 
-  pipeline.AddPass<Inliner>();
+  pipeline.AddPass<MapInliner>();
 
   // TODO(b/65775800): Fix wrong output bug in Call and remove the CallInliner
   // pass.
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 146c9052f1..1484e14df1 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -45,8 +45,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
-        "//tensorflow/compiler/xla/service:inliner",
         "//tensorflow/compiler/xla/service:layout_assignment",
+        "//tensorflow/compiler/xla/service:map_inliner",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 27fe89375d..7c79eb7d79 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -28,9 +28,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
-#include "tensorflow/compiler/xla/service/inliner.h"
 #include "tensorflow/compiler/xla/service/interpreter/executable.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
+#include "tensorflow/compiler/xla/service/map_inliner.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/service/inliner.cc b/tensorflow/compiler/xla/service/map_inliner.cc
similarity index 87%
rename from tensorflow/compiler/xla/service/inliner.cc
rename to tensorflow/compiler/xla/service/map_inliner.cc
index 50c408f5bb..2200ef054a 100644
--- a/tensorflow/compiler/xla/service/inliner.cc
+++ b/tensorflow/compiler/xla/service/map_inliner.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/inliner.h"
+#include "tensorflow/compiler/xla/service/map_inliner.h"
 
 #include <memory>
 #include <string>
@@ -32,10 +32,10 @@ limitations under the License.
 
 namespace xla {
 
-// InlinerVisitor traverses the HLO computation and inlines maps.
-class InlinerVisitor : public DfsHloVisitorWithDefault {
+// MapInlinerVisitor traverses the HLO computation and inlines maps.
+class MapInlinerVisitor : public DfsHloVisitorWithDefault {
  public:
-  explicit InlinerVisitor(HloComputation* computation)
+  explicit MapInlinerVisitor(HloComputation* computation)
       : computation_(computation) {}
 
   // Default visitor action is to do nothing and return OK.
@@ -49,24 +49,23 @@ class InlinerVisitor : public DfsHloVisitorWithDefault {
   StatusOr<bool> Run(HloComputation* computation);
 
  private:
-  // Current HloComputation instance the InlinerVisitor is traversing.
+  // Current HloComputation instance the MapInlinerVisitor is traversing.
   HloComputation* computation_;
 
   // Whether algebraic simplification has occurred.
   bool changed_ = false;
 };
 
-StatusOr<bool> InlinerVisitor::Run(HloComputation* computation) {
+StatusOr<bool> MapInlinerVisitor::Run(HloComputation* computation) {
   changed_ = false;
   computation_ = computation;
   TF_RETURN_IF_ERROR(computation->root_instruction()->Accept(this));
   return changed_;
 }
 
-Status InlinerVisitor::HandleMap(HloInstruction* map) {
+Status MapInlinerVisitor::HandleMap(HloInstruction* map) {
   HloComputation* function = map->to_apply();
   HloInstruction& root = *function->root_instruction();
-  // TODO(b/29249531): Add DCE pass to remove unused HloComputations.
   // Only inlining functions that are simply a single operation until a better
   // profitability model for inlining is defined.
   if (hlo_query::AllOperandsAreParameters(root)) {
@@ -112,8 +111,8 @@ Status InlinerVisitor::HandleMap(HloInstruction* map) {
   return Status::OK();
 }
 
-StatusOr<bool> Inliner::Run(HloModule* module) {
-  InlinerVisitor visitor(/*computation=*/nullptr);
+StatusOr<bool> MapInliner::Run(HloModule* module) {
+  MapInlinerVisitor visitor(/*computation=*/nullptr);
   bool changed = false;
   for (HloComputation* computation : module->computations()) {
     TF_ASSIGN_OR_RETURN(bool computation_changed, visitor.Run(computation));
diff --git a/tensorflow/compiler/xla/service/inliner.h b/tensorflow/compiler/xla/service/map_inliner.h
similarity index 59%
rename from tensorflow/compiler/xla/service/inliner.h
rename to tensorflow/compiler/xla/service/map_inliner.h
index e20af08fb7..b679118118 100644
--- a/tensorflow/compiler/xla/service/inliner.h
+++ b/tensorflow/compiler/xla/service/map_inliner.h
@@ -13,27 +13,27 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INLINER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_INLINER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MAP_INLINER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MAP_INLINER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
 
-// A pass which performs inlining. Which can result, for example, in functions
-// that were previously being mapped by Map instead directly applied to the
-// forwarded operands (i.e., map({X, Y}, max) -> max(X, Y)).
-class Inliner : public HloModulePass {
+// A pass which performs map inlining. This replaces kMap instructions with
+// their equivalent sequence of array operations. For example:
+//   map({X, Y}, add) -> add(X, Y)).
+class MapInliner : public HloModulePass {
  public:
-  ~Inliner() override = default;
-  absl::string_view name() const override { return "inline"; }
+  ~MapInliner() override = default;
+  absl::string_view name() const override { return "map-inline"; }
 
-  // Run inlining on the given computation. Returns whether the computation was
-  // changed.
+  // Run map inlining on the given computation. Returns whether the computation
+  // was changed.
   StatusOr<bool> Run(HloModule* module) override;
 };
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INLINER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MAP_INLINER_H_
diff --git a/tensorflow/compiler/xla/service/inliner_test.cc b/tensorflow/compiler/xla/service/map_inliner_test.cc
similarity index 95%
rename from tensorflow/compiler/xla/service/inliner_test.cc
rename to tensorflow/compiler/xla/service/map_inliner_test.cc
index 98e0f2cfd7..84059dd0f7 100644
--- a/tensorflow/compiler/xla/service/inliner_test.cc
+++ b/tensorflow/compiler/xla/service/map_inliner_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/inliner.h"
+#include "tensorflow/compiler/xla/service/map_inliner.h"
 
 #include <memory>
 #include <utility>
@@ -35,10 +35,10 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-using InlinerTest = HloVerifiedTestBase;
+using MapInlinerTest = HloVerifiedTestBase;
 
 // Test that `map` with `max` is transformed to `max`
-TEST_F(InlinerTest, MapMax) {
+TEST_F(MapInlinerTest, MapMax) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
 
   auto max_builder = HloComputation::Builder(TestName());
@@ -63,7 +63,7 @@ TEST_F(InlinerTest, MapMax) {
   hlo_module->AddEmbeddedComputation(std::move(max_f32));
   hlo_module->AddEntryComputation(std::move(computation));
 
-  Inliner inliner;
+  MapInliner inliner;
   EXPECT_TRUE(inliner.Run(hlo_module).ValueOrDie());
   EXPECT_THAT(hlo_module->entry_computation()->root_instruction(),
               op::Maximum(lhs, rhs));
@@ -75,7 +75,7 @@ TEST_F(InlinerTest, MapMax) {
 }
 
 // Test that `constant` function is changed to `broadcast`.
-TEST_F(InlinerTest, MapConstant) {
+TEST_F(MapInlinerTest, MapConstant) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
 
   auto const2_builder = HloComputation::Builder(TestName());
@@ -97,7 +97,7 @@ TEST_F(InlinerTest, MapConstant) {
   hlo_module->AddEmbeddedComputation(std::move(const2_f32));
   hlo_module->AddEntryComputation(std::move(computation));
   HloInstruction* root = hlo_module->entry_computation()->root_instruction();
-  Inliner inliner;
+  MapInliner inliner;
   EXPECT_TRUE(inliner.Run(hlo_module).ValueOrDie());
   root = hlo_module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Broadcast(op::Constant()));
@@ -108,7 +108,7 @@ TEST_F(InlinerTest, MapConstant) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, expected));
 }
 
-TEST_F(InlinerTest, MapSubtractOppositeOrder) {
+TEST_F(MapInlinerTest, MapSubtractOppositeOrder) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
 
   // Note that the parameter ordinals are in the opposite order to their
@@ -135,7 +135,7 @@ TEST_F(InlinerTest, MapSubtractOppositeOrder) {
   hlo_module->AddEmbeddedComputation(std::move(max_f32));
   hlo_module->AddEntryComputation(std::move(computation));
 
-  Inliner inliner;
+  MapInliner inliner;
   EXPECT_TRUE(inliner.Run(hlo_module).ValueOrDie());
   EXPECT_THAT(hlo_module->entry_computation()->root_instruction(),
           op::Subtract(rhs, lhs));
@@ -146,7 +146,7 @@ TEST_F(InlinerTest, MapSubtractOppositeOrder) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, expected));
 }
 
-TEST_F(InlinerTest, MapParameter) {
+TEST_F(MapInlinerTest, MapParameter) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
 
   auto param_builder = HloComputation::Builder(TestName());
@@ -167,7 +167,7 @@ TEST_F(InlinerTest, MapParameter) {
   hlo_module->AddEmbeddedComputation(std::move(param_f32));
   hlo_module->AddEntryComputation(std::move(computation));
 
-  Inliner inliner;
+  MapInliner inliner;
   EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
   EXPECT_THAT(hlo_module->entry_computation()->root_instruction(), rhs);
 
-- 
GitLab


From b74c9aa65fcbe615495a972a5021e983707d02f6 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Thu, 4 Oct 2018 14:24:25 -0700
Subject: [PATCH 442/570] Add apidefs for the list ops.

PiperOrigin-RevId: 215802845
---
 .../api_def/python_api/api_defTensorListPushBackBatch.pbtxt   | 4 ++++
 .../core/api_def/python_api/api_def_EmptyTensorList.pbtxt     | 4 ++++
 .../api_def/python_api/api_def_TensorListConcatLists.pbtxt    | 4 ++++
 .../api_def/python_api/api_def_TensorListElementShape.pbtxt   | 4 ++++
 .../api_def/python_api/api_def_TensorListFromTensor.pbtxt     | 4 ++++
 .../core/api_def/python_api/api_def_TensorListGather.pbtxt    | 4 ++++
 .../core/api_def/python_api/api_def_TensorListGetItem.pbtxt   | 4 ++++
 .../core/api_def/python_api/api_def_TensorListLength.pbtxt    | 4 ++++
 .../core/api_def/python_api/api_def_TensorListPopBack.pbtxt   | 4 ++++
 .../core/api_def/python_api/api_def_TensorListPushBack.pbtxt  | 4 ++++
 .../core/api_def/python_api/api_def_TensorListReserve.pbtxt   | 4 ++++
 .../core/api_def/python_api/api_def_TensorListScatter.pbtxt   | 4 ++++
 .../core/api_def/python_api/api_def_TensorListSetItem.pbtxt   | 4 ++++
 .../core/api_def/python_api/api_def_TensorListStack.pbtxt     | 4 ++++
 14 files changed, 56 insertions(+)
 create mode 100644 tensorflow/core/api_def/python_api/api_defTensorListPushBackBatch.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_EmptyTensorList.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_TensorListConcatLists.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_TensorListElementShape.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_TensorListFromTensor.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_TensorListGather.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_TensorListGetItem.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_TensorListLength.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_TensorListPopBack.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_TensorListPushBack.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_TensorListReserve.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_TensorListScatter.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_TensorListSetItem.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_TensorListStack.pbtxt

diff --git a/tensorflow/core/api_def/python_api/api_defTensorListPushBackBatch.pbtxt b/tensorflow/core/api_def/python_api/api_defTensorListPushBackBatch.pbtxt
new file mode 100644
index 0000000000..3d937c745c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_defTensorListPushBackBatch.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListPushBackBatch"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_EmptyTensorList.pbtxt b/tensorflow/core/api_def/python_api/api_def_EmptyTensorList.pbtxt
new file mode 100644
index 0000000000..44f25b5d93
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_EmptyTensorList.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "EmptyTensorList"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListConcatLists.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListConcatLists.pbtxt
new file mode 100644
index 0000000000..45fc55e71e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListConcatLists.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListConcatLists"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListElementShape.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListElementShape.pbtxt
new file mode 100644
index 0000000000..e1ad713e7f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListElementShape.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListElementShape"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListFromTensor.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListFromTensor.pbtxt
new file mode 100644
index 0000000000..4aaefba3c5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListFromTensor.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListFromTensor"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListGather.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListGather.pbtxt
new file mode 100644
index 0000000000..aaf607d70e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListGather.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListGather"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListGetItem.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListGetItem.pbtxt
new file mode 100644
index 0000000000..3bb5f39cbc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListGetItem.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListGetItem"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListLength.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListLength.pbtxt
new file mode 100644
index 0000000000..a04c20bb8a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListLength.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListLength"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListPopBack.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListPopBack.pbtxt
new file mode 100644
index 0000000000..9287162f22
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListPopBack.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListPopBack"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListPushBack.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListPushBack.pbtxt
new file mode 100644
index 0000000000..da2bc11721
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListPushBack.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListPushBack"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListReserve.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListReserve.pbtxt
new file mode 100644
index 0000000000..77e63747d5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListReserve.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListReserve"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListScatter.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListScatter.pbtxt
new file mode 100644
index 0000000000..0015189d7f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListScatter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListScatter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListSetItem.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListSetItem.pbtxt
new file mode 100644
index 0000000000..4999ee7ad9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListSetItem.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListSetItem"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListStack.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListStack.pbtxt
new file mode 100644
index 0000000000..2dc7b2784b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListStack.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListStack"
+  visibility: HIDDEN
+}
-- 
GitLab


From ac7b84de8803edbb2d4da573b3f8704e9fad8fa8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 14:45:32 -0700
Subject: [PATCH 443/570] Internal change.

PiperOrigin-RevId: 215806953
---
 tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
index 9f62ac3f2c..c22a457a71 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
@@ -113,6 +113,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // input configuration.
   TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
 
+  TF_LITE_ENSURE_EQ(context, input->dims->size, 3);
   const int batch_size = input->dims->data[0];
   const int max_time = input->dims->data[1];
   const int fw_num_units = fw_input_weights->dims->data[0];
-- 
GitLab


From a742575879db1df48daf929b8d29e43a1d168dd7 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Thu, 4 Oct 2018 14:55:14 -0700
Subject: [PATCH 444/570] Automated rollback of commit
 6b538d9ce54e878576131cde0c76e43a893180c2

PiperOrigin-RevId: 215808649
---
 tensorflow/python/data/kernel_tests/BUILD     |  1 -
 tensorflow/tensorflow.bzl                     | 39 ++++++++-----------
 .../tools/pip_package/pip_smoke_test.py       |  2 +-
 3 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 10ec0dbe1c..c7295d6e69 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -306,7 +306,6 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
     ],
     tags = [
-        "no_oss",  # TODO(b/116813115): Investigate timeout and re-enable.
         "no_windows_gpu",
     ],
 )
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index dead44c57e..cad5de1b0c 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1798,29 +1798,22 @@ def cuda_py_test(
         flaky = 0,
         xla_enabled = False,
         grpc_enabled = False):
-    if main == None:
-        main = name + ".py"
-    for config in ["cpu", "gpu"]:
-        test_name = name
-        test_tags = tags
-        if config == "gpu":
-            test_name += "_gpu"
-            test_tags = test_tags + tf_cuda_tests_tags()
-        tf_py_test(
-            name = test_name,
-            size = size,
-            srcs = srcs,
-            data = data,
-            main = main,
-            args = args,
-            tags = test_tags,
-            shard_count = shard_count,
-            additional_deps = additional_deps,
-            kernels = kernels,
-            flaky = flaky,
-            xla_enabled = xla_enabled,
-            grpc_enabled = grpc_enabled,
-        )
+    test_tags = tags + tf_cuda_tests_tags()
+    tf_py_test(
+        name = name,
+        size = size,
+        srcs = srcs,
+        data = data,
+        main = main,
+        args = args,
+        tags = test_tags,
+        shard_count = shard_count,
+        additional_deps = additional_deps,
+        kernels = kernels,
+        flaky = flaky,
+        xla_enabled = xla_enabled,
+        grpc_enabled = grpc_enabled,
+    )
 
 register_extension_info(
     extension_name = "cuda_py_test",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index e7f9628fa6..c6ef82ccdc 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -142,7 +142,7 @@ def main():
 
   missing_dependencies = []
   # File extensions and endings to ignore
-  ignore_extensions = ["_test", "_test.py", "_test_gpu", "_test_gpu.py"]
+  ignore_extensions = ["_test", "_test.py"]
 
   ignored_files = 0
   blacklisted_files = len(BLACKLIST)
-- 
GitLab


From 2e2e89699c1186eef157911b57e4b062de376ce9 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Thu, 4 Oct 2018 14:59:43 -0700
Subject: [PATCH 445/570] Add basic TensorList op support in bridge.

* Add kernels for TensorListReserve. EmptyTensorList, TensorListElementShape, TensorListPushBack, TensorlistPopBack;
* Treat list type pretty much identical to Stack in the bridge for now;
* Support variant output by treating variant like a uint8 and leaving the interpretation up to the XlaExpression (variant type does not support tensor_data());

PiperOrigin-RevId: 215809335
---
 tensorflow/compiler/tests/BUILD               |  16 ++
 .../compiler/tests/tensor_list_ops_test.py    | 105 ++++++++
 tensorflow/compiler/tf2xla/kernels/BUILD      |   2 +
 .../tf2xla/kernels/tensor_list_ops.cc         | 226 ++++++++++++++++++
 tensorflow/compiler/tf2xla/xla_op_kernel.cc   |  40 +++-
 tensorflow/compiler/tf2xla/xla_op_kernel.h    |   5 +
 6 files changed, 384 insertions(+), 10 deletions(-)
 create mode 100644 tensorflow/compiler/tests/tensor_list_ops_test.py
 create mode 100644 tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index ee36729fd1..ba2401ed26 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -894,6 +894,22 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "tensor_list_ops_test",
+    size = "small",
+    srcs = ["tensor_list_ops_test.py"],
+    # TensorList ops are not implemented in the on-demand compilation model yet.
+    disabled_backends = "cpu_ondemand",
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:list_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:function",
+    ],
+)
+
 tf_xla_py_test(
     name = "ternary_ops_test",
     size = "small",
diff --git a/tensorflow/compiler/tests/tensor_list_ops_test.py b/tensorflow/compiler/tests/tensor_list_ops_test.py
new file mode 100644
index 0000000000..b556723eec
--- /dev/null
+++ b/tensorflow/compiler/tests/tensor_list_ops_test.py
@@ -0,0 +1,105 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ops which manipulate lists of tensors via bridge."""
+
+# pylint: disable=g-bad-name
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import list_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+def scalar_shape():
+  return ops.convert_to_tensor([], dtype=dtypes.int32)
+
+
+class ListOpsTest(xla_test.XLATestCase):
+
+  def testElementShape(self):
+    with self.cached_session() as sess, self.test_scope():
+      dim = array_ops.placeholder(dtypes.int32)
+      l = list_ops.tensor_list_reserve(
+          element_shape=(dim, 15), num_elements=20,
+          element_dtype=dtypes.float32)
+      e32 = list_ops.tensor_list_element_shape(l, shape_type=dtypes.int32)
+      e64 = list_ops.tensor_list_element_shape(l, shape_type=dtypes.int64)
+      self.assertAllEqual(sess.run(e32, {dim: 10}), (10, 15))
+      self.assertAllEqual(sess.run(e64, {dim: 7}), (7, 15))
+
+  def testPushPop(self):
+    with self.cached_session() as sess, self.test_scope():
+      num = array_ops.placeholder(dtypes.int32)
+      l = list_ops.tensor_list_reserve(
+          element_shape=(7, 15), num_elements=num, element_dtype=dtypes.float32)
+      l = list_ops.tensor_list_push_back(
+          l, constant_op.constant(1.0, shape=(7, 15)))
+      l = list_ops.tensor_list_push_back(
+          l, constant_op.constant(2.0, shape=(7, 15)))
+      l, e2 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      _, e1 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(sess.run(e2, {num: 10}), 2.0 * np.ones((7, 15)))
+      self.assertAllEqual(sess.run(e1, {num: 10}), 1.0 * np.ones((7, 15)))
+
+  def testPushPopSeparateLists(self):
+    with self.cached_session() as sess, self.test_scope():
+      num = array_ops.placeholder(dtypes.int32)
+      l = list_ops.tensor_list_reserve(
+          element_shape=scalar_shape(),
+          num_elements=num,
+          element_dtype=dtypes.float32)
+      l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
+      l2 = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
+      l3 = list_ops.tensor_list_push_back(l, constant_op.constant(3.0))
+      _, e11 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      l2, e21 = list_ops.tensor_list_pop_back(l2, element_dtype=dtypes.float32)
+      l2, e22 = list_ops.tensor_list_pop_back(l2, element_dtype=dtypes.float32)
+      l3, e31 = list_ops.tensor_list_pop_back(l3, element_dtype=dtypes.float32)
+      l3, e32 = list_ops.tensor_list_pop_back(l3, element_dtype=dtypes.float32)
+      result = sess.run([e11, [e21, e22], [e31, e32]], {num: 20})
+      self.assertEqual(result, [1.0, [2.0, 1.0], [3.0, 1.0]])
+
+  def testEmptyTensorList(self):
+    dim = 7
+    with self.cached_session() as sess, self.test_scope():
+      p = array_ops.placeholder(dtypes.int32)
+      l = list_ops.empty_tensor_list(
+          element_shape=(p, 15), element_dtype=dtypes.float32)
+      l = list_ops.tensor_list_push_back(
+          l, constant_op.constant(1.0, shape=(dim, 15)))
+      _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Use TensorListReserve instead"):
+        self.assertEqual(sess.run(e, {p: dim}), 1.0 * np.ones((dim, 15)))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 9a7130f253..95a010a119 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -95,6 +95,7 @@ tf_kernel_library(
         "stateless_random_ops.cc",
         "strided_slice_op.cc",
         "tensor_array_ops.cc",
+        "tensor_list_ops.cc",
         "tile_ops.cc",
         "topk_op.cc",
         "training_ops.cc",
@@ -158,6 +159,7 @@ tf_kernel_library(
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:conv_ops",
         "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:list_kernels",
         "//tensorflow/core/kernels:no_op",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/kernels:pooling_ops",
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
new file mode 100644
index 0000000000..74d4fcc425
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -0,0 +1,226 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA TensorList operators.
+
+#include <limits>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+Status GetTensorListShape(xla::XlaBuilder* builder, xla::XlaOp op,
+                          TensorShape* tensor_list_shape) {
+  auto shape_or_status = builder->GetShape(op);
+  if (!shape_or_status.ok()) {
+    return shape_or_status.status();
+  }
+  xla::Shape shape = shape_or_status.ValueOrDie();
+  TF_RET_CHECK(xla::ShapeUtil::IsTuple(shape));
+  return XLAShapeToTensorShape(xla::ShapeUtil::GetTupleElementShape(shape, 0),
+                               tensor_list_shape);
+}
+
+class TensorListReserveOp : public XlaOpKernel {
+ public:
+  explicit TensorListReserveOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape element_shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &element_shape));
+    int64 num_elements;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &num_elements));
+
+    TensorShape tensor_shape;
+    tensor_shape.AddDim(num_elements);
+    tensor_shape.AppendShape(element_shape);
+
+    xla::XlaBuilder* b = ctx->builder();
+    ctx->SetOutput(0, xla::Tuple(b, {xla::Broadcast(XlaHelpers::Zero(b, dtype_),
+                                                    tensor_shape.dim_sizes()),
+                                     xla::ConstantR0<int32>(b, 0)}));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListReserveOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListReserve")
+                    .CompileTimeConstInput("element_shape")
+                    .CompileTimeConstInput("num_elements"),
+                TensorListReserveOp);
+
+class EmptyTensorListOp : public XlaOpKernel {
+ public:
+  explicit EmptyTensorListOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    ctx->CtxFailure(
+        errors::InvalidArgument("XLA compilation requires a fixed tensor list "
+                                "size. Use TensorListReserve instead."));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(EmptyTensorListOp);
+};
+
+REGISTER_XLA_OP(Name("EmptyTensorList"), EmptyTensorListOp);
+
+class TensorListElementShapeOp : public XlaOpKernel {
+ public:
+  explicit TensorListElementShapeOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape_type", &shape_type_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, GetTensorListShape(b, ctx->Input(0), &shape));
+    shape.RemoveDim(0);
+
+    switch (shape_type_) {
+      case DT_INT64:
+        ctx->SetOutput(0, xla::ConstantR1<int64>(b, shape.dim_sizes()));
+        break;
+      case DT_INT32: {
+        std::vector<int32> size;
+        for (int64 s : shape.dim_sizes()) {
+          size.push_back(s);
+        }
+        ctx->SetOutput(0, xla::ConstantR1<int32>(b, size));
+        break;
+      }
+      default:
+        ctx->CtxFailure(
+            errors::InvalidArgument("Unsupported shape type requested"));
+        return;
+    }
+  }
+
+ private:
+  DataType shape_type_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListElementShapeOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListElementShape"), TensorListElementShapeOp);
+
+class TensorListPushBackOp : public XlaOpKernel {
+ public:
+  explicit TensorListPushBackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp list = ctx->Input(0);
+    TensorShape elem_shape = ctx->InputShape(1);
+
+    xla::XlaOp ta = xla::GetTupleElement(list, 0);
+    xla::XlaOp index = xla::GetTupleElement(list, 1);
+    xla::XlaOp value = ctx->Input(1);
+
+    // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
+    auto start_indices =
+        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
+                 xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
+
+    TensorShape slice_shape = elem_shape;
+    slice_shape.InsertDim(0, 1LL);
+    auto update = xla::Reshape(value, slice_shape.dim_sizes());
+
+    // TODO(phawkins): We don't check the index is in bounds --- there is no
+    // error mechanism in XLA.
+    ctx->SetOutput(
+        0, xla::Tuple(b, {xla::DynamicUpdateSlice(ta, update, start_indices),
+                          index + xla::ConstantR0<int32>(b, 1)}));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListPushBackOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListPushBack"), TensorListPushBackOp);
+
+class TensorListPopBackOp : public XlaOpKernel {
+ public:
+  explicit TensorListPopBackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp state = ctx->Input(0);
+
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, GetTensorListShape(b, state, &shape));
+
+    xla::XlaOp ta = xla::GetTupleElement(state, 0);
+    xla::XlaOp index = xla::GetTupleElement(state, 1);
+
+    index = index - xla::ConstantR0<int32>(b, 1);
+
+    // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
+    auto start_indices =
+        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
+                 xla::MakeEdgePaddingConfig({{0, shape.dims() - 1}}));
+
+    auto slice_shape = shape.dim_sizes();
+    slice_shape[0] = 1LL;
+
+    // TODO(phawkins): We don't check the index is in bounds --- there is no
+    // error mechanism in XLA.
+    xla::XlaOp read = xla::DynamicSlice(ta, start_indices, slice_shape);
+    // Remove the leading '1' dimension.
+    std::vector<int64> value_shape(slice_shape.begin() + 1, slice_shape.end());
+
+    ctx->SetOutput(0, xla::Tuple(b, {ta, index}));
+    ctx->SetOutput(1, xla::Reshape(read, value_shape));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListPopBackOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListPopBack"), TensorListPopBackOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 2a9eaeee14..dd3498ef7a 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -455,23 +455,43 @@ Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
   return Status::OK();
 }
 
+Status XlaOpKernelContext::allocate_output(int index, const xla::Shape& shape,
+                                           Tensor** output) {
+  // The step's default allocator is the dummy XlaCompilationAllocator which
+  // simply allocates a metadata buffer to hold the expression to which it
+  // corresponds.
+  if (expected_output_dtype(index) == DT_VARIANT) {
+    // tensor_data() is not supported for variant Tensor (i.e.,
+    // DataTypeCanUseMemcpy is false for DT_VARIANT), and so storing the
+    // XlaExpression inside the Tensor's tensor_data() does not work for
+    // variant. Instead construct a uint8 tensor and store the expression in its
+    // value.
+    // TODO(jpienaar): This should be refactored to stop masquerading
+    // XlaExpressions as Tensors.
+    *output = new Tensor();
+    TensorShape tensor_shape;
+    TF_RETURN_IF_ERROR(
+        context_->allocate_temp(DT_UINT8, tensor_shape, *output));
+    context_->set_output(index, **output);
+  } else {
+    TensorShape tensor_shape;
+    TF_RETURN_IF_ERROR(XLAShapeToTensorShape(shape, &tensor_shape));
+    TF_RETURN_IF_ERROR(context_->allocate_output(index, tensor_shape, output));
+  }
+  return Status::OK();
+}
+
 void XlaOpKernelContext::SetOutput(int index, const xla::XlaOp& handle) {
   // Makes the host Tensor that will refer to the expression.
   Tensor* output = nullptr;
-  auto shape = builder()->GetShape(handle);
-  if (!shape.ok()) {
-    SetStatus(shape.status());
+  auto shape_or = builder()->GetShape(handle);
+  if (!shape_or.ok()) {
+    SetStatus(shape_or.status());
     return;
   }
 
-  // The step's default allocator is the dummy XlaCompilationAllocator which
-  // simply allocates a metadata buffer to hold the expression to which it
-  // corresponds.
-  TensorShape tensor_shape;
-  OP_REQUIRES_OK(context_,
-                 XLAShapeToTensorShape(shape.ValueOrDie(), &tensor_shape));
   OP_REQUIRES_OK(context_,
-                 context_->allocate_output(index, tensor_shape, &output));
+                 allocate_output(index, shape_or.ValueOrDie(), &output));
 
   // The expression is stored in the tensor's data buffer. Fill in the
   // fields now.
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index a3a0d10cc0..aa00a45496 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -255,6 +255,11 @@ class XlaOpKernelContext {
   // Returns the tensor of input `name`.
   const Tensor& GetInputTensorByName(absl::string_view name);
 
+  // Wraps OpKernelContext's allocate_output method while providing special
+  // behavior for DT_VARIANT: a variant is treated as DT_UINT8 scalar as the
+  // type to allow mapping for variant to more generic types.
+  Status allocate_output(int index, const xla::Shape& shape, Tensor** output);
+
   OpKernelContext* const context_;
 };
 
-- 
GitLab


From 26d3617d2ab5f4874b73059be524e94b9535465b Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Thu, 4 Oct 2018 15:11:26 -0700
Subject: [PATCH 446/570] Avoid creating control edges on not-this-graph.

PiperOrigin-RevId: 215811680
---
 tensorflow/python/eager/function.py       | 17 +++++++----------
 tensorflow/python/ops/control_flow_ops.py |  3 +++
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index dd9f5e233c..2750461fb2 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -269,15 +269,6 @@ class FuncGraph(ops.Graph):
   def variables(self, var_list):
     self._weak_variables = [weakref.ref(v) for v in var_list]
 
-  def control_dependencies(self, control_inputs):
-    # Drop control dependencies to outside of the graph. TODO(b/117109273)
-    # unclear how to capture an op, not a tensor.
-    if not control_inputs:
-      return super(FuncGraph, self).control_dependencies(control_inputs)
-    return super(FuncGraph, self).control_dependencies(
-        [c for c in control_inputs
-         if getattr(c, "graph", None) is self])
-
   def create_op(
       self,
       op_type,
@@ -503,6 +494,9 @@ class _EagerDefinedFunction(object):
 
     Returns:
       The outputs of the function call.
+
+    Raises:
+      ValueError: if the number of arguments is incorrect.
     """
 
     executing_eagerly = ctx.executing_eagerly()
@@ -536,6 +530,10 @@ class _EagerDefinedFunction(object):
       # TODO(akshayka): Either remove this if the FunctionLibraryRuntime
       # creates `PartitionedCallOp` kernels by default, or remove the previous
       # branch if a TPU kernel is registered for `PartitionedCall`.
+      if len(args) != len(self.signature.input_arg):
+        raise ValueError(
+            "Arguments and signature arguments do not match: %s %s " %
+            (len(args), len(list(self.signature.input_arg))))
       outputs = functional_ops.partitioned_call(
           args=args,
           f=self,
@@ -756,7 +754,6 @@ class Function(object):
         BACKWARD_FUNCTION_ATTRIBUTE_NAME:
             self._backward_graph_function._inference_function.name})  # pylint: disable=protected-access
     forward_function_attr.update(self._attrs)
-
     self._forward_function = _EagerDefinedFunction(
         forward_function_name, self._func_graph, self._func_graph.inputs,
         self._func_graph.outputs + backwards_graph_captures,
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index f779c3d273..5bc217d355 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -1333,6 +1333,9 @@ class ControlFlowState(object):
     """
     if util.IsLoopSwitch(op):
       return None
+    if op.graph._building_function:  # pylint: disable=protected-access
+      # The optimization here is tricky to apply to functions
+      return array_ops.zeros_like(op.outputs[index])
     dead_branch = util.IsSwitch(op)
     forward_ctxt = _GetWhileContext(op)
     grad_state = self._map.get(forward_ctxt)
-- 
GitLab


From bd99ed794264668ce77ed7527bc41df7aba3927b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 15:17:46 -0700
Subject: [PATCH 447/570] Fix bug in Grappler constant folding: The logic
 detecting full reductions was flawed. Added better test coverage.

Also added a extra test for a related symbolic shape inference operation that I first suspected to be broken.

PiperOrigin-RevId: 215812753
---
 .../grappler/costs/graph_properties_test.cc   |   6 +
 .../grappler/optimizers/constant_folding.cc   |  47 ++++---
 .../optimizers/constant_folding_test.cc       | 130 ++++++++++++------
 3 files changed, 118 insertions(+), 65 deletions(-)

diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 362092a6cf..db10f586bc 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -1340,6 +1340,8 @@ TEST_F(GraphPropertiesTest, SymbolicShapes) {
   Output zero = ops::Const(s.WithOpName("zero"), 0.0f, {});
   Output g = ops::Shape(s.WithOpName("g"), c);
   Output h = ops::Fill(s.WithOpName("h"), g, zero);
+  Output zero_idx = ops::Const(s.WithOpName("zero_idx"), {0}, {1});
+  Output j = ops::Sum(s.WithOpName("j"), a, zero_idx);
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -1382,6 +1384,10 @@ TEST_F(GraphPropertiesTest, SymbolicShapes) {
   ASSERT_EQ(2, shape_f.dim_size());
   EXPECT_EQ(shape_h.dim(0).size(), shape_c.dim(0).size());
   EXPECT_EQ(shape_h.dim(1).size(), shape_c.dim(1).size());
+
+  const auto shape_j = properties.GetOutputProperties("j").at(0).shape();
+  ASSERT_EQ(1, shape_j.dim_size());
+  EXPECT_EQ(shape_j.dim(0).size(), shape_a.dim(1).size());
 }
 
 TEST_F(GraphPropertiesTest, DoNotValidateColocationConstraints) {
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index ca5d3a6dfd..3d0d95bba7 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -616,28 +616,37 @@ Status ConstantFolding::MaterializeReductionIndices(
     // We can't do anything if we don't know the rank of the input.
     return Status::OK();
   }
-  const int rank = input_prop.shape().dim_size();
-  if (rank == 0) {
+  const int input_rank = input_prop.shape().dim_size();
+  if (input_rank < 1) {
     // Unexpected graph, don't try to change it.
     return Status::OK();
   }
+  const OpInfo::TensorProperties& reduction_indices_prop = input_props[1];
+  DataType dtype = reduction_indices_prop.dtype();
+  if (dtype != DT_INT32 && dtype != DT_INT64) {
+    return Status::OK();
+  }
+  PartialTensorShape reduction_indices_shape(reduction_indices_prop.shape());
+  const int num_reduction_indices = reduction_indices_shape.num_elements();
+
   const std::vector<OpInfo::TensorProperties>& output_props =
       properties.GetOutputProperties(node->name());
   if (output_props.size() != 1) {
     return Status::OK();
   }
-  const bool keep_dims =
-      node->attr().count("keep_dims") && node->attr().at("keep_dims").b();
   const OpInfo::TensorProperties& output_prop = output_props[0];
-  PartialTensorShape output_shape(output_prop.shape());
-  if (output_shape.num_elements() != 1) {
-    bool full_reduction = false;
+  const int output_rank =
+      output_prop.shape().unknown_rank() ? -1 : output_prop.shape().dim_size();
+
+  bool full_reduction = output_rank == 0 || num_reduction_indices == input_rank;
+  if (!full_reduction) {
+    // A full reduction will generate a tensor of one of the shapes
+    // [], [1], [1, 1], [1, 1, ...]. Even if we do not know the number of
+    // elements in the output of the reduction, we may deduce it from reshape
+    // nodes following it.
     for (const NodeDef* fanout : node_map_->GetOutputs(node->name())) {
-      if (!IsReshape(*fanout) && !keep_dims) {
-        // Depending on how it's setup, a full reduction will generate a tensor
-        // of shape [], [1], [1, 1], [1, 1, ...]. If keep_dims isn't true, we
-        // rely on the existence of a reshape node following the reduction to
-        // ensure that the fanout is fed a scalar of the right shape.
+      full_reduction = false;
+      if (!IsReshape(*fanout)) {
         return Status::OK();
       }
       const std::vector<OpInfo::TensorProperties>& reshape_props =
@@ -658,20 +667,15 @@ Status ConstantFolding::MaterializeReductionIndices(
     }
   }
 
-  const OpInfo::TensorProperties& reduction_prop = input_props[1];
-  DataType dtype = reduction_prop.dtype();
-  if (dtype != DT_INT32 && dtype != DT_INT64) {
-    return Status::OK();
-  }
-  // We know it's a full reduction. We can generate the set of indices to
-  // reduce.
+  // We know it's a full reduction. We can generate the full set of indices to
+  // reduce as a constant node.
   string const_name = OptimizedNodeName(*node, "-reduction_indices");
   if (node_map_->GetNode(const_name)) {
     return Status::OK();
   }
   NodeDef* reduction_indices = graph_->add_node();
-  Tensor value(dtype, TensorShape({rank}));
-  for (int i = 0; i < rank; ++i) {
+  Tensor value(dtype, TensorShape({input_rank}));
+  for (int i = 0; i < input_rank; ++i) {
     if (dtype == DT_INT32) {
       value.vec<int32>()(i) = i;
     } else {
@@ -680,6 +684,7 @@ Status ConstantFolding::MaterializeReductionIndices(
   }
   TF_RETURN_IF_ERROR(
       CreateNodeDef(const_name, TensorValue(&value), reduction_indices));
+
   reduction_indices->set_device(node->device());
   string ctrl_dep =
       AddControlDependency(node->input(1), graph_, node_map_.get());
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index b09360a2c2..fab01edfed 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -2591,58 +2591,100 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
 }
 
 TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output input =
-      ops::Placeholder(s.WithOpName("input"), DT_FLOAT,
-                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
-  Output indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32);
-  Output sum = ops::Sum(s.WithOpName("sum"), input, indices);
-  Output size = ops::Const(s.WithOpName("size"), 1, {1});
-  Output reshape = ops::Reshape(s.WithOpName("reshape"), sum, size);
+  for (bool use_reshape : {true, false}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output input =
+        ops::Placeholder(s.WithOpName("input"), DT_FLOAT,
+                         ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+    // If use_reshape is false, we need to now the number of indices to apply
+    // the rewrite.
+    Output indices = ops::Placeholder(
+        s.WithOpName("indices"), DT_INT32,
+        ops::Placeholder::Shape(PartialTensorShape({use_reshape ? -1 : 2})));
+    Output sum = ops::Sum(s.WithOpName("sum"), input, indices);
+    if (use_reshape) {
+      Output size = ops::Const(s.WithOpName("size"), 1, {1});
+      Output reshape = ops::Reshape(s.WithOpName("reshape"), sum, size);
+    }
 
-  GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  item.fetch.push_back("reshape");
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    item.fetch.push_back(use_reshape ? "reshape" : "sum");
 
-  auto input_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
-  Tensor indices_t(DT_INT32, TensorShape({2}));
-  indices_t.flat<int>()(0) = 0;
-  indices_t.flat<int>()(1) = 1;
-  auto tensors_expected = EvaluateNodes(
-      item.graph, item.fetch, {{"input", input_t}, {"indices", indices_t}});
-  EXPECT_EQ(1, tensors_expected.size());
+    auto input_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+    Tensor indices_t(DT_INT32, TensorShape({2}));
+    indices_t.flat<int>()(0) = 0;
+    indices_t.flat<int>()(1) = 1;
+    auto tensors_expected = EvaluateNodes(
+        item.graph, item.fetch, {{"input", input_t}, {"indices", indices_t}});
+    EXPECT_EQ(1, tensors_expected.size());
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
-  GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+    // Use aggressive mode to force the shape inference to propagate placeholder
+    // shapes.
+    ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                              nullptr /* cpu_device */);
+    GraphDef output;
+    Status status = optimizer.Optimize(nullptr, item, &output);
+    TF_EXPECT_OK(status);
 
-  // Run a second time to make sure the optimization is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+    // Run a second time to make sure the optimization is idempotent.
+    item.graph.Swap(&output);
+    status = optimizer.Optimize(nullptr, item, &output);
+    TF_EXPECT_OK(status);
 
-  int found = 0;
-  for (const auto& node : output.node()) {
-    if (node.name() == "ConstantFolding/sum-reduction_indices") {
-      ++found;
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ("^indices", node.input(0));
-      EXPECT_EQ(2, TensorShape(node.attr().at("value").tensor().tensor_shape())
-                       .num_elements());
-    } else if (node.name() == "sum") {
-      ++found;
-      EXPECT_EQ("ConstantFolding/sum-reduction_indices", node.input(1));
-    } else if (node.name() == "indices") {
-      ++found;
+    int found = 0;
+    for (const auto& node : output.node()) {
+      if (node.name() == "ConstantFolding/sum-reduction_indices") {
+        ++found;
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^indices", node.input(0));
+        EXPECT_EQ(2,
+                  TensorShape(node.attr().at("value").tensor().tensor_shape())
+                      .num_elements());
+      } else if (node.name() == "sum") {
+        ++found;
+        EXPECT_EQ("ConstantFolding/sum-reduction_indices", node.input(1));
+      } else if (node.name() == "indices") {
+        ++found;
+      }
     }
+    EXPECT_EQ(3, found);
+
+    auto tensors = EvaluateNodes(output, item.fetch,
+                                 {{"input", input_t}, {"indices", indices_t}});
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
   }
-  EXPECT_EQ(3, found);
+}
 
-  auto tensors = EvaluateNodes(output, item.fetch,
-                               {{"input", input_t}, {"indices", indices_t}});
-  EXPECT_EQ(1, tensors.size());
-  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
+TEST_F(ConstantFoldingTest, MaterializeReductionIndices_NotFullReduction) {
+  for (bool input_rank_known : {true, false}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output input =
+        (input_rank_known ? ops::Placeholder(s.WithOpName("input"), DT_FLOAT,
+                                             ops::Placeholder::Shape(
+                                                 PartialTensorShape({-1, -1})))
+                          : ops::Placeholder(s.WithOpName("input"), DT_FLOAT));
+    Output indices =
+        ops::Placeholder(s.WithOpName("indices"), DT_INT32,
+                         ops::Placeholder::Shape(
+                             PartialTensorShape({input_rank_known ? 1 : 2})));
+    Output sum = ops::Sum(s.WithOpName("sum"), input, indices);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    item.fetch.push_back("sum");
+
+    // Use aggressive mode to force the shape inference to propagate placeholder
+    // shapes.
+    ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                              nullptr /* cpu_device */);
+    GraphDef output;
+    Status status = optimizer.Optimize(nullptr, item, &output);
+    TF_EXPECT_OK(status);
+
+    CompareGraphs(item.graph, output);
+  }
 }
 
 TEST_F(ConstantFoldingTest, LargeConstant) {
-- 
GitLab


From feda8c786948b1c7cc6bd9fe447781ceaff6b3d3 Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Thu, 4 Oct 2018 15:20:56 -0700
Subject: [PATCH 448/570] Fix for memory issue in micro test code, spotted by
 asan checks

PiperOrigin-RevId: 215813259
---
 .../lite/experimental/micro/kernels/softmax_test.cc       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/softmax_test.cc b/tensorflow/contrib/lite/experimental/micro/kernels/softmax_test.cc
index df7d87d623..694456d8ac 100644
--- a/tensorflow/contrib/lite/experimental/micro/kernels/softmax_test.cc
+++ b/tensorflow/contrib/lite/experimental/micro/kernels/softmax_test.cc
@@ -160,7 +160,7 @@ void TestSoftmaxQuantized(std::initializer_list<int> input_dims_data,
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(SimpleTest) {
-  const int output_dims_count = 6;
+  const int output_dims_count = 10;
   float output_data[output_dims_count];
   tflite::testing::TestSoftmaxFloat(  //
       {2, 2, 5},                      // Input shape.
@@ -181,7 +181,7 @@ TF_LITE_MICRO_TEST(SimpleTest) {
           0.031684921,
           0.011656231,
       },
-      {2, 2, 3},  // Output shape.
+      {2, 2, 5},  // Output shape.
       output_data);
 }
 
@@ -192,7 +192,7 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized) {
   const float input_max = 64.0f;
   const float output_min = 0.0f;
   const float output_max = (255.0f / 256.0f);
-  const int output_dims_count = 6;
+  const int output_dims_count = 5;
   uint8_t output_data[output_dims_count];
   tflite::testing::TestSoftmaxQuantized(  //
       {2, 1, 5},                          // Input shape.
@@ -212,7 +212,7 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized) {
           F2Q(0.234121657, output_min, output_max),
           F2Q(0.636408647, output_min, output_max),
       },
-      {2, 1, 3},               // Output shape.
+      {2, 1, 5},               // Output shape.
       output_min, output_max,  // Output quantized range.
       output_data);
 }
-- 
GitLab


From 3a457c7252f09afd03483092ce9dcc7aa292b8c6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 15:27:44 -0700
Subject: [PATCH 449/570] This CL fixes a bug in the eager benchmarks test that
 caused the defun tests to execute a different-sized matrix multiply than the
 eager tests.

PiperOrigin-RevId: 215814346
---
 tensorflow/python/eager/benchmarks_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 3fe79ef244..2b0118c07f 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -353,7 +353,7 @@ class MicroBenchmarks(test.Benchmark):
                               num_iters,
                               execution_mode=None):
     f = function.defun(math_ops.matmul)
-    func = lambda: f(m, m, transpose_b)
+    func = lambda: f(m, m, transpose_b=transpose_b)
     self._run(func, num_iters, execution_mode=execution_mode)
 
   def _benchmark_defun_matmul_forward_backward(self,
@@ -366,7 +366,7 @@ class MicroBenchmarks(test.Benchmark):
     def func():
       with backprop.GradientTape() as gt:
         gt.watch(m)
-        y = f(m, m, transpose_b)
+        y = f(m, m, transpose_b=transpose_b)
       _ = gt.gradient(y, m)
 
     self._run(func, num_iters, execution_mode=execution_mode)
-- 
GitLab


From a08ca5bb74fcd828c19060216923ad0f378bb518 Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Thu, 4 Oct 2018 15:29:58 -0700
Subject: [PATCH 450/570] Disable tensorrt:unary_test in OSS since it crashes
 with SEGV.

PiperOrigin-RevId: 215814732
---
 tensorflow/contrib/tensorrt/BUILD | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 9e8979bce4..5c16fcb760 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -455,7 +455,6 @@ cuda_py_tests(
         "test/multi_connection_neighbor_engine_test.py",
         "test/neighboring_engine_test.py",
         "test/rank_two_test.py",
-        "test/unary_test.py",
         "test/vgg_block_nchw_test.py",
         "test/vgg_block_test.py",
     ],
@@ -471,6 +470,25 @@ cuda_py_tests(
     ],
 )
 
+cuda_py_tests(
+    name = "tf_trt_integration_test_no_oss",
+    srcs = [
+        "test/unary_test.py",
+    ],
+    additional_deps = [
+        ":tf_trt_integration_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_oss",  # TODO(b/117274186): re-enable in OSS after crash fixed
+        "no_pip",  # TODO(b/117274186): re-enable in OSS after crash fixed
+        "no_windows",
+        "nomac",
+    ],
+)
+
 cc_library(
     name = "utils",
     srcs = ["convert/utils.cc"],
-- 
GitLab


From d6a2e7bcca5683c377b592f177bcac9aeb1c550f Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Thu, 4 Oct 2018 15:54:20 -0700
Subject: [PATCH 451/570] Fix unused imports.

PiperOrigin-RevId: 215819072
---
 tensorflow/compiler/tests/tensor_list_ops_test.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tensorflow/compiler/tests/tensor_list_ops_test.py b/tensorflow/compiler/tests/tensor_list_ops_test.py
index b556723eec..5c079d595c 100644
--- a/tensorflow/compiler/tests/tensor_list_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_list_ops_test.py
@@ -20,22 +20,13 @@ from __future__ import division
 from __future__ import print_function
 import numpy as np
 from tensorflow.compiler.tests import xla_test
-from tensorflow.python.client import session
-from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import list_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import test
-from tensorflow.python.training import server_lib
 
 
 def scalar_shape():
-- 
GitLab


From cf8e7cf89abb4a7783b9a99f17574ea128fa767a Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Thu, 4 Oct 2018 16:10:21 -0700
Subject: [PATCH 452/570] Pin ops with small integer inputs (already on the
 cpu) to the cpu in eager.

An environment variable (TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING) is provided to turn this off if necessary (its on by default).

PiperOrigin-RevId: 215821915
---
 .../core/common_runtime/eager/context.cc      |  4 +-
 .../core/common_runtime/eager/context.h       |  2 +
 .../core/common_runtime/eager/execute.cc      | 67 ++++++++++++++++---
 tensorflow/python/eager/core_test.py          | 28 ++++++++
 4 files changed, 91 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 18420b60fd..f23cefb33d 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -70,7 +70,9 @@ EagerContext::EagerContext(const SessionOptions& opts,
       async_default_(async),
       log_memory_(LogMemory::IsEnabled()),
       env_(opts.env),
-      use_send_tensor_rpc_(false) {
+      use_send_tensor_rpc_(false),
+      pin_small_ops_to_cpu_(ReadBoolFromEnvVar(
+          "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING", true)) {
   if (device_mgr_owned) {
     local_device_manager_.reset(device_mgr);
     local_unowned_device_manager_ = nullptr;
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 5ed6057ec6..15eeaa8066 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -202,6 +202,7 @@ class EagerContext {
   // EagerService.SendTensor RPC. If false, _Send/_Recv ops should be used
   // instead (which in-turn use WorkerService.RecvTensor RPCs).
   bool UseSendTensorRPC() { return use_send_tensor_rpc_; }
+  bool PinSmallOpsToCPU() { return pin_small_ops_to_cpu_; }
 
  private:
   void InitDeviceMapAndAsync();
@@ -293,6 +294,7 @@ class EagerContext {
 #endif
 
   bool use_send_tensor_rpc_;
+  const bool pin_small_ops_to_cpu_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 1bc63616d0..a52f933d75 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -579,19 +579,23 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   return Status::OK();
 #endif
 }
-}  // namespace
 
-Status EagerExecute(EagerOperation* op,
-                    gtl::InlinedVector<TensorHandle*, 2>* retvals,
-                    int* num_retvals) {
-  // Ensure all resource-touching ops run in the device the resource is,
-  // regardless of anything else that has been specified. This is identical to
-  // the graph mode behavior.
+// The Op device may be updated if:
+// - A resource touching input is specified: all resource-touching ops run in
+// the device the resource is, regardless of anything else that has been
+// specified. This is identical to the graph mode behavior.
+//
+// - All op inputs are on the CPU, small (<64 elements) and integers
+// (int32/int64). This can be disabled by setting the environment variable
+// "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
+Status MaybeUpdateOpDevice(EagerOperation* op) {
   EagerContext* ctx = op->EagerContext();
+  bool device_set_for_resource_variable = false;
+  bool all_inputs_eligible_for_cpu_pinning = ctx->PinSmallOpsToCPU();
+
   for (int i = 0; i < op->Inputs().size(); ++i) {
     Device* input_op_device = nullptr;
-    auto status = op->Inputs()[i]->OpDevice(&input_op_device);
-    if (!status.ok()) return status;
+    TF_RETURN_IF_ERROR(op->Inputs()[i]->OpDevice(&input_op_device));
     VLOG(2) << "for op " << op->Name() << " input " << i << " "
             << DataTypeString(op->Inputs()[i]->dtype) << " "
             << (input_op_device == nullptr ? "cpu" : input_op_device->name())
@@ -603,8 +607,53 @@ Status EagerExecute(EagerOperation* op,
               << d->name() << " because input #" << i
               << " is a resource in this device.";
       op->SetDevice(d);
+
+      device_set_for_resource_variable = true;
+      all_inputs_eligible_for_cpu_pinning = false;
+    } else if (all_inputs_eligible_for_cpu_pinning) {
+      TensorHandle* handle = op->Inputs()[i];
+
+      // Input is on CPU.
+      if (input_op_device != nullptr && input_op_device != ctx->HostCPU()) {
+        all_inputs_eligible_for_cpu_pinning = false;
+        continue;
+      }
+
+      if (handle->dtype != DataType::DT_INT32 &&
+          handle->dtype != DataType::DT_INT64) {
+        all_inputs_eligible_for_cpu_pinning = false;
+        continue;
+      }
+
+      int64 num_elements;
+      TF_RETURN_IF_ERROR(handle->NumElements(&num_elements));
+      if (num_elements > 64) {
+        all_inputs_eligible_for_cpu_pinning = false;
+      }
     }
   }
+
+  // Ops without inputs are usually ops that generate a tensor in some way and
+  // usually require being present on whatever device they are scheduled on
+  // - for e.g. VarHandleOp or _Recv).
+  // TODO(nareshmodi): Is it possible there is no int32/int64 CPU kernel for
+  // an op, but there is a GPU kernel?
+  if (!op->Inputs().empty() && all_inputs_eligible_for_cpu_pinning) {
+    VLOG(1) << "Forcing op " << op->Name()
+            << " to be on the CPU since all input tensors have an "
+               "int32/int64 dtype, and are small (less than 64 elements).";
+    op->SetDevice(ctx->HostCPU());
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status EagerExecute(EagerOperation* op,
+                    gtl::InlinedVector<TensorHandle*, 2>* retvals,
+                    int* num_retvals) {
+  TF_RETURN_IF_ERROR(MaybeUpdateOpDevice(op));
+
   bool op_is_local = IsLocal(op->EagerContext(), op->Device());
 
   if (op_is_local) {
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index fb5442b646..e601aa376f 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -631,6 +631,34 @@ class TFETest(test_util.TensorFlowTestCase):
     for t in tensors:
       self.assertIsInstance(t, ops.EagerTensor)
 
+  def testSmallIntegerOpsForcedToCPU(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+
+    a = constant_op.constant((1, 2, 3, 4, 5), dtype=dtypes.int64)
+    b = constant_op.constant((2, 3, 4, 5, 6), dtype=dtypes.int64)
+    with context.device('gpu:0'):
+      c = a + b
+
+    # Op forced to CPU since all constants are integers and small.
+    self.assertEqual(c.device, '/job:localhost/replica:0/task:0/device:CPU:0')
+
+    a = array_ops.zeros((8, 10), dtype=dtypes.int64)
+    b = array_ops.ones((8, 10), dtype=dtypes.int64)
+
+    with context.device('gpu:0'):
+      c = a + b
+
+    # Op not forced to CPU since the tensors are larger than 64 elements.
+    self.assertEqual(c.device, '/job:localhost/replica:0/task:0/device:GPU:0')
+
+    a = constant_op.constant((1, 2, 3, 4, 5), dtype=dtypes.float32)
+    b = constant_op.constant((2, 3, 4, 5, 6), dtype=dtypes.float32)
+    with context.device('gpu:0'):
+      c = a + b
+
+    # Op not forced to CPU since the constants are not integers.
+    self.assertEqual(c.device, '/job:localhost/replica:0/task:0/device:GPU:0')
 
 class SendRecvTest(test_util.TensorFlowTestCase):
 
-- 
GitLab


From 4a00f2fc6514ad5ee60ab0a9645863fdf263499f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 16:29:47 -0700
Subject: [PATCH 453/570] Add Chaos Free Network (CFN) cell.

The implementation is based on: https://openreview.net/pdf?id=S1dIzvclg.

PiperOrigin-RevId: 215824867
---
 .../rnn/python/kernel_tests/rnn_cell_test.py  |  65 +++++++++
 tensorflow/contrib/rnn/python/ops/rnn_cell.py | 129 ++++++++++++++++++
 2 files changed, 194 insertions(+)

diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index 0a27200015..aa1d7d2b01 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -1120,6 +1120,71 @@ class RNNCellTest(test.TestCase):
             r"input size \(3\) must be divisible by number_of_groups \(2\)"):
           gcell(glstm_input, gcell_zero_state)
 
+  def testCFNCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope("root"):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = contrib_rnn_cell.CFNCell(
+            units=2,
+            kernel_initializer=initializers.Constant(0.5))
+        g, _ = cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.17188203, 0.17188203]])
+      with variable_scope.variable_scope("other"):
+        # Test CFN with input_size != num_units.
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        cell = contrib_rnn_cell.CFNCell(
+            units=2,
+            kernel_initializer=initializers.Constant(0.5))
+        g, _ = cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.15535763, 0.15535763]])
+
+  def testCFNCellEndToEnd(self):
+    with self.cached_session() as sess:
+      input_shape = 10
+      output_shape = 5
+      timestep = 4
+      batch = 100
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = utils.to_categorical(y_train)
+      cell = contrib_rnn_cell.CFNCell(output_shape)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape))
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape))
+
+      outputs, state = rnn.dynamic_rnn(
+          cell, inputs, dtype=dtypes.float32)
+      self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape])
+      self.assertEqual(state.shape.as_list(), [None, output_shape])
+      loss = losses.softmax_cross_entropy(predict, state)
+      train_op = training.GradientDescentOptimizer(0.001).minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      _, outputs, state = sess.run(
+          [train_op, outputs, state], {inputs: x_train, predict: y_train})
+
+      self.assertEqual(len(outputs), batch)
+      self.assertEqual(len(state), batch)
+
   def testMinimalRNNCell(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 59a61af7b3..78cea8feb4 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -3510,3 +3510,132 @@ class MinimalRNNCell(rnn_cell_impl.LayerRNNCell):
 
     new_h = u * state + (1 - u) * feedforward
     return new_h, new_h
+
+
+class CFNCell(rnn_cell_impl.LayerRNNCell):
+  """Chaos Free Network cell.
+
+  The implementation is based on:
+
+    https://openreview.net/pdf?id=S1dIzvclg
+
+  Thomas Laurent, James von Brecht.
+  "A recurrent neural network without chaos." ICLR, 2017.
+
+  A CFN cell first projects the input to the hidden space. The hidden state
+  goes through a contractive mapping. The new hidden state is then calcuated
+  as a linear combination of the projected input and the contracted previous
+  hidden state, using decoupled input and forget gates.
+  """
+
+  def __init__(self,
+               units,
+               activation="tanh",
+               kernel_initializer="glorot_uniform",
+               bias_initializer="ones",
+               name=None,
+               dtype=None,
+               **kwargs):
+    """Initialize the parameters for a CFN cell.
+
+    Args:
+      units: int, The number of units in the CFN cell.
+      activation: Nonlinearity to use. Default: `tanh`.
+      kernel_initializer: Initializer for the `kernel` weights
+        matrix. Default: `glorot_uniform`.
+      bias_initializer: The initializer to use for the bias in the
+        gates. Default: `ones`.
+      name: String, the name of the cell.
+      dtype: Default dtype of the cell.
+      **kwargs: Dict, keyword named properties for common cell attributes.
+    """
+    super(CFNCell, self).__init__(name=name, dtype=dtype, **kwargs)
+
+    # Inputs must be 2-dimensional.
+    self.input_spec = base_layer.InputSpec(ndim=2)
+
+    self.units = units
+    self.activation = activations.get(activation)
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+  @property
+  def state_size(self):
+    return self.units
+
+  @property
+  def output_size(self):
+    return self.units
+
+  def build(self, inputs_shape):
+    if inputs_shape[-1] is None:
+      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
+                       % str(inputs_shape))
+
+    input_size = inputs_shape[-1]
+    # pylint: disable=protected-access
+    # `self.kernel` contains V_{\theta}, V_{\eta}, W.
+    # `self.recurrent_kernel` contains U_{\theta}, U_{\eta}.
+    # `self.bias` contains b_{\theta}, b_{\eta}.
+    self.kernel = self.add_weight(
+        shape=[input_size, 3 * self.units],
+        name=rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+        initializer=self.kernel_initializer)
+    self.recurrent_kernel = self.add_weight(
+        shape=[self.units, 2 * self.units],
+        name="recurrent_%s" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+        initializer=self.kernel_initializer)
+    self.bias = self.add_weight(
+        shape=[2 * self.units],
+        name=rnn_cell_impl._BIAS_VARIABLE_NAME,
+        initializer=self.bias_initializer)
+    # pylint: enable=protected-access
+
+    self.built = True
+
+  def call(self, inputs, state):
+    """Run one step of CFN.
+
+    Args:
+      inputs: input Tensor, must be 2-D, `[batch, input_size]`.
+      state: state Tensor, must be 2-D, `[batch, state_size]`.
+
+    Returns:
+      A tuple containing:
+
+      - Output: A `2-D` tensor with shape `[batch_size, state_size]`.
+      - New state: A `2-D` tensor with shape `[batch_size, state_size]`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    input_size = inputs.get_shape()[-1]
+    if input_size.value is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+    # The variable names u, v, w, b are consistent with the notations in the
+    # original paper.
+    v, w = array_ops.split(
+        value=self.kernel,
+        num_or_size_splits=[2 * self.units, self.units],
+        axis=1)
+    u = self.recurrent_kernel
+    b = self.bias
+
+    gates = math_ops.matmul(state, u) + math_ops.matmul(inputs, v)
+    gates = nn_ops.bias_add(gates, b)
+    gates = math_ops.sigmoid(gates)
+    theta, eta = array_ops.split(value=gates,
+                                 num_or_size_splits=2,
+                                 axis=1)
+
+    proj_input = math_ops.matmul(inputs, w)
+
+    # The input gate is (1 - eta), which is different from the original paper.
+    # This is for the propose of initialization. With the default
+    # bias_initializer `ones`, the input gate is initialized to a small number.
+    new_h = theta * self.activation(state) + (1 - eta) * self.activation(
+        proj_input)
+
+    return new_h, new_h
-- 
GitLab


From 83ff640fa5026b8bd3cb9c2ceff9e99e8e03823a Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 4 Oct 2018 18:46:53 -0700
Subject: [PATCH 454/570] [XLA:GPU] Fix old-ptxas-version detection logic.

This was completely broken for CUDA versions > 9 and resulted in spurious warnings.

Reported in #22706#issuecomment-426861394 -- thank you!

PiperOrigin-RevId: 215841354
---
 tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index b4ae2e42c7..50e47542c4 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -401,7 +401,7 @@ void WarnIfBadPtxasVersion(const string& ptxas_path) {
            "prefers >= 9.2.88).  Compilation of XLA kernels below will likely "
            "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas "
            "binary is sufficient.";
-  } else if ((vmaj < 9 || vmin < 2 || vdot < 88)) {
+  } else if (std::make_tuple(vmaj, vmin, vdot) < std::make_tuple(9, 2, 88)) {
     LOG(WARNING)
         << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
         << vdot
-- 
GitLab


From 5608454c31bb298096bb6aa463b33baa2fa68f08 Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Thu, 4 Oct 2018 19:07:44 -0700
Subject: [PATCH 455/570] Add 'device' property to TPUMirroredVariable, so
 tf.train.init_from_checkpoint can be supported.

PiperOrigin-RevId: 215843249
---
 tensorflow/contrib/distribute/python/values.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 18ceba42c2..0dd78ba185 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -571,6 +571,10 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
           ValueError("Device %s not found in %s (current device %s)" %
                      (device, self._index.keys(), device_util.current())), e)
 
+  @property
+  def device(self):
+    return self._get().device
+
   # The arguments to update() are automatically unwrapped so the update()
   # function would normally see regular variables, not MirroredVariables.
   # However, the update function can still operate on wrapped MirroredVariables
-- 
GitLab


From f4cef34fad7b00a3b1f288ff5c95001c5b83c1f8 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Thu, 4 Oct 2018 19:26:26 -0700
Subject: [PATCH 456/570] Fix regression that caused xrange to be ignored.

PiperOrigin-RevId: 215844450
---
 tensorflow/python/autograph/operators/py_builtins.py          | 1 +
 tensorflow/python/autograph/pyct/inspect_utils.py             | 4 ++++
 .../python/autograph/pyct/static_analysis/live_values.py      | 4 ++++
 3 files changed, 9 insertions(+)

diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index 91a2a22cc2..70e59272a9 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -228,5 +228,6 @@ BUILTIN_FUINCTIONS_MAP = {
     'len': len_,
     'print': print_,
     'range': range_,
+    # TODO(mdan): This might make more sense as tf.data.range.
     'xrange': range_,
 }
diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py
index eef74599a7..1416988ea3 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils.py
@@ -30,10 +30,14 @@ from tensorflow.python.util import tf_inspect
 
 
 def isbuiltin(f):
+  """Returns True if the argument is a built-in function."""
   # Note these return false for isinstance(f, types.BuiltinFunctionType) so we
   # need to specifically check for them.
   if f in (range, int, float):
     return True
+  if six.PY2:
+    if f in (xrange,):
+      return True
   if isinstance(f, types.BuiltinFunctionType):
     return True
   if tf_inspect.isbuiltin(f):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/live_values.py b/tensorflow/python/autograph/pyct/static_analysis/live_values.py
index 36b9e7074d..4ceddce53b 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/live_values.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/live_values.py
@@ -24,6 +24,7 @@ from __future__ import division
 from __future__ import print_function
 
 import gast
+import six
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import transformer
@@ -35,6 +36,9 @@ from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
 # These symbols are legal in Python, but don't appear in the namespace.
 _SPECIAL_SYMBOLS = {'range': range, 'print': print}
 
+if six.PY2:
+  _SPECIAL_SYMBOLS['xrange'] = xrange
+
 
 class LiveValueResolver(transformer.Base):
   """Annotates nodes with live values."""
-- 
GitLab


From 176e6993c5e11631389e05f82b3d71a3a367e392 Mon Sep 17 00:00:00 2001
From: Billy Lamberta <blamb@google.com>
Date: Thu, 4 Oct 2018 21:25:33 -0700
Subject: [PATCH 457/570] Fix link in eager notebook stub.

PiperOrigin-RevId: 215853105
---
 .../notebooks/automatic_differentiation.ipynb |    2 +-
 .../performance/xla/operation_semantics.md    | 2426 -----------------
 2 files changed, 1 insertion(+), 2427 deletions(-)
 delete mode 100644 tensorflow/docs_src/performance/xla/operation_semantics.md

diff --git a/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb
index 8fae622e12..446e340118 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb
@@ -65,7 +65,7 @@
         "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/eager/automatic_differentiation.ipynb\"\u003e\n",
         "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
         "\u003c/td\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/blob/master/site/en/tutorials/eager/automatic_differentiation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/docs/blob/master/site/en/tutorials/eager/automatic_differentiation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
       ]
     }
   ],
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
deleted file mode 100644
index 96d269bec4..0000000000
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ /dev/null
@@ -1,2426 +0,0 @@
-# Operation Semantics
-
-The following describes the semantics of operations defined in the
-[`XlaBuilder`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
-interface. Typically, these operations map one-to-one to operations defined in
-the RPC interface in
-[`xla_data.proto`](https://www.tensorflow.org/code/tensorflow/compiler/xla/xla_data.proto).
-
-A note on nomenclature: the generalized data type XLA deals with is an
-N-dimensional array holding elements of some uniform type (such as 32-bit
-float). Throughout the documentation, *array* is used to denote an
-arbitrary-dimensional array. For convenience, special cases have more specific
-and familiar names; for example a *vector* is a 1-dimensional array and a
-*matrix* is a 2-dimensional array.
-
-## AllToAll
-
-See also
-[`XlaBuilder::AllToAll`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Alltoall is a collective operation that sends data from all cores to all cores.
-It has two phases:
-
-1.  the scatter phase. On each core, the operand is split into `split_count`
-    number of blocks along the `split_dimensions`, and the blocks are scattered
-    to all cores, e.g., the ith block is send to the ith core.
-2.  the gather phase. Each core concatenates the received blocks along the
-    `concat_dimension`.
-
-The participating cores can be configured by:
-
--   `replica_groups`: each ReplicaGroup contains a list of replica id. If empty,
-    all replicas belong to one group in the order of 0 - (n-1). Alltoall will be
-    applied within subgroups in the specified order. For example, replica
-    groups = {{1,2,3},{4,5,0}} means, an Alltoall will be applied within replica
-    1, 2, 3, and in the gather phase, the received blocks will be concatenated
-    in the order of 1, 2, 3; another Alltoall will be applied within replica 4,
-    5, 0, and the concatenation order is 4, 5, 0.
-
-Prerequisites:
-
--   The dimension size of the operand on the split_dimension is divisible by
-    split_count.
--   The operand's shape is not tuple.
-
-<b> `AllToAll(operand, split_dimension, concat_dimension, split_count,
-replica_groups)` </b>
-
-
-| Arguments          | Type                  | Semantics                       |
-| ------------------ | --------------------- | ------------------------------- |
-| `operand`          | `XlaOp`               | n dimensional input array       |
-| `split_dimension`  | `int64`               | A value in the interval `[0,    |
-:                    :                       : n)` that names the dimension    :
-:                    :                       : along which the operand is      :
-:                    :                       : split                           :
-| `concat_dimension` | `int64`               | a value in the interval `[0,    |
-:                    :                       : n)` that names the dimension    :
-:                    :                       : along which the split blocks    :
-:                    :                       : are concatenated                :
-| `split_count`      | `int64`               | the number of cores that        |
-:                    :                       : participate this operation. If  :
-:                    :                       : `replica_groups` is empty, this :
-:                    :                       : should be the number of         :
-:                    :                       : replicas; otherwise, this       :
-:                    :                       : should be equal to the number   :
-:                    :                       : of replicas in each group.      :
-| `replica_groups`   | `ReplicaGroup` vector | each group contains a list of   |
-:                    :                       : replica id.                     :
-
-Below shows an example of Alltoall.
-
-```
-XlaBuilder b("alltoall");
-auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
-AllToAll(x, /*split_dimension=*/1, /*concat_dimension=*/0, /*split_count=*/4);
-```
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/xla/ops_alltoall.png">
-</div>
-
-In this example, there are 4 cores participating the Alltoall. On each core, the
-operand is split into 4 parts along dimension 0, so each part has shape
-f32[4,4]. The 4 parts are scattered to all cores. Then each core concatenates
-the received parts along dimension 1, in the order or core 0-4. So the output on
-each core has shape f32[16,4].
-
-## BatchNormGrad
-
-See also
-[`XlaBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
-and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
-for a detailed description of the algorithm.
-
-Calculates gradients of batch norm.
-
-<b> `BatchNormGrad(operand, scale, mean, variance, grad_output, epsilon, feature_index)` </b>
-
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `XlaOp`                 | n dimensional array to be        |
-:                 :                         : normalized (x)                   :
-| `scale`         | `XlaOp`                 | 1 dimensional array              |
-:                 :                         : (\\(\gamma\\))                   :
-| `mean`          | `XlaOp`                 | 1 dimensional array (\\(\mu\\))  |
-| `variance`      | `XlaOp`                 | 1 dimensional array              |
-:                 :                         : (\\(\sigma^2\\))                 :
-| `grad_output`   | `XlaOp`                 | Gradients passed to              |
-:                 :                         : `BatchNormTraining`              :
-:                 :                         : (\\( \nabla y\\))                :
-| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
-| `feature_index` | `int64`                 | Index to feature dimension in    |
-:                 :                         : `operand`                        :
-
-For each feature in the feature dimension (`feature_index` is the index for the
-feature dimension in `operand`), the operation calculates the gradients with
-respect to `operand`, `offset` and `scale` across all the other dimensions. The
-`feature_index` must be a valid index for the feature dimension in `operand`.
-
-The three gradients are defined by the following formulas (assuming a
-4-dimensional tensor as `operand` and with feature dimension index \\(l\\),
-batch size `m` and spatial sizes `w` and `h`):
-
-\\[ \begin{split} c_l&=
-\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h
-\left( \nabla y_{ijkl} \frac{x_{ijkl} - \mu_l}{\sigma^2_l+\epsilon} \right)
-\\\\
-\nabla x_{ijkl} &= \frac{\gamma_{l}}{\sqrt{\sigma^2_{l}+\epsilon}}
-\left( \nabla y_{ijkl} - \mathrm{mean}(\nabla y) - c_l (x_{ijkl} - \mu_{l})
-\right)
-\\\\
-\nabla \gamma_l &= \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \left( \nabla y_{ijkl}
-\frac{x_{ijkl} - \mu_l}{\sqrt{\sigma^2_{l}+\epsilon}} \right)
-\\\\\
-\nabla \beta_l &= \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \nabla y_{ijkl}
-\end{split} \\]
-
-The inputs `mean` and `variance` represent moments value
-across batch and spatial dimensions.
-
-The output type is a tuple of three handles:
-
-| Outputs        | Type                    | Semantics                         |
-| -------------  | ----------------------- | --------------------------------- |
-| `grad_operand` | `XlaOp`                 | gradient with respect to input    |
-:                :                         : `operand` (\\( \nabla x\\))       :
-| `grad_scale`   | `XlaOp`                 | gradient with respect to input    |
-:                :                         : `scale` (\\( \nabla \gamma\\))    :
-| `grad_offset`  | `XlaOp`                 | gradient with respect to input    |
-:                :                         : `offset`(\\( \nabla \beta\\))     :
-
-## BatchNormInference
-
-See also
-[`XlaBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
-and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
-for a detailed description of the algorithm.
-
-Normalizes an array across batch and spatial dimensions.
-
-<b> `BatchNormInference(operand, scale, offset, mean, variance, epsilon, feature_index)` </b>
-
-Arguments       | Type    | Semantics
---------------- | ------- | ---------------------------------------
-`operand`       | `XlaOp` | n dimensional array to be normalized
-`scale`         | `XlaOp` | 1 dimensional array
-`offset`        | `XlaOp` | 1 dimensional array
-`mean`          | `XlaOp` | 1 dimensional array
-`variance`      | `XlaOp` | 1 dimensional array
-`epsilon`       | `float` | Epsilon value
-`feature_index` | `int64` | Index to feature dimension in `operand`
-
-For each feature in the feature dimension (`feature_index` is the index for the
-feature dimension in `operand`), the operation calculates the mean and variance
-across all the other dimensions and uses the mean and variance to normalize each
-element in `operand`. The `feature_index` must be a valid index for the feature
-dimension in `operand`.
-
-`BatchNormInference`  is equivalent to calling `BatchNormTraining` without
-computing `mean` and `variance` for each batch. It uses the input `mean` and
-`variance` instead as estimated values. The purpose of this op is to reduce
-latency in inference, hence the name `BatchNormInference`.
-
-The output is an n-dimensional, normalized array with the same shape as input
-`operand`.
-
-## BatchNormTraining
-
-See also
-[`XlaBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
-and [`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
-for a detailed description of the algorithm.
-
-Normalizes an array across batch and spatial dimensions.
-
-<b> `BatchNormTraining(operand, scale, offset, epsilon, feature_index)` </b>
-
-Arguments       | Type    | Semantics
---------------- | ------- | ----------------------------------------
-`operand`       | `XlaOp` | n dimensional array to be normalized (x)
-`scale`         | `XlaOp` | 1 dimensional array (\\(\gamma\\))
-`offset`        | `XlaOp` | 1 dimensional array (\\(\beta\\))
-`epsilon`       | `float` | Epsilon value (\\(\epsilon\\))
-`feature_index` | `int64` | Index to feature dimension in `operand`
-
-For each feature in the feature dimension (`feature_index` is the index for the
-feature dimension in `operand`), the operation calculates the mean and variance
-across all the other dimensions and uses the mean and variance to normalize each
-element in `operand`. The `feature_index` must be a valid index for the feature
-dimension in `operand`.
-
-The algorithm goes as follows for each batch in `operand` \\(x\\) that
-contains `m` elements with `w` and `h` as the size of spatial dimensions
-(assuming `operand` is an 4 dimensional array):
-
-- Calculates batch mean \\(\mu_l\\) for each feature `l` in feature dimension:
-\\(\mu_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h x_{ijkl}\\)
-
-- Calculates batch variance \\(\sigma^2_l\\):
-\\(\sigma^2_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h (x_{ijkl} - \mu_l)^2\\)
-
-- Normalizes, scales and shifts:
-\\(y_{ijkl}=\frac{\gamma_l(x_{ijkl}-\mu_l)}{\sqrt[2]{\sigma^2_l+\epsilon}}+\beta_l\\)
-
-The epsilon value, usually a small number, is added to avoid divide-by-zero errors.
-
-The output type is a tuple of three `XlaOp`s:
-
-| Outputs      | Type                    | Semantics                            |
-| ------------ | ----------------------- | -------------------------------------|
-| `output`     | `XlaOp`                 | n dimensional array with the same    |
-:              :                         : shape as input `operand` (y)         :
-| `batch_mean` | `XlaOp`                 | 1 dimensional array (\\(\mu\\))      |
-| `batch_var`  | `XlaOp`                 | 1 dimensional array (\\(\sigma^2\\)) |
-
-The `batch_mean` and `batch_var` are moments calculated across the batch and
-spatial dimensions using the formulas above.
-
-## BitcastConvertType
-
-See also
-[`XlaBuilder::BitcastConvertType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Similar to a `tf.bitcast` in TensorFlow, performs an element-wise bitcast
-operation from a data shape to a target shape. The dimensions must match, and
-the conversion is an element-wise one; e.g. `s32` elements become `f32` elements
-via bitcast routine. Bitcast is implemented as a low-level cast, so machines
-with different floating-point representations will give different results.
-
-<b> `BitcastConvertType(operand, new_element_type)` </b>
-
-Arguments          | Type            | Semantics
------------------- | --------------- | ---------------------------
-`operand`          | `XlaOp`         | array of type T with dims D
-`new_element_type` | `PrimitiveType` | type U
-
-The dimensions of the operand and the target shape must match. The bit-width of
-the source and destination element types must be equal. The source
-and destination element types must not be tuples.
-
-## Broadcast
-
-See also
-[`XlaBuilder::Broadcast`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Adds dimensions to an array by duplicating the data in the array.
-
-<b> `Broadcast(operand, broadcast_sizes)` </b>
-
-Arguments         | Type                | Semantics
------------------ | ------------------- | -------------------------------
-`operand`         | `XlaOp`             | The array to duplicate
-`broadcast_sizes` | `ArraySlice<int64>` | The sizes of the new dimensions
-
-The new dimensions are inserted on the left, i.e. if `broadcast_sizes` has
-values `{a0, ..., aN}` and the operand shape has dimensions `{b0, ..., bM}` then
-the shape of the output has dimensions `{a0, ..., aN, b0, ..., bM}`.
-
-The new dimensions index into copies of the operand, i.e.
-
-```
-output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
-```
-
-For example, if `operand` is a scalar `f32` with value `2.0f`, and
-`broadcast_sizes` is `{2, 3}`, then the result will be an array with shape
-`f32[2, 3]` and all the values in the result will be `2.0f`.
-
-## Call
-
-See also
-[`XlaBuilder::Call`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Invokes a computation with the given arguments.
-
-<b> `Call(computation, args...)` </b>
-
-| Arguments     | Type                   | Semantics                           |
-| ------------- | ---------------------- | ----------------------------------- |
-| `computation` | `XlaComputation`       | computation of type `T_0, T_1, ..., |
-:               :                        : T_N -> S` with N parameters of      :
-:               :                        : arbitrary type                      :
-| `args`        | sequence of N `XlaOp`s | N arguments of arbitrary type       |
-
-The arity and types of the `args` must match the parameters of the
-`computation`. It is allowed to have no `args`.
-
-## Clamp
-
-See also
-[`XlaBuilder::Clamp`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Clamps an operand to within the range between a minimum and maximum value.
-
-<b> `Clamp(min, operand, max)` </b>
-
-Arguments | Type    | Semantics
---------- | ------- | ---------------
-`min`     | `XlaOp` | array of type T
-`operand` | `XlaOp` | array of type T
-`max`     | `XlaOp` | array of type T
-
-Given an operand and minimum and maximum values, returns the operand if it is in
-the range between the minimum and maximum, else returns the minimum value if the
-operand is below this range or the maximum value if the operand is above this
-range.  That is, `clamp(a, x, b) =  min(max(a, x), b)`.
-
-All three arrays must be the same shape. Alternatively, as a restricted form of
-[broadcasting](broadcasting.md), `min` and/or `max` can be a scalar of type `T`.
-
-Example with scalar `min` and `max`:
-
-```
-let operand: s32[3] = {-1, 5, 9};
-let min: s32 = 0;
-let max: s32 = 6;
-==>
-Clamp(min, operand, max) = s32[3]{0, 5, 6};
-```
-
-## Collapse
-
-See also
-[`XlaBuilder::Collapse`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
-and the `tf.reshape` operation.
-
-Collapses dimensions of an array into one dimension.
-
-<b> `Collapse(operand, dimensions)` </b>
-
-Arguments    | Type           | Semantics
------------- | -------------- | -----------------------------------------------
-`operand`    | `XlaOp`        | array of type T
-`dimensions` | `int64` vector | in-order, consecutive subset of T's dimensions.
-
-Collapse replaces the given subset of the operand's dimensions by a single
-dimension. The input arguments are an arbitrary array of type T and a
-compile-time-constant vector of dimension indices. The dimension indices must be
-an in-order (low to high dimension numbers), consecutive subset of T's
-dimensions. Thus, {0, 1, 2}, {0, 1}, or {1, 2} are all valid dimension sets, but
-{1, 0} or {0, 2} are not. They are replaced by a single new dimension, in the
-same position in the dimension sequence as those they replace, with the new
-dimension size equal to the product of original dimension sizes. The lowest
-dimension number in `dimensions` is the slowest varying dimension (most major)
-in the loop nest which collapses these dimension, and the highest dimension
-number is fastest varying (most minor). See the `tf.reshape` operator
-if more general collapse ordering is needed.
-
-For example, let v be an array of 24 elements:
-
-```
-let v = f32[4x2x3] {{{10, 11, 12},  {15, 16, 17}},
-                    {{20, 21, 22},  {25, 26, 27}},
-                    {{30, 31, 32},  {35, 36, 37}},
-                    {{40, 41, 42},  {45, 46, 47}}};
-
-// Collapse to a single dimension, leaving one dimension.
-let v012 = Collapse(v, {0,1,2});
-then v012 == f32[24] {10, 11, 12, 15, 16, 17,
-                      20, 21, 22, 25, 26, 27,
-                      30, 31, 32, 35, 36, 37,
-                      40, 41, 42, 45, 46, 47};
-
-// Collapse the two lower dimensions, leaving two dimensions.
-let v01 = Collapse(v, {0,1});
-then v01 == f32[4x6] {{10, 11, 12, 15, 16, 17},
-                      {20, 21, 22, 25, 26, 27},
-                      {30, 31, 32, 35, 36, 37},
-                      {40, 41, 42, 45, 46, 47}};
-
-// Collapse the two higher dimensions, leaving two dimensions.
-let v12 = Collapse(v, {1,2});
-then v12 == f32[8x3] {{10, 11, 12},
-                      {15, 16, 17},
-                      {20, 21, 22},
-                      {25, 26, 27},
-                      {30, 31, 32},
-                      {35, 36, 37},
-                      {40, 41, 42},
-                      {45, 46, 47}};
-
-```
-
-## Concatenate
-
-See also
-[`XlaBuilder::ConcatInDim`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Concatenate composes an array from multiple array operands. The array is of the
-same rank as each of the input array operands (which must be of the same rank as
-each other) and contains the arguments in the order that they were specified.
-
-<b> `Concatenate(operands..., dimension)` </b>
-
-| Arguments   | Type                  | Semantics                              |
-| ----------- | --------------------- | -------------------------------------- |
-| `operands`  | sequence of N `XlaOp` | N arrays of type T with dimensions     |
-:             :                       : [L0, L1, ...]. Requires N >= 1.        :
-| `dimension` | `int64`               | A value in the interval `[0, N)` that  |
-:             :                       : names the dimension to be concatenated :
-:             :                       : between the `operands`.                :
-
-With the exception of `dimension` all dimensions must be the same. This is
-because XLA does not support "ragged" arrays. Also note that rank-0 values
-cannot be concatenated (as it's impossible to name the dimension along which the
-concatenation occurs).
-
-1-dimensional example:
-
-```
-Concat({{2, 3}, {4, 5}, {6, 7}}, 0)
->>> {2, 3, 4, 5, 6, 7}
-```
-
-2-dimensional example:
-
-```
-let a = {
-  {1, 2},
-  {3, 4},
-  {5, 6},
-};
-let b = {
-  {7, 8},
-};
-Concat({a, b}, 0)
->>> {
-  {1, 2},
-  {3, 4},
-  {5, 6},
-  {7, 8},
-}
-```
-
-Diagram:
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="https://www.tensorflow.org/images/ops_concatenate.png">
-</div>
-
-## Conditional
-
-See also
-[`XlaBuilder::Conditional`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-<b> `Conditional(pred, true_operand, true_computation, false_operand,
-false_computation)` </b>
-
-Arguments           | Type             | Semantics
-------------------- | ---------------- | ---------------------------------
-`pred`              | `XlaOp`          | Scalar of type `PRED`
-`true_operand`      | `XlaOp`          | Argument of type `T_0`
-`true_computation`  | `XlaComputation` | XlaComputation of type `T_0 -> S`
-`false_operand`     | `XlaOp`          | Argument of type `T_1`
-`false_computation` | `XlaComputation` | XlaComputation of type `T_1 -> S`
-
-Executes `true_computation` if `pred` is `true`, `false_computation` if `pred`
-is `false`, and returns the result.
-
-The `true_computation` must take in a single argument of type `T_0` and will be
-invoked with `true_operand` which must be of the same type. The
-`false_computation` must take in a single argument of type `T_1` and will be
-invoked with `false_operand` which must be of the same type. The type of the
-returned value of `true_computation` and `false_computation` must be the same.
-
-Note that only one of `true_computation` and `false_computation` will be
-executed depending on the value of `pred`.
-
-## Conv (convolution)
-
-See also
-[`XlaBuilder::Conv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-As ConvWithGeneralPadding, but the padding is specified in a short-hand way as
-either SAME or VALID. SAME padding pads the input (`lhs`) with zeroes so that
-the output has the same shape as the input when not taking striding into
-account. VALID padding simply means no padding.
-
-## ConvWithGeneralPadding (convolution)
-
-See also
-[`XlaBuilder::ConvWithGeneralPadding`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Computes a convolution of the kind used in neural networks. Here, a convolution
-can be thought of as a n-dimensional window moving across a n-dimensional base
-area and a computation is performed for each possible position of the window.
-
-| Arguments             | Type                 | Semantics                     |
-| --------------------- | -------------------- | ----------------------------- |
-| `lhs`                 | `XlaOp`              | rank n+2 array of inputs      |
-| `rhs`                 | `XlaOp`              | rank n+2 array of kernel      |
-:                       :                      : weights                       :
-| `window_strides`      | `ArraySlice<int64>`  | n-d array of kernel strides   |
-| `padding`             | `ArraySlice<         | n-d array of (low, high)      |
-:                       : pair<int64, int64>>` : padding                       :
-| `lhs_dilation`        | `ArraySlice<int64>`  | n-d lhs dilation factor array |
-| `rhs_dilation`        | `ArraySlice<int64>`  | n-d rhs dilation factor array |
-| `feature_group_count` | int64                | the number of feature groups  |
-
-Let n be the number of spatial dimensions. The `lhs` argument is a rank n+2
-array describing the base area. This is called the input, even though of course
-the rhs is also an input. In a neural network, these are the input activations.
-The n+2 dimensions are, in this order:
-
-*   `batch`: Each coordinate in this dimension represents an independent input
-    for which convolution is carried out.
-*   `z/depth/features`: Each (y,x) position in the base area has a vector
-    associated to it, which goes into this dimension.
-*   `spatial_dims`: Describes the `n` spatial dimensions that define the base
-    area that the window moves across.
-
-The `rhs` argument is a rank n+2 array describing the convolutional
-filter/kernel/window. The dimensions are, in this order:
-
-*   `output-z`: The `z` dimension of the output.
-*   `input-z`: The size of this dimension times `feature_group_count` should
-    equal the size of the `z` dimension in lhs.
-*   `spatial_dims`: Describes the `n` spatial dimensions that define the n-d
-    window that moves across the base area.
-
-The `window_strides` argument specifies the stride of the convolutional window
-in the spatial dimensions. For example, if the stride in the first spatial
-dimension is 3, then the window can only be placed at coordinates where the
-first spatial index is divisible by 3.
-
-The `padding` argument specifies the amount of zero padding to be applied to the
-base area. The amount of padding can be negative -- the absolute value of
-negative padding indicates the number of elements to remove from the specified
-dimension before doing the convolution. `padding[0]` specifies the padding for
-dimension `y` and `padding[1]` specifies the padding for dimension `x`. Each
-pair has the low padding as the first element and the high padding as the second
-element. The low padding is applied in the direction of lower indices while the
-high padding is applied in the direction of higher indices. For example, if
-`padding[1]` is `(2,3)` then there will be a padding by 2 zeroes on the left and
-by 3 zeroes on the right in the second spatial dimension. Using padding is
-equivalent to inserting those same zero values into the input (`lhs`) before
-doing the convolution.
-
-The `lhs_dilation` and `rhs_dilation` arguments specify the dilation factor to
-be applied to the lhs and rhs, respectively, in each spatial dimension. If the
-dilation factor in a spatial dimension is d, then d-1 holes are implicitly
-placed between each of the entries in that dimension, increasing the size of the
-array. The holes are filled with a no-op value, which for convolution means
-zeroes.
-
-Dilation of the rhs is also called atrous convolution. For more details, see
-`tf.nn.atrous_conv2d`. Dilation of the lhs is also called transposed
-convolution. For more details, see `tf.nn.conv2d_transpose`.
-
-The `feature_group_count` argument (default value 1) can be used for grouped
-convolutions. `feature_group_count` needs to be a divisor of both the input and
-the output feature dimension. If `feature_group_count` is greater than 1, it
-means that conceptually the input and output feature dimension and the `rhs`
-output feature dimension are split evenly into `feature_group_count` many
-groups, each group consisting of a consecutive subsequence of features. The
-input feature dimension of `rhs` needs to be equal to the `lhs` input feature
-dimension divided by `feature_group_count` (so it already has the size of a
-group of input features). The i-th groups are used together to compute
-`feature_group_count` many separate convolutions. The results of these
-convolutions are concatenated together in the output feature dimension.
-
-For depthwise convolution the `feature_group_count` argument would be set to the
-input feature dimension, and the filter would be reshaped from
-`[filter_height, filter_width, in_channels, channel_multiplier]` to
-`[filter_height, filter_width, 1, in_channels * channel_multiplier]`. For more
-details, see `tf.nn.depthwise_conv2d`.
-
-The output shape has these dimensions, in this order:
-
-*   `batch`: Same size as `batch` on the input (`lhs`).
-*   `z`: Same size as `output-z` on the kernel (`rhs`).
-*   `spatial_dims`: One value for each valid placement of the convolutional
-    window.
-
-The valid placements of the convolutional window are determined by the strides
-and the size of the base area after padding.
-
-To describe what a convolution does, consider a 2d convolution, and pick some
-fixed `batch`, `z`, `y`, `x` coordinates in the output. Then `(y,x)` is a
-position of a corner of the window within the base area (e.g. the upper left
-corner, depending on how you interpret the spatial dimensions). We now have a 2d
-window, taken from the base area, where each 2d point is associated to a 1d
-vector, so we get a 3d box. From the convolutional kernel, since we fixed the
-output coordinate `z`, we also have a 3d box. The two boxes have the same
-dimensions, so we can take the sum of the element-wise products between the two
-boxes (similar to a dot product). That is the output value.
-
-Note that if `output-z` is e.g., 5, then each position of the window produces 5
-values in the output into the `z` dimension of the output. These values differ
-in what part of the convolutional kernel is used - there is a separate 3d box of
-values used for each `output-z` coordinate. So you could think of it as 5
-separate convolutions with a different filter for each of them.
-
-Here is pseudo-code for a 2d convolution with padding and striding:
-
-```
-for (b, oz, oy, ox) {  // output coordinates
-  value = 0;
-  for (iz, ky, kx) {  // kernel coordinates and input z
-    iy = oy*stride_y + ky - pad_low_y;
-    ix = ox*stride_x + kx - pad_low_x;
-    if ((iy, ix) inside the base area considered without padding) {
-      value += input(b, iz, iy, ix) * kernel(oz, iz, ky, kx);
-    }
-  }
-  output(b, oz, oy, ox) = value;
-}
-```
-
-## ConvertElementType
-
-See also
-[`XlaBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Similar to an element-wise `static_cast` in C++, performs an element-wise
-conversion operation from a data shape to a target shape. The dimensions must
-match, and the conversion is an element-wise one; e.g. `s32` elements become
-`f32` elements via an `s32`-to-`f32` conversion routine.
-
-<b> `ConvertElementType(operand, new_element_type)` </b>
-
-Arguments          | Type            | Semantics
------------------- | --------------- | ---------------------------
-`operand`          | `XlaOp`         | array of type T with dims D
-`new_element_type` | `PrimitiveType` | type U
-
-The dimensions of the operand and the target shape must match. The source and
-destination element types must not be tuples.
-
-A conversion such as `T=s32` to `U=f32` will perform a normalizing int-to-float
-conversion routine such as round-to-nearest-even.
-
-> Note: The precise float-to-int and visa-versa conversions are currently
-> unspecified, but may become additional arguments to the convert operation in
-> the future.  Not all possible conversions have been implemented for all
->targets.
-
-```
-let a: s32[3] = {0, 1, 2};
-let b: f32[3] = convert(a, f32);
-then b == f32[3]{0.0, 1.0, 2.0}
-```
-
-## CrossReplicaSum
-
-See also
-[`XlaBuilder::CrossReplicaSum`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Computes a sum across replicas.
-
-<b> `CrossReplicaSum(operand)` </b>
-
-Arguments | Type    | Semantics
---------- | ------- | -----------------------------
-`operand` | `XlaOp` | Array to sum across replicas.
-| `replica_group_ids`    | `int64` vector | Group ID for each replica.      |
-
-The output shape is the same as the input shape. For example, if there are two
-replicas and the operand has the value `(1.0, 2.5)` and `(3.0, 5.25)`
-respectively on the two replicas, then the output value from this op will be
-`(4.0, 7.75)` on both replicas.
-
-`replica_group_ids` identifies the group ID of each replica. The group ID must
-either be empty (all replicas belong to a single group), or contain the same
-number of elements as the number of replicas. For example, if
-`replica_group_ids` = {0, 1, 2, 3, 0, 1, 2, 3} has eight replicas, there are
-four subgroups of replica IDs: {0, 4}, {1, 5}, {2, 6}, and {3, 7}. The size of
-each subgroup *must* be identical, so, for example, using:
-`replica_group_ids` = {0, 1, 2, 0} for four replicas is invalid.
-
-Computing the result of CrossReplicaSum requires having one input from each
-replica, so if one replica executes a CrossReplicaSum node more times than
-another, then the former replica will wait forever. Since the replicas are all
-running the same program, there are not a lot of ways for that to happen, but it
-is possible when a while loop's condition depends on data from infeed and the
-data that is infed causes the while loop to iterate more times on one replica
-than another.
-
-## CustomCall
-
-See also
-[`XlaBuilder::CustomCall`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Call a user-provided function within a computation.
-
-<b> `CustomCall(target_name, args..., shape)` </b>
-
-| Arguments     | Type                   | Semantics                         |
-| ------------- | ---------------------- | --------------------------------- |
-| `target_name` | `string`               | Name of the function. A call      |
-:               :                        : instruction will be emitted which :
-:               :                        : targets this symbol name.         :
-| `args`        | sequence of N `XlaOp`s | N arguments of arbitrary type,    |
-:               :                        : which will be passed to the       :
-:               :                        : function.                         :
-| `shape`       | `Shape`                | Output shape of the function      |
-
-The function signature is the same, regardless of the arity or type of args:
-
-```
-extern "C" void target_name(void* out, void** in);
-```
-
-For example, if CustomCall is used as follows:
-
-```
-let x = f32[2] {1,2};
-let y = f32[2x3] {{10, 20, 30}, {40, 50, 60}};
-
-CustomCall("myfunc", {x, y}, f32[3x3])
-```
-
-Here is an example of an implementation of `myfunc`:
-
-```
-extern "C" void myfunc(void* out, void** in) {
-  float (&x)[2] = *static_cast<float(*)[2]>(in[0]);
-  float (&y)[2][3] = *static_cast<float(*)[2][3]>(in[1]);
-  EXPECT_EQ(1, x[0]);
-  EXPECT_EQ(2, x[1]);
-  EXPECT_EQ(10, y[0][0]);
-  EXPECT_EQ(20, y[0][1]);
-  EXPECT_EQ(30, y[0][2]);
-  EXPECT_EQ(40, y[1][0]);
-  EXPECT_EQ(50, y[1][1]);
-  EXPECT_EQ(60, y[1][2]);
-  float (&z)[3][3] = *static_cast<float(*)[3][3]>(out);
-  z[0][0] = x[1] + y[1][0];
-  // ...
-}
-```
-
-The user-provided function must not have side-effects and its execution must be
-idempotent.
-
-> Note: The opaque nature of the user-provided function restricts optimization
-> opportunities for the compiler. Try to express your computation in terms of
-> native XLA ops whenever possible; only use CustomCall as a last resort.
-
-## Dot
-
-See also
-[`XlaBuilder::Dot`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-<b> `Dot(lhs, rhs)` </b>
-
-Arguments | Type    | Semantics
---------- | ------- | ---------------
-`lhs`     | `XlaOp` | array of type T
-`rhs`     | `XlaOp` | array of type T
-
-The exact semantics of this operation depend on the ranks of the operands:
-
-| Input                   | Output                | Semantics               |
-| ----------------------- | --------------------- | ----------------------- |
-| vector [n] `dot` vector | scalar                | vector dot product      |
-: [n]                     :                       :                         :
-| matrix [m x k] `dot`    | vector [m]            | matrix-vector           |
-: vector [k]              :                       : multiplication          :
-| matrix [m x k] `dot`    | matrix [m x n]        | matrix-matrix           |
-: matrix [k x n]          :                       : multiplication          :
-
-The operation performs sum of products over the last dimension of `lhs` and the
-one-before-last dimension of `rhs`. These are the "contracted" dimensions. The
-contracted dimensions of `lhs` and `rhs` must be of the same size. In practice,
-it can be used to perform dot products between vectors, vector/matrix
-multiplications or matrix/matrix multiplications.
-
-## DotGeneral
-
-See also
-[`XlaBuilder::DotGeneral`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-<b> `DotGeneral(lhs, rhs, dimension_numbers)` </b>
-
-Arguments           | Type                  | Semantics
-------------------- | --------------------- | ---------------
-`lhs`               | `XlaOp`               | array of type T
-`rhs`               | `XlaOp`               | array of type T
-`dimension_numbers` | `DotDimensionNumbers` | array of type T
-
-As Dot, but allows contracting and batch dimension numbers to be specified for
-both the 'lhs' and 'rhs'.
-
-| DotDimensionNumbers Fields | Type                    | Semantics
-| --------- | ----------------------- | ---------------
-| 'lhs_contracting_dimensions' | repeated int64 | 'lhs' contracting dimension numbers |
-| 'rhs_contracting_dimensions' | repeated int64 | 'rhs' contracting dimension numbers |
-| 'lhs_batch_dimensions' | repeated int64 | 'lhs' batch dimension numbers |
-| 'rhs_batch_dimensions' | repeated int64 | 'rhs' batch dimension numbers |
-
-DotGeneral performs the sum of products over contracting dimensions specified
-in 'dimension_numbers'.
-
-Associated contracting dimension numbers from the 'lhs' and 'rhs' do not need
-to be the same, but must be listed in the same order in both
-'lhs/rhs_contracting_dimensions' arrays and have the same dimension sizes.
-There must be exactly one contracting dimension on both 'lhs' and 'rhs'.
-
-Example with contracting dimension numbers:
-
-```
-lhs = { {1.0, 2.0, 3.0},
-        {4.0, 5.0, 6.0} }
-
-rhs = { {1.0, 1.0, 1.0},
-        {2.0, 2.0, 2.0} }
-
-DotDimensionNumbers dnums;
-dnums.add_lhs_contracting_dimensions(1);
-dnums.add_rhs_contracting_dimensions(1);
-
-DotGeneral(lhs, rhs, dnums) -> { {6.0, 12.0},
-                                 {15.0, 30.0} }
-```
-
-Associated batch dimension numbers from the 'lhs' and 'rhs' must have the same
-dimension number, must be listed in the same order in both arrays, must
-have the same dimension sizes, and must be ordered before contracting and
-non-contracting/non-batch dimension numbers.
-
-Example with batch dimension numbers (batch size 2, 2x2 matrices):
-
-```
-lhs = { { {1.0, 2.0},
-          {3.0, 4.0} },
-        { {5.0, 6.0},
-          {7.0, 8.0} } }
-
-rhs = { { {1.0, 0.0},
-          {0.0, 1.0} },
-        { {1.0, 0.0},
-          {0.0, 1.0} } }
-
-DotDimensionNumbers dnums;
-dnums.add_lhs_contracting_dimensions(2);
-dnums.add_rhs_contracting_dimensions(1);
-dnums.add_lhs_batch_dimensions(0);
-dnums.add_rhs_batch_dimensions(0);
-
-DotGeneral(lhs, rhs, dnums) -> { { {1.0, 2.0},
-                                   {3.0, 4.0} },
-                                 { {5.0, 6.0},
-                                   {7.0, 8.0} } }
-```
-
-| Input                               | Output            | Semantics        |
-| ----------------------------------- | ----------------- | ---------------- |
-| [b0, m, k] `dot` [b0, k, n]         | [b0, m, n]        |  batch matmul    |
-| [b0, b1, m, k] `dot` [b0, b1, k, n] | [b0, b1, m, n]    |  batch matmul    |
-
-It follows that the resulting dimension number starts with the batch dimension,
-then the 'lhs' non-contracting/non-batch dimension, and finally the 'rhs'
-non-contracting/non-batch dimension.
-
-## DynamicSlice
-
-See also
-[`XlaBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-DynamicSlice extracts a sub-array from the input array at dynamic
-`start_indices`. The size of the slice in each dimension is passed in
-`size_indices`, which specify the end point of exclusive slice intervals in each
-dimension: [start, start + size). The shape of `start_indices` must be rank ==
-1, with dimension size equal to the rank of `operand`.
-
-<b> `DynamicSlice(operand, start_indices, size_indices)` </b>
-
-| Arguments       | Type                | Semantics                           |
-| --------------- | ------------------- | ----------------------------------- |
-| `operand`       | `XlaOp`             | N dimensional array of type T       |
-| `start_indices` | `XlaOp`             | Rank 1 array of N integers          |
-:                 :                     : containing the starting indices of  :
-:                 :                     : the slice for each dimension. Value :
-:                 :                     : must be greater than or equal to    :
-:                 :                     : zero.                               :
-| `size_indices`  | `ArraySlice<int64>` | List of N integers containing the   |
-:                 :                     : slice size for each dimension. Each :
-:                 :                     : value must be strictly greater than :
-:                 :                     : zero, and start + size must be less :
-:                 :                     : than or equal to the size of the    :
-:                 :                     : dimension to avoid wrapping modulo  :
-:                 :                     : dimension size.                     :
-
-The effective slice indices are computed by applying the following
-transformation for each index `i` in `[1, N)` before performing the slice:
-
-```
-start_indices[i] = clamp(start_indices[i], 0, operand.dimension_size[i] - size_indices[i])
-```
-
-This ensures that the extracted slice is always in-bounds with respect to the
-operand array. If the slice is in-bounds before the transformation is applied,
-the transformation has no effect.
-
-1-dimensional example:
-
-```
-let a = {0.0, 1.0, 2.0, 3.0, 4.0}
-let s = {2}
-
-DynamicSlice(a, s, {2}) produces:
-  {2.0, 3.0}
-```
-
-2-dimensional example:
-
-```
-let b =
- { {0.0,  1.0,  2.0},
-   {3.0,  4.0,  5.0},
-   {6.0,  7.0,  8.0},
-   {9.0, 10.0, 11.0} }
-let s = {2, 1}
-
-DynamicSlice(b, s, {2, 2}) produces:
-  { { 7.0,  8.0},
-    {10.0, 11.0} }
-```
-## DynamicUpdateSlice
-
-See also
-[`XlaBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-DynamicUpdateSlice generates a result which is the value of the input array
-`operand`, with a slice `update` overwritten at `start_indices`.
-The shape of `update` determines the shape of the sub-array of the result which
-is updated.
-The shape of `start_indices` must be rank == 1, with dimension size equal to
-the rank of `operand`.
-
-<b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
-
-| Arguments       | Type    | Semantics                                        |
-| --------------- | ------- | ------------------------------------------------ |
-| `operand`       | `XlaOp` | N dimensional array of type T                    |
-| `update`        | `XlaOp` | N dimensional array of type T containing the     |
-:                 :         : slice update. Each dimension of update shape     :
-:                 :         : must be strictly greater than zero, and start +  :
-:                 :         : update must be less than or equal to the operand :
-:                 :         : size for each dimension to avoid generating      :
-:                 :         : out-of-bounds update indices.                    :
-| `start_indices` | `XlaOp` | Rank 1 array of N integers containing the        |
-:                 :         : starting indices of the slice for each           :
-:                 :         : dimension. Value must be greater than or equal   :
-:                 :         : to zero.                                         :
-
-The effective slice indices are computed by applying the following
-transformation for each index `i` in `[1, N)` before performing the slice:
-
-```
-start_indices[i] = clamp(start_indices[i], 0, operand.dimension_size[i] - update.dimension_size[i])
-```
-
-This ensures that the updated slice is always in-bounds with respect to the
-operand array. If the slice is in-bounds before the transformation is applied,
-the transformation has no effect.
-
-1-dimensional example:
-
-```
-let a = {0.0, 1.0, 2.0, 3.0, 4.0}
-let u = {5.0, 6.0}
-let s = {2}
-
-DynamicUpdateSlice(a, u, s) produces:
-  {0.0, 1.0, 5.0, 6.0, 4.0}
-```
-
-2-dimensional example:
-
-```
-let b =
- { {0.0,  1.0,  2.0},
-   {3.0,  4.0,  5.0},
-   {6.0,  7.0,  8.0},
-   {9.0, 10.0, 11.0} }
-let u =
- { {12.0,  13.0},
-   {14.0,  15.0},
-   {16.0,  17.0} }
-
-let s = {1, 1}
-
-DynamicUpdateSlice(b, u, s) produces:
- { {0.0,  1.0,  2.0},
-   {3.0, 12.0, 13.0},
-   {6.0, 14.0, 15.0},
-   {9.0, 16.0, 17.0} }
-```
-
-## Element-wise binary arithmetic operations
-
-See also
-[`XlaBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-A set of element-wise binary arithmetic operations is supported.
-
-<b> `Op(lhs, rhs)` </b>
-
-Where `Op` is one of `Add` (addition), `Sub` (subtraction), `Mul`
-(multiplication), `Div` (division), `Rem` (remainder), `Max` (maximum), `Min`
-(minimum), `LogicalAnd` (logical AND), or `LogicalOr` (logical OR).
-
-Arguments | Type    | Semantics
---------- | ------- | ----------------------------------------
-`lhs`     | `XlaOp` | left-hand-side operand: array of type T
-`rhs`     | `XlaOp` | right-hand-side operand: array of type T
-
-The arguments' shapes have to be either similar or compatible. See the
-[broadcasting](../../performance/xla/broadcasting.md) documentation about what it means for shapes to
-be compatible. The result of an operation has a shape which is the result of
-broadcasting the two input arrays. In this variant, operations between arrays of
-different ranks are *not* supported, unless one of the operands is a scalar.
-
-When `Op` is `Rem`, the sign of the result is taken from the dividend, and the
-absolute value of the result is always less than the divisor's absolute value.
-
-Integer division overflow (signed/unsigned division/remainder by zero or signed
-divison/remainder of `INT_SMIN` with `-1`) produces an implementation defined
-value.
-
-An alternative variant with different-rank broadcasting support exists for these
-operations:
-
-<b> `Op(lhs, rhs, broadcast_dimensions)` </b>
-
-Where `Op` is the same as above. This variant of the operation should be used
-for arithmetic operations between arrays of different ranks (such as adding a
-matrix to a vector).
-
-The additional `broadcast_dimensions` operand is a slice of integers used to
-expand the rank of the lower-rank operand up to the rank of the higher-rank
-operand. `broadcast_dimensions` maps the dimensions of the lower-rank shape to
-the dimensions of the higher-rank shape. The unmapped dimensions of the expanded
-shape are filled with dimensions of size one. Degenerate-dimension broadcasting
-then broadcasts the shapes along these degenerate dimensions to equalize the
-shapes of both operands. The semantics are described in detail on the
-[broadcasting page](../../performance/xla/broadcasting.md).
-
-## Element-wise comparison operations
-
-See also
-[`XlaBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-A set of standard element-wise binary comparison operations is supported. Note
-that standard IEEE 754 floating-point comparison semantics apply when comparing
-floating-point types.
-
-<b> `Op(lhs, rhs)` </b>
-
-Where `Op` is one of `Eq` (equal-to), `Ne` (not equal-to), `Ge`
-(greater-or-equal-than), `Gt` (greater-than), `Le` (less-or-equal-than), `Lt`
-(less-than).
-
-Arguments | Type    | Semantics
---------- | ------- | ----------------------------------------
-`lhs`     | `XlaOp` | left-hand-side operand: array of type T
-`rhs`     | `XlaOp` | right-hand-side operand: array of type T
-
-The arguments' shapes have to be either similar or compatible. See the
-[broadcasting](../../performance/xla/broadcasting.md) documentation about what it means for shapes to
-be compatible. The result of an operation has a shape which is the result of
-broadcasting the two input arrays with the element type `PRED`. In this variant,
-operations between arrays of different ranks are *not* supported, unless one of
-the operands is a scalar.
-
-An alternative variant with different-rank broadcasting support exists for these
-operations:
-
-<b> `Op(lhs, rhs, broadcast_dimensions)` </b>
-
-Where `Op` is the same as above. This variant of the operation should be used
-for comparison operations between arrays of different ranks (such as adding a
-matrix to a vector).
-
-The additional `broadcast_dimensions` operand is a slice of integers specifying
-the dimensions to use for broadcasting the operands. The semantics are described
-in detail on the [broadcasting page](../../performance/xla/broadcasting.md).
-
-## Element-wise unary functions
-
-XlaBuilder supports these element-wise unary functions:
-
-<b>`Abs(operand)`</b> Element-wise abs `x -> |x|`.
-
-<b>`Ceil(operand)`</b> Element-wise ceil `x -> ⌈x⌉`.
-
-<b>`Cos(operand)`</b> Element-wise cosine `x -> cos(x)`.
-
-<b>`Exp(operand)`</b> Element-wise natural exponential `x -> e^x`.
-
-<b>`Floor(operand)`</b> Element-wise floor `x -> ⌊x⌋`.
-
-<b>`IsFinite(operand)`</b> Tests whether each element of `operand` is finite,
-i.e., is not positive or negative infinity, and is not `NaN`. Returns an array
-of `PRED` values with the same shape as the input, where each element is `true`
-if and only if the corresponding input element is finite.
-
-<b>`Log(operand)`</b> Element-wise natural logarithm `x -> ln(x)`.
-
-<b>`LogicalNot(operand)`</b> Element-wise logical not `x -> !(x)`.
-
-<b>`Neg(operand)`</b> Element-wise negation `x -> -x`.
-
-<b>`Sign(operand)`</b> Element-wise sign operation `x -> sgn(x)` where
-
-$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ 0 & x = 0\\ 1 & x > 0 \end{cases}$$
-
-using the comparison operator of the element type of `operand`.
-
-<b>`Tanh(operand)`</b> Element-wise hyperbolic tangent `x -> tanh(x)`.
-
-
-Arguments | Type    | Semantics
---------- | ------- | ---------------------------
-`operand` | `XlaOp` | The operand to the function
-
-The function is applied to each element in the `operand` array, resulting in an
-array with the same shape. It is allowed for `operand` to be a scalar (rank 0).
-
-## Gather
-
-The XLA gather operation stitches together several slices (each slice at a
-potentially different runtime offset) of an input array.
-
-### General Semantics
-
-See also
-[`XlaBuilder::Gather`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-For a more intuitive description, see the "Informal Description" section below.
-
-<b> `gather(operand, start_indices, offset_dims, collapsed_slice_dims, slice_sizes, start_index_map)` </b>
-
-|Arguments         | Type                    | Semantics                       |
-|----------------- | ----------------------- | --------------------------------|
-|`operand`         | `XlaOp`                 | The array we’re gathering       |
-:                  :                         : from.                           :
-|`start_indices`   | `XlaOp`                 | Array containing the starting  |
-:                  :                         : indices of the slices we gather.:
-|`index_vector_dim` | `int64`                | The dimension in                |
-:                  :                         : `start_indices` that "contains" :
-:                  :                         : the starting indices.  See      :
-:                  :                         : below for a detailed            :
-:                  :                         : description.                    :
-|`offset_dims`     | `ArraySlice<int64>`     | The set of dimensions in  the   :
-:                  :                         : output shape that offset into a :
-:                  :                         : array sliced from operand.     :
-|`slice_sizes`     | `ArraySlice<int64>`      | `slice_sizes[i]` is the bounds |
-:                  :                          : for the slice on dimension `i`.:
-|`collapsed_slice_dims` | `ArraySlice<int64>` | The set of dimensions in each  :
-|                  :                          | slice that are collapsed away. :
-|                  :                          | These dimensions must have size:
-|                  :                          | 1.                             |
-|`start_index_map` | `ArraySlice<int64>`      | A map that describes how to map|
-:                  :                          : indices in `start_indices` to  :
-:                  :                          : to legal indices into operand. :
-
-For convenience, we label dimensions in the output array not in `offset_dims`
-as `batch_dims`.
-
-The output is an array of rank `batch_dims.size` + `operand.rank` -
-`collapsed_slice_dims`.size.
-
-If `index_vector_dim` is equal to `start_indices.rank` we implicitly consider
-`start_indices` to have a trailing `1` dimension (i.e. if `start_indices` was of
-shape `[6,7]` and `index_vector_dim` is `2` then we implicitly consider the
-shape of `start_indices` to be `[6,7,1]`).
-
-The bounds for the output array along dimension `i` is computed as follows:
-
-  1. If `i` is present in `batch_dims` (i.e. is equal to `batch_dims[k]` for
-     some `k`) then we pick the corresponding dimension bounds out of
-     `start_indices.shape`, skipping `index_vector_dim` (i.e. pick
-     `start_indices.shape.dims`[`k`] if `k` < `index_vector_dim` and
-     `start_indices.shape.dims`[`k`+`1`] otherwise).
-
-  2. If `i` is present in `offset_dims` (i.e. equal to `offset_dims`[`k`] for
-     some `k`) then we pick the corresponding bound out of `slice_sizes` after
-     accounting for `collapsed_slice_dims` (i.e. we pick
-     `adjusted_slice_sizes`[`k`] where `adjusted_slice_sizes` is `slice_sizes`
-     with the bounds at indices `collapsed_slice_dims` removed).
-
-Formally, the operand index `In` corresponding to an output index `Out` is
-computed as follows:
-
-  1. Let `G` = { `Out`[`k`] for `k` in `batch_dims` }.  Use `G` to slice out
-     vector `S` such that `S`[`i`] = `start_indices`[Combine(`G`, `i`)] where
-     Combine(A, b) inserts b at position `index_vector_dim` into A.  Note that
-     this is well defined even if `G` is empty -- if `G` is empty then `S` =
-     `start_indices`.
-
-  2. Create a starting index, `S`<sub>`in`</sub>, into `operand` using `S` by
-     scattering `S` using `start_index_map`.  More precisely:
-       1. `S`<sub>`in`</sub>[`start_index_map`[`k`]] = `S`[`k`] if `k` <
-          `start_index_map.size`.
-       2. `S`<sub>`in`</sub>[`_`] = `0` otherwise.
-
-  3. Create an index `O`<sub>`in`</sub> into `operand` by scattering the indices
-     at the offset dimensions in `Out` according to the `collapsed_slice_dims`
-     set.  More precisely:
-       1. `O`<sub>`in`</sub>[`expand_offset_dims`(`k`)] =
-          `Out`[`offset_dims`[`k`]] if `k` < `offset_dims.size`
-          (`expand_offset_dims` is defined below).
-       2. `O`<sub>`in`</sub>[`_`] = `0` otherwise.
-  4. `In` is `O`<sub>`in`</sub> + `S`<sub>`in`</sub> where + is element-wise
-     addition.
-
-`expand_offset_dims` is the monotonic function with domain [`0`, `offset.size`)
-and range [`0`, `operand.rank`) \ `collapsed_slice_dims`.  So if, e.g.,
-`offset.size` is `4`, `operand.rank` is `6` and `collapsed_slice_dims` is {`0`,
-`2`} then `expand_offset_dims` is {`0`→`1`, `1`→`3`, `2`→`4`, `3`→`5`}.
-
-### Informal Description and Examples
-
-Informally, every index `Out` in the output array corresponds to an element `E`
-in the operand array, computed as follows:
-
-  - We use the batch dimensions in `Out` to look up a starting index from
-    `start_indices`.
-
-  - We use `start_index_map` to map the starting index (which may have size less
-    than operand.rank) to a "full" starting index into operand.
-
-  - We dynamic-slice out a slice with size `slice_sizes` using the full starting
-    index.
-
-  - We reshape the slice by collapsing the `collapsed_slice_dims` dimensions.
-    Since all collapsed slice dimensions have to have bound 1 this reshape is
-    always legal.
-
-  - We use the offset dimensions in `Out` to index into this slice to get the
-    input element, `E`, corresponding to output index `Out`.
-
-`index_vector_dim` is set to `start_indices.rank` - `1` in all of the
-examples that follow.  More interesting values for `index_vector_dim` does not
-change the operation fundamentally, but makes the visual representation more
-cumbersome.
-
-To get an intuition on how all of the above fits together, let's look at an
-example that gathers 5 slices of shape `[8,6]` from a `[16,11]` array.  The
-position of a slice into the `[16,11]` array can be represented as an index
-vector of shape `S64[2]`, so the set of 5 positions can be represented as a
-`S64[5,2]` array.
-
-The behavior of the gather operation can then be depicted as an index
-transformation that takes [`G`,`O`<sub>`0`</sub>,`O`<sub>`1`</sub>], an index in
-the output shape, and maps it to an element in the input array in the following
-way:
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_xla_gather_0.svg">
-</div>
-
-We first select an (`X`,`Y`) vector from the gather indices array using `G`.
-The element in the output array at index
-[`G`,`O`<sub>`0`</sub>,`O`<sub>`1`</sub>] is then the element in the input
-array at index [`X`+`O`<sub>`0`</sub>,`Y`+`O`<sub>`1`</sub>].
-
-`slice_sizes` is `[8,6]`, which decides the range of W<sub>`0`</sub> and
-W<sub>`1`</sub>, and this in turn decides the bounds of the slice.
-
-This gather operation acts as a batch dynamic slice with `G` as the batch
-dimension.
-
-The gather indices may be multidimensional.  For instance, a more general
-version of the example above using a "gather indices" array of shape `[4,5,2]`
-would translate indices like this:
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_xla_gather_1.svg">
-</div>
-
-Again, this acts as a batch dynamic slice `G`<sub>`0`</sub> and
-`G`<sub>`1`</sub> as the batch dimensions.  The slice size is still `[8,6]`.
-
-The gather operation in XLA generalizes the informal semantics outlined above in
-the following ways:
-
- 1. We can configure which dimensions in the output shape are the offset
-    dimensions (dimensions containing `O`<sub>`0`</sub>, `O`<sub>`1`</sub> in
-    the last example).  The output batch dimensions (dimensions containing
-    `G`<sub>`0`</sub>, `G`<sub>`1`</sub> in the last example) are defined to be
-    the output dimensions that are not offset dimensions.
-
- 2. The number of output offset dimensions explicitly present in the output
-    shape may be smaller than the input rank.  These "missing" dimensions, which
-    are listed explicitly as `collapsed_slice_dims`, must have a slice size of
-    `1`.  Since they have a slice size of `1` the only valid index for them is
-    `0` and eliding them does not introduce ambiguity.
-
- 3. The slice extracted from the "Gather Indices" array ((`X`, `Y`) in the last
-    example) may have fewer elements than the input array rank, and an explicit
-    mapping dictates how the index should be expanded to have the same rank as
-    the input.
-
-As a final example, we use (2) and (3) to implement `tf.gather_nd`:
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_xla_gather_2.svg">
-</div>
-
-`G`<sub>`0`</sub> and `G`<sub>`1`</sub> are used to slice out a starting index
-from the gather indices array as usual, except the starting index has only one
-element, `X`.  Similarly, there is only one output offset index with the value
-`O`<sub>`0`</sub>.  However, before being used as indices into the input array,
-these are expanded in accordance to "Gather Index Mapping" (`start_index_map` in
-the formal description) and "Offset Mapping" (`expand_offset_dims` in the formal
-description) into [`0`,`O`<sub>`0`</sub>] and [`X`,`0`] respectively, adding up
-to [`X`,`O`<sub>`0`</sub>].  In other words, the output index
-[`G`<sub>`0`</sub>,`G`<sub>`1`</sub>,`O`<sub>`0`</sub>] maps to the input index
-[`GatherIndices`[`G`<sub>`0`</sub>,`G`<sub>`1`</sub>,`0`],`X`] which gives us
-the semantics for `tf.gather_nd`.
-
-`slice_sizes` for this case is `[1,11]`.  Intuitively this means that every
-index `X` in the gather indices array picks an entire row and the result is the
-concatenation of all these rows.
-
-## GetTupleElement
-
-See also
-[`XlaBuilder::GetTupleElement`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Indexes into a tuple with a compile-time-constant value.
-
-The value must be a compile-time-constant so that shape inference can determine
-the type of the resulting value.
-
-This is analogous to `std::get<int N>(t)` in C++. Conceptually:
-
-```
-let v: f32[10] = f32[10]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-let s: s32 = 5;
-let t: (f32[10], s32) = tuple(v, s);
-let element_1: s32 = gettupleelement(t, 1);  // Inferred shape matches s32.
-```
-
-See also `tf.tuple`.
-
-## Infeed
-
-See also
-[`XlaBuilder::Infeed`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-<b> `Infeed(shape)` </b>
-
-| Argument | Type    | Semantics                                             |
-| -------- | ------- | ----------------------------------------------------- |
-| `shape`  | `Shape` | Shape of the data read from the Infeed interface. The |
-:          :         : layout field of the shape must be set to match the    :
-:          :         : layout of the data sent to the device; otherwise its  :
-:          :         : behavior is undefined.                                :
-
-Reads a single data item from the implicit Infeed streaming interface of the
-device, interpreting the data as the given shape and its layout, and returns a
-`XlaOp` of the data. Multiple Infeed operations are allowed in a
-computation, but there must be a total order among the Infeed operations. For
-example, two Infeeds in the code below have a total order since there is a
-dependency between the while loops.
-
-```
-result1 = while (condition, init = init_value) {
-  Infeed(shape)
-}
-
-result2 = while (condition, init = result1) {
-  Infeed(shape)
-}
-```
-
-Nested tuple shapes are not supported. For an empty tuple shape, the Infeed
-operation is effectively a no-op and proceeds without reading any data from the
-Infeed of the device.
-
-> Note: We plan to allow multiple Infeed operations without a total order, in
-> which case the compiler will provide information about how the Infeed
-> operations are serialized in the compiled program.
-
-## Iota
-
-<b> `Iota()` </b>
-
-Builds a constant literal on device rather than a potentially large host
-transfer.  Creates a rank 1 tensor of values starting at zero and incrementing
-by one.
-
-Arguments          | Type            | Semantics
------------------- | --------------- | ---------------------------
-`type`             | `PrimitiveType` | type U
-`size`             | `int64`         | The number of elements in the tensor.
-
-## Map
-
-See also
-[`XlaBuilder::Map`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-<b> `Map(operands..., computation)` </b>
-
-| Arguments         | Type                   | Semantics                      |
-| ----------------- | ---------------------- | ------------------------------ |
-| `operands`        | sequence of N `XlaOp`s | N arrays of types T_0..T_{N-1} |
-| `computation`     | `XlaComputation`       | computation of type `T_0, T_1, |
-:                   :                        : ..., T_{N + M -1} -> S` with N :
-:                   :                        : parameters of type T and M of  :
-:                   :                        : arbitrary type                 :
-| `dimensions`      | `int64` array          | array of map dimensions        |
-
-Applies a scalar function over the given `operands` arrays, producing an array
-of the same dimensions where each element is the result of the mapped function
-applied to the corresponding elements in the input arrays.
-
-The mapped function is an arbitrary computation with the restriction that it has
-N inputs of scalar type `T` and a single output with type `S`. The output has
-the same dimensions as the operands except that the element type T is replaced
-with S.
-
-For example: `Map(op1, op2, op3, computation, par1)` maps `elem_out <-
-computation(elem1, elem2, elem3, par1)` at each (multi-dimensional) index in the
-input arrays to produce the output array.
-
-## Pad
-
-See also
-[`XlaBuilder::Pad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-<b> `Pad(operand, padding_value, padding_config)` </b>
-
-| Arguments        | Type            | Semantics                               |
-| ---------------- | --------------- | --------------------------------------- |
-| `operand`        | `XlaOp`         | array of type `T`                       |
-| `padding_value`  | `XlaOp`         | scalar of type `T` to fill in the added |
-:                  :                 : padding                                 :
-| `padding_config` | `PaddingConfig` | padding amount on both edges (low,      |
-:                  :                 : high) and between the elements of each  :
-:                  :                 : dimension                               :
-
-Expands the given `operand` array by padding around the array as well as between
-the elements of the array with the given `padding_value`. `padding_config`
-specifies the amount of edge padding and the interior padding for each
-dimension.
-
-`PaddingConfig` is a repeated field of `PaddingConfigDimension`, which contains
-three fields for each dimension: `edge_padding_low`, `edge_padding_high`, and
-`interior_padding`. `edge_padding_low` and `edge_padding_high` specify the
-amount of padding added at the low-end (next to index 0) and the high-end (next
-to the highest index) of each dimension respectively. The amount of edge padding
-can be negative -- the absolute value of negative padding indicates the number
-of elements to remove from the specified dimension. `interior_padding` specifies
-the amount of padding added between any two elements in each dimension. Interior
-padding occurs logically before edge padding, so in the case of negative edge
-padding elements are removed from the interior-padded operand. This operation is
-a no-op if the edge padding pairs are all (0, 0) and the interior padding values
-are all 0. The figure below shows examples of different `edge_padding` and
-`interior_padding` values for a two-dimensional array.
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="https://www.tensorflow.org/images/ops_pad.png">
-</div>
-
-## Recv
-
-See also
-[`XlaBuilder::Recv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-<b> `Recv(shape, channel_handle)` </b>
-
-| Arguments        | Type            | Semantics                            |
-| ---------------- | --------------- | ------------------------------------ |
-| `shape`          | `Shape`         | shape of the data to receive         |
-| `channel_handle` | `ChannelHandle` | unique identifier for each send/recv pair |
-
-Receives data of the given shape from a `Send` instruction in another
-computation that shares the same channel handle. Returns a
-XlaOp for the received data.
-
-The client API of `Recv` operation represents synchronous communication.
-However, the instruction is internally decomposed into 2 HLO instructions
-(`Recv` and `RecvDone`) to enable asynchronous data transfers. See also
-[`HloInstruction::CreateRecv` and `HloInstruction::CreateRecvDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h).
-
-<b>`Recv(const Shape& shape, int64 channel_id)`</b>
-
-Allocates resources required to receive data from a `Send` instruction with the
-same channel_id. Returns a context for the allocated resources, which is used
-by a following `RecvDone` instruction to wait for the completion of the data
-transfer. The context is a tuple of {receive buffer (shape), request identifier
-(U32)} and it can only be used by a `RecvDone` instruction.
-
-<b> `RecvDone(HloInstruction context)` </b>
-
-Given a context created by a `Recv` instruction, waits for the data transfer to
-complete and returns the received data.
-
-## Reduce
-
-See also
-[`XlaBuilder::Reduce`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Applies a reduction function to one or more arrays in parallel.
-
-<b> `Reduce(operands..., init_values..., computation, dimensions)` </b>
-
-Arguments     | Type                  | Semantics
-------------- | --------------------- | ---------------------------------------
-`operands`    | Sequence of N `XlaOp` | N arrays of types `T_0, ..., T_N`.
-`init_values` | Sequence of N `XlaOp` | N scalars of types `T_0, ..., T_N`.
-`computation` | `XlaComputation`      | computation of type
-              :                       : `T_0, ..., T_N, T_0, ..., T_N -> Collate(T_0, ..., T_N)`
-`dimensions`  | `int64` array         | unordered array of dimensions to reduce
-
-Where:
-* N is required to be greater or equal to 1.
-* All input arrays must have the same dimensions.
-* If `N = 1`, `Collate(T)` is `T`.
-* If `N > 1`, `Collate(T_0, ..., T_N)` is a tuple of `N` elements of type `T`.
-
-The output of the op is `Collate(Q_0, ..., Q_N)` where `Q_i` is an array of type
-`T_i`, the dimensions of which are described below.
-
-This operation reduces one or more dimensions of each input array into scalars.
-The rank of each returned array is `rank(operand) - len(dimensions)`.
-`init_value` is the initial value used for every reduction and may be inserted
-anywhere during computation by the back-end. In most cases, `init_value` is an
-identity of the reduction function (for example, 0 for addition). The applied
-`computation` is always passed the `init_value` on the left-hand side.
-
-The evaluation order of the reduction function is arbitrary and may be
-non-deterministic. Therefore, the reduction function should not be overly
-sensitive to reassociation.
-
-Some reduction functions like addition are not strictly associative for floats.
-However, if the range of the data is limited, floating-point addition is close
-enough to being associative for most practical uses. It is possible to conceive
-of some completely non-associative reductions, however, and these will produce
-incorrect or unpredictable results in XLA reductions.
-
-As an example, when reducing across one dimension in a single 1D array with
-values [10, 11, 12, 13], with reduction function `f` (this is `computation`)
-then that could be computed as
-
-`f(10, f(11, f(12, f(init_value, 13)))`
-
-but there are also many other possibilities, e.g.
-
-`f(init_value, f(f(10, f(init_value, 11)), f(f(init_value, 12), f(init_value, 13))))`
-
-The following is a rough pseudo-code example of how reduction could be
-implemented, using summation as the reduction computation with an initial value
-of 0.
-
-```python
-result_shape <- remove all dims in dimensions from operand_shape
-
-# Iterate over all elements in result_shape. The number of r's here is equal
-# to the rank of the result
-for r0 in range(result_shape[0]), r1 in range(result_shape[1]), ...:
-  # Initialize this result element
-  result[r0, r1...] <- 0
-
-  # Iterate over all the reduction dimensions
-  for d0 in range(dimensions[0]), d1 in range(dimensions[1]), ...:
-    # Increment the result element with the value of the operand's element.
-    # The index of the operand's element is constructed from all ri's and di's
-    # in the right order (by construction ri's and di's together index over the
-    # whole operand shape).
-    result[r0, r1...] += operand[ri... di]
-```
-
-Here's an example of reducing a 2D array (matrix). The shape has rank 2,
-dimension 0 of size 2 and dimension 1 of size 3:
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="https://www.tensorflow.org/images/ops_2d_matrix.png">
-</div>
-
-Results of reducing dimensions 0 or 1 with an "add" function:
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_from_2d_matrix.png">
-</div>
-
-Note that both reduction results are 1D arrays. The diagram shows one as column
-and another as row just for visual convenience.
-
-For a more complex example, here is a 3D array. Its rank is 3, dimension 0 of
-size 4, dimension 1 of size 2 and dimension 2 of size 3. For simplicity, the
-values 1 to 6 are replicated across dimension 0.
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_from_3d_matrix.png">
-</div>
-
-Similarly to the 2D example, we can reduce just one dimension. If we reduce
-dimension 0, for example, we get a rank-2 array where all values across
-dimension 0 were folded into a scalar:
-
-```text
-|  4   8  12 |
-| 16  20  24 |
-```
-
-If we reduce dimension 2, we also get a rank-2 array where all values across
-dimension 2 were folded into a scalar:
-
-```text
-| 6  15 |
-| 6  15 |
-| 6  15 |
-| 6  15 |
-```
-
-Note that the relative order between the remaining dimensions in the input is
-preserved in the output, but some dimensions may get assigned new numbers (since
-the rank changes).
-
-We can also reduce multiple dimensions. Add-reducing dimensions 0 and 1 produces
-the 1D array `| 20 28 36 |`.
-
-Reducing the 3D array over all its dimensions produces the scalar `84`.
-
-When `N > 1`, reduce function application is slightly more complex, as it is
-applied simultaneously to all inputs. For example, consider the following
-reduction function, which can be used to compute the max and the argmax of a
-a 1-D tensor in parallel:
-
-```
-f: (Float, Int, Float, Int) -> Float, Int
-f(max, argmax, value, index):
-  if value >= argmax:
-    return (value, index)
-  else:
-    return (max, argmax)
-```
-
-For 1-D Input arrays `V = Float[N], K = Int[N]`, and init values
-`I_V = Float, I_K =  Int`, the result `f_(N-1)` of reducing across the only
-input dimension is equivalent to the following recursive application:
-```
-f_0 = f(I_V, I_K, V_0, K_0)
-f_1 = f(f_0.first, f_0.second, V_1, K_1)
-...
-f_(N-1) = f(f_(N-2).first, f_(N-2).second, V_(N-1), K_(N-1))
-```
-
-Applying this reduction to an array of values, and an array of sequential
-indices (i.e. iota), will co-iterate over the arrays, and return a tuple
-containing the maximal value and the matching index.
-
-## ReducePrecision
-
-See also
-[`XlaBuilder::ReducePrecision`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Models the effect of converting floating-point values to a lower-precision
-format (such as IEEE-FP16) and back to the original format.  The number of
-exponent and mantissa bits in the lower-precision format can be specified
-arbitrarily, although all bit sizes may not be supported on all hardware
-implementations.
-
-<b> `ReducePrecision(operand, mantissa_bits, exponent_bits)` </b>
-
-Arguments       | Type    | Semantics
---------------- | ------- | -------------------------------------------------
-`operand`       | `XlaOp` | array of floating-point type `T`.
-`exponent_bits` | `int32` | number of exponent bits in lower-precision format
-`mantissa_bits` | `int32` | number of mantissa bits in lower-precision format
-
-The result is an array of type `T`.  The input values are rounded to the nearest
-value representable with the given number of mantissa bits (using "ties to even"
-semantics), and any values that exceed the range specified by the number of
-exponent bits are clamped to positive or negative infinity.  `NaN` values are
-retained, although they may be converted to canonical `NaN` values.
-
-The lower-precision format must have at least one exponent bit (in order to
-distinguish a zero value from an infinity, since both have a zero mantissa), and
-must have a non-negative number of mantissa bits.  The number of exponent or
-mantissa bits may exceed the corresponding value for type `T`; the corresponding
-portion of the conversion is then simply a no-op.
-
-## ReduceWindow
-
-See also
-[`XlaBuilder::ReduceWindow`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Applies a reduction function to all elements in each window of the input
-multi-dimensional array, producing an output multi-dimensional array with the
-same number of elements as the number of valid positions of the window. A
-pooling layer can be expressed as a `ReduceWindow`. Similar to
-[`Reduce`](#reduce), the applied `computation` is always passed the `init_value`
-on the left-hand side.
-
-<b> `ReduceWindow(operand, init_value, computation, window_dimensions,
-window_strides, padding)` </b>
-
-| Arguments           | Type                | Semantics                        |
-| ------------------- | ------------------- | -------------------------------- |
-| `operand`           | `XlaOp`             | N dimensional array containing   |
-:                     :                     : elements of type T. This is the  :
-:                     :                     : base area on which the window is :
-:                     :                     : placed.                          :
-| `init_value`        | `XlaOp`             | Starting value for the           |
-:                     :                     : reduction. See [Reduce](#reduce) :
-:                     :                     : for details.                     :
-| `computation`       | `XlaComputation`    | Reduction function of type `T, T |
-:                     :                     : -> T`, to apply to all elements  :
-:                     :                     : in each window                   :
-| `window_dimensions` | `ArraySlice<int64>` | array of integers for window     |
-:                     :                     : dimension values                 :
-| `window_strides`    | `ArraySlice<int64>` | array of integers for window     |
-:                     :                     : stride values                    :
-| `padding`           | `Padding`           | padding type for window          |
-:                     :                     : (Padding\:\:kSame or             :
-:                     :                     : Padding\:\:kValid)               :
-
-Below code and figure shows an example of using `ReduceWindow`. Input is a
-matrix of size [4x6] and both window_dimensions and window_stride_dimensions are
-[2x3].
-
-```
-// Create a computation for the reduction (maximum).
-XlaComputation max;
-{
-  XlaBuilder builder(client_, "max");
-  auto y = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "y");
-  auto x = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "x");
-  builder.Max(y, x);
-  max = builder.Build().ConsumeValueOrDie();
-}
-
-// Create a ReduceWindow computation with the max reduction computation.
-XlaBuilder builder(client_, "reduce_window_2x3");
-auto shape = ShapeUtil::MakeShape(F32, {4, 6});
-auto input = builder.Parameter(0, shape, "input");
-builder.ReduceWindow(
-    input, *max,
-    /*init_val=*/builder.ConstantLiteral(LiteralUtil::MinValue(F32)),
-    /*window_dimensions=*/{2, 3},
-    /*window_stride_dimensions=*/{2, 3},
-    Padding::kValid);
-```
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_window.png">
-</div>
-
-Stride of 1 in a dimension specifies that the position of a window in the
-dimension is 1 element away from its adjacent window. In order to specify that
-no windows overlap with each other, window_stride_dimensions should be equal to
-window_dimensions. The figure below illustrates the use of two different stride
-values. Padding is applied to each dimension of the input and the calculations
-are the same as though the input came in with the dimensions it has after
-padding.
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:75%" src="https://www.tensorflow.org/images/ops_reduce_window_stride.png">
-</div>
-
-The evaluation order of the reduction function is arbitrary and may be
-non-deterministic. Therefore, the reduction function should not be overly
-sensitive to reassociation. See the discussion about associativity in the
-context of [`Reduce`](#reduce) for more details.
-
-## Reshape
-
-See also
-[`XlaBuilder::Reshape`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
-and the [`Collapse`](#collapse) operation.
-
-Reshapes the dimensions of an array into a new configuration.
-
-<b> `Reshape(operand, new_sizes)` </b>
-<b> `Reshape(operand, dimensions, new_sizes)` </b>
-
-Arguments    | Type           | Semantics
------------- | -------------- | ---------------------------------------
-`operand`    | `XlaOp`        | array of type T
-`dimensions` | `int64` vector | order in which dimensions are collapsed
-`new_sizes`  | `int64` vector | vector of sizes of new dimensions
-
-Conceptually, reshape first flattens an array into a one-dimensional vector of
-data values, and then refines this vector into a new shape. The input arguments
-are an arbitrary array of type T, a compile-time-constant vector of dimension
-indices, and a compile-time-constant vector of dimension sizes for the result.
-The values in the `dimension` vector, if given, must be a permutation of all of
-T's dimensions; the default if not given is `{0, ..., rank - 1}`. The order of
-the dimensions in `dimensions` is from slowest-varying dimension (most major) to
-fastest-varying dimension (most minor) in the loop nest which collapses the
-input array into a single dimension. The `new_sizes` vector determines the size
-of the output array. The value at index 0 in `new_sizes` is the size of
-dimension 0, the value at index 1 is the size of dimension 1, and so on. The
-product of the `new_size` dimensions must equal the product of the operand's
-dimension sizes. When refining the collapsed array into the multidimensional
-array defined by `new_sizes`, the dimensions in `new_sizes` are ordered from
-slowest varying (most major) and to fastest varying (most minor).
-
-For example, let v be an array of 24 elements:
-
-```
-let v = f32[4x2x3] {{{10, 11, 12}, {15, 16, 17}},
-                    {{20, 21, 22}, {25, 26, 27}},
-                    {{30, 31, 32}, {35, 36, 37}},
-                    {{40, 41, 42}, {45, 46, 47}}};
-
-In-order collapse:
-let v012_24 = Reshape(v, {0,1,2}, {24});
-then v012_24 == f32[24] {10, 11, 12, 15, 16, 17, 20, 21, 22, 25, 26, 27,
-                         30, 31, 32, 35, 36, 37, 40, 41, 42, 45, 46, 47};
-
-let v012_83 = Reshape(v, {0,1,2}, {8,3});
-then v012_83 == f32[8x3] {{10, 11, 12}, {15, 16, 17},
-                          {20, 21, 22}, {25, 26, 27},
-                          {30, 31, 32}, {35, 36, 37},
-                          {40, 41, 42}, {45, 46, 47}};
-
-Out-of-order collapse:
-let v021_24 = Reshape(v, {1,2,0}, {24});
-then v012_24 == f32[24]  {10, 20, 30, 40, 11, 21, 31, 41, 12, 22, 32, 42,
-                          15, 25, 35, 45, 16, 26, 36, 46, 17, 27, 37, 47};
-
-let v021_83 = Reshape(v, {1,2,0}, {8,3});
-then v021_83 == f32[8x3] {{10, 20, 30}, {40, 11, 21},
-                          {31, 41, 12}, {22, 32, 42},
-                          {15, 25, 35}, {45, 16, 26},
-                          {36, 46, 17}, {27, 37, 47}};
-
-
-let v021_262 = Reshape(v, {1,2,0}, {2,6,2});
-then v021_262 == f32[2x6x2] {{{10, 20}, {30, 40},
-                              {11, 21}, {31, 41},
-                              {12, 22}, {32, 42}},
-                             {{15, 25}, {35, 45},
-                              {16, 26}, {36, 46},
-                              {17, 27}, {37, 47}}};
-```
-
-As a special case, reshape can transform a single-element array to a scalar and
-vice versa. For example,
-
-```
-Reshape(f32[1x1] {{5}}, {0,1}, {}) == 5;
-Reshape(5, {}, {1,1}) == f32[1x1] {{5}};
-```
-
-## Rev (reverse)
-
-See also
-[`XlaBuilder::Rev`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-<b>`Rev(operand, dimensions)`</b>
-
-Arguments    | Type                | Semantics
------------- | ------------------- | ---------------------
-`operand`    | `XlaOp`             | array of type T
-`dimensions` | `ArraySlice<int64>` | dimensions to reverse
-
-Reverses the order of elements in the `operand` array along the specified
-`dimensions`, generating an output array of the same shape. Each element of the
-operand array at a multidimensional index is stored into the output array at a
-transformed index. The multidimensional index is transformed by reversing the
-index in each dimension to be reversed (i.e., if a dimension of size N is one of
-the reversing dimensions, its index i is transformed into N - 1 - i).
-
-One use for the `Rev` operation is to reverse the convolution weight array along
-the two window dimensions during the gradient computation in neural networks.
-
-## RngNormal
-
-See also
-[`XlaBuilder::RngNormal`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Constructs an output of a given shape with random numbers generated following
-the $$N(\mu, \sigma)$$ normal distribution. The parameters $$\mu$$ and
-$$\sigma$$, and output shape have to have a floating point elemental type. The
-parameters furthermore have to be scalar valued.
-
-<b>`RngNormal(mu, sigma, shape)`</b>
-
-| Arguments | Type    | Semantics                                           |
-| --------- | ------- | --------------------------------------------------- |
-| `mu`      | `XlaOp` | Scalar of type T specifying mean of generated       |
-:           :         : numbers                                   :
-| `sigma`   | `XlaOp` | Scalar of type T specifying standard deviation of   |
-:           :         : generated numbers                                   :
-| `shape`   | `Shape` | Output shape of type T                              |
-
-## RngUniform
-
-See also
-[`XlaBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Constructs an output of a given shape with random numbers generated following
-the uniform distribution over the interval $$[a,b)$$. The parameters and output
-element type have to be a boolean type, an integral type or a floating point
-types, and the types have to be consistent. The CPU and GPU backends currently
-only support F64, F32, F16, BF16, S64, U64, S32 and U32. Furthermore, the
-parameters need to be scalar valued. If $$b <= a$$ the result is
-implementation-defined.
-
-<b>`RngUniform(a, b, shape)`</b>
-
-| Arguments | Type                    | Semantics                         |
-| --------- | ----------------------- | --------------------------------- |
-| `a`       | `XlaOp`                 | Scalar of type T specifying lower |
-:           :                         : limit of interval                 :
-| `b`       | `XlaOp`                 | Scalar of type T specifying upper |
-:           :                         : limit of interval                 :
-| `shape`   | `Shape`                 | Output shape of type T            |
-
-## Scatter
-
-The XLA scatter operation generates a result which is the value of the input
-tensor `operand`, with several slices (at indices specified by
-`scatter_indices`) updated with the values in `updates` using
-`update_computation`.
-
-See also
-[`XlaBuilder::Scatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-<b> `scatter(operand, scatter_indices, updates, update_computation, index_vector_dim, update_window_dims, inserted_window_dims, scatter_dims_to_operand_dims)` </b>
-
-|Arguments         | Type                   | Semantics                        |
-|------------------|------------------------|----------------------------------|
-|`operand`         | `XlaOp`                | Tensor to be scattered into.     |
-|`scatter_indices` | `XlaOp`                | Tensor containing the starting   |
-:                  :                        : indices of the slices that must  :
-:                  :                        : be scattered to.                 :
-|`updates`         | `XlaOp`                | Tensor containing the values that|
-:                  :                        : must be used for scattering.     :
-|`update_computation`| `XlaComputation`     | Computation to be used for       |
-:                  :                        : combining the existing values in :
-:                  :                        : the input tensor and the updates :
-:                  :                        : during scatter. This computation :
-:                  :                        : should be of type `T, T -> T`.   :
-|`index_vector_dim`| `int64`                | The dimension in                 |
-:                  :                        : `scatter_indices` that contains  :
-:                  :                        : the starting indices.            :
-|`update_window_dims`| `ArraySlice<int64>`  | The set of dimensions in         |
-:                  :                        : `updates` shape that are _window :
-:                  :                        : dimensions_.                     :
-|`inserted_window_dims`| `ArraySlice<int64>`| The set of _window dimensions_   |
-:                  :                        : that must be inserted into       :
-:                  :                        : `updates` shape.                 :
-|`scatter_dims_to_operand_dims`| `ArraySlice<int64>`  | A dimensions map from  |
-:                  :                        : the scatter indices to the       :
-:                  :                        : operand index space. This array  :
-:                  :                        : is interpreted as mapping `i` to :
-:                  :                        : `scatter_dims_to_operand_dims[i]`:
-:                  :                        : . It has to be one-to-one and    :
-:                  :                        : total.                           :
-
-If `index_vector_dim` is equal to `scatter_indices.rank` we implicitly consider
-`scatter_indices` to have a trailing `1` dimension.
-
-We define `update_scatter_dims` of type `ArraySlice<int64>` as the set of
-dimensions in `updates` shape that are not in `update_window_dims`, in ascending
-order.
-
-The arguments of scatter should follow these constraints:
-
-  - `updates` tensor must be of rank `update_window_dims.size +
-  scatter_indices.rank - 1`.
-
-  - Bounds of dimension `i` in `updates` must conform to the following:
-      - If `i` is present in `update_window_dims` (i.e. equal to
-        `update_window_dims`[`k`] for some `k`), then the bound of dimension
-        `i` in `updates` must not exceed the corresponding bound of `operand`
-        after accounting for the `inserted_window_dims` (i.e.
-        `adjusted_window_bounds`[`k`], where `adjusted_window_bounds` contains
-        the bounds of `operand` with the bounds at indices
-        `inserted_window_dims` removed).
-      - If `i` is present in `update_scatter_dims` (i.e. equal to
-        `update_scatter_dims`[`k`] for some `k`), then the bound of dimension
-        `i` in `updates` must be equal to the corresponding bound of
-        `scatter_indices`, skipping `index_vector_dim` (i.e.
-        `scatter_indices.shape.dims`[`k`], if `k` < `index_vector_dim` and
-        `scatter_indices.shape.dims`[`k+1`] otherwise).
-
-  - `update_window_dims` must be in ascending order, not have any repeating
-    dimension numbers, and be in the range `[0, updates.rank)`.
-
-  - `inserted_window_dims` must be in ascending order, not have any
-    repeating dimension numbers, and be in the range `[0, operand.rank)`.
-
-  - `scatter_dims_to_operand_dims.size` must be equal to
-    `scatter_indices`[`index_vector_dim`], and its values must be in the range
-    `[0, operand.rank)`.
-
-For a given index `U` in the `updates` tensor, the corresponding index `I` in
-the `operand` tensor into which this update has to be applied is computed as
-follows:
-
-  1. Let `G` = { `U`[`k`] for `k` in `update_scatter_dims` }. Use `G` to look up
-     an index vector `S` in the `scatter_indices` tensor such that `S`[`i`] =
-     `scatter_indices`[Combine(`G`, `i`)] where Combine(A, b) inserts b at
-     positions `index_vector_dim` into A.
-  2. Create an index `S`<sub>`in`</sub> into `operand` using `S` by scattering
-     `S` using the `scatter_dims_to_operand_dims` map. More formally:
-       1. `S`<sub>`in`</sub>[`scatter_dims_to_operand_dims`[`k`]] = `S`[`k`] if
-          `k` < `scatter_dims_to_operand_dims.size`.
-       2. `S`<sub>`in`</sub>[`_`] = `0` otherwise.
-  3. Create an index `W`<sub>`in`</sub> into `operand` by scattering the indices
-     at `update_window_dims` in `U` according to `inserted_window_dims`.
-     More formally:
-       1. `W`<sub>`in`</sub>[`window_dims_to_operand_dims`(`k`)] = `U`[`k`] if
-          `k` < `update_window_dims.size`, where `window_dims_to_operand_dims`
-          is the monotonic function with domain [`0`, `update_window_dims.size`)
-          and range [`0`, `operand.rank`) \\ `inserted_window_dims`. (For
-          example, if `update_window_dims.size` is `4`, `operand.rank` is `6`,
-          and `inserted_window_dims` is {`0`, `2`} then
-          `window_dims_to_operand_dims` is {`0`→`1`, `1`→`3`, `2`→`4`,
-          `3`→`5`}).
-       2. `W`<sub>`in`</sub>[`_`] = `0` otherwise.
-  4. `I` is `W`<sub>`in`</sub> + `S`<sub>`in`</sub> where + is element-wise
-     addition.
-
-In summary, the scatter operation can be defined as follows.
-
-   - Initialize `output` with `operand`, i.e. for all indices `O` in the
-     `operand` tensor:\
-       `output`[`O`] = `operand`[`O`]
-   - For every index `U` in the `updates` tensor and the corresponding index `O`
-     in the `operand` tensor:\
-       `output`[`O`] = `update_computation`(`output`[`O`], `updates`[`U`])
-
-The order in which updates are applied is non-deterministic. So, when multiple
-indices in `updates` refer to the same index in `operand`, the corresponding
-value in `output` will be non-deterministic.
-
-Note that the first parameter that is passed into the `update_computation` will
-always be the current value from the `output` tensor and the second parameter
-will always be the value from the `updates` tensor. This is important
-specifically for cases when the `update_computation` is _not commutative_.
-
-Informally, the scatter op can be viewed as an _inverse_ of the gather op, i.e.
-the scatter op updates the elements in the input that are extracted by the
-corresponding gather op.
-
-For a detailed informal description and examples, refer to the
-"Informal Description" section under `Gather`.
-
-## Select
-
-See also
-[`XlaBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Constructs an output array from elements of two input arrays, based on the
-values of a predicate array.
-
-<b> `Select(pred, on_true, on_false)` </b>
-
-Arguments  | Type    | Semantics
----------- | ------- | ------------------
-`pred`     | `XlaOp` | array of type PRED
-`on_true`  | `XlaOp` | array of type T
-`on_false` | `XlaOp` | array of type T
-
-The arrays `on_true` and `on_false` must have the same shape. This is also the
-shape of the output array. The array `pred` must have the same dimensionality as
-`on_true` and `on_false`, with the `PRED` element type.
-
-For each element `P` of `pred`, the corresponding element of the output array is
-taken from `on_true` if the value of `P` is `true`, and from `on_false` if the
-value of `P` is `false`. As a restricted form of [broadcasting]
-(broadcasting.md), `pred` can be a scalar of type `PRED`. In this case, the
-output array is taken wholly from `on_true` if `pred` is `true`, and from
-`on_false` if `pred` is `false`.
-
-Example with non-scalar `pred`:
-
-```
-let pred: PRED[4] = {true, false, false, true};
-let v1: s32[4] = {1, 2, 3, 4};
-let v2: s32[4] = {100, 200, 300, 400};
-==>
-Select(pred, v1, v2) = s32[4]{1, 200, 300, 4};
-```
-
-Example with scalar `pred`:
-
-```
-let pred: PRED = true;
-let v1: s32[4] = {1, 2, 3, 4};
-let v2: s32[4] = {100, 200, 300, 400};
-==>
-Select(pred, v1, v2) = s32[4]{1, 2, 3, 4};
-```
-
-Selections between tuples are supported. Tuples are considered to be scalar
-types for this purpose. If `on_true` and `on_false` are tuples (which must have
-the same shape!) then `pred` has to be a scalar of type `PRED`.
-
-## SelectAndScatter
-
-See also
-[`XlaBuilder::SelectAndScatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-This operation can be considered as a composite operation that first computes
-`ReduceWindow` on the `operand` array to select an element from each window, and
-then scatters the `source` array to the indices of the selected elements to
-construct an output array with the same shape as the operand array. The binary
-`select` function is used to select an element from each window by applying it
-across each window, and it is called with the property that the first
-parameter's index vector is lexicographically less than the second parameter's
-index vector. The `select` function returns `true` if the first parameter is
-selected and returns `false` if the second parameter is selected, and the
-function must hold transitivity (i.e., if `select(a, b)` and `select(b, c)` are
-`true`, then `select(a, c)` is also `true`) so that the selected element does
-not depend on the order of the elements traversed for a given window.
-
-The function `scatter` is applied at each selected index in the output array. It
-takes two scalar parameters:
-
-1.  Current value at the selected index in the output array
-2.  The scatter value from `source` that applies to the selected index
-
-It combines the two parameters and returns a scalar value that's used to update
-the value at the selected index in the output array. Initially, all indices of
-the output array are set to `init_value`.
-
-The output array has the same shape as the `operand` array and the `source`
-array must have the same shape as the result of applying a `ReduceWindow`
-operation on the `operand` array. `SelectAndScatter` can be used to
-backpropagate the gradient values for a pooling layer in a neural network.
-
-<b>`SelectAndScatter(operand, select, window_dimensions, window_strides,
-padding, source, init_value, scatter)`</b>
-
-| Arguments           | Type                | Semantics                        |
-| ------------------- | ------------------- | -------------------------------- |
-| `operand`           | `XlaOp`             | array of type T over which the   |
-:                     :                     : windows slide                    :
-| `select`            | `XlaComputation`    | binary computation of type `T, T |
-:                     :                     : -> PRED`, to apply to all        :
-:                     :                     : elements in each window; returns :
-:                     :                     : `true` if the first parameter is :
-:                     :                     : selected and returns `false` if  :
-:                     :                     : the second parameter is selected :
-| `window_dimensions` | `ArraySlice<int64>` | array of integers for window     |
-:                     :                     : dimension values                 :
-| `window_strides`    | `ArraySlice<int64>` | array of integers for window     |
-:                     :                     : stride values                    :
-| `padding`           | `Padding`           | padding type for window          |
-:                     :                     : (Padding\:\:kSame or             :
-:                     :                     : Padding\:\:kValid)               :
-| `source`            | `XlaOp`             | array of type T with the values  |
-:                     :                     : to scatter                       :
-| `init_value`        | `XlaOp`             | scalar value of type T for the   |
-:                     :                     : initial value of the output      :
-:                     :                     : array                            :
-| `scatter`           | `XlaComputation`    | binary computation of type `T, T |
-:                     :                     : -> T`, to apply each scatter     :
-:                     :                     : source element with its          :
-:                     :                     : destination element              :
-
-The figure below shows examples of using `SelectAndScatter`, with the `select`
-function computing the maximal value among its parameters. Note that when the
-windows overlap, as in the figure (2) below, an index of the `operand` array may
-be selected multiple times by different windows. In the figure, the element of
-value 9 is selected by both of the top windows (blue and red) and the binary
-addition `scatter` function produces the output element of value 8 (2 + 6).
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%"
-    src="https://www.tensorflow.org/images/ops_scatter_to_selected_window_element.png">
-</div>
-
-The evaluation order of the `scatter` function is arbitrary and may be
-non-deterministic. Therefore, the `scatter` function should not be overly
-sensitive to reassociation. See the discussion about associativity in the
-context of [`Reduce`](#reduce) for more details.
-
-## Send
-
-See also
-[`XlaBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-<b> `Send(operand, channel_handle)` </b>
-
-Arguments        | Type            | Semantics
----------------- | --------------- | -----------------------------------------
-`operand`        | `XlaOp`         | data to send (array of type T)
-`channel_handle` | `ChannelHandle` | unique identifier for each send/recv pair
-
-Sends the given operand data to a `Recv` instruction in another computation
-that shares the same channel handle. Does not return any data.
-
-Similar to the `Recv` operation, the client API of `Send` operation represents
-synchronous communication, and is internally decomposed into 2 HLO instructions
-(`Send` and `SendDone`) to enable asynchronous data transfers. See also
-[`HloInstruction::CreateSend` and `HloInstruction::CreateSendDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h).
-
-<b>`Send(HloInstruction operand, int64 channel_id)`</b>
-
-Initiates an asynchronous transfer of the operand to the resources allocated by
-the `Recv` instruction with the same channel id. Returns a context, which is
-used by a following `SendDone` instruction to wait for the completion of the
-data transfer. The context is a tuple of {operand (shape), request identifier
-(U32)} and it can only be used by a `SendDone` instruction.
-
-<b> `SendDone(HloInstruction context)` </b>
-
-Given a context created by a `Send` instruction, waits for the data transfer to
-complete.  The instruction does not return any data.
-
-<b> Scheduling of channel instructions </b>
-
-The execution order of the 4 instructions for each channel (`Recv`, `RecvDone`,
-`Send`, `SendDone`) is as below.
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:70%" src="../../images/send_recv_order.png">
-</div>
-
-* `Recv` happens before `Send`
-* `Send` happens before `RecvDone`
-* `Recv` happens before `RecvDone`
-* `Send` happens before `SendDone`
-
-When the backend compilers generate a linear schedule for each computation that
-communicates via channel instructions, there must not be cycles across the
-computations. For example, below schedules lead to deadlocks.
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/send_recv_schedule.png">
-</div>
-
-## Slice
-
-See also
-[`XlaBuilder::Slice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-Slicing extracts a sub-array from the input array. The sub-array is of the same
-rank as the input and contains the values inside a bounding box within the input
-array where the dimensions and indices of the bounding box are given as
-arguments to the slice operation.
-
-<b> `Slice(operand, start_indices, limit_indices)` </b>
-
-| Arguments       | Type                | Semantics                            |
-| --------------- | ------------------- | ------------------------------------ |
-| `operand`       | `XlaOp`             | N dimensional array of type T        |
-| `start_indices` | `ArraySlice<int64>` | List of N integers containing the    |
-:                 :                     : starting indices of the slice for    :
-:                 :                     : each dimension. Values must be       :
-:                 :                     : greater than or equal to zero.       :
-| `limit_indices` | `ArraySlice<int64>` | List of N integers containing the    |
-:                 :                     : ending indices (exclusive) for the   :
-:                 :                     : slice for each dimension. Each value :
-:                 :                     : must be greater than or equal to the :
-:                 :                     : respective `start_indices` value for :
-:                 :                     : the dimension and less than or equal :
-:                 :                     : to the size of the dimension.        :
-
-1-dimensional example:
-
-```
-let a = {0.0, 1.0, 2.0, 3.0, 4.0}
-Slice(a, {2}, {4}) produces:
-  {2.0, 3.0}
-```
-
-2-dimensional example:
-
-```
-let b =
- { {0.0,  1.0,  2.0},
-   {3.0,  4.0,  5.0},
-   {6.0,  7.0,  8.0},
-   {9.0, 10.0, 11.0} }
-
-Slice(b, {2, 1}, {4, 3}) produces:
-  { { 7.0,  8.0},
-    {10.0, 11.0} }
-```
-
-## Sort
-
-See also
-[`XlaBuilder::Sort`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-There are two versions of the Sort instruction: a single-operand and a
-two-operand version.
-
-<b>`Sort(operand)`</b>
-
-Arguments   | Type    | Semantics
------------ | ------- | --------------------
-`operand`   | `XlaOp` | The operand to sort.
-`dimension` | `int64` | The dimension along which to sort.
-
-Sorts the elements in the operand in ascending order along the provided
-dimension. For example, for a rank-2 (matrix) operand, a `dimension` value of 0
-will sort each column independently, and a `dimension` value of 1 will sort each
-row independently. If the operand's elements have floating point type, and the
-operand contains NaN elements, the order of elements in the output is
-implementation-defined.
-
-<b>`Sort(key, value)`</b>
-
-Sorts both the key and the value operands. The keys are sorted as in the
-single-operand version. The values are sorted according to the order of their
-corresponding keys. For example, if the inputs are `keys = [3, 1]` and
-`values = [42, 50]`, then the output of the sort is the tuple 
-`{[1, 3], [50, 42]}`.
-
-The sort is not guaranteed to be stable, that is, if the keys array contains
-duplicates, the order of their corresponding values may not be preserved.
-
-Arguments   | Type    | Semantics
------------ | ------- | -------------------
-`keys`      | `XlaOp` | The sort keys.
-`values`    | `XlaOp` | The values to sort.
-`dimension` | `int64` | The dimension along which to sort.
-
-The `keys` and `values` must have the same dimensions, but may have different
-element types.
-
-## Transpose
-
-See also the `tf.reshape` operation.
-
-<b>`Transpose(operand)`</b>
-
-Arguments     | Type                | Semantics
-------------- | ------------------- | ------------------------------
-`operand`     | `XlaOp`             | The operand to transpose.
-`permutation` | `ArraySlice<int64>` | How to permute the dimensions.
-
-
-Permutes the operand dimensions with the given permutation, so
-`∀ i . 0 ≤ i < rank ⇒ input_dimensions[permutation[i]] = output_dimensions[i]`.
-
-This is the same as Reshape(operand, permutation,
-                            Permute(permutation, operand.shape.dimensions)).
-
-## Tuple
-
-See also
-[`XlaBuilder::Tuple`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-A tuple containing a variable number of data handles, each of which has its own
-shape.
-
-This is analogous to `std::tuple` in C++. Conceptually:
-
-```
-let v: f32[10] = f32[10]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-let s: s32 = 5;
-let t: (f32[10], s32) = tuple(v, s);
-```
-
-Tuples can be deconstructed (accessed) via the [`GetTupleElement`]
-(#gettupleelement) operation.
-
-## While
-
-See also
-[`XlaBuilder::While`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
-
-<b> `While(condition, body, init)` </b>
-
-| Arguments   | Type             | Semantics                                |
-| ----------- | ---------------- | ---------------------------------------- |
-| `condition` | `XlaComputation` | XlaComputation of type `T -> PRED` which |
-:             :                  : defines the termination condition of the :
-:             :                  : loop.                                    :
-| `body`      | `XlaComputation` | XlaComputation of type `T -> T` which    |
-:             :                  : defines the body of the loop.            :
-| `init`      | `T`              | Initial value for the parameter of       |
-:             :                  : `condition` and `body`.                  :
-
-Sequentially executes the `body` until the `condition` fails. This is similar to
-a typical while loop in many other languages except for the differences and
-restrictions listed below.
-
-*   A `While` node returns a value of type `T`, which is the result from the
-    last execution of the `body`.
-*   The shape of the type `T` is statically determined and must be the same
-    across all iterations.
-
-The T parameters of the computations are initialized with the `init` value in
-the first iteration and are automatically updated to the new result from `body`
-in each subsequent iteration.
-
-One main use case of the `While` node is to implement the repeated execution of
-training in neural networks. Simplified pseudocode is shown below with a graph
-that represents the computation. The code can be found in
-[`while_test.cc`](https://www.tensorflow.org/code/tensorflow/compiler/xla/tests/while_test.cc).
-The type `T` in this example is a `Tuple` consisting of an `int32` for the
-iteration count and a `vector[10]` for the accumulator. For 1000 iterations, the
-loop keeps adding a constant vector to the accumulator.
-
-```
-// Pseudocode for the computation.
-init = {0, zero_vector[10]} // Tuple of int32 and float[10].
-result = init;
-while (result(0) < 1000) {
-  iteration = result(0) + 1;
-  new_vector = result(1) + constant_vector[10];
-  result = {iteration, new_vector};
-}
-```
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="https://www.tensorflow.org/images/ops_while.png">
-</div>
-- 
GitLab


From 57d31aa599c83014397a22bbb8f1a27a33b0ade3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Oct 2018 22:30:20 -0700
Subject: [PATCH 458/570] Remove dependency on epsilon for diagonal shampoo.

PiperOrigin-RevId: 215857772
---
 .../contrib/opt/python/training/shampoo.py       | 16 +++++++++++-----
 .../contrib/opt/python/training/shampoo_test.py  |  8 ++++----
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/shampoo.py b/tensorflow/contrib/opt/python/training/shampoo.py
index f161521b97..e542f46892 100644
--- a/tensorflow/contrib/opt/python/training/shampoo.py
+++ b/tensorflow/contrib/opt/python/training/shampoo.py
@@ -108,7 +108,8 @@ class ShampooOptimizer(optimizer.Optimizer):
       precond_update_interval: We should update the preconditioners after
                                this many steps. Default = 1. Usually less than
                                svd_interval.
-      epsilon:  epsilon * I_n is added to each mat_gbar_j for stability
+      epsilon:  epsilon * I_n is added to each mat_gbar_j for stability for
+                non-diagonal version of shampoo.
       alpha:  total power of the preconditioners.
       use_iterative_root: should the optimizer use SVD (faster) or the
                           iterative root method (for TPU) for finding the
@@ -394,15 +395,20 @@ class ShampooOptimizer(optimizer.Optimizer):
           assert self._mat_gbar_decay == 1.0
           mat_g_updated = state_ops.scatter_add(mat_g, indices,
                                                 mat_gbar_weight_t * grad_outer)
-          mat_h = math_ops.pow(
-              array_ops.gather(mat_g_updated, indices) + self._epsilon,
-              neg_alpha)
+          mat_g_updated_slice = array_ops.gather(mat_g_updated, indices)
+          mat_h = array_ops.where(
+              math_ops.greater(mat_g_updated_slice, 0),
+              math_ops.pow(mat_g_updated_slice, neg_alpha),
+              array_ops.zeros_like(mat_g_updated_slice))
         else:
           mat_g_updated = self._weighted_average(mat_g,
                                                  self._mat_gbar_decay,
                                                  mat_gbar_decay_t,
                                                  mat_gbar_weight_t * grad_outer)
-          mat_h = math_ops.pow(mat_g_updated + self._epsilon, neg_alpha)
+          mat_h = array_ops.where(
+              math_ops.greater(mat_g_updated, 0),
+              math_ops.pow(mat_g_updated, neg_alpha),
+              array_ops.zeros_like(mat_g_updated))
 
         # Need to do the transpose to ensure that the tensor becomes
         # a d_{i+1} x ... x d_n x d_0 x ... d_i tensor as described above.
diff --git a/tensorflow/contrib/opt/python/training/shampoo_test.py b/tensorflow/contrib/opt/python/training/shampoo_test.py
index a2fd8fbd87..e88c8221a0 100644
--- a/tensorflow/contrib/opt/python/training/shampoo_test.py
+++ b/tensorflow/contrib/opt/python/training/shampoo_test.py
@@ -279,7 +279,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
       # Update rule is var = var - lr * gg^{-0.5} * grad
       # lr = 1
       mat_g = (grad_np * grad_np)
-      new_val_np = init_var_np - np.power(mat_g + RIDGE_EPSILON, -0.5) * grad_np
+      new_val_np = init_var_np - np.power(mat_g, -0.5) * grad_np
 
       self.assertAllCloseAccordingToType(
           new_val_np, new_val, atol=TOLERANCE, rtol=TOLERANCE)
@@ -288,7 +288,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
       new_val = sess.run(var)
 
       mat_g += (grad_np_2 * grad_np_2)
-      new_val_np -= np.power(mat_g + RIDGE_EPSILON, -0.5) * grad_np_2
+      new_val_np -= np.power(mat_g, -0.5) * grad_np_2
 
       self.assertAllCloseAccordingToType(
           new_val_np, new_val, atol=TOLERANCE, rtol=TOLERANCE)
@@ -339,7 +339,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
 
       mat_g1 = np.sum(
           grad_np * grad_np, axis=1, keepdims=True) / grad_np.shape[0]
-      mat_left = np.power(mat_g1 + RIDGE_EPSILON, -0.25)
+      mat_left = np.power(mat_g1, -0.25)
       mat_g2 = np.dot(grad_np.transpose(), grad_np) / grad_np.shape[1]
       mat_right = np_power(mat_g2 + RIDGE_EPSILON * np.eye(size[1]), -0.25)
       new_val_np = init_var_np - np.dot(grad_np * mat_left, mat_right)
@@ -353,7 +353,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
 
       mat_g1 += np.sum(
           grad_np_2 * grad_np_2, axis=1, keepdims=True) / grad_np_2.shape[0]
-      mat_left = np.power(mat_g1 + RIDGE_EPSILON, -0.25)
+      mat_left = np.power(mat_g1, -0.25)
       mat_g2 += np.dot(grad_np_2.transpose(), grad_np_2) / grad_np_2.shape[1]
       mat_right = np_power(mat_g2 + RIDGE_EPSILON * np.eye(size[1]), -0.25)
       new_val_np -= np.dot(grad_np_2 * mat_left, mat_right)
-- 
GitLab


From 3b94d75a9e10ef8ef33760d0ef6aad326e1353ba Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 01:22:02 -0700
Subject: [PATCH 459/570] Merge the different LSTM EvalFloat/EvalHybrid calls
 into a single file.

PiperOrigin-RevId: 215870962
---
 tensorflow/contrib/lite/kernels/BUILD         |  13 +-
 .../kernels/bidirectional_sequence_lstm.cc    | 333 +------
 .../lite/kernels/internal/kernel_utils.cc     | 598 ------------
 .../lite/kernels/internal/kernel_utils.h      | 184 ----
 tensorflow/contrib/lite/kernels/lstm.cc       | 300 +-----
 tensorflow/contrib/lite/kernels/lstm_eval.cc  | 909 ++++++++++++++++++
 tensorflow/contrib/lite/kernels/lstm_eval.h   |  79 ++
 .../kernels/unidirectional_sequence_lstm.cc   | 310 +-----
 8 files changed, 1061 insertions(+), 1665 deletions(-)
 create mode 100644 tensorflow/contrib/lite/kernels/lstm_eval.cc
 create mode 100644 tensorflow/contrib/lite/kernels/lstm_eval.h

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 95e387814d..68636fb070 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -234,11 +234,11 @@ cc_library(
         ":activation_functor",
         ":eigen_support",
         ":kernel_util",
+        ":lstm_eval",
         ":op_macros",
         ":padding",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite:util",
         "//tensorflow/contrib/lite/c:c_api_internal",
         "//tensorflow/contrib/lite/kernels:gemm_support",
         "//tensorflow/contrib/lite/kernels/internal:audio_utils",
@@ -254,6 +254,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "lstm_eval",
+    srcs = ["lstm_eval.cc"],
+    hdrs = ["lstm_eval.h"],
+    deps = [
+        "//tensorflow/contrib/lite/c:c_api_internal",
+        "//tensorflow/contrib/lite/kernels/internal:kernel_utils",
+        "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
+    ],
+)
+
 cc_library(
     name = "builtin_ops",
     srcs = ["register.cc"],
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
index 0532528f52..a326827b1e 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/lstm_eval.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -694,330 +695,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalFloat(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
-    const TfLiteTensor* aux_input_to_input_weights,
-    const TfLiteTensor* aux_input_to_forget_weights,
-    const TfLiteTensor* aux_input_to_cell_weights,
-    const TfLiteTensor* aux_input_to_output_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, bool forward_sequence, int output_offset,
-    TfLiteTensor* scratch_buffer, TfLiteTensor* activation_state,
-    TfLiteTensor* cell_state, TfLiteTensor* output) {
-  const int max_time = input->dims->data[0];
-  const int n_batch = input->dims->data[1];
-  const int n_input = input->dims->data[2];
-  const int aux_input_size = (aux_input) ? aux_input->dims->data[2] : 0;
-
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existense of only one to the get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  // Index the scratch buffers pointers to the global scratch buffer.
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  const float* input_to_input_weights_ptr =
-      (use_cifg) ? nullptr : input_to_input_weights->data.f;
-  const float* recurrent_to_input_weights_ptr =
-      (use_cifg) ? nullptr : recurrent_to_input_weights->data.f;
-  const float* input_gate_bias_ptr =
-      (use_cifg) ? nullptr : input_gate_bias->data.f;
-  const float* cell_to_input_weights_ptr =
-      (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr;
-  const float* cell_to_forget_weights_ptr =
-      (use_peephole) ? cell_to_forget_weights->data.f : nullptr;
-  const float* cell_to_output_weights_ptr =
-      (use_peephole) ? cell_to_output_weights->data.f : nullptr;
-  const float* projection_weights_ptr =
-      (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-
-  float* aux_input_ptr = nullptr;
-  float* aux_input_to_input_weights_ptr = nullptr;
-  float* aux_input_to_forget_weights_ptr = nullptr;
-  float* aux_input_to_cell_weights_ptr = nullptr;
-  float* aux_input_to_output_weights_ptr = nullptr;
-  if (aux_input_size > 0) {
-    aux_input_ptr = aux_input->data.f;
-    aux_input_to_input_weights_ptr = aux_input_to_input_weights->data.f;
-    aux_input_to_forget_weights_ptr = aux_input_to_forget_weights->data.f;
-    aux_input_to_cell_weights_ptr = aux_input_to_cell_weights->data.f;
-    aux_input_to_output_weights_ptr = aux_input_to_output_weights->data.f;
-  }
-
-  // Loop through the sequence.
-  const int input_step = n_batch * n_input;
-  const int output_step = n_batch * output->dims->data[2];
-  for (int t = 0; t < max_time; t++) {
-    // If this is the forward_sequence, step forward, otherwise step backwards.
-    const int t_rel = forward_sequence ? t : max_time - t - 1;
-    const float* input_ptr = input->data.f + t_rel * input_step;
-    float* output_ptr_time =
-        output->data.f + t_rel * output_step + output_offset;
-
-    kernel_utils::LstmStepWithAuxInput(
-        input_ptr, input_to_input_weights_ptr, input_to_forget_weights->data.f,
-        input_to_cell_weights->data.f, input_to_output_weights->data.f,
-        aux_input_ptr, aux_input_to_input_weights_ptr,
-        aux_input_to_forget_weights_ptr, aux_input_to_cell_weights_ptr,
-        aux_input_to_output_weights_ptr, recurrent_to_input_weights_ptr,
-        recurrent_to_forget_weights->data.f, recurrent_to_cell_weights->data.f,
-        recurrent_to_output_weights->data.f, cell_to_input_weights_ptr,
-        cell_to_forget_weights_ptr, cell_to_output_weights_ptr,
-        input_gate_bias_ptr, forget_gate_bias->data.f, cell_bias->data.f,
-        output_gate_bias->data.f, projection_weights_ptr, projection_bias_ptr,
-        params, n_batch, n_cell, n_input, aux_input_size, n_output,
-        activation_state->data.f, cell_state->data.f, input_gate_scratch,
-        forget_gate_scratch, cell_scratch, output_gate_scratch,
-        output_ptr_time);
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalHybrid(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
-    const TfLiteTensor* aux_input_to_input_weights,
-    const TfLiteTensor* aux_input_to_forget_weights,
-    const TfLiteTensor* aux_input_to_cell_weights,
-    const TfLiteTensor* aux_input_to_output_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, bool forward_sequence, int output_offset,
-    TfLiteTensor* scratch_buffer, TfLiteTensor* scaling_factors,
-    TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_cell_weights,
-    TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized,
-    TfLiteTensor* output_state_quantized, TfLiteTensor* cell_state_quantized,
-    TfLiteTensor* output_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output) {
-  const int max_time = input->dims->data[0];
-  const int n_batch = input->dims->data[1];
-  const int n_input = input->dims->data[2];
-  const int aux_input_size = (aux_input) ? aux_input->dims->data[2] : 0;
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  int8_t* input_to_input_weights_ptr = nullptr;
-  float input_to_input_weights_scale = 1.0f;
-  int8_t* recurrent_to_input_weights_ptr = nullptr;
-  float recurrent_to_input_weights_scale = 1.0f;
-  float* input_gate_bias_ptr = nullptr;
-  if (!use_cifg) {
-    input_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8);
-    recurrent_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8);
-    input_gate_bias_ptr = input_gate_bias->data.f;
-    input_to_input_weights_scale = input_to_input_weights->params.scale;
-    recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
-  }
-
-  int8_t* cell_to_input_weights_ptr = nullptr;
-  int8_t* cell_to_forget_weights_ptr = nullptr;
-  int8_t* cell_to_output_weights_ptr = nullptr;
-  float cell_to_input_weights_scale = 1.0f;
-  float cell_to_forget_weights_scale = 1.0f;
-  float cell_to_output_weights_scale = 1.0f;
-  if (use_peephole) {
-    if (!use_cifg) {
-      cell_to_input_weights_ptr =
-          reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8);
-      cell_to_input_weights_scale = cell_to_input_weights->params.scale;
-    }
-    cell_to_forget_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8);
-    cell_to_output_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8);
-    cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
-    cell_to_output_weights_scale = cell_to_output_weights->params.scale;
-  }
-
-  const int8_t* projection_weights_ptr =
-      (projection_weights == nullptr)
-          ? nullptr
-          : reinterpret_cast<int8_t*>(projection_weights->data.uint8);
-  const float projection_weights_scale =
-      (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-
-  // Required tensors, pointers are non-null.
-  const int8_t* input_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8);
-  const float input_to_forget_weights_scale =
-      input_to_forget_weights->params.scale;
-  const int8_t* input_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8);
-  const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
-  const int8_t* input_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8);
-  const float input_to_output_weights_scale =
-      input_to_output_weights->params.scale;
-  const int8_t* recurrent_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8);
-  const float recurrent_to_forget_weights_scale =
-      recurrent_to_forget_weights->params.scale;
-  const int8_t* recurrent_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8);
-  const float recurrent_to_cell_weights_scale =
-      recurrent_to_cell_weights->params.scale;
-  const int8_t* recurrent_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
-  const float recurrent_to_output_weights_scale =
-      recurrent_to_output_weights->params.scale;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
-
-  float* output_state_ptr = output_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-
-  // Temporary storage for quantized values and scaling factors.
-  int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
-  int8_t* quantized_aux_input_ptr =
-      (aux_input_quantized == nullptr)
-          ? nullptr
-          : reinterpret_cast<int8_t*>(aux_input_quantized->data.uint8);
-  int8_t* quantized_output_state_ptr =
-      reinterpret_cast<int8_t*>(output_state_quantized->data.uint8);
-  int8_t* quantized_cell_state_ptr =
-      reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8);
-  float* scaling_factors_ptr = scaling_factors->data.f;
-  float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
-  float* recovered_cell_weights_ptr = recovered_cell_weights->data.f;
-
-  // Auxiliary input and weights.
-  float* aux_input_ptr = nullptr;
-  int8_t* aux_input_to_input_weights_ptr = nullptr;
-  int8_t* aux_input_to_forget_weights_ptr = nullptr;
-  int8_t* aux_input_to_cell_weights_ptr = nullptr;
-  int8_t* aux_input_to_output_weights_ptr = nullptr;
-  float aux_input_to_input_weights_scale = 0.0f;
-  float aux_input_to_forget_weights_scale = 0.0f;
-  float aux_input_to_cell_weights_scale = 0.0f;
-  float aux_input_to_output_weights_scale = 0.0f;
-  if (aux_input_size > 0) {
-    aux_input_ptr = aux_input->data.f;
-    aux_input_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_input_to_input_weights->data.uint8);
-    aux_input_to_forget_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_input_to_forget_weights->data.uint8);
-    aux_input_to_cell_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_input_to_cell_weights->data.uint8);
-    aux_input_to_output_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_input_to_output_weights->data.uint8);
-    aux_input_to_input_weights_scale = aux_input_to_input_weights->params.scale;
-    aux_input_to_forget_weights_scale =
-        aux_input_to_forget_weights->params.scale;
-    aux_input_to_cell_weights_scale = aux_input_to_cell_weights->params.scale;
-    aux_input_to_output_weights_scale =
-        aux_input_to_output_weights->params.scale;
-  }
-
-  // Feed the sequence into the LSTM step-by-step.
-  const int input_step = n_batch * n_input;
-  const int output_step = n_batch * output->dims->data[2];
-  for (int t = 0; t < max_time; t++) {
-    // If this is the forward_sequence, step forward, otherwise step backwards.
-    const int t_rel = forward_sequence ? t : max_time - t - 1;
-    const float* input_ptr = input->data.f + t_rel * input_step;
-    float* output_ptr = output->data.f + t_rel * output_step + output_offset;
-
-    kernel_utils::LstmStepWithAuxInput(
-        input_ptr, input_to_input_weights_ptr, input_to_input_weights_scale,
-        input_to_forget_weights_ptr, input_to_forget_weights_scale,
-        input_to_cell_weights_ptr, input_to_cell_weights_scale,
-        input_to_output_weights_ptr, input_to_output_weights_scale,
-        aux_input_ptr, aux_input_to_input_weights_ptr,
-        aux_input_to_input_weights_scale, aux_input_to_forget_weights_ptr,
-        aux_input_to_forget_weights_scale, aux_input_to_cell_weights_ptr,
-        aux_input_to_cell_weights_scale, aux_input_to_output_weights_ptr,
-        aux_input_to_output_weights_scale, recurrent_to_input_weights_ptr,
-        recurrent_to_input_weights_scale, recurrent_to_forget_weights_ptr,
-        recurrent_to_forget_weights_scale, recurrent_to_cell_weights_ptr,
-        recurrent_to_cell_weights_scale, recurrent_to_output_weights_ptr,
-        recurrent_to_output_weights_scale, cell_to_input_weights_ptr,
-        cell_to_input_weights_scale, cell_to_forget_weights_ptr,
-        cell_to_forget_weights_scale, cell_to_output_weights_ptr,
-        cell_to_output_weights_scale, input_gate_bias_ptr, forget_gate_bias_ptr,
-        cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr,
-        projection_weights_scale, projection_bias_ptr, params, n_batch, n_cell,
-        n_input, aux_input_size, n_output, input_gate_scratch,
-        forget_gate_scratch, cell_scratch, output_gate_scratch,
-        scaling_factors_ptr, prod_scaling_factors_ptr,
-        recovered_cell_weights_ptr, quantized_input_ptr,
-        quantized_aux_input_ptr, quantized_output_state_ptr,
-        quantized_cell_state_ptr, output_state_ptr, cell_state_ptr, output_ptr);
-  }
-
-  return kTfLiteOk;
-}
-
 // The LSTM Op engine.
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>(
@@ -1157,7 +834,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   switch (fw_input_to_output_weights->type) {
     case kTfLiteFloat32: {
-      TfLiteStatus fw_pass_status = EvalFloat(
+      TfLiteStatus fw_pass_status = lstm_eval::EvalFloat(
           input, fw_input_to_input_weights, fw_input_to_forget_weights,
           fw_input_to_cell_weights, fw_input_to_output_weights,
           fw_recurrent_to_input_weights, fw_recurrent_to_forget_weights,
@@ -1172,7 +849,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           fw_activation_state, fw_cell_state, fw_output);
       TF_LITE_ENSURE_OK(context, fw_pass_status);
 
-      TfLiteStatus bw_pass_status = EvalFloat(
+      TfLiteStatus bw_pass_status = lstm_eval::EvalFloat(
           input, bw_input_to_input_weights, bw_input_to_forget_weights,
           bw_input_to_cell_weights, bw_input_to_output_weights,
           bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights,
@@ -1208,7 +885,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TfLiteTensor* recovered_cell_weights =
           GetTemporary(context, node, kRecoveredCellWeights);
 
-      TfLiteStatus fw_pass_status = EvalHybrid(
+      TfLiteStatus fw_pass_status = lstm_eval::EvalHybrid(
           input, fw_input_to_input_weights, fw_input_to_forget_weights,
           fw_input_to_cell_weights, fw_input_to_output_weights,
           fw_recurrent_to_input_weights, fw_recurrent_to_forget_weights,
@@ -1226,7 +903,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           fw_output);
       TF_LITE_ENSURE_OK(context, fw_pass_status);
 
-      TfLiteStatus bw_pass_status = EvalHybrid(
+      TfLiteStatus bw_pass_status = lstm_eval::EvalHybrid(
           input, bw_input_to_input_weights, bw_input_to_forget_weights,
           bw_input_to_cell_weights, bw_input_to_output_weights,
           bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights,
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
index 56e9367878..083e5839bd 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
@@ -169,603 +169,5 @@ void RnnBatchStep(
                                         hidden_state_ptr_batch);
 }
 
-void LstmStep(
-    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
-    const float* input_to_forget_weights_ptr,
-    const float* input_to_cell_weights_ptr,
-    const float* input_to_output_weights_ptr,
-    const float* recurrent_to_input_weights_ptr,
-    const float* recurrent_to_forget_weights_ptr,
-    const float* recurrent_to_cell_weights_ptr,
-    const float* recurrent_to_output_weights_ptr,
-    const float* cell_to_input_weights_ptr,
-    const float* cell_to_forget_weights_ptr,
-    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
-    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
-    int n_batch, int n_cell, int n_input, int n_output, float* output_state_ptr,
-    float* cell_state_ptr, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
-    float* output_ptr_batch) {
-  LstmStepWithAuxInput(
-      input_ptr_batch, input_to_input_weights_ptr, input_to_forget_weights_ptr,
-      input_to_cell_weights_ptr, input_to_output_weights_ptr,
-      /*aux_input_ptr_batch=*/nullptr,
-      /*aux_input_to_input_weights_ptr=*/nullptr,
-      /*aux_input_to_forget_weights_ptr=*/nullptr,
-      /*aux_input_to_cell_weights_ptr=*/nullptr,
-      /*aux_input_to_output_weights_ptr=*/nullptr,
-      recurrent_to_input_weights_ptr, recurrent_to_forget_weights_ptr,
-      recurrent_to_cell_weights_ptr, recurrent_to_output_weights_ptr,
-      cell_to_input_weights_ptr, cell_to_forget_weights_ptr,
-      cell_to_output_weights_ptr, input_gate_bias_ptr, forget_gate_bias_ptr,
-      cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr,
-      projection_bias_ptr, params, n_batch, n_cell, n_input, /*n_aux_input=*/0,
-      n_output, output_state_ptr, cell_state_ptr, input_gate_scratch,
-      forget_gate_scratch, cell_scratch, output_gate_scratch, output_ptr_batch);
-}
-
-void LstmStepWithAuxInput(
-    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
-    const float* input_to_forget_weights_ptr,
-    const float* input_to_cell_weights_ptr,
-    const float* input_to_output_weights_ptr, const float* aux_input_ptr_batch,
-    const float* aux_input_to_input_weights_ptr,
-    const float* aux_input_to_forget_weights_ptr,
-    const float* aux_input_to_cell_weights_ptr,
-    const float* aux_input_to_output_weights_ptr,
-    const float* recurrent_to_input_weights_ptr,
-    const float* recurrent_to_forget_weights_ptr,
-    const float* recurrent_to_cell_weights_ptr,
-    const float* recurrent_to_output_weights_ptr,
-    const float* cell_to_input_weights_ptr,
-    const float* cell_to_forget_weights_ptr,
-    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
-    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
-    int n_batch, int n_cell, int n_input, int n_aux_input, int n_output,
-    float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
-    float* output_ptr_batch) {
-  // Since we have already checked that weights are all there or none, we can
-  // check the existense of only one to the get the condition.
-  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
-  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
-  // Initialize scratch buffers with bias.
-  if (!use_cifg) {
-    tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch,
-                                          input_gate_scratch);
-  }
-  tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
-                                        forget_gate_scratch);
-  tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
-                                        cell_scratch);
-  tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
-                                        output_gate_scratch);
-
-  // For each batch and cell: compute input_weight * input.
-  if (!use_cifg) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_input_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-        input_gate_scratch, /*result_stride=*/1);
-  }
-
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_forget_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-      forget_gate_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_cell_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-      cell_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_output_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-      output_gate_scratch, /*result_stride=*/1);
-
-  // If auxiliary input is available then compute aux_input_weight * aux_input
-  if (aux_input_ptr_batch != nullptr) {
-    if (!use_cifg) {
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          aux_input_to_input_weights_ptr, n_cell, n_aux_input,
-          aux_input_ptr_batch, n_batch, input_gate_scratch,
-          /*result_stride=*/1);
-    }
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_forget_weights_ptr, n_cell, n_aux_input,
-        aux_input_ptr_batch, n_batch, forget_gate_scratch, /*result_stride=*/1);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_cell_weights_ptr, n_cell, n_aux_input, aux_input_ptr_batch,
-        n_batch, cell_scratch, /*result_stride=*/1);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_output_weights_ptr, n_cell, n_aux_input,
-        aux_input_ptr_batch, n_batch, output_gate_scratch, /*result_stride=*/1);
-  }
-
-  // For each batch and cell: compute recurrent_weight * output_state.
-  if (!use_cifg) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
-        n_batch, input_gate_scratch, /*result_stride=*/1);
-  }
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, forget_gate_scratch,
-      /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, cell_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, output_gate_scratch,
-      /*result_stride=*/1);
-
-  // For each batch and cell: update input gate.
-  if (!use_cifg) {
-    if (use_peephole) {
-      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          cell_to_input_weights_ptr, n_cell, cell_state_ptr, n_batch,
-          input_gate_scratch);
-    }
-    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                       input_gate_scratch);
-  }
-
-  // For each batch and cell: update forget gate.
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_forget_weights_ptr, n_cell, cell_state_ptr, n_batch,
-        forget_gate_scratch);
-  }
-  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                     forget_gate_scratch);
-
-  // For each batch and cell: update the cell.
-  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
-                                         n_batch * n_cell, cell_state_ptr);
-  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                        params->activation, cell_scratch);
-  if (use_cifg) {
-    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                             forget_gate_scratch);
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
-  } else {
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
-  }
-  if (params->cell_clip > 0.0) {
-    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
-                             params->cell_clip, cell_state_ptr);
-  }
-
-  // For each batch and cell: update the output gate.
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_output_weights_ptr, n_cell, cell_state_ptr, n_batch,
-        output_gate_scratch);
-  }
-  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                     output_gate_scratch);
-  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        params->activation, cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
-                                         n_batch * n_cell, output_gate_scratch);
-
-  // For each batch: update the projection and output_state.
-  const bool use_projection_weight = (projection_weights_ptr != nullptr);
-  const bool use_projection_bias = (projection_bias_ptr != nullptr);
-  if (use_projection_weight) {
-    if (use_projection_bias) {
-      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
-                                            n_batch, output_ptr_batch);
-    } else {
-      tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch,
-        output_ptr_batch, /*result_stride=*/1);
-    if (params->proj_clip > 0.0) {
-      tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output,
-                               params->proj_clip, output_ptr_batch);
-    }
-  } else {
-    tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
-                             output_ptr_batch);
-  }
-  tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
-                           output_state_ptr);
-}
-
-void LstmStep(
-    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
-    float input_to_input_weights_scale,
-    const int8_t* input_to_forget_weights_ptr,
-    float input_to_forget_weights_scale,
-    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
-    const int8_t* input_to_output_weights_ptr,
-    float input_to_output_weights_scale,
-    const int8_t* recurrent_to_input_weights_ptr,
-    float recurrent_to_input_weights_scale,
-    const int8_t* recurrent_to_forget_weights_ptr,
-    float recurrent_to_forget_weights_scale,
-    const int8_t* recurrent_to_cell_weights_ptr,
-    float recurrent_to_cell_weights_scale,
-    const int8_t* recurrent_to_output_weights_ptr,
-    float recurrent_to_output_weights_scale,
-    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
-    const int8_t* cell_to_forget_weights_ptr,
-    float cell_to_forget_weights_scale,
-    const int8_t* cell_to_output_weights_ptr,
-    float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
-    float projection_weights_scale, const float* projection_bias_ptr,
-    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
-    int n_output, float* input_gate_scratch, float* forget_gate_scratch,
-    float* cell_scratch, float* output_gate_scratch, float* scaling_factors,
-    float* product_scaling_factors, float* recovered_cell_weights,
-    int8_t* quantized_input_ptr_batch, int8_t* quantized_output_state_ptr,
-    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
-    float* cell_state_ptr, float* output_ptr_batch) {
-  LstmStepWithAuxInput(
-      input_ptr_batch, input_to_input_weights_ptr, input_to_input_weights_scale,
-      input_to_forget_weights_ptr, input_to_forget_weights_scale,
-      input_to_cell_weights_ptr, input_to_cell_weights_scale,
-      input_to_output_weights_ptr, input_to_output_weights_scale,
-      /*aux_input_ptr_batch=*/nullptr,
-      /*aux_input_to_input_weights_ptr=*/nullptr,
-      /*aux_input_to_input_weights_scale=*/0.0f,
-      /*aux_input_to_forget_weights_ptr=*/nullptr,
-      /*aux_input_to_forget_weights_scale=*/0.0f,
-      /*aux_input_to_cell_weights_ptr=*/nullptr,
-      /*aux_input_to_cell_weights_scale=*/0.0f,
-      /*aux_input_to_output_weights_ptr=*/nullptr,
-      /*aux_input_to_output_weights_scale=*/0.0f,
-      recurrent_to_input_weights_ptr, recurrent_to_input_weights_scale,
-      recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_scale,
-      recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_scale,
-      recurrent_to_output_weights_ptr, recurrent_to_output_weights_scale,
-      cell_to_input_weights_ptr, cell_to_input_weights_scale,
-      cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
-      cell_to_output_weights_ptr, cell_to_output_weights_scale,
-      input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr,
-      output_gate_bias_ptr, projection_weights_ptr, projection_weights_scale,
-      projection_bias_ptr, params, n_batch, n_cell, n_input,
-      /*n_aux_input=*/0, n_output, input_gate_scratch, forget_gate_scratch,
-      cell_scratch, output_gate_scratch, scaling_factors,
-      product_scaling_factors, recovered_cell_weights,
-      quantized_input_ptr_batch,
-      /*quantized_aux_input_ptr_batch=*/nullptr, quantized_output_state_ptr,
-      quantized_cell_state_ptr, output_state_ptr, cell_state_ptr,
-      output_ptr_batch);
-    }
-
-    void LstmStepWithAuxInput(
-        const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
-        float input_to_input_weights_scale,
-        const int8_t* input_to_forget_weights_ptr,
-        float input_to_forget_weights_scale,
-        const int8_t* input_to_cell_weights_ptr,
-        float input_to_cell_weights_scale,
-        const int8_t* input_to_output_weights_ptr,
-        float input_to_output_weights_scale, const float* aux_input_ptr_batch,
-        const int8_t* aux_input_to_input_weights_ptr,
-        float aux_input_to_input_weights_scale,
-        const int8_t* aux_input_to_forget_weights_ptr,
-        float aux_input_to_forget_weights_scale,
-        const int8_t* aux_input_to_cell_weights_ptr,
-        float aux_input_to_cell_weights_scale,
-        const int8_t* aux_input_to_output_weights_ptr,
-        float aux_input_to_output_weights_scale,
-        const int8_t* recurrent_to_input_weights_ptr,
-        float recurrent_to_input_weights_scale,
-        const int8_t* recurrent_to_forget_weights_ptr,
-        float recurrent_to_forget_weights_scale,
-        const int8_t* recurrent_to_cell_weights_ptr,
-        float recurrent_to_cell_weights_scale,
-        const int8_t* recurrent_to_output_weights_ptr,
-        float recurrent_to_output_weights_scale,
-        const int8_t* cell_to_input_weights_ptr,
-        float cell_to_input_weights_scale,
-        const int8_t* cell_to_forget_weights_ptr,
-        float cell_to_forget_weights_scale,
-        const int8_t* cell_to_output_weights_ptr,
-        float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
-        const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-        const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
-        float projection_weights_scale, const float* projection_bias_ptr,
-        const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
-        int n_aux_input, int n_output, float* input_gate_scratch,
-        float* forget_gate_scratch, float* cell_scratch,
-        float* output_gate_scratch, float* scaling_factors,
-        float* product_scaling_factors, float* recovered_cell_weights,
-        int8_t* quantized_input_ptr_batch,
-        int8_t* quantized_aux_input_ptr_batch,
-        int8_t* quantized_output_state_ptr, int8_t* quantized_cell_state_ptr,
-        float* output_state_ptr, float* cell_state_ptr,
-        float* output_ptr_batch) {
-      // Since we have already checked that weights are all there or none, we
-      // can check the existense of only one to the get the condition.
-      const bool use_cifg = (input_to_input_weights_ptr == nullptr);
-      const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
-      // Initialize scratch buffers with bias.
-      if (!use_cifg) {
-        tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell,
-                                              n_batch, input_gate_scratch);
-      }
-      tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell,
-                                            n_batch, forget_gate_scratch);
-      tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
-                                            cell_scratch);
-      tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell,
-                                            n_batch, output_gate_scratch);
-
-      if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
-        // Save quantization and matmul computation for all zero input.
-        float unused_min, unused_max;
-        for (int b = 0; b < n_batch; ++b) {
-          const int offset = b * n_input;
-          tensor_utils::SymmetricQuantizeFloats(
-              input_ptr_batch + offset, n_input,
-              quantized_input_ptr_batch + offset, &unused_min, &unused_max,
-              &scaling_factors[b]);
-        }
-        // For each batch and cell: compute input_weight * input.
-        if (!use_cifg) {
-          for (int b = 0; b < n_batch; ++b) {
-            product_scaling_factors[b] =
-                scaling_factors[b] * input_to_input_weights_scale;
-          }
-          tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-              input_to_input_weights_ptr, n_cell, n_input,
-              quantized_input_ptr_batch, product_scaling_factors, n_batch,
-              input_gate_scratch, /*result_stride=*/1);
-        }
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * input_to_forget_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            input_to_forget_weights_ptr, n_cell, n_input,
-            quantized_input_ptr_batch, product_scaling_factors, n_batch,
-            forget_gate_scratch,
-            /*result_stride=*/1);
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * input_to_cell_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            input_to_cell_weights_ptr, n_cell, n_input,
-            quantized_input_ptr_batch, product_scaling_factors, n_batch,
-            cell_scratch, /*result_stride=*/1);
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * input_to_output_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            input_to_output_weights_ptr, n_cell, n_input,
-            quantized_input_ptr_batch, product_scaling_factors, n_batch,
-            output_gate_scratch,
-            /*result_stride=*/1);
-      }
-
-      if (aux_input_ptr_batch != nullptr &&
-          !tensor_utils::IsZeroVector(aux_input_ptr_batch, n_batch * n_input)) {
-        // Save quantization and matmul computation for all zero input.
-        float unused_min, unused_max;
-        for (int b = 0; b < n_batch; ++b) {
-          const int offset = b * n_input;
-          tensor_utils::SymmetricQuantizeFloats(
-              aux_input_ptr_batch + offset, n_input,
-              quantized_aux_input_ptr_batch + offset, &unused_min, &unused_max,
-              &scaling_factors[b]);
-        }
-        // For each batch and cell: compute input_weight * input.
-        if (!use_cifg) {
-          for (int b = 0; b < n_batch; ++b) {
-            product_scaling_factors[b] =
-                scaling_factors[b] * aux_input_to_input_weights_scale;
-          }
-          tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-              aux_input_to_input_weights_ptr, n_cell, n_input,
-              quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
-              input_gate_scratch, /*result_stride=*/1);
-        }
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * aux_input_to_forget_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            aux_input_to_forget_weights_ptr, n_cell, n_input,
-            quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
-            forget_gate_scratch, /*result_stride=*/1);
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * aux_input_to_cell_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            aux_input_to_cell_weights_ptr, n_cell, n_input,
-            quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
-            cell_scratch, /*result_stride=*/1);
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * aux_input_to_output_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            aux_input_to_output_weights_ptr, n_cell, n_input,
-            quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
-            output_gate_scratch, /*result_stride=*/1);
-      }
-
-      if (!tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output)) {
-        // Save quantization and matmul computation for all zero input.
-        float unused_min, unused_max;
-        for (int b = 0; b < n_batch; ++b) {
-          const int offset = b * n_output;
-          tensor_utils::SymmetricQuantizeFloats(
-              output_state_ptr + offset, n_output,
-              quantized_output_state_ptr + offset, &unused_min, &unused_max,
-              &scaling_factors[b]);
-        }
-        // For each batch and cell: compute recurrent_weight * output_state.
-        if (!use_cifg) {
-          for (int b = 0; b < n_batch; ++b) {
-            product_scaling_factors[b] =
-                scaling_factors[b] * recurrent_to_input_weights_scale;
-          }
-          tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-              recurrent_to_input_weights_ptr, n_cell, n_output,
-              quantized_output_state_ptr, product_scaling_factors, n_batch,
-              input_gate_scratch, /*result_stride=*/1);
-        }
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * recurrent_to_forget_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            recurrent_to_forget_weights_ptr, n_cell, n_output,
-            quantized_output_state_ptr, product_scaling_factors, n_batch,
-            forget_gate_scratch, /*result_stride=*/1);
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * recurrent_to_cell_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            recurrent_to_cell_weights_ptr, n_cell, n_output,
-            quantized_output_state_ptr, product_scaling_factors, n_batch,
-            cell_scratch, /*result_stride=*/1);
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * recurrent_to_output_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            recurrent_to_output_weights_ptr, n_cell, n_output,
-            quantized_output_state_ptr, product_scaling_factors, n_batch,
-            output_gate_scratch, /*result_stride=*/1);
-      }
-
-      // Save quantization and matmul computation for all zero input.
-      bool is_cell_state_all_zeros =
-          tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
-
-      // For each batch and cell: update input gate.
-      if (!use_cifg) {
-        if (use_peephole && !is_cell_state_all_zeros) {
-          tensor_utils::VectorScalarMultiply(cell_to_input_weights_ptr, n_cell,
-                                             cell_to_input_weights_scale,
-                                             recovered_cell_weights);
-          tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-              recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
-              input_gate_scratch);
-        }
-        tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                           input_gate_scratch);
-      }
-
-      // For each batch and cell: update forget gate.
-      if (use_peephole && !is_cell_state_all_zeros) {
-        tensor_utils::VectorScalarMultiply(cell_to_forget_weights_ptr, n_cell,
-                                           cell_to_forget_weights_scale,
-                                           recovered_cell_weights);
-        tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-            recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
-            forget_gate_scratch);
-      }
-      tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                         forget_gate_scratch);
-
-      // For each batch and cell: update the cell.
-      tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch,
-                                             cell_state_ptr, n_batch * n_cell,
-                                             cell_state_ptr);
-      tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                            params->activation, cell_scratch);
-      if (use_cifg) {
-        tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                                 forget_gate_scratch);
-        tensor_utils::VectorVectorCwiseProductAccumulate(
-            cell_scratch, forget_gate_scratch, n_batch * n_cell,
-            cell_state_ptr);
-      } else {
-        tensor_utils::VectorVectorCwiseProductAccumulate(
-            cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
-      }
-      if (params->cell_clip > 0.0) {
-        tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
-                                 params->cell_clip, cell_state_ptr);
-      }
-
-      is_cell_state_all_zeros =
-          tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
-      // For each batch and cell: update the output gate.
-      if (use_peephole && !is_cell_state_all_zeros) {
-        tensor_utils::VectorScalarMultiply(cell_to_output_weights_ptr, n_cell,
-                                           cell_to_output_weights_scale,
-                                           recovered_cell_weights);
-        tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-            recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
-            output_gate_scratch);
-      }
-      tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                         output_gate_scratch);
-      tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                            params->activation, cell_scratch);
-      tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
-                                             n_batch * n_cell,
-                                             output_gate_scratch);
-
-      // For each batch: update the projection and output_state.
-      const bool use_projection_weight = (projection_weights_ptr != nullptr);
-      const bool use_projection_bias = (projection_bias_ptr != nullptr);
-      if (use_projection_weight) {
-        if (use_projection_bias) {
-          tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
-                                                n_batch, output_ptr_batch);
-        } else {
-          tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
-        }
-        if (!tensor_utils::IsZeroVector(output_gate_scratch,
-                                        n_batch * n_cell)) {
-          // Save quantization and matmul computation for all zero input.
-          float unused_min, unused_max;
-          for (int b = 0; b < n_batch; ++b) {
-            const int offset = b * n_cell;
-            tensor_utils::SymmetricQuantizeFloats(
-                output_gate_scratch + offset, n_cell,
-                quantized_cell_state_ptr + offset, &unused_min, &unused_max,
-                &scaling_factors[b]);
-          }
-          for (int b = 0; b < n_batch; ++b) {
-            product_scaling_factors[b] =
-                scaling_factors[b] * projection_weights_scale;
-          }
-          tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-              projection_weights_ptr, n_output, n_cell,
-              quantized_cell_state_ptr, product_scaling_factors, n_batch,
-              output_ptr_batch,
-              /*result_stride=*/1);
-        }
-        if (params->proj_clip > 0.0) {
-          tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output,
-                                   params->proj_clip, output_ptr_batch);
-        }
-      } else {
-        tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
-                                 output_ptr_batch);
-      }
-      tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
-                               output_state_ptr);
-    }
-
 }  // namespace kernel_utils
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
index b5558cce55..74e0a4a53d 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
@@ -76,190 +76,6 @@ void RnnBatchStep(
     int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
     float* hidden_state_ptr_batch, float* output_ptr_batch);
 
-// Performs an LSTM batch inference step for input specified by input_ptr_batch.
-// The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and
-// biases (*_bias_ptr), and buffers (*_scratch), along with additional
-// parameters:
-//  - params: various LSTM params including activation, clipping, etc.,
-//  - n_batch: size of batch,
-//  - n_cell: number of cells (or units),
-//  - n_input: the input size,
-//  - n_output: the output size.
-//
-// The pointers to the cell and output state and the output are updated.
-//
-// The pointers with the suffix "_batch" point to data aligned in batch_major
-// order, and each step processes batch_size many inputs from input_ptr_batch,
-// and updates batch_size many cell and output states.
-void LstmStep(
-    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
-    const float* input_to_forget_weights_ptr,
-    const float* input_to_cell_weights_ptr,
-    const float* input_to_output_weights_ptr,
-    const float* recurrent_to_input_weights_ptr,
-    const float* recurrent_to_forget_weights_ptr,
-    const float* recurrent_to_cell_weights_ptr,
-    const float* recurrent_to_output_weights_ptr,
-    const float* cell_to_input_weights_ptr,
-    const float* cell_to_forget_weights_ptr,
-    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
-    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
-    int n_batch, int n_cell, int n_input, int n_output, float* output_state_ptr,
-    float* cell_state_ptr, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
-    float* output_ptr_batch);
-
-// Same as above but includes an auxiliary input with the corresponding weights.
-void LstmStepWithAuxInput(
-    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
-    const float* input_to_forget_weights_ptr,
-    const float* input_to_cell_weights_ptr,
-    const float* input_to_output_weights_ptr, const float* aux_input_ptr_batch,
-    const float* aux_input_to_input_weights_ptr,
-    const float* aux_input_to_forget_weights_ptr,
-    const float* aux_input_to_cell_weights_ptr,
-    const float* aux_input_to_output_weights_ptr,
-    const float* recurrent_to_input_weights_ptr,
-    const float* recurrent_to_forget_weights_ptr,
-    const float* recurrent_to_cell_weights_ptr,
-    const float* recurrent_to_output_weights_ptr,
-    const float* cell_to_input_weights_ptr,
-    const float* cell_to_forget_weights_ptr,
-    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
-    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
-    int n_batch, int n_cell, int n_input, int n_aux_input, int n_output,
-    float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
-    float* output_ptr_batch);
-
-// Same as above but with quantized weight matrices. In detail:
-// Input of size 'n_batch * n_input':
-//   input_ptr_batch
-//
-// LSTM weights:
-// Quantized input weights of size 'n_cell * n_input':
-//   input_to_input_weights            - optional (can be nullptr)
-//   input_to_forget_weights
-//   input_to_cell_weights
-//   input_to_input_weights
-// Quantized recurrent weights of size 'n_cell * n_output':
-//   recurrent_to_input_weights        - optional
-//   recurrent_to_forget_weights
-//   recurrent_to_cell_weights
-//   recurrent_to_input_weights
-// Quantized peephole weights of size 'n_cell', representing diagonal matrices.
-//   cell_to_input_weights             - optional
-//   cell_to_cell_weights              - optional
-//   cell_to_output_weights            - optional
-// Quantized projection weights of size 'n_output * n_cell'
-//   projection_weights_ptr            - optional
-// Weight scales (scalars) for each of the weights above.
-//   input_to_input_weights_scale      - optional
-//   input_to_forget_weights_scale
-//   input_to_cell_weights_scale
-//   input_to_output_weights_scale
-//   recurrent_to_input_weights_scale  - optional
-//   recurrent_to_forget_weights_scale
-//   recurrent_to_cell_weights_scale
-//   recurrent_to_output_weights_scale
-//   cell_to_input_weights_scale,
-//   cell_to_forget_weights_scale,
-//   cell_to_output_weights_scale,
-//   projection_weights_scale          - optional
-// Gate biases of size 'n_cell':
-//   input_gate_bias_ptr               - optional
-//   forget_gate_bias_ptr
-//   cell_gate_bias_ptr
-//   output_gate_bias_ptr
-//
-// Temporary pre-allocated storage for quantized values:
-//   quantized_input_ptr_batch (same size as input_ptr_batch)
-//   quantized_output_state_ptr (same size as output_state_ptr)
-//   quantized_cell_state_ptr (same size as cell_state_ptr)
-// Temporary pre-allocated storage for recovered values:
-//   recovered_cell_weights (same size as cell_to_*_weights)
-//
-// Outputs:
-//   output_state_ptr - size 'n_batch * n_output'
-//   cell_state_ptr   - size 'n_batch * n_cell'
-//   output_ptr_batch - size 'n_batch * n_output'
-void LstmStep(
-    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
-    float input_to_input_weights_scale,
-    const int8_t* input_to_forget_weights_ptr,
-    float input_to_forget_weights_scale,
-    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
-    const int8_t* input_to_output_weights_ptr,
-    float input_to_output_weights_scale,
-    const int8_t* recurrent_to_input_weights_ptr,
-    float recurrent_to_input_weights_scale,
-    const int8_t* recurrent_to_forget_weights_ptr,
-    float recurrent_to_forget_weights_scale,
-    const int8_t* recurrent_to_cell_weights_ptr,
-    float recurrent_to_cell_weights_scale,
-    const int8_t* recurrent_to_output_weights_ptr,
-    float recurrent_to_output_weights_scale,
-    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
-    const int8_t* cell_to_forget_weights_ptr,
-    float cell_to_forget_weights_scale,
-    const int8_t* cell_to_output_weights_ptr,
-    float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
-    float projection_weights_scale, const float* projection_bias_ptr,
-    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
-    int n_output, float* input_gate_scratch, float* forget_gate_scratch,
-    float* cell_scratch, float* output_gate_scratch, float* scaling_factors,
-    float* product_scaling_factors, float* recovered_cell_weights,
-    int8_t* quantized_input_ptr_batch, int8_t* quantized_output_state_ptr,
-    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
-    float* cell_state_ptr, float* output_ptr_batch);
-
-void LstmStepWithAuxInput(
-    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
-    float input_to_input_weights_scale,
-    const int8_t* input_to_forget_weights_ptr,
-    float input_to_forget_weights_scale,
-    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
-    const int8_t* input_to_output_weights_ptr,
-    float input_to_output_weights_scale, const float* aux_input_ptr_batch,
-    const int8_t* aux_input_to_input_weights_ptr,
-    float aux_input_to_input_weights_scale,
-    const int8_t* aux_input_to_forget_weights_ptr,
-    float aux_input_to_forget_weights_scale,
-    const int8_t* aux_input_to_cell_weights_ptr,
-    float aux_input_to_cell_weights_scale,
-    const int8_t* aux_input_to_output_weights_ptr,
-    float aux_input_to_output_weights_scale,
-    const int8_t* recurrent_to_input_weights_ptr,
-    float recurrent_to_input_weights_scale,
-    const int8_t* recurrent_to_forget_weights_ptr,
-    float recurrent_to_forget_weights_scale,
-    const int8_t* recurrent_to_cell_weights_ptr,
-    float recurrent_to_cell_weights_scale,
-    const int8_t* recurrent_to_output_weights_ptr,
-    float recurrent_to_output_weights_scale,
-    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
-    const int8_t* cell_to_forget_weights_ptr,
-    float cell_to_forget_weights_scale,
-    const int8_t* cell_to_output_weights_ptr,
-    float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
-    float projection_weights_scale, const float* projection_bias_ptr,
-    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
-    int n_aux_input, int n_output, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
-    float* scaling_factors, float* product_scaling_factors,
-    float* recovered_cell_weights, int8_t* quantized_input_ptr_batch,
-    int8_t* quantized_aux_input_ptr_batch, int8_t* quantized_output_state_ptr,
-    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
-    float* cell_state_ptr, float* output_ptr_batch);
-
 }  // namespace kernel_utils
 }  // namespace tflite
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/lstm.cc b/tensorflow/contrib/lite/kernels/lstm.cc
index 5b996d00bc..16d67a1a93 100644
--- a/tensorflow/contrib/lite/kernels/lstm.cc
+++ b/tensorflow/contrib/lite/kernels/lstm.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/kernels/internal/tensor.h"
 #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/lstm_eval.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -424,263 +425,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-// The LSTM Op engine.
-TfLiteStatus EvalFloat(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output) {
-  const int n_batch = input->dims->data[0];
-  const int n_input = input->dims->data[1];
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  const float* input_to_input_weights_ptr =
-      (use_cifg) ? nullptr : input_to_input_weights->data.f;
-  const float* recurrent_to_input_weights_ptr =
-      (use_cifg) ? nullptr : recurrent_to_input_weights->data.f;
-  const float* input_gate_bias_ptr =
-      (use_cifg) ? nullptr : input_gate_bias->data.f;
-  const float* cell_to_input_weights_ptr =
-      (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr;
-  const float* cell_to_forget_weights_ptr =
-      (use_peephole) ? cell_to_forget_weights->data.f : nullptr;
-  const float* cell_to_output_weights_ptr =
-      (use_peephole) ? cell_to_output_weights->data.f : nullptr;
-  const float* projection_weights_ptr =
-      (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-
-  // Required tensors, pointers are non-null.
-  const float* input_ptr_batch = input->data.f;
-  const float* input_to_forget_weights_ptr = input_to_forget_weights->data.f;
-  const float* input_to_cell_weights_ptr = input_to_cell_weights->data.f;
-  const float* input_to_output_weights_ptr = input_to_output_weights->data.f;
-  const float* recurrent_to_forget_weights_ptr =
-      recurrent_to_forget_weights->data.f;
-  const float* recurrent_to_cell_weights_ptr =
-      recurrent_to_cell_weights->data.f;
-  const float* recurrent_to_output_weights_ptr =
-      recurrent_to_output_weights->data.f;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
-
-  float* activation_state_ptr = activation_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-  float* output_ptr_batch = output->data.f;
-
-  kernel_utils::LstmStep(
-      input_ptr_batch, input_to_input_weights_ptr, input_to_forget_weights_ptr,
-      input_to_cell_weights_ptr, input_to_output_weights_ptr,
-      recurrent_to_input_weights_ptr, recurrent_to_forget_weights_ptr,
-      recurrent_to_cell_weights_ptr, recurrent_to_output_weights_ptr,
-      cell_to_input_weights_ptr, cell_to_forget_weights_ptr,
-      cell_to_output_weights_ptr, input_gate_bias_ptr, forget_gate_bias_ptr,
-      cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr,
-      projection_bias_ptr, params, n_batch, n_cell, n_input, n_output,
-      activation_state_ptr, cell_state_ptr, input_gate_scratch,
-      forget_gate_scratch, cell_scratch, output_gate_scratch, output_ptr_batch);
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalHybrid(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* scaling_factors, TfLiteTensor* prod_scaling_factors,
-    TfLiteTensor* recovered_cell_weights, TfLiteTensor* input_quantized,
-    TfLiteTensor* activation_state_quantized,
-    TfLiteTensor* cell_state_quantized, TfLiteTensor* activation_state,
-    TfLiteTensor* cell_state, TfLiteTensor* output) {
-  const int n_batch = input->dims->data[0];
-  const int n_input = input->dims->data[1];
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  int8_t* input_to_input_weights_ptr = nullptr;
-  float input_to_input_weights_scale = 1.0f;
-  int8_t* recurrent_to_input_weights_ptr = nullptr;
-  float recurrent_to_input_weights_scale = 1.0f;
-  float* input_gate_bias_ptr = nullptr;
-  if (!use_cifg) {
-    input_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8);
-    recurrent_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8);
-    input_gate_bias_ptr = input_gate_bias->data.f;
-    input_to_input_weights_scale = input_to_input_weights->params.scale;
-    recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
-  }
-
-  int8_t* cell_to_input_weights_ptr = nullptr;
-  int8_t* cell_to_forget_weights_ptr = nullptr;
-  int8_t* cell_to_output_weights_ptr = nullptr;
-  float cell_to_input_weights_scale = 1.0f;
-  float cell_to_forget_weights_scale = 1.0f;
-  float cell_to_output_weights_scale = 1.0f;
-  if (use_peephole) {
-    if (!use_cifg) {
-      cell_to_input_weights_ptr =
-          reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8);
-      cell_to_input_weights_scale = cell_to_input_weights->params.scale;
-    }
-    cell_to_forget_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8);
-    cell_to_output_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8);
-    cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
-    cell_to_output_weights_scale = cell_to_output_weights->params.scale;
-  }
-
-  const int8_t* projection_weights_ptr =
-      (projection_weights == nullptr)
-          ? nullptr
-          : reinterpret_cast<int8_t*>(projection_weights->data.uint8);
-  const float projection_weights_scale =
-      (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-
-  // Required tensors, pointers are non-null.
-  const float* input_ptr_batch = input->data.f;
-  const int8_t* input_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8);
-  const float input_to_forget_weights_scale =
-      input_to_forget_weights->params.scale;
-  const int8_t* input_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8);
-  const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
-  const int8_t* input_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8);
-  const float input_to_output_weights_scale =
-      input_to_output_weights->params.scale;
-  const int8_t* recurrent_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8);
-  const float recurrent_to_forget_weights_scale =
-      recurrent_to_forget_weights->params.scale;
-  const int8_t* recurrent_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8);
-  const float recurrent_to_cell_weights_scale =
-      recurrent_to_cell_weights->params.scale;
-  const int8_t* recurrent_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
-  const float recurrent_to_output_weights_scale =
-      recurrent_to_output_weights->params.scale;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
-
-  float* activation_state_ptr = activation_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-  float* output_ptr_batch = output->data.f;
-
-  // Temporary storage for quantized values and scaling factors.
-  int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
-  int8_t* quantized_activation_state_ptr =
-      reinterpret_cast<int8_t*>(activation_state_quantized->data.uint8);
-  int8_t* quantized_cell_state_ptr =
-      reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8);
-  float* scaling_factors_ptr = scaling_factors->data.f;
-  float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
-  float* recovered_cell_weights_ptr = recovered_cell_weights->data.f;
-
-  kernel_utils::LstmStep(
-      input_ptr_batch, input_to_input_weights_ptr, input_to_input_weights_scale,
-      input_to_forget_weights_ptr, input_to_forget_weights_scale,
-      input_to_cell_weights_ptr, input_to_cell_weights_scale,
-      input_to_output_weights_ptr, input_to_output_weights_scale,
-      recurrent_to_input_weights_ptr, recurrent_to_input_weights_scale,
-      recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_scale,
-      recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_scale,
-      recurrent_to_output_weights_ptr, recurrent_to_output_weights_scale,
-      cell_to_input_weights_ptr, cell_to_input_weights_scale,
-      cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
-      cell_to_output_weights_ptr, cell_to_output_weights_scale,
-      input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr,
-      output_gate_bias_ptr, projection_weights_ptr, projection_weights_scale,
-      projection_bias_ptr, params, n_batch, n_cell, n_input, n_output,
-      input_gate_scratch, forget_gate_scratch, cell_scratch,
-      output_gate_scratch, scaling_factors_ptr, prod_scaling_factors_ptr,
-      recovered_cell_weights_ptr, quantized_input_ptr,
-      quantized_activation_state_ptr, quantized_cell_state_ptr,
-      activation_state_ptr, cell_state_ptr, output_ptr_batch);
-
-  return kTfLiteOk;
-}
-
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
   OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
@@ -738,15 +482,21 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // TODO(mirkov): add a check that weights are all uint8s or all floats.
   switch (input_to_output_weights->type) {
     case kTfLiteFloat32: {
-      return EvalFloat(input, input_to_input_weights, input_to_forget_weights,
-                       input_to_cell_weights, input_to_output_weights,
-                       recurrent_to_input_weights, recurrent_to_forget_weights,
-                       recurrent_to_cell_weights, recurrent_to_output_weights,
-                       cell_to_input_weights, cell_to_forget_weights,
-                       cell_to_output_weights, input_gate_bias,
-                       forget_gate_bias, cell_bias, output_gate_bias,
-                       projection_weights, projection_bias, params,
-                       scratch_buffer, activation_state, cell_state, output);
+      return lstm_eval::EvalFloat(
+          input, input_to_input_weights, input_to_forget_weights,
+          input_to_cell_weights, input_to_output_weights,
+          recurrent_to_input_weights, recurrent_to_forget_weights,
+          recurrent_to_cell_weights, recurrent_to_output_weights,
+          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          /*aux_input=*/nullptr,
+          /*aux_input_to_input_weights=*/nullptr,
+          /*aux_input_to_forget_weights=*/nullptr,
+          /*aux_input_to_cell_weights=*/nullptr,
+          /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
+          projection_bias, params, /*forward_sequence=*/true,
+          /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
+          output);
     }
     case kTfLiteUInt8: {
       TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
@@ -759,17 +509,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           GetTemporary(context, node, /*index=*/5);
       TfLiteTensor* recovered_cell_weights =
           GetTemporary(context, node, /*index=*/6);
-      return EvalHybrid(
+      return lstm_eval::EvalHybrid(
           input, input_to_input_weights, input_to_forget_weights,
           input_to_cell_weights, input_to_output_weights,
           recurrent_to_input_weights, recurrent_to_forget_weights,
           recurrent_to_cell_weights, recurrent_to_output_weights,
           cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
-          input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias,
-          projection_weights, projection_bias, params, scratch_buffer,
-          scaling_factors, prod_scaling_factors, recovered_cell_weights,
-          input_quantized, activation_state_quantized, cell_state_quantized,
-          activation_state, cell_state, output);
+          /*aux_input=*/nullptr,
+          /*aux_input_to_input_weights=*/nullptr,
+          /*aux_input_to_forget_weights=*/nullptr,
+          /*aux_input_to_cell_weights=*/nullptr,
+          /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
+          projection_bias, params, /*forward_sequence=*/true,
+          /*output_offset=*/0, scratch_buffer, scaling_factors,
+          prod_scaling_factors, recovered_cell_weights, input_quantized,
+          /*aux_input_quantized=*/nullptr, activation_state_quantized,
+          cell_state_quantized, activation_state, cell_state, output);
     }
     default:
       context->ReportError(context, "Type %d is not currently supported.",
diff --git a/tensorflow/contrib/lite/kernels/lstm_eval.cc b/tensorflow/contrib/lite/kernels/lstm_eval.cc
new file mode 100644
index 0000000000..c6c21eb085
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/lstm_eval.cc
@@ -0,0 +1,909 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/lstm_eval.h"
+
+#include <stdint.h>
+
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace lstm_eval {
+
+namespace {
+
+// Performs an LSTM batch inference step for input specified by input_ptr_batch.
+// The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and
+// biases (*_bias_ptr), and buffers (*_scratch), along with additional
+// parameters:
+//  - params: various LSTM params including activation, clipping, etc.,
+//  - n_batch: size of batch,
+//  - n_cell: number of cells (or units),
+//  - n_input: the input size,
+//  - n_output: the output size.
+//
+// The pointers to the cell and output state and the output are updated.
+//
+// The pointers with the suffix "_batch" point to data aligned in batch_major
+// order, and each step processes batch_size many inputs from input_ptr_batch,
+// and updates batch_size many cell and output states.
+inline void LstmStepWithAuxInput(
+    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
+    const float* input_to_forget_weights_ptr,
+    const float* input_to_cell_weights_ptr,
+    const float* input_to_output_weights_ptr, const float* aux_input_ptr_batch,
+    const float* aux_input_to_input_weights_ptr,
+    const float* aux_input_to_forget_weights_ptr,
+    const float* aux_input_to_cell_weights_ptr,
+    const float* aux_input_to_output_weights_ptr,
+    const float* recurrent_to_input_weights_ptr,
+    const float* recurrent_to_forget_weights_ptr,
+    const float* recurrent_to_cell_weights_ptr,
+    const float* recurrent_to_output_weights_ptr,
+    const float* cell_to_input_weights_ptr,
+    const float* cell_to_forget_weights_ptr,
+    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
+    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
+    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
+    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
+    int n_batch, int n_cell, int n_input, int n_aux_input, int n_output,
+    float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
+    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
+    float* output_ptr_batch) {
+  // Since we have already checked that weights are all there or none, we can
+  // check the existense of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
+  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
+  // Initialize scratch buffers with bias.
+  if (!use_cifg) {
+    tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch,
+                                          input_gate_scratch);
+  }
+  tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
+                                        forget_gate_scratch);
+  tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
+                                        cell_scratch);
+  tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
+                                        output_gate_scratch);
+
+  // For each batch and cell: compute input_weight * input.
+  if (!use_cifg) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_input_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
+        input_gate_scratch, /*result_stride=*/1);
+  }
+
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_to_forget_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
+      forget_gate_scratch, /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_to_cell_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
+      cell_scratch, /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_to_output_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
+      output_gate_scratch, /*result_stride=*/1);
+
+  // If auxiliary input is available then compute aux_input_weight * aux_input
+  if (aux_input_ptr_batch != nullptr) {
+    if (!use_cifg) {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          aux_input_to_input_weights_ptr, n_cell, n_aux_input,
+          aux_input_ptr_batch, n_batch, input_gate_scratch,
+          /*result_stride=*/1);
+    }
+
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_forget_weights_ptr, n_cell, n_aux_input,
+        aux_input_ptr_batch, n_batch, forget_gate_scratch, /*result_stride=*/1);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_cell_weights_ptr, n_cell, n_aux_input, aux_input_ptr_batch,
+        n_batch, cell_scratch, /*result_stride=*/1);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_output_weights_ptr, n_cell, n_aux_input,
+        aux_input_ptr_batch, n_batch, output_gate_scratch, /*result_stride=*/1);
+  }
+
+  // For each batch and cell: compute recurrent_weight * output_state.
+  if (!use_cifg) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
+        n_batch, input_gate_scratch, /*result_stride=*/1);
+  }
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
+      n_batch, forget_gate_scratch,
+      /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
+      n_batch, cell_scratch, /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
+      n_batch, output_gate_scratch,
+      /*result_stride=*/1);
+
+  // For each batch and cell: update input gate.
+  if (!use_cifg) {
+    if (use_peephole) {
+      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+          cell_to_input_weights_ptr, n_cell, cell_state_ptr, n_batch,
+          input_gate_scratch);
+    }
+    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
+                                       input_gate_scratch);
+  }
+
+  // For each batch and cell: update forget gate.
+  if (use_peephole) {
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_forget_weights_ptr, n_cell, cell_state_ptr, n_batch,
+        forget_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
+                                     forget_gate_scratch);
+
+  // For each batch and cell: update the cell.
+  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
+                                         n_batch * n_cell, cell_state_ptr);
+  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  if (use_cifg) {
+    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                             forget_gate_scratch);
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  } else {
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  }
+  if (params->cell_clip > 0.0) {
+    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
+                             params->cell_clip, cell_state_ptr);
+  }
+
+  // For each batch and cell: update the output gate.
+  if (use_peephole) {
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_output_weights_ptr, n_cell, cell_state_ptr, n_batch,
+        output_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
+                                     output_gate_scratch);
+  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                         n_batch * n_cell, output_gate_scratch);
+
+  // For each batch: update the projection and output_state.
+  const bool use_projection_weight = (projection_weights_ptr != nullptr);
+  const bool use_projection_bias = (projection_bias_ptr != nullptr);
+  if (use_projection_weight) {
+    if (use_projection_bias) {
+      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
+                                            n_batch, output_ptr_batch);
+    } else {
+      tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch,
+        output_ptr_batch, /*result_stride=*/1);
+    if (params->proj_clip > 0.0) {
+      tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output,
+                               params->proj_clip, output_ptr_batch);
+    }
+  } else {
+    tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
+                             output_ptr_batch);
+  }
+  tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
+                           output_state_ptr);
+}
+
+// Same as above but with quantized weight matrices. In detail:
+// Input of size 'n_batch * n_input':
+//   input_ptr_batch
+//
+// LSTM weights:
+// Quantized input weights of size 'n_cell * n_input':
+//   input_to_input_weights            - optional (can be nullptr)
+//   input_to_forget_weights
+//   input_to_cell_weights
+//   input_to_input_weights
+// Quantized recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weights        - optional
+//   recurrent_to_forget_weights
+//   recurrent_to_cell_weights
+//   recurrent_to_input_weights
+// Quantized peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights             - optional
+//   cell_to_cell_weights              - optional
+//   cell_to_output_weights            - optional
+// Quantized projection weights of size 'n_output * n_cell'
+//   projection_weights_ptr            - optional
+// Weight scales (scalars) for each of the weights above.
+//   input_to_input_weights_scale      - optional
+//   input_to_forget_weights_scale
+//   input_to_cell_weights_scale
+//   input_to_output_weights_scale
+//   recurrent_to_input_weights_scale  - optional
+//   recurrent_to_forget_weights_scale
+//   recurrent_to_cell_weights_scale
+//   recurrent_to_output_weights_scale
+//   cell_to_input_weights_scale,
+//   cell_to_forget_weights_scale,
+//   cell_to_output_weights_scale,
+//   projection_weights_scale          - optional
+// Gate biases of size 'n_cell':
+//   input_gate_bias_ptr               - optional
+//   forget_gate_bias_ptr
+//   cell_gate_bias_ptr
+//   output_gate_bias_ptr
+//
+// Temporary pre-allocated storage for quantized values:
+//   quantized_input_ptr_batch (same size as input_ptr_batch)
+//   quantized_output_state_ptr (same size as output_state_ptr)
+//   quantized_cell_state_ptr (same size as cell_state_ptr)
+// Temporary pre-allocated storage for recovered values:
+//   recovered_cell_weights (same size as cell_to_*_weights)
+//
+// Outputs:
+//   output_state_ptr - size 'n_batch * n_output'
+//   cell_state_ptr   - size 'n_batch * n_cell'
+//   output_ptr_batch - size 'n_batch * n_output'
+inline void LstmStepWithAuxInput(
+    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
+    float input_to_input_weights_scale,
+    const int8_t* input_to_forget_weights_ptr,
+    float input_to_forget_weights_scale,
+    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
+    const int8_t* input_to_output_weights_ptr,
+    float input_to_output_weights_scale, const float* aux_input_ptr_batch,
+    const int8_t* aux_input_to_input_weights_ptr,
+    float aux_input_to_input_weights_scale,
+    const int8_t* aux_input_to_forget_weights_ptr,
+    float aux_input_to_forget_weights_scale,
+    const int8_t* aux_input_to_cell_weights_ptr,
+    float aux_input_to_cell_weights_scale,
+    const int8_t* aux_input_to_output_weights_ptr,
+    float aux_input_to_output_weights_scale,
+    const int8_t* recurrent_to_input_weights_ptr,
+    float recurrent_to_input_weights_scale,
+    const int8_t* recurrent_to_forget_weights_ptr,
+    float recurrent_to_forget_weights_scale,
+    const int8_t* recurrent_to_cell_weights_ptr,
+    float recurrent_to_cell_weights_scale,
+    const int8_t* recurrent_to_output_weights_ptr,
+    float recurrent_to_output_weights_scale,
+    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
+    const int8_t* cell_to_forget_weights_ptr,
+    float cell_to_forget_weights_scale,
+    const int8_t* cell_to_output_weights_ptr,
+    float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
+    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
+    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
+    float projection_weights_scale, const float* projection_bias_ptr,
+    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
+    int n_aux_input, int n_output, float* input_gate_scratch,
+    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
+    float* scaling_factors, float* product_scaling_factors,
+    float* recovered_cell_weights, int8_t* quantized_input_ptr_batch,
+    int8_t* quantized_aux_input_ptr_batch, int8_t* quantized_output_state_ptr,
+    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
+    float* cell_state_ptr, float* output_ptr_batch) {
+  // Since we have already checked that weights are all there or none, we
+  // can check the existense of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
+  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
+  // Initialize scratch buffers with bias.
+  if (!use_cifg) {
+    tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch,
+                                          input_gate_scratch);
+  }
+  tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
+                                        forget_gate_scratch);
+  tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
+                                        cell_scratch);
+  tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
+                                        output_gate_scratch);
+
+  if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
+    // Save quantization and matmul computation for all zero input.
+    float unused_min, unused_max;
+    for (int b = 0; b < n_batch; ++b) {
+      const int offset = b * n_input;
+      tensor_utils::SymmetricQuantizeFloats(
+          input_ptr_batch + offset, n_input, quantized_input_ptr_batch + offset,
+          &unused_min, &unused_max, &scaling_factors[b]);
+    }
+    // For each batch and cell: compute input_weight * input.
+    if (!use_cifg) {
+      for (int b = 0; b < n_batch; ++b) {
+        product_scaling_factors[b] =
+            scaling_factors[b] * input_to_input_weights_scale;
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          input_to_input_weights_ptr, n_cell, n_input,
+          quantized_input_ptr_batch, product_scaling_factors, n_batch,
+          input_gate_scratch, /*result_stride=*/1);
+    }
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * input_to_forget_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_forget_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
+        product_scaling_factors, n_batch, forget_gate_scratch,
+        /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * input_to_cell_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_cell_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
+        product_scaling_factors, n_batch, cell_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * input_to_output_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_output_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
+        product_scaling_factors, n_batch, output_gate_scratch,
+        /*result_stride=*/1);
+  }
+
+  if (aux_input_ptr_batch != nullptr &&
+      !tensor_utils::IsZeroVector(aux_input_ptr_batch, n_batch * n_input)) {
+    // Save quantization and matmul computation for all zero input.
+    float unused_min, unused_max;
+    for (int b = 0; b < n_batch; ++b) {
+      const int offset = b * n_input;
+      tensor_utils::SymmetricQuantizeFloats(
+          aux_input_ptr_batch + offset, n_input,
+          quantized_aux_input_ptr_batch + offset, &unused_min, &unused_max,
+          &scaling_factors[b]);
+    }
+    // For each batch and cell: compute input_weight * input.
+    if (!use_cifg) {
+      for (int b = 0; b < n_batch; ++b) {
+        product_scaling_factors[b] =
+            scaling_factors[b] * aux_input_to_input_weights_scale;
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          aux_input_to_input_weights_ptr, n_cell, n_input,
+          quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
+          input_gate_scratch, /*result_stride=*/1);
+    }
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * aux_input_to_forget_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_forget_weights_ptr, n_cell, n_input,
+        quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
+        forget_gate_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * aux_input_to_cell_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_cell_weights_ptr, n_cell, n_input,
+        quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
+        cell_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * aux_input_to_output_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_output_weights_ptr, n_cell, n_input,
+        quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
+        output_gate_scratch, /*result_stride=*/1);
+  }
+
+  if (!tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output)) {
+    // Save quantization and matmul computation for all zero input.
+    float unused_min, unused_max;
+    for (int b = 0; b < n_batch; ++b) {
+      const int offset = b * n_output;
+      tensor_utils::SymmetricQuantizeFloats(output_state_ptr + offset, n_output,
+                                            quantized_output_state_ptr + offset,
+                                            &unused_min, &unused_max,
+                                            &scaling_factors[b]);
+    }
+    // For each batch and cell: compute recurrent_weight * output_state.
+    if (!use_cifg) {
+      for (int b = 0; b < n_batch; ++b) {
+        product_scaling_factors[b] =
+            scaling_factors[b] * recurrent_to_input_weights_scale;
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          recurrent_to_input_weights_ptr, n_cell, n_output,
+          quantized_output_state_ptr, product_scaling_factors, n_batch,
+          input_gate_scratch, /*result_stride=*/1);
+    }
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * recurrent_to_forget_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_forget_weights_ptr, n_cell, n_output,
+        quantized_output_state_ptr, product_scaling_factors, n_batch,
+        forget_gate_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * recurrent_to_cell_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_cell_weights_ptr, n_cell, n_output,
+        quantized_output_state_ptr, product_scaling_factors, n_batch,
+        cell_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * recurrent_to_output_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_output_weights_ptr, n_cell, n_output,
+        quantized_output_state_ptr, product_scaling_factors, n_batch,
+        output_gate_scratch, /*result_stride=*/1);
+  }
+
+  // Save quantization and matmul computation for all zero input.
+  bool is_cell_state_all_zeros =
+      tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
+
+  // For each batch and cell: update input gate.
+  if (!use_cifg) {
+    if (use_peephole && !is_cell_state_all_zeros) {
+      tensor_utils::VectorScalarMultiply(cell_to_input_weights_ptr, n_cell,
+                                         cell_to_input_weights_scale,
+                                         recovered_cell_weights);
+      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+          recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+          input_gate_scratch);
+    }
+    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
+                                       input_gate_scratch);
+  }
+
+  // For each batch and cell: update forget gate.
+  if (use_peephole && !is_cell_state_all_zeros) {
+    tensor_utils::VectorScalarMultiply(cell_to_forget_weights_ptr, n_cell,
+                                       cell_to_forget_weights_scale,
+                                       recovered_cell_weights);
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+        forget_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
+                                     forget_gate_scratch);
+
+  // For each batch and cell: update the cell.
+  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
+                                         n_batch * n_cell, cell_state_ptr);
+  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  if (use_cifg) {
+    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                             forget_gate_scratch);
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  } else {
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  }
+  if (params->cell_clip > 0.0) {
+    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
+                             params->cell_clip, cell_state_ptr);
+  }
+
+  is_cell_state_all_zeros =
+      tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
+  // For each batch and cell: update the output gate.
+  if (use_peephole && !is_cell_state_all_zeros) {
+    tensor_utils::VectorScalarMultiply(cell_to_output_weights_ptr, n_cell,
+                                       cell_to_output_weights_scale,
+                                       recovered_cell_weights);
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+        output_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
+                                     output_gate_scratch);
+  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                         n_batch * n_cell, output_gate_scratch);
+
+  // For each batch: update the projection and output_state.
+  const bool use_projection_weight = (projection_weights_ptr != nullptr);
+  const bool use_projection_bias = (projection_bias_ptr != nullptr);
+  if (use_projection_weight) {
+    if (use_projection_bias) {
+      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
+                                            n_batch, output_ptr_batch);
+    } else {
+      tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
+    }
+    if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) {
+      // Save quantization and matmul computation for all zero input.
+      float unused_min, unused_max;
+      for (int b = 0; b < n_batch; ++b) {
+        const int offset = b * n_cell;
+        tensor_utils::SymmetricQuantizeFloats(
+            output_gate_scratch + offset, n_cell,
+            quantized_cell_state_ptr + offset, &unused_min, &unused_max,
+            &scaling_factors[b]);
+      }
+      for (int b = 0; b < n_batch; ++b) {
+        product_scaling_factors[b] =
+            scaling_factors[b] * projection_weights_scale;
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          projection_weights_ptr, n_output, n_cell, quantized_cell_state_ptr,
+          product_scaling_factors, n_batch, output_ptr_batch,
+          /*result_stride=*/1);
+    }
+    if (params->proj_clip > 0.0) {
+      tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output,
+                               params->proj_clip, output_ptr_batch);
+    }
+  } else {
+    tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
+                             output_ptr_batch);
+  }
+  tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
+                           output_state_ptr);
+}
+}  // namespace
+
+TfLiteStatus EvalFloat(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
+    const TfLiteTensor* aux_input_to_input_weights,
+    const TfLiteTensor* aux_input_to_forget_weights,
+    const TfLiteTensor* aux_input_to_cell_weights,
+    const TfLiteTensor* aux_input_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, bool forward_sequence, int output_offset,
+    TfLiteTensor* scratch_buffer, TfLiteTensor* activation_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output) {
+  const int max_time = (input->dims->size == 2) ? 1 : input->dims->data[0];
+  const int n_batch = input->dims->data[input->dims->size - 2];
+  const int n_input = input->dims->data[input->dims->size - 1];
+  const int aux_input_size =
+      (aux_input) ? aux_input->dims->data[aux_input->dims->size - 1] : 0;
+
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existense of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  float* input_gate_scratch = nullptr;
+  float* cell_scratch = nullptr;
+  float* forget_gate_scratch = nullptr;
+  float* output_gate_scratch = nullptr;
+  if (use_cifg) {
+    cell_scratch = scratch_buffer->data.f;
+    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+  } else {
+    input_gate_scratch = scratch_buffer->data.f;
+    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
+  }
+
+  // Check optional tensors, the respective pointers can be null.
+  const float* input_to_input_weights_ptr =
+      (use_cifg) ? nullptr : input_to_input_weights->data.f;
+  const float* recurrent_to_input_weights_ptr =
+      (use_cifg) ? nullptr : recurrent_to_input_weights->data.f;
+  const float* input_gate_bias_ptr =
+      (use_cifg) ? nullptr : input_gate_bias->data.f;
+  const float* cell_to_input_weights_ptr =
+      (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr;
+  const float* cell_to_forget_weights_ptr =
+      (use_peephole) ? cell_to_forget_weights->data.f : nullptr;
+  const float* cell_to_output_weights_ptr =
+      (use_peephole) ? cell_to_output_weights->data.f : nullptr;
+  const float* projection_weights_ptr =
+      (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
+  const float* projection_bias_ptr =
+      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+
+  float* aux_input_ptr = nullptr;
+  float* aux_input_to_input_weights_ptr = nullptr;
+  float* aux_input_to_forget_weights_ptr = nullptr;
+  float* aux_input_to_cell_weights_ptr = nullptr;
+  float* aux_input_to_output_weights_ptr = nullptr;
+  if (aux_input_size > 0) {
+    aux_input_ptr = aux_input->data.f;
+    aux_input_to_input_weights_ptr = aux_input_to_input_weights->data.f;
+    aux_input_to_forget_weights_ptr = aux_input_to_forget_weights->data.f;
+    aux_input_to_cell_weights_ptr = aux_input_to_cell_weights->data.f;
+    aux_input_to_output_weights_ptr = aux_input_to_output_weights->data.f;
+  }
+
+  // Loop through the sequence.
+  const int input_step = n_batch * n_input;
+  const int output_step = n_batch * output->dims->data[output->dims->size - 1];
+  for (int t = 0; t < max_time; t++) {
+    // If this is the forward_sequence, step forward, otherwise step backwards.
+    const int t_rel = forward_sequence ? t : max_time - t - 1;
+    const float* input_ptr = input->data.f + t_rel * input_step;
+    float* output_ptr_time =
+        output->data.f + t_rel * output_step + output_offset;
+
+    LstmStepWithAuxInput(
+        input_ptr, input_to_input_weights_ptr, input_to_forget_weights->data.f,
+        input_to_cell_weights->data.f, input_to_output_weights->data.f,
+        aux_input_ptr, aux_input_to_input_weights_ptr,
+        aux_input_to_forget_weights_ptr, aux_input_to_cell_weights_ptr,
+        aux_input_to_output_weights_ptr, recurrent_to_input_weights_ptr,
+        recurrent_to_forget_weights->data.f, recurrent_to_cell_weights->data.f,
+        recurrent_to_output_weights->data.f, cell_to_input_weights_ptr,
+        cell_to_forget_weights_ptr, cell_to_output_weights_ptr,
+        input_gate_bias_ptr, forget_gate_bias->data.f, cell_bias->data.f,
+        output_gate_bias->data.f, projection_weights_ptr, projection_bias_ptr,
+        params, n_batch, n_cell, n_input, aux_input_size, n_output,
+        activation_state->data.f, cell_state->data.f, input_gate_scratch,
+        forget_gate_scratch, cell_scratch, output_gate_scratch,
+        output_ptr_time);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalHybrid(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
+    const TfLiteTensor* aux_input_to_input_weights,
+    const TfLiteTensor* aux_input_to_forget_weights,
+    const TfLiteTensor* aux_input_to_cell_weights,
+    const TfLiteTensor* aux_input_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, bool forward_sequence, int output_offset,
+    TfLiteTensor* scratch_buffer, TfLiteTensor* scaling_factors,
+    TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_cell_weights,
+    TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized,
+    TfLiteTensor* output_state_quantized, TfLiteTensor* cell_state_quantized,
+    TfLiteTensor* output_state, TfLiteTensor* cell_state,
+    TfLiteTensor* output) {
+  const int max_time = (input->dims->size == 2) ? 1 : input->dims->data[0];
+  const int n_batch = input->dims->data[input->dims->size - 2];
+  const int n_input = input->dims->data[input->dims->size - 1];
+  const int aux_input_size =
+      (aux_input) ? aux_input->dims->data[aux_input->dims->size - 1] : 0;
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+
+  float* input_gate_scratch = nullptr;
+  float* cell_scratch = nullptr;
+  float* forget_gate_scratch = nullptr;
+  float* output_gate_scratch = nullptr;
+  if (use_cifg) {
+    cell_scratch = scratch_buffer->data.f;
+    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+  } else {
+    input_gate_scratch = scratch_buffer->data.f;
+    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
+  }
+
+  // Check optional tensors, the respective pointers can be null.
+  int8_t* input_to_input_weights_ptr = nullptr;
+  float input_to_input_weights_scale = 1.0f;
+  int8_t* recurrent_to_input_weights_ptr = nullptr;
+  float recurrent_to_input_weights_scale = 1.0f;
+  float* input_gate_bias_ptr = nullptr;
+  if (!use_cifg) {
+    input_to_input_weights_ptr =
+        reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8);
+    recurrent_to_input_weights_ptr =
+        reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8);
+    input_gate_bias_ptr = input_gate_bias->data.f;
+    input_to_input_weights_scale = input_to_input_weights->params.scale;
+    recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
+  }
+
+  int8_t* cell_to_input_weights_ptr = nullptr;
+  int8_t* cell_to_forget_weights_ptr = nullptr;
+  int8_t* cell_to_output_weights_ptr = nullptr;
+  float cell_to_input_weights_scale = 1.0f;
+  float cell_to_forget_weights_scale = 1.0f;
+  float cell_to_output_weights_scale = 1.0f;
+  if (use_peephole) {
+    if (!use_cifg) {
+      cell_to_input_weights_ptr =
+          reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8);
+      cell_to_input_weights_scale = cell_to_input_weights->params.scale;
+    }
+    cell_to_forget_weights_ptr =
+        reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8);
+    cell_to_output_weights_ptr =
+        reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8);
+    cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
+    cell_to_output_weights_scale = cell_to_output_weights->params.scale;
+  }
+
+  const int8_t* projection_weights_ptr =
+      (projection_weights == nullptr)
+          ? nullptr
+          : reinterpret_cast<int8_t*>(projection_weights->data.uint8);
+  const float projection_weights_scale =
+      (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
+  const float* projection_bias_ptr =
+      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+
+  // Required tensors, pointers are non-null.
+  const int8_t* input_to_forget_weights_ptr =
+      reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8);
+  const float input_to_forget_weights_scale =
+      input_to_forget_weights->params.scale;
+  const int8_t* input_to_cell_weights_ptr =
+      reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8);
+  const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
+  const int8_t* input_to_output_weights_ptr =
+      reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8);
+  const float input_to_output_weights_scale =
+      input_to_output_weights->params.scale;
+  const int8_t* recurrent_to_forget_weights_ptr =
+      reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8);
+  const float recurrent_to_forget_weights_scale =
+      recurrent_to_forget_weights->params.scale;
+  const int8_t* recurrent_to_cell_weights_ptr =
+      reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8);
+  const float recurrent_to_cell_weights_scale =
+      recurrent_to_cell_weights->params.scale;
+  const int8_t* recurrent_to_output_weights_ptr =
+      reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
+  const float recurrent_to_output_weights_scale =
+      recurrent_to_output_weights->params.scale;
+  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
+  const float* cell_bias_ptr = cell_bias->data.f;
+  const float* output_gate_bias_ptr = output_gate_bias->data.f;
+
+  float* output_state_ptr = output_state->data.f;
+  float* cell_state_ptr = cell_state->data.f;
+
+  // Temporary storage for quantized values and scaling factors.
+  int8_t* quantized_input_ptr =
+      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+  int8_t* quantized_aux_input_ptr =
+      (aux_input_quantized == nullptr)
+          ? nullptr
+          : reinterpret_cast<int8_t*>(aux_input_quantized->data.uint8);
+  int8_t* quantized_output_state_ptr =
+      reinterpret_cast<int8_t*>(output_state_quantized->data.uint8);
+  int8_t* quantized_cell_state_ptr =
+      reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8);
+  float* scaling_factors_ptr = scaling_factors->data.f;
+  float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
+  float* recovered_cell_weights_ptr = recovered_cell_weights->data.f;
+
+  // Auxiliary input and weights.
+  float* aux_input_ptr = nullptr;
+  int8_t* aux_input_to_input_weights_ptr = nullptr;
+  int8_t* aux_input_to_forget_weights_ptr = nullptr;
+  int8_t* aux_input_to_cell_weights_ptr = nullptr;
+  int8_t* aux_input_to_output_weights_ptr = nullptr;
+  float aux_input_to_input_weights_scale = 0.0f;
+  float aux_input_to_forget_weights_scale = 0.0f;
+  float aux_input_to_cell_weights_scale = 0.0f;
+  float aux_input_to_output_weights_scale = 0.0f;
+  if (aux_input_size > 0) {
+    aux_input_ptr = aux_input->data.f;
+    aux_input_to_input_weights_ptr =
+        reinterpret_cast<int8_t*>(aux_input_to_input_weights->data.uint8);
+    aux_input_to_forget_weights_ptr =
+        reinterpret_cast<int8_t*>(aux_input_to_forget_weights->data.uint8);
+    aux_input_to_cell_weights_ptr =
+        reinterpret_cast<int8_t*>(aux_input_to_cell_weights->data.uint8);
+    aux_input_to_output_weights_ptr =
+        reinterpret_cast<int8_t*>(aux_input_to_output_weights->data.uint8);
+    aux_input_to_input_weights_scale = aux_input_to_input_weights->params.scale;
+    aux_input_to_forget_weights_scale =
+        aux_input_to_forget_weights->params.scale;
+    aux_input_to_cell_weights_scale = aux_input_to_cell_weights->params.scale;
+    aux_input_to_output_weights_scale =
+        aux_input_to_output_weights->params.scale;
+  }
+
+  // Feed the sequence into the LSTM step-by-step.
+  const int input_step = n_batch * n_input;
+  const int output_step = n_batch * output->dims->data[output->dims->size - 1];
+  for (int t = 0; t < max_time; t++) {
+    // If this is the forward_sequence, step forward, otherwise step backwards.
+    const int t_rel = forward_sequence ? t : max_time - t - 1;
+    const float* input_ptr = input->data.f + t_rel * input_step;
+    float* output_ptr = output->data.f + t_rel * output_step + output_offset;
+
+    LstmStepWithAuxInput(
+        input_ptr, input_to_input_weights_ptr, input_to_input_weights_scale,
+        input_to_forget_weights_ptr, input_to_forget_weights_scale,
+        input_to_cell_weights_ptr, input_to_cell_weights_scale,
+        input_to_output_weights_ptr, input_to_output_weights_scale,
+        aux_input_ptr, aux_input_to_input_weights_ptr,
+        aux_input_to_input_weights_scale, aux_input_to_forget_weights_ptr,
+        aux_input_to_forget_weights_scale, aux_input_to_cell_weights_ptr,
+        aux_input_to_cell_weights_scale, aux_input_to_output_weights_ptr,
+        aux_input_to_output_weights_scale, recurrent_to_input_weights_ptr,
+        recurrent_to_input_weights_scale, recurrent_to_forget_weights_ptr,
+        recurrent_to_forget_weights_scale, recurrent_to_cell_weights_ptr,
+        recurrent_to_cell_weights_scale, recurrent_to_output_weights_ptr,
+        recurrent_to_output_weights_scale, cell_to_input_weights_ptr,
+        cell_to_input_weights_scale, cell_to_forget_weights_ptr,
+        cell_to_forget_weights_scale, cell_to_output_weights_ptr,
+        cell_to_output_weights_scale, input_gate_bias_ptr, forget_gate_bias_ptr,
+        cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr,
+        projection_weights_scale, projection_bias_ptr, params, n_batch, n_cell,
+        n_input, aux_input_size, n_output, input_gate_scratch,
+        forget_gate_scratch, cell_scratch, output_gate_scratch,
+        scaling_factors_ptr, prod_scaling_factors_ptr,
+        recovered_cell_weights_ptr, quantized_input_ptr,
+        quantized_aux_input_ptr, quantized_output_state_ptr,
+        quantized_cell_state_ptr, output_state_ptr, cell_state_ptr, output_ptr);
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace lstm_eval
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/lstm_eval.h b/tensorflow/contrib/lite/kernels/lstm_eval.h
new file mode 100644
index 0000000000..adf8cf0f64
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/lstm_eval.h
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_LSTM_EVAL_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_LSTM_EVAL_H_
+
+#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/contrib/lite/c/c_api_internal.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace lstm_eval {
+
+TfLiteStatus EvalFloat(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
+    const TfLiteTensor* aux_input_to_input_weights,
+    const TfLiteTensor* aux_input_to_forget_weights,
+    const TfLiteTensor* aux_input_to_cell_weights,
+    const TfLiteTensor* aux_input_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, bool forward_sequence, int output_offset,
+    TfLiteTensor* scratch_buffer, TfLiteTensor* activation_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output);
+
+TfLiteStatus EvalHybrid(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
+    const TfLiteTensor* aux_input_to_input_weights,
+    const TfLiteTensor* aux_input_to_forget_weights,
+    const TfLiteTensor* aux_input_to_cell_weights,
+    const TfLiteTensor* aux_input_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, bool forward_sequence, int output_offset,
+    TfLiteTensor* scratch_buffer, TfLiteTensor* scaling_factors,
+    TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_cell_weights,
+    TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized,
+    TfLiteTensor* output_state_quantized, TfLiteTensor* cell_state_quantized,
+    TfLiteTensor* output_state, TfLiteTensor* cell_state, TfLiteTensor* output);
+
+}  // namespace lstm_eval
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_LSTM_EVAL_H_
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
index 63817bd886..ec9cf38b83 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/lstm_eval.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -429,273 +430,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-// The LSTM Op engine.
-TfLiteStatus EvalFloat(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output) {
-  const int max_time = input->dims->data[0];
-  const int n_batch = input->dims->data[1];
-  const int n_input = input->dims->data[2];
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  const float* input_to_input_weights_ptr =
-      (use_cifg) ? nullptr : input_to_input_weights->data.f;
-  const float* recurrent_to_input_weights_ptr =
-      (use_cifg) ? nullptr : recurrent_to_input_weights->data.f;
-  const float* input_gate_bias_ptr =
-      (use_cifg) ? nullptr : input_gate_bias->data.f;
-  const float* cell_to_input_weights_ptr =
-      (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr;
-  const float* cell_to_forget_weights_ptr =
-      (use_peephole) ? cell_to_forget_weights->data.f : nullptr;
-  const float* cell_to_output_weights_ptr =
-      (use_peephole) ? cell_to_output_weights->data.f : nullptr;
-  const float* projection_weights_ptr =
-      (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-
-  // Required tensors, pointers are non-null.
-  const float* input_to_forget_weights_ptr = input_to_forget_weights->data.f;
-  const float* input_to_cell_weights_ptr = input_to_cell_weights->data.f;
-  const float* input_to_output_weights_ptr = input_to_output_weights->data.f;
-  const float* recurrent_to_forget_weights_ptr =
-      recurrent_to_forget_weights->data.f;
-  const float* recurrent_to_cell_weights_ptr =
-      recurrent_to_cell_weights->data.f;
-  const float* recurrent_to_output_weights_ptr =
-      recurrent_to_output_weights->data.f;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
-
-  float* activation_state_ptr = activation_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-
-  // Feed the sequence into the LSTM step-by-step.
-  for (int t = 0; t < max_time; t++) {
-    const float* input_ptr_batch = input->data.f + t * n_batch * n_input;
-    float* output_ptr_batch = output->data.f + t * n_batch * n_output;
-
-    kernel_utils::LstmStep(
-        input_ptr_batch, input_to_input_weights_ptr,
-        input_to_forget_weights_ptr, input_to_cell_weights_ptr,
-        input_to_output_weights_ptr, recurrent_to_input_weights_ptr,
-        recurrent_to_forget_weights_ptr, recurrent_to_cell_weights_ptr,
-        recurrent_to_output_weights_ptr, cell_to_input_weights_ptr,
-        cell_to_forget_weights_ptr, cell_to_output_weights_ptr,
-        input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr,
-        output_gate_bias_ptr, projection_weights_ptr, projection_bias_ptr,
-        params, n_batch, n_cell, n_input, n_output, activation_state_ptr,
-        cell_state_ptr, input_gate_scratch, forget_gate_scratch, cell_scratch,
-        output_gate_scratch, output_ptr_batch);
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalHybrid(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* scaling_factors, TfLiteTensor* prod_scaling_factors,
-    TfLiteTensor* recovered_cell_weights, TfLiteTensor* input_quantized,
-    TfLiteTensor* activation_state_quantized,
-    TfLiteTensor* cell_state_quantized, TfLiteTensor* activation_state,
-    TfLiteTensor* cell_state, TfLiteTensor* output) {
-  const int max_time = input->dims->data[0];
-  const int n_batch = input->dims->data[1];
-  const int n_input = input->dims->data[2];
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  int8_t* input_to_input_weights_ptr = nullptr;
-  float input_to_input_weights_scale = 1.0f;
-  int8_t* recurrent_to_input_weights_ptr = nullptr;
-  float recurrent_to_input_weights_scale = 1.0f;
-  float* input_gate_bias_ptr = nullptr;
-  if (!use_cifg) {
-    input_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8);
-    recurrent_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8);
-    input_gate_bias_ptr = input_gate_bias->data.f;
-    input_to_input_weights_scale = input_to_input_weights->params.scale;
-    recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
-  }
-
-  int8_t* cell_to_input_weights_ptr = nullptr;
-  int8_t* cell_to_forget_weights_ptr = nullptr;
-  int8_t* cell_to_output_weights_ptr = nullptr;
-  float cell_to_input_weights_scale = 1.0f;
-  float cell_to_forget_weights_scale = 1.0f;
-  float cell_to_output_weights_scale = 1.0f;
-  if (use_peephole) {
-    if (!use_cifg) {
-      cell_to_input_weights_ptr =
-          reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8);
-      cell_to_input_weights_scale = cell_to_input_weights->params.scale;
-    }
-    cell_to_forget_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8);
-    cell_to_output_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8);
-    cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
-    cell_to_output_weights_scale = cell_to_output_weights->params.scale;
-  }
-
-  const int8_t* projection_weights_ptr =
-      (projection_weights == nullptr)
-          ? nullptr
-          : reinterpret_cast<int8_t*>(projection_weights->data.uint8);
-  float projection_weights_scale =
-      (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-
-  // Required tensors, pointers are non-null.
-  const int8_t* input_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8);
-  const float input_to_forget_weights_scale =
-      input_to_forget_weights->params.scale;
-  const int8_t* input_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8);
-  const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
-  const int8_t* input_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8);
-  const float input_to_output_weights_scale =
-      input_to_output_weights->params.scale;
-  const int8_t* recurrent_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8);
-  const float recurrent_to_forget_weights_scale =
-      recurrent_to_forget_weights->params.scale;
-  const int8_t* recurrent_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8);
-  const float recurrent_to_cell_weights_scale =
-      recurrent_to_cell_weights->params.scale;
-  const int8_t* recurrent_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
-  const float recurrent_to_output_weights_scale =
-      recurrent_to_output_weights->params.scale;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
-
-  float* activation_state_ptr = activation_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-
-  // Temporary storage for quantized values and scaling factors.
-  int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
-  int8_t* quantized_activation_state_ptr =
-      reinterpret_cast<int8_t*>(activation_state_quantized->data.uint8);
-  int8_t* quantized_cell_state_ptr =
-      reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8);
-  float* scaling_factors_ptr = scaling_factors->data.f;
-  float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
-  float* recovered_cell_weights_ptr = recovered_cell_weights->data.f;
-
-  // Feed the sequence into the LSTM step-by-step.
-  for (int t = 0; t < max_time; t++) {
-    const float* input_ptr_batch = input->data.f + t * n_batch * n_input;
-    float* output_ptr_batch = output->data.f + t * n_batch * n_output;
-
-    kernel_utils::LstmStep(
-        input_ptr_batch, input_to_input_weights_ptr,
-        input_to_input_weights_scale, input_to_forget_weights_ptr,
-        input_to_forget_weights_scale, input_to_cell_weights_ptr,
-        input_to_cell_weights_scale, input_to_output_weights_ptr,
-        input_to_output_weights_scale, recurrent_to_input_weights_ptr,
-        recurrent_to_input_weights_scale, recurrent_to_forget_weights_ptr,
-        recurrent_to_forget_weights_scale, recurrent_to_cell_weights_ptr,
-        recurrent_to_cell_weights_scale, recurrent_to_output_weights_ptr,
-        recurrent_to_output_weights_scale, cell_to_input_weights_ptr,
-        cell_to_input_weights_scale, cell_to_forget_weights_ptr,
-        cell_to_forget_weights_scale, cell_to_output_weights_ptr,
-        cell_to_output_weights_scale, input_gate_bias_ptr, forget_gate_bias_ptr,
-        cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr,
-        projection_weights_scale, projection_bias_ptr, params, n_batch, n_cell,
-        n_input, n_output, input_gate_scratch, forget_gate_scratch,
-        cell_scratch, output_gate_scratch, scaling_factors_ptr,
-        prod_scaling_factors_ptr, recovered_cell_weights_ptr,
-        quantized_input_ptr, quantized_activation_state_ptr,
-        quantized_cell_state_ptr, activation_state_ptr, cell_state_ptr,
-        output_ptr_batch);
-  }
-  return kTfLiteOk;
-}
-
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
@@ -750,15 +484,21 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   switch (input_to_output_weights->type) {
     case kTfLiteFloat32: {
-      return EvalFloat(input, input_to_input_weights, input_to_forget_weights,
-                       input_to_cell_weights, input_to_output_weights,
-                       recurrent_to_input_weights, recurrent_to_forget_weights,
-                       recurrent_to_cell_weights, recurrent_to_output_weights,
-                       cell_to_input_weights, cell_to_forget_weights,
-                       cell_to_output_weights, input_gate_bias,
-                       forget_gate_bias, cell_bias, output_gate_bias,
-                       projection_weights, projection_bias, params,
-                       scratch_buffer, activation_state, cell_state, output);
+      return lstm_eval::EvalFloat(
+          input, input_to_input_weights, input_to_forget_weights,
+          input_to_cell_weights, input_to_output_weights,
+          recurrent_to_input_weights, recurrent_to_forget_weights,
+          recurrent_to_cell_weights, recurrent_to_output_weights,
+          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          /*aux_input=*/nullptr,
+          /*aux_input_to_input_weights=*/nullptr,
+          /*aux_input_to_forget_weights=*/nullptr,
+          /*aux_input_to_cell_weights=*/nullptr,
+          /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
+          projection_bias, params, /*forward_sequence=*/true,
+          /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
+          output);
     }
     case kTfLiteUInt8: {
       TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
@@ -771,17 +511,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           GetTemporary(context, node, /*index=*/5);
       TfLiteTensor* recovered_cell_weights =
           GetTemporary(context, node, /*index=*/6);
-      return EvalHybrid(
+      return lstm_eval::EvalHybrid(
           input, input_to_input_weights, input_to_forget_weights,
           input_to_cell_weights, input_to_output_weights,
           recurrent_to_input_weights, recurrent_to_forget_weights,
           recurrent_to_cell_weights, recurrent_to_output_weights,
           cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
-          input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias,
-          projection_weights, projection_bias, params, scratch_buffer,
-          scaling_factors, prod_scaling_factors, recovered_cell_weights,
-          input_quantized, activation_state_quantized, cell_state_quantized,
-          activation_state, cell_state, output);
+          /*aux_input=*/nullptr,
+          /*aux_input_to_input_weights=*/nullptr,
+          /*aux_input_to_forget_weights=*/nullptr,
+          /*aux_input_to_cell_weights=*/nullptr,
+          /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
+          projection_bias, params, /*forward_sequence=*/true,
+          /*output_offset=*/0, scratch_buffer, scaling_factors,
+          prod_scaling_factors, recovered_cell_weights, input_quantized,
+          /*aux_input_quantized=*/nullptr, activation_state_quantized,
+          cell_state_quantized, activation_state, cell_state, output);
     }
     default:
       context->ReportError(context, "Type %d is not currently supported.",
-- 
GitLab


From 123de2797a4348c963b597096762085bfa09eab1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 02:01:34 -0700
Subject: [PATCH 460/570] compat: Update forward compatibility horizon to
 2018-10-05

PiperOrigin-RevId: 215874612
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 76e08610ba..8f4e8e0b98 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -26,7 +26,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 4)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 5)
 
 
 @tf_export("compat.forward_compatible")
-- 
GitLab


From 92c8a77ba480bf4aeddea412cc1d2988f6ad81cd Mon Sep 17 00:00:00 2001
From: HyoukJoong Lee <hyouklee@google.com>
Date: Fri, 5 Oct 2018 07:46:22 -0700
Subject: [PATCH 461/570] Use absl::Span for HloModuleGroupMetadata

PiperOrigin-RevId: 215905026
---
 .../compiler/xla/service/hlo_module_group_metadata.cc     | 2 +-
 .../compiler/xla/service/hlo_module_group_metadata.h      | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index 83352ef91b..b4aac4c807 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -59,7 +59,7 @@ string HloModuleGroupMetadata::TrackedInstruction::ToString() const {
 }
 
 /* static */ StatusOr<std::unique_ptr<HloModuleGroupMetadata>>
-HloModuleGroupMetadata::Build(const std::vector<HloModule*>& modules) {
+HloModuleGroupMetadata::Build(absl::Span<HloModule* const> modules) {
   auto metadata = absl::make_unique<HloModuleGroupMetadata>(modules);
   TF_RETURN_IF_ERROR(metadata->Build());
   return std::move(metadata);
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index 0311b73207..928df0f5a7 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -102,14 +102,14 @@ class HloModuleGroupMetadata {
     HloInstruction* recv_done = nullptr;
   };
 
-  explicit HloModuleGroupMetadata(const std::vector<HloModule*>& modules)
-      : modules_(modules) {}
+  explicit HloModuleGroupMetadata(absl::Span<HloModule* const> modules)
+      : modules_(modules.begin(), modules.end()) {}
 
   ~HloModuleGroupMetadata() = default;
 
   // Build and return the metadata for the given modules.
   static StatusOr<std::unique_ptr<HloModuleGroupMetadata>> Build(
-      const std::vector<HloModule*>& modules);
+      absl::Span<HloModule* const> modules);
 
   // Returns true if the instruction is one of the 4 channel instructions (Send,
   // Recv, SendDone, RecvDone).
@@ -274,7 +274,7 @@ class HloModuleGroupMetadata {
   int64 max_channel_id_ = -1;
 
   // The modules that this metadata was built from.
-  const std::vector<HloModule*>& modules_;
+  const std::vector<HloModule*> modules_;
 
   absl::flat_hash_map<HloModule*, std::unique_ptr<TuplePointsToAnalysis>>
       points_to_analyses_;
-- 
GitLab


From 388ed2929ea024adcfb76ea9ddd78a38a87470b7 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 5 Oct 2018 08:03:19 -0700
Subject: [PATCH 462/570] [TF:XLA] Move broadcasting code out of BroadcastTo op
 into a common helper library.

Change XlaBinaryOp::Broadcast to use the BroadcastTo lowering, since it produces fewer extraneous reshapes and transposes. Even if the reshapes and transposes would later optimize away, this yields more readable output and makes life easier for HLO rewrites that run early.

Change in preparation for removing reshapes from SoftmaxCrossEntropyWithLogits.

PiperOrigin-RevId: 215906847
---
 tensorflow/compiler/tf2xla/kernels/BUILD      |  1 +
 .../compiler/tf2xla/kernels/binary_ops.cc     | 10 +-
 .../tf2xla/kernels/broadcast_to_op.cc         | 63 +------------
 .../compiler/tf2xla/kernels/cwise_ops.cc      | 57 +++---------
 .../compiler/tf2xla/kernels/cwise_ops.h       |  3 +-
 tensorflow/compiler/tf2xla/lib/BUILD          | 16 ++++
 tensorflow/compiler/tf2xla/lib/broadcast.cc   | 93 +++++++++++++++++++
 tensorflow/compiler/tf2xla/lib/broadcast.h    | 32 +++++++
 8 files changed, 165 insertions(+), 110 deletions(-)
 create mode 100644 tensorflow/compiler/tf2xla/lib/broadcast.cc
 create mode 100644 tensorflow/compiler/tf2xla/lib/broadcast.h

diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 95a010a119..224e5ea123 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -121,6 +121,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/lib:batch_dot",
+        "//tensorflow/compiler/tf2xla/lib:broadcast",
         "//tensorflow/compiler/tf2xla/lib:cholesky",
         "//tensorflow/compiler/tf2xla/lib:qr",
         "//tensorflow/compiler/tf2xla/lib:random",
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index a988d3c33e..47e517a657 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -64,7 +64,7 @@ XLA_MAKE_BINARY(Complex, xla::Complex(lhs, rhs, extend_dimensions));
 // }
 static xla::XlaOp DivNoNanImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
                                xla::XlaOp y, const BCast& broadcast_helper) {
-  std::tie(x, y) = XlaBinaryOp::Broadcast(b, x, y, broadcast_helper);
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
   auto zero = XlaHelpers::Zero(b, dtype);
   auto y_equals_0 = xla::Eq(y, zero);
   auto zeros = xla::ZerosLike(x);
@@ -84,7 +84,7 @@ XLA_MAKE_BINARY(DivNoNan,
 // }
 static xla::XlaOp FloorDivImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
                                xla::XlaOp y, const BCast& broadcast_helper) {
-  std::tie(x, y) = XlaBinaryOp::Broadcast(b, x, y, broadcast_helper);
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
   if (DataTypeIsUnsigned(dtype)) {
     return xla::Div(x, y);
   }
@@ -105,7 +105,7 @@ XLA_MAKE_BINARY(FloorDiv,
 
 static xla::XlaOp XlogyImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
                             xla::XlaOp y, const BCast& broadcast_helper) {
-  std::tie(x, y) = XlaBinaryOp::Broadcast(b, x, y, broadcast_helper);
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
   auto zero = XlaHelpers::Zero(b, dtype);
   auto is_zero = xla::Eq(x, zero);
   return xla::Select(is_zero, zero, xla::Mul(x, xla::Log(y)));
@@ -114,7 +114,7 @@ XLA_MAKE_BINARY(Xlogy, XlogyImpl(b, input_type(0), lhs, rhs, broadcast_helper));
 
 static xla::XlaOp XdivyImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
                             xla::XlaOp y, const BCast& broadcast_helper) {
-  std::tie(x, y) = XlaBinaryOp::Broadcast(b, x, y, broadcast_helper);
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
   auto zero = XlaHelpers::Zero(b, dtype);
   auto is_zero = xla::Eq(x, zero);
   return xla::Select(is_zero, zero, xla::Div(x, y));
@@ -126,7 +126,7 @@ XLA_MAKE_BINARY(Xdivy, XdivyImpl(b, input_type(0), lhs, rhs, broadcast_helper));
 // return (x < T(0)) == (y < T(0)) ? trunc_mod : std::fmod(trunc_mod + y, y);
 static xla::XlaOp FloorModImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
                                xla::XlaOp y, const BCast& broadcast_helper) {
-  std::tie(x, y) = XlaBinaryOp::Broadcast(b, x, y, broadcast_helper);
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
   auto zero = XlaHelpers::Zero(b, dtype);
   auto same_sign = xla::Eq(xla::Lt(x, zero), xla::Lt(y, zero));
   auto trunc_mod = xla::Rem(x, y);
diff --git a/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc b/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
index 696c1c39be..9bb11fb67e 100644
--- a/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
@@ -13,16 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "absl/algorithm/container.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 namespace {
@@ -37,59 +32,9 @@ class BroadcastToOp : public XlaOpKernel {
     TensorShape output_shape;
     OP_REQUIRES_OK(context, context->ConstantInputAsShape(1, &output_shape));
 
-    OP_REQUIRES(context, input_shape.dims() <= output_shape.dims(),
-                errors::InvalidArgument(
-                    "Input rank (", input_shape.dims(),
-                    ") must be less than or equal to the output rank (",
-                    output_shape.dims(), ")"));
-
-    auto input_dims = input_shape.dim_sizes();
-    auto output_dims = output_shape.dim_sizes();
-
-    // Broadcasting is done right-to-left on right-aligned dimensions; reverse
-    // the two vectors so elements to be broadcast are aligned.
-    absl::c_reverse(input_dims);
-    absl::c_reverse(output_dims);
-
-    std::vector<int64> broadcast_dims;
-    std::vector<int64> broadcast_shape;
-    for (int i = 0; i < output_shape.dims(); ++i) {
-      if (i < input_shape.dims()) {
-        OP_REQUIRES(
-            context,
-            (output_dims[i] == 0 && input_dims[i] == 0) ||
-                (input_dims[i] != 0 && output_dims[i] % input_dims[i] == 0),
-            errors::InvalidArgument("invalid shape to broadcast from ",
-                                    input_shape.DebugString(), " to ",
-                                    output_shape.DebugString()));
-
-        broadcast_dims.push_back(broadcast_shape.size());
-        if (output_dims[i] == input_dims[i]) {
-          broadcast_shape.push_back(output_dims[i]);
-        } else if (output_dims[i] != input_dims[i]) {
-          // Add dimensions [I, O/I], which we will later flatten to just
-          // [O]. We must do this in two phases since XLA broadcasting does not
-          // support tiling.
-          broadcast_shape.push_back(input_dims[i]);
-          broadcast_shape.push_back(output_dims[i] / input_dims[i]);
-        }
-      } else {
-        broadcast_shape.push_back(output_dims[i]);
-      }
-    }
-    absl::c_reverse(broadcast_dims);
-    int broadcast_shape_size = broadcast_shape.size();
-    for (int64& broadcast_dim : broadcast_dims) {
-      broadcast_dim = broadcast_shape_size - broadcast_dim - 1;
-    }
-    absl::c_reverse(broadcast_shape);
-    xla::XlaOp output = xla::Reshape(
-        xla::BroadcastInDim(context->Input(0),
-                            xla::ShapeUtil::MakeShape(
-                                context->input_xla_type(0), broadcast_shape),
-                            broadcast_dims),
-        output_shape.dim_sizes());
-    context->SetOutput(0, output);
+    auto output = BroadcastTo(context->Input(0), output_shape.dim_sizes());
+    OP_REQUIRES_OK(context, output.status());
+    context->SetOutput(0, output.ValueOrDie());
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
index ef1015552d..234f7b4a01 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 
+#include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -39,7 +40,8 @@ void XlaBinaryOp::Compile(XlaOpKernelContext* ctx) {
   // compute valid broadcast shapes, but rely below on XLA to
   // automatically perform the broadcast assuming its valid shapes are
   // a superset of TensorFlow's valid shapes.
-  BCast bcast(BCast::FromShape(lhs_shape), BCast::FromShape(rhs_shape));
+  BCast bcast(BCast::FromShape(lhs_shape), BCast::FromShape(rhs_shape),
+              /*fewer_dims_optimization=*/false);
   if (!bcast.IsValid()) {
     ctx->SetStatus(errors::InvalidArgument("Incompatible shapes: ",
                                            lhs_shape.DebugString(), " vs. ",
@@ -86,51 +88,18 @@ void XlaBinaryOp::Compile(XlaOpKernelContext* ctx) {
 }
 
 /* static */ std::pair<xla::XlaOp, xla::XlaOp> XlaBinaryOp::Broadcast(
-    xla::XlaBuilder* builder, const xla::XlaOp& lhs, const xla::XlaOp& rhs,
-    const BCast& broadcast_helper) {
-  // Manually construct the broadcasting since MapN does not do
-  // automatic broadcasting. The bcast helper ensures that
-  // lhs.reshape(bcast.x_reshape()).broadcast(bcast.x_bcast()) and
-  // rhs.reshape(bcast.y_reshape()).broadcast(bcast.y_bcast()) have
-  // the same shape, so can be operated on by MapN.
-
-  // First reshape the inputs, which should be a metadata-only
-  // operation since we are flattening the dimensions in order.
-  auto lhs_shaped = xla::Reshape(lhs, broadcast_helper.x_reshape());
-  auto rhs_shaped = xla::Reshape(rhs, broadcast_helper.y_reshape());
-
-  // Next broadcast the necessary input dimensions. We rely on the
-  // XLA optimizer to be smart about the fact that we are asking
-  // it to broadcast size 1 on some of these dimensions, to avoid
-  // adding complexity to this code.
-  auto lhs_broadcast = xla::Broadcast(lhs_shaped, broadcast_helper.x_bcast());
-  int lhs_size = broadcast_helper.x_bcast().size();
-  auto rhs_broadcast = xla::Broadcast(rhs_shaped, broadcast_helper.y_bcast());
-  int rhs_size = broadcast_helper.y_bcast().size();
-
-  // Now reshape them to the correct output shape. After the
-  // broadcast each side is twice as wide as it should be, since the
-  // broadcast dimensions were prepended to the shape. Reshape
-  // flattening each original dimension with the prepended broadcast
-  // dimension. E.g. if we started out with lhs_shaped with shape
-  // [5,2,3] and x_bcast was [2,1,7] then lhs_broadcast would have
-  // shape [2,1,7,5,2,3] and we want to reshape it to [10,2,21].
-  std::vector<int64> lhs_reorder;
-  for (int i = 0; i < lhs_size; ++i) {
-    lhs_reorder.push_back(i);
-    lhs_reorder.push_back(i + lhs_size);
+    xla::XlaOp lhs, xla::XlaOp rhs, const BCast& broadcast_helper) {
+  auto lhs_output = BroadcastTo(lhs, broadcast_helper.output_shape());
+  if (!lhs_output.ok()) {
+    xla::XlaOp error = lhs.builder()->ReportError(lhs_output.status());
+    return {error, error};
   }
-  auto lhs_output =
-      xla::Reshape(lhs_broadcast, lhs_reorder, broadcast_helper.output_shape());
-  std::vector<int64> rhs_reorder;
-  for (int i = 0; i < rhs_size; ++i) {
-    rhs_reorder.push_back(i);
-    rhs_reorder.push_back(i + rhs_size);
+  auto rhs_output = BroadcastTo(rhs, broadcast_helper.output_shape());
+  if (!rhs_output.ok()) {
+    xla::XlaOp error = rhs.builder()->ReportError(rhs_output.status());
+    return {error, error};
   }
-  auto rhs_output =
-      xla::Reshape(rhs_broadcast, rhs_reorder, broadcast_helper.output_shape());
-
-  return {lhs_output, rhs_output};
+  return {lhs_output.ValueOrDie(), rhs_output.ValueOrDie()};
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
index 6653944a91..516ead4bfe 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
@@ -67,8 +67,7 @@ class XlaBinaryOp : public XlaOpKernel {
   // 'broadcast_helper', yielding arguments 'lhs' and 'rhs' that have the same
   // shape.
   static std::pair<xla::XlaOp, xla::XlaOp> Broadcast(
-      xla::XlaBuilder* builder, const xla::XlaOp& lhs, const xla::XlaOp& rhs,
-      const BCast& broadcast_helper);
+      xla::XlaOp lhs, xla::XlaOp rhs, const BCast& broadcast_helper);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 8597e7f139..1ce3930fd1 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -31,6 +31,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "broadcast",
+    srcs = ["broadcast.cc"],
+    hdrs = ["broadcast.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "cholesky",
     srcs = ["cholesky.cc"],
diff --git a/tensorflow/compiler/tf2xla/lib/broadcast.cc b/tensorflow/compiler/tf2xla/lib/broadcast.cc
new file mode 100644
index 0000000000..3e402ef855
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/broadcast.cc
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/broadcast.h"
+
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace tensorflow {
+
+xla::StatusOr<xla::XlaOp> BroadcastTo(xla::XlaOp input,
+                                      absl::Span<int64 const> output_dims) {
+  xla::XlaBuilder* builder = input.builder();
+  TF_ASSIGN_OR_RETURN(xla::Shape input_shape, builder->GetShape(input));
+  absl::Span<int64 const> input_dims =
+      xla::AsInt64Slice(input_shape.dimensions());
+
+  if (input_dims == output_dims) {
+    return input;
+  }
+
+  if (input_dims.size() > output_dims.size()) {
+    return errors::InvalidArgument(
+        "Input shape (", xla::ShapeUtil::HumanString(input_shape),
+        ") must have rank less than or equal to the output shape [",
+        absl::StrJoin(output_dims, ","), "]");
+  }
+
+  std::vector<int64> broadcast_dims;
+  std::vector<int64> broadcast_shape;
+  auto input_it = input_dims.rbegin();
+  for (auto output_it = output_dims.rbegin(); output_it != output_dims.rend();
+       ++output_it) {
+    if (input_it != input_dims.rend()) {
+      if (!(*output_it == 0 && *input_it == 0) &&
+          !(*input_it != 0 && *output_it % *input_it == 0)) {
+        return errors::InvalidArgument("Invalid shape broadcast from ",
+                                       xla::ShapeUtil::HumanString(input_shape),
+                                       " to [", absl::StrJoin(output_dims, ","),
+                                       "]");
+      }
+
+      broadcast_dims.push_back(broadcast_shape.size());
+      if (*output_it == *input_it) {
+        broadcast_shape.push_back(*output_it);
+      } else if (*output_it != *input_it) {
+        // Add dimensions [I, O/I], which we will later flatten to just
+        // [O]. We must do this in two phases since XLA broadcasting does not
+        // support tiling.
+        broadcast_shape.push_back(*input_it);
+        broadcast_shape.push_back(*output_it / *input_it);
+      }
+      ++input_it;
+    } else {
+      broadcast_shape.push_back(*output_it);
+    }
+  }
+  TF_RET_CHECK(input_it == input_dims.rend());
+
+  absl::c_reverse(broadcast_dims);
+  int broadcast_shape_size = broadcast_shape.size();
+  for (int64& broadcast_dim : broadcast_dims) {
+    broadcast_dim = broadcast_shape_size - broadcast_dim - 1;
+  }
+  absl::c_reverse(broadcast_shape);
+  xla::XlaOp output = xla::BroadcastInDim(
+      input,
+      xla::ShapeUtil::MakeShape(input_shape.element_type(), broadcast_shape),
+      broadcast_dims);
+  if (broadcast_shape != output_dims) {
+    output = xla::Reshape(output, output_dims);
+  }
+  return output;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/broadcast.h b/tensorflow/compiler/tf2xla/lib/broadcast.h
new file mode 100644
index 0000000000..591e696f06
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/broadcast.h
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_BROADCAST_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LIB_BROADCAST_H_
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace tensorflow {
+
+// Broadcasts 'input' up to shape 'output_dims', using TensorFlow broadcasting
+// rules. Supports broadcasting a dimension of size x to size x*y, i.e., tiling.
+xla::StatusOr<xla::XlaOp> BroadcastTo(xla::XlaOp input,
+                                      absl::Span<int64 const> output_dims);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_BROADCAST_H_
-- 
GitLab


From cea6b4959152981ab778001f30ff9ad87bb4fc9e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 08:34:28 -0700
Subject: [PATCH 463/570] Relax some unnecessary 4D array restrictions

PiperOrigin-RevId: 215910400
---
 tensorflow/contrib/lite/kernels/internal/types.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index 64a39dd2a2..c6bc6074d4 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -268,8 +268,9 @@ class RuntimeShape {
   // This creates a shape padded to the desired size with the specified value.
   RuntimeShape(int new_shape_size, const RuntimeShape& shape, int pad_value)
       : size_(0) {
+    // If the following check fails, it is likely because a 4D-only kernel is
+    // being used with an array of larger dimension count.
     TFLITE_CHECK_GE(new_shape_size, shape.DimensionsCount());
-    TFLITE_CHECK_LE(new_shape_size, kMaxSmallSize);
     Resize(new_shape_size);
     const int size_increase = new_shape_size - shape.DimensionsCount();
     for (int i = 0; i < size_increase; ++i) {
-- 
GitLab


From 53faa313b7628cd8c9fbb836544cc6482cafb7a4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 08:46:54 -0700
Subject: [PATCH 464/570] Switch NCCL to build from open source (version
 2.3.5-5) by default.

Note to users manually patching ptxas from a later toolkit version:
Building NCCL requires the same version of ptxas and nvlink.

PiperOrigin-RevId: 215911973
---
 configure.py                        |   17 +-
 tensorflow/workspace.bzl            |   10 +-
 third_party/gpus/cuda_configure.bzl | 1979 ++++++++++++++-------------
 third_party/nccl/LICENSE            |  231 +---
 third_party/nccl/archive.BUILD      |  179 +++
 third_party/nccl/build_defs.bzl.tpl |  210 +++
 third_party/nccl/nccl_archive.BUILD |   68 -
 third_party/nccl/nccl_configure.bzl |  214 +--
 8 files changed, 1592 insertions(+), 1316 deletions(-)
 create mode 100644 third_party/nccl/archive.BUILD
 create mode 100644 third_party/nccl/build_defs.bzl.tpl
 delete mode 100644 third_party/nccl/nccl_archive.BUILD

diff --git a/configure.py b/configure.py
index a88fdb3555..65b4622995 100644
--- a/configure.py
+++ b/configure.py
@@ -35,7 +35,6 @@ except ImportError:
 
 _DEFAULT_CUDA_VERSION = '9.0'
 _DEFAULT_CUDNN_VERSION = '7'
-_DEFAULT_NCCL_VERSION = '2.2'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,7.0'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
 _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
@@ -1109,18 +1108,17 @@ def set_tf_nccl_install_path(environ_cp):
     raise ValueError('Currently NCCL is only supported on Linux platforms.')
 
   ask_nccl_version = (
-      'Please specify the NCCL version you want to use. If NCCL %s is not '
-      'installed, then you can use version 1.3 that can be fetched '
-      'automatically but it may have worse performance with multiple GPUs. '
-      '[Default is %s]: ') % (_DEFAULT_NCCL_VERSION, _DEFAULT_NCCL_VERSION)
+      'Please specify the locally installed NCCL version you want to use. '
+      '[Default is to use https://github.com/nvidia/nccl]: ')
 
   for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
     tf_nccl_version = get_from_env_or_user_or_default(
-        environ_cp, 'TF_NCCL_VERSION', ask_nccl_version, _DEFAULT_NCCL_VERSION)
-    tf_nccl_version = reformat_version_sequence(str(tf_nccl_version), 1)
+        environ_cp, 'TF_NCCL_VERSION', ask_nccl_version, '')
+
+    if not tf_nccl_version:
+      break  # No need to get install path, building the open source code.
 
-    if tf_nccl_version == '1':
-      break  # No need to get install path, NCCL 1 is a GitHub repo.
+    tf_nccl_version = reformat_version_sequence(str(tf_nccl_version), 1)
 
     # Look with ldconfig first if we can find the library in paths
     # like /usr/lib/x86_64-linux-gnu and the header file in the corresponding
@@ -1232,7 +1230,6 @@ def set_tf_nccl_install_path(environ_cp):
   environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
   write_action_env_to_bazelrc('TF_NCCL_VERSION', tf_nccl_version)
 
-
 def get_native_cuda_compute_capabilities(environ_cp):
   """Get native cuda compute capabilities.
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 72f3fd0cf8..8df41f96b8 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -585,12 +585,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "nccl_archive",
-        build_file = clean_dep("//third_party:nccl/nccl_archive.BUILD"),
-        sha256 = "2ca86fb6179ecbff789cc67c836139c1bbc0324ed8c04643405a30bf26325176",
-        strip_prefix = "nccl-03d856977ecbaac87e598c0c4bafca96761b9ac7",
+        build_file = clean_dep("//third_party:nccl/archive.BUILD"),
+        sha256 = "19132b5127fa8e02d95a09795866923f04064c8f1e0770b2b42ab551408882a4",
+        strip_prefix = "nccl-f93fe9bfd94884cec2ba711897222e0df5569a53",
         urls = [
-            "https://mirror.bazel.build/github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
-            "https://github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
+            "https://mirror.bazel.build/github.com/nvidia/nccl/archive/f93fe9bfd94884cec2ba711897222e0df5569a53.tar.gz",
+            "https://github.com/nvidia/nccl/archive/f93fe9bfd94884cec2ba711897222e0df5569a53.tar.gz",
         ],
     )
 
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 69f4599c16..831a3067b2 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -126,118 +126,141 @@ load(
 )
 
 def _get_python_bin(repository_ctx):
-    """Gets the python bin path."""
-    python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
-    if python_bin != None:
-        return python_bin
-    python_bin_name = "python.exe" if _is_windows(repository_ctx) else "python"
-    python_bin_path = repository_ctx.which(python_bin_name)
-    if python_bin_path != None:
-        return str(python_bin_path)
-    auto_configure_fail("Cannot find python in PATH, please make sure " +
-                        "python is installed and add its directory in PATH, or --define " +
-                        "%s='/something/else'.\nPATH=%s" % (
-                            _PYTHON_BIN_PATH,
-                            repository_ctx.os.environ.get("PATH", ""),
-                        ))
+  """Gets the python bin path."""
+  python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
+  if python_bin != None:
+    return python_bin
+  python_bin_name = "python.exe" if _is_windows(repository_ctx) else "python"
+  python_bin_path = repository_ctx.which(python_bin_name)
+  if python_bin_path != None:
+    return str(python_bin_path)
+  auto_configure_fail(
+      "Cannot find python in PATH, please make sure " +
+      "python is installed and add its directory in PATH, or --define " +
+      "%s='/something/else'.\nPATH=%s" % (
+          _PYTHON_BIN_PATH,
+          repository_ctx.os.environ.get("PATH", ""),
+      ))
+
 
 def _get_nvcc_tmp_dir_for_windows(repository_ctx):
-    """Return the tmp directory for nvcc to generate intermediate source files."""
-    escaped_tmp_dir = escape_string(
-        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace("\\", "\\\\"),
-    )
-    return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir"
+  """Return the tmp directory for nvcc to generate intermediate source files."""
+  escaped_tmp_dir = escape_string(
+      get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
+          "\\", "\\\\"),)
+  return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir"
 
-def _get_msvc_compiler(repository_ctx):
-    vc_path = find_vc_path(repository_ctx)
-    return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/")
 
-def _get_win_cuda_defines(repository_ctx):
-    """Return CROSSTOOL defines for Windows"""
-
-    # If we are not on Windows, return empty vaules for Windows specific fields.
-    # This ensures the CROSSTOOL file parser is happy.
-    if not _is_windows(repository_ctx):
-        return {
-            "%{msvc_env_tmp}": "",
-            "%{msvc_env_path}": "",
-            "%{msvc_env_include}": "",
-            "%{msvc_env_lib}": "",
-            "%{msvc_cl_path}": "",
-            "%{msvc_ml_path}": "",
-            "%{msvc_link_path}": "",
-            "%{msvc_lib_path}": "",
-            "%{cxx_builtin_include_directory}": "",
-        }
-
-    vc_path = find_vc_path(repository_ctx)
-    if not vc_path:
-        auto_configure_fail("Visual C++ build tools not found on your machine." +
-                            "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using")
-        return {}
-
-    env = setup_vc_env_vars(repository_ctx, vc_path)
-    escaped_paths = escape_string(env["PATH"])
-    escaped_include_paths = escape_string(env["INCLUDE"])
-    escaped_lib_paths = escape_string(env["LIB"])
-    escaped_tmp_dir = escape_string(
-        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace("\\", "\\\\"),
-    )
+def _get_msvc_compiler(repository_ctx):
+  vc_path = find_vc_path(repository_ctx)
+  return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/")
 
-    msvc_cl_path = "windows/msvc_wrapper_for_nvcc.bat"
-    msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace("\\", "/")
-    msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace("\\", "/")
-    msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace("\\", "/")
 
-    # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
-    # The generated files are guranteed to have unique name, so they can share the same tmp directory
-    escaped_cxx_include_directories = ["cxx_builtin_include_directory: \"%s\"" % _get_nvcc_tmp_dir_for_windows(repository_ctx)]
-    for path in escaped_include_paths.split(";"):
-        if path:
-            escaped_cxx_include_directories.append("cxx_builtin_include_directory: \"%s\"" % path)
+def _get_win_cuda_defines(repository_ctx):
+  """Return CROSSTOOL defines for Windows"""
 
+  # If we are not on Windows, return empty vaules for Windows specific fields.
+  # This ensures the CROSSTOOL file parser is happy.
+  if not _is_windows(repository_ctx):
     return {
-        "%{msvc_env_tmp}": escaped_tmp_dir,
-        "%{msvc_env_path}": escaped_paths,
-        "%{msvc_env_include}": escaped_include_paths,
-        "%{msvc_env_lib}": escaped_lib_paths,
-        "%{msvc_cl_path}": msvc_cl_path,
-        "%{msvc_ml_path}": msvc_ml_path,
-        "%{msvc_link_path}": msvc_link_path,
-        "%{msvc_lib_path}": msvc_lib_path,
-        "%{cxx_builtin_include_directory}": "\n".join(escaped_cxx_include_directories),
+        "%{msvc_env_tmp}": "",
+        "%{msvc_env_path}": "",
+        "%{msvc_env_include}": "",
+        "%{msvc_env_lib}": "",
+        "%{msvc_cl_path}": "",
+        "%{msvc_ml_path}": "",
+        "%{msvc_link_path}": "",
+        "%{msvc_lib_path}": "",
+        "%{cxx_builtin_include_directory}": "",
     }
 
+  vc_path = find_vc_path(repository_ctx)
+  if not vc_path:
+    auto_configure_fail(
+        "Visual C++ build tools not found on your machine." +
+        "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using"
+    )
+    return {}
+
+  env = setup_vc_env_vars(repository_ctx, vc_path)
+  escaped_paths = escape_string(env["PATH"])
+  escaped_include_paths = escape_string(env["INCLUDE"])
+  escaped_lib_paths = escape_string(env["LIB"])
+  escaped_tmp_dir = escape_string(
+      get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
+          "\\", "\\\\"),)
+
+  msvc_cl_path = "windows/msvc_wrapper_for_nvcc.bat"
+  msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace(
+      "\\", "/")
+  msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace(
+      "\\", "/")
+  msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace(
+      "\\", "/")
+
+  # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
+  # The generated files are guranteed to have unique name, so they can share the same tmp directory
+  escaped_cxx_include_directories = [
+      "cxx_builtin_include_directory: \"%s\"" %
+      _get_nvcc_tmp_dir_for_windows(repository_ctx)
+  ]
+  for path in escaped_include_paths.split(";"):
+    if path:
+      escaped_cxx_include_directories.append(
+          "cxx_builtin_include_directory: \"%s\"" % path)
+
+  return {
+      "%{msvc_env_tmp}":
+          escaped_tmp_dir,
+      "%{msvc_env_path}":
+          escaped_paths,
+      "%{msvc_env_include}":
+          escaped_include_paths,
+      "%{msvc_env_lib}":
+          escaped_lib_paths,
+      "%{msvc_cl_path}":
+          msvc_cl_path,
+      "%{msvc_ml_path}":
+          msvc_ml_path,
+      "%{msvc_link_path}":
+          msvc_link_path,
+      "%{msvc_lib_path}":
+          msvc_lib_path,
+      "%{cxx_builtin_include_directory}":
+          "\n".join(escaped_cxx_include_directories),
+  }
+
 # TODO(dzc): Once these functions have been factored out of Bazel's
 # cc_configure.bzl, load them from @bazel_tools instead.
 # BEGIN cc_configure common functions.
 def find_cc(repository_ctx):
-    """Find the C++ compiler."""
-    if _is_windows(repository_ctx):
-        return _get_msvc_compiler(repository_ctx)
-
-    if _use_cuda_clang(repository_ctx):
-        target_cc_name = "clang"
-        cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
-        if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
-            return "extra_tools/bin/clang"
-    else:
-        target_cc_name = "gcc"
-        cc_path_envvar = _GCC_HOST_COMPILER_PATH
-    cc_name = target_cc_name
-
-    if cc_path_envvar in repository_ctx.os.environ:
-        cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
-        if cc_name_from_env:
-            cc_name = cc_name_from_env
-    if cc_name.startswith("/"):
-        # Absolute path, maybe we should make this supported by our which function.
-        return cc_name
-    cc = repository_ctx.which(cc_name)
-    if cc == None:
-        fail(("Cannot find {}, either correct your path or set the {}" +
-              " environment variable").format(target_cc_name, cc_path_envvar))
-    return cc
+  """Find the C++ compiler."""
+  if _is_windows(repository_ctx):
+    return _get_msvc_compiler(repository_ctx)
+
+  if _use_cuda_clang(repository_ctx):
+    target_cc_name = "clang"
+    cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
+    if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
+      return "extra_tools/bin/clang"
+  else:
+    target_cc_name = "gcc"
+    cc_path_envvar = _GCC_HOST_COMPILER_PATH
+  cc_name = target_cc_name
+
+  if cc_path_envvar in repository_ctx.os.environ:
+    cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
+    if cc_name_from_env:
+      cc_name = cc_name_from_env
+  if cc_name.startswith("/"):
+    # Absolute path, maybe we should make this supported by our which function.
+    return cc_name
+  cc = repository_ctx.which(cc_name)
+  if cc == None:
+    fail(("Cannot find {}, either correct your path or set the {}" +
+          " environment variable").format(target_cc_name, cc_path_envvar))
+  return cc
+
 
 _INC_DIR_MARKER_BEGIN = "#include <...>"
 
@@ -246,80 +269,82 @@ _OSX_FRAMEWORK_SUFFIX = " (framework directory)"
 _OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
 
 def _cxx_inc_convert(path):
-    """Convert path returned by cc -E xc++ in a complete path."""
-    path = path.strip()
-    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
-        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
-    return path
+  """Convert path returned by cc -E xc++ in a complete path."""
+  path = path.strip()
+  if path.endswith(_OSX_FRAMEWORK_SUFFIX):
+    path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
+  return path
+
 
 def _normalize_include_path(repository_ctx, path):
-    """Normalizes include paths before writing them to the crosstool.
+  """Normalizes include paths before writing them to the crosstool.
 
     If path points inside the 'crosstool' folder of the repository, a relative
     path is returned.
     If path points outside the 'crosstool' folder, an absolute path is returned.
     """
-    path = str(repository_ctx.path(path))
-    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+  path = str(repository_ctx.path(path))
+  crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+
+  if path.startswith(crosstool_folder):
+    # We drop the path to "$REPO/crosstool" and a trailing path separator.
+    return path[len(crosstool_folder) + 1:]
+  return path
 
-    if path.startswith(crosstool_folder):
-        # We drop the path to "$REPO/crosstool" and a trailing path separator.
-        return path[len(crosstool_folder) + 1:]
-    return path
 
 def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
-    """Compute the list of default C or C++ include directories."""
-    if lang_is_cpp:
-        lang = "c++"
-    else:
-        lang = "c"
-    result = repository_ctx.execute([cc, "-E", "-x" + lang, "-", "-v"])
-    index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
-    if index1 == -1:
-        return []
-    index1 = result.stderr.find("\n", index1)
-    if index1 == -1:
-        return []
-    index2 = result.stderr.rfind("\n ")
-    if index2 == -1 or index2 < index1:
-        return []
-    index2 = result.stderr.find("\n", index2 + 1)
-    if index2 == -1:
-        inc_dirs = result.stderr[index1 + 1:]
-    else:
-        inc_dirs = result.stderr[index1 + 1:index2].strip()
+  """Compute the list of default C or C++ include directories."""
+  if lang_is_cpp:
+    lang = "c++"
+  else:
+    lang = "c"
+  result = repository_ctx.execute([cc, "-E", "-x" + lang, "-", "-v"])
+  index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
+  if index1 == -1:
+    return []
+  index1 = result.stderr.find("\n", index1)
+  if index1 == -1:
+    return []
+  index2 = result.stderr.rfind("\n ")
+  if index2 == -1 or index2 < index1:
+    return []
+  index2 = result.stderr.find("\n", index2 + 1)
+  if index2 == -1:
+    inc_dirs = result.stderr[index1 + 1:]
+  else:
+    inc_dirs = result.stderr[index1 + 1:index2].strip()
+
+  return [
+      _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
+      for p in inc_dirs.split("\n")
+  ]
 
-    return [
-        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
-        for p in inc_dirs.split("\n")
-    ]
 
 def get_cxx_inc_directories(repository_ctx, cc):
-    """Compute the list of default C and C++ include directories."""
-
-    # For some reason `clang -xc` sometimes returns include paths that are
-    # different from the ones from `clang -xc++`. (Symlink and a dir)
-    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-    includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
-    includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
-
-    includes_cpp_set = depset(includes_cpp)
-    return includes_cpp + [
-        inc
-        for inc in includes_c
-        if inc not in includes_cpp_set
-    ]
+  """Compute the list of default C and C++ include directories."""
+
+  # For some reason `clang -xc` sometimes returns include paths that are
+  # different from the ones from `clang -xc++`. (Symlink and a dir)
+  # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+  includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
+  includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
+
+  includes_cpp_set = depset(includes_cpp)
+  return includes_cpp + [
+      inc for inc in includes_c if inc not in includes_cpp_set
+  ]
+
 
 def auto_configure_fail(msg):
-    """Output failure message when cuda configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
+  """Output failure message when cuda configuration fails."""
+  red = "\033[0;31m"
+  no_color = "\033[0m"
+  fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
 
 # END cc_configure common functions (see TODO above).
 
 def _host_compiler_includes(repository_ctx, cc):
-    """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
+  """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
 
     Args:
       repository_ctx: The repository context.
@@ -330,14 +355,15 @@ def _host_compiler_includes(repository_ctx, cc):
       host compiler include directories, which can be added to the CROSSTOOL
       file.
     """
-    inc_dirs = get_cxx_inc_directories(repository_ctx, cc)
-    inc_entries = []
-    for inc_dir in inc_dirs:
-        inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
-    return "\n".join(inc_entries)
+  inc_dirs = get_cxx_inc_directories(repository_ctx, cc)
+  inc_entries = []
+  for inc_dir in inc_dirs:
+    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
+  return "\n".join(inc_entries)
+
 
 def _cuda_include_path(repository_ctx, cuda_config):
-    """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
+  """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
 
     Args:
       repository_ctx: The repository context.
@@ -348,39 +374,41 @@ def _cuda_include_path(repository_ctx, cuda_config):
       host compiler include directories, which can be added to the CROSSTOOL
       file.
     """
-    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" %
-                                    (
-                                        cuda_config.cuda_toolkit_path,
-                                        ".exe" if cuda_config.cpu_value == "Windows" else "",
-                                    ))
-    result = repository_ctx.execute([
-        nvcc_path,
-        "-v",
-        "/dev/null",
-        "-o",
-        "/dev/null",
-    ])
-    target_dir = ""
-    for one_line in result.stderr.splitlines():
-        if one_line.startswith("#$ _TARGET_DIR_="):
-            target_dir = (cuda_config.cuda_toolkit_path + "/" +
-                          one_line.replace("#$ _TARGET_DIR_=", "") + "/include")
-    inc_entries = []
-    if target_dir != "":
-        inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
-    default_include = cuda_config.cuda_toolkit_path + "/include"
-    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" %
-                       default_include)
-    return "\n".join(inc_entries)
+  nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
+      cuda_config.cuda_toolkit_path,
+      ".exe" if cuda_config.cpu_value == "Windows" else "",
+  ))
+  result = repository_ctx.execute([
+      nvcc_path,
+      "-v",
+      "/dev/null",
+      "-o",
+      "/dev/null",
+  ])
+  target_dir = ""
+  for one_line in result.stderr.splitlines():
+    if one_line.startswith("#$ _TARGET_DIR_="):
+      target_dir = (
+          cuda_config.cuda_toolkit_path + "/" + one_line.replace(
+              "#$ _TARGET_DIR_=", "") + "/include")
+  inc_entries = []
+  if target_dir != "":
+    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
+  default_include = cuda_config.cuda_toolkit_path + "/include"
+  inc_entries.append(
+      "  cxx_builtin_include_directory: \"%s\"" % default_include)
+  return "\n".join(inc_entries)
+
 
 def _enable_cuda(repository_ctx):
-    if "TF_NEED_CUDA" in repository_ctx.os.environ:
-        enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
-        return enable_cuda == "1"
-    return False
+  if "TF_NEED_CUDA" in repository_ctx.os.environ:
+    enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
+    return enable_cuda == "1"
+  return False
+
 
-def _cuda_toolkit_path(repository_ctx):
-    """Finds the cuda toolkit directory.
+def cuda_toolkit_path(repository_ctx):
+  """Finds the cuda toolkit directory.
 
     Args:
       repository_ctx: The repository context.
@@ -388,27 +416,31 @@ def _cuda_toolkit_path(repository_ctx):
     Returns:
       A speculative real path of the cuda toolkit install directory.
     """
-    cuda_toolkit_path = _DEFAULT_CUDA_TOOLKIT_PATH
-    if _CUDA_TOOLKIT_PATH in repository_ctx.os.environ:
-        cuda_toolkit_path = repository_ctx.os.environ[_CUDA_TOOLKIT_PATH].strip()
-    if not repository_ctx.path(cuda_toolkit_path).exists:
-        auto_configure_fail("Cannot find cuda toolkit path.")
-    return str(repository_ctx.path(cuda_toolkit_path).realpath)
+  cuda_toolkit_path = _DEFAULT_CUDA_TOOLKIT_PATH
+  if _CUDA_TOOLKIT_PATH in repository_ctx.os.environ:
+    cuda_toolkit_path = repository_ctx.os.environ[_CUDA_TOOLKIT_PATH].strip()
+  if not repository_ctx.path(cuda_toolkit_path).exists:
+    auto_configure_fail("Cannot find cuda toolkit path.")
+  return str(repository_ctx.path(cuda_toolkit_path).realpath)
+
 
 def _cudnn_install_basedir(repository_ctx):
-    """Finds the cudnn install directory."""
-    cudnn_install_path = _DEFAULT_CUDNN_INSTALL_PATH
-    if _CUDNN_INSTALL_PATH in repository_ctx.os.environ:
-        cudnn_install_path = repository_ctx.os.environ[_CUDNN_INSTALL_PATH].strip()
-    if not repository_ctx.path(cudnn_install_path).exists:
-        auto_configure_fail("Cannot find cudnn install path.")
-    return cudnn_install_path
+  """Finds the cudnn install directory."""
+  cudnn_install_path = _DEFAULT_CUDNN_INSTALL_PATH
+  if _CUDNN_INSTALL_PATH in repository_ctx.os.environ:
+    cudnn_install_path = repository_ctx.os.environ[_CUDNN_INSTALL_PATH].strip()
+  if not repository_ctx.path(cudnn_install_path).exists:
+    auto_configure_fail("Cannot find cudnn install path.")
+  return cudnn_install_path
+
 
 def matches_version(environ_version, detected_version):
-    """Checks whether the user-specified version matches the detected version.
+  """Checks whether the user-specified version matches the detected version.
 
-    This function performs a weak matching so that if the user specifies only the
-    major or major and minor versions, the versions are still considered matching
+    This function performs a weak matching so that if the user specifies only
+    the
+    major or major and minor versions, the versions are still considered
+    matching
     if the version parts match. To illustrate:
 
         environ_version  detected_version  result
@@ -424,25 +456,25 @@ def matches_version(environ_version, detected_version):
         variables.
       detected_version: The version autodetected from the CUDA installation on
         the system.
-
     Returns: True if user-specified version matches detected version and False
       otherwise.
-    """
-    environ_version_parts = environ_version.split(".")
-    detected_version_parts = detected_version.split(".")
-    if len(detected_version_parts) < len(environ_version_parts):
-        return False
-    for i, part in enumerate(detected_version_parts):
-        if i >= len(environ_version_parts):
-            break
-        if part != environ_version_parts[i]:
-            return False
-    return True
+  """
+  environ_version_parts = environ_version.split(".")
+  detected_version_parts = detected_version.split(".")
+  if len(detected_version_parts) < len(environ_version_parts):
+    return False
+  for i, part in enumerate(detected_version_parts):
+    if i >= len(environ_version_parts):
+      break
+    if part != environ_version_parts[i]:
+      return False
+  return True
+
 
 _NVCC_VERSION_PREFIX = "Cuda compilation tools, release "
 
 def _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value):
-    """Detects the version of CUDA installed on the system.
+  """Detects the version of CUDA installed on the system.
 
     Args:
       repository_ctx: The repository context.
@@ -452,64 +484,61 @@ def _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value):
       String containing the version of CUDA.
     """
 
-    # Run nvcc --version and find the line containing the CUDA version.
-    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" %
-                                    (
-                                        cuda_toolkit_path,
-                                        ".exe" if cpu_value == "Windows" else "",
-                                    ))
-    if not nvcc_path.exists:
-        auto_configure_fail("Cannot find nvcc at %s" % str(nvcc_path))
-    result = repository_ctx.execute([str(nvcc_path), "--version"])
-    if result.stderr:
-        auto_configure_fail("Error running nvcc --version: %s" % result.stderr)
-    lines = result.stdout.splitlines()
-    version_line = lines[len(lines) - 1]
-    if version_line.find(_NVCC_VERSION_PREFIX) == -1:
-        auto_configure_fail(
-            "Could not parse CUDA version from nvcc --version. Got: %s" %
-            result.stdout,
-        )
-
-    # Parse the CUDA version from the line containing the CUDA version.
-    prefix_removed = version_line.replace(_NVCC_VERSION_PREFIX, "")
-    parts = prefix_removed.split(",")
-    if len(parts) != 2 or len(parts[0]) < 2:
-        auto_configure_fail(
-            "Could not parse CUDA version from nvcc --version. Got: %s" %
-            result.stdout,
-        )
-    full_version = parts[1].strip()
-    if full_version.startswith("V"):
-        full_version = full_version[1:]
-
-    # Check whether TF_CUDA_VERSION was set by the user and fail if it does not
-    # match the detected version.
-    environ_version = ""
-    if _TF_CUDA_VERSION in repository_ctx.os.environ:
-        environ_version = repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
-    if environ_version and not matches_version(environ_version, full_version):
-        auto_configure_fail(
-            ("CUDA version detected from nvcc (%s) does not match " +
-             "TF_CUDA_VERSION (%s)") % (full_version, environ_version),
-        )
-
-    # We only use the version consisting of the major and minor version numbers.
-    version_parts = full_version.split(".")
-    if len(version_parts) < 2:
-        auto_configure_fail("CUDA version detected from nvcc (%s) is incomplete.")
-    if cpu_value == "Windows":
-        version = "64_%s%s" % (version_parts[0], version_parts[1])
-    else:
-        version = "%s.%s" % (version_parts[0], version_parts[1])
-    return version
+  # Run nvcc --version and find the line containing the CUDA version.
+  nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
+      cuda_toolkit_path,
+      ".exe" if cpu_value == "Windows" else "",
+  ))
+  if not nvcc_path.exists:
+    auto_configure_fail("Cannot find nvcc at %s" % str(nvcc_path))
+  result = repository_ctx.execute([str(nvcc_path), "--version"])
+  if result.stderr:
+    auto_configure_fail("Error running nvcc --version: %s" % result.stderr)
+  lines = result.stdout.splitlines()
+  version_line = lines[len(lines) - 1]
+  if version_line.find(_NVCC_VERSION_PREFIX) == -1:
+    auto_configure_fail(
+        "Could not parse CUDA version from nvcc --version. Got: %s" %
+        result.stdout,)
+
+  # Parse the CUDA version from the line containing the CUDA version.
+  prefix_removed = version_line.replace(_NVCC_VERSION_PREFIX, "")
+  parts = prefix_removed.split(",")
+  if len(parts) != 2 or len(parts[0]) < 2:
+    auto_configure_fail(
+        "Could not parse CUDA version from nvcc --version. Got: %s" %
+        result.stdout,)
+  full_version = parts[1].strip()
+  if full_version.startswith("V"):
+    full_version = full_version[1:]
+
+  # Check whether TF_CUDA_VERSION was set by the user and fail if it does not
+  # match the detected version.
+  environ_version = ""
+  if _TF_CUDA_VERSION in repository_ctx.os.environ:
+    environ_version = repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
+  if environ_version and not matches_version(environ_version, full_version):
+    auto_configure_fail(
+        ("CUDA version detected from nvcc (%s) does not match " +
+         "TF_CUDA_VERSION (%s)") % (full_version, environ_version),)
+
+  # We only use the version consisting of the major and minor version numbers.
+  version_parts = full_version.split(".")
+  if len(version_parts) < 2:
+    auto_configure_fail("CUDA version detected from nvcc (%s) is incomplete.")
+  if cpu_value == "Windows":
+    version = "64_%s%s" % (version_parts[0], version_parts[1])
+  else:
+    version = "%s.%s" % (version_parts[0], version_parts[1])
+  return version
+
 
 _DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"
 _DEFINE_CUDNN_MINOR = "#define CUDNN_MINOR"
 _DEFINE_CUDNN_PATCHLEVEL = "#define CUDNN_PATCHLEVEL"
 
 def find_cuda_define(repository_ctx, header_dir, header_file, define):
-    """Returns the value of a #define in a header file.
+  """Returns the value of a #define in a header file.
 
     Greps through a header file and returns the value of the specified #define.
     If the #define is not found, then raise an error.
@@ -524,52 +553,52 @@ def find_cuda_define(repository_ctx, header_dir, header_file, define):
       The value of the #define found in the header.
     """
 
-    # Confirm location of the header and grep for the line defining the macro.
-    h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
-    if not h_path.exists:
-        auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
-    result = repository_ctx.execute(
-        # Grep one more lines as some #defines are splitted into two lines.
-        ["grep", "--color=never", "-A1", "-E", define, str(h_path)],
-    )
-    if result.stderr:
-        auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
-
-    # Parse the version from the line defining the macro.
-    if result.stdout.find(define) == -1:
-        auto_configure_fail("Cannot find line containing '%s' in %s" %
-                            (define, h_path))
-
-    # Split results to lines
-    lines = result.stdout.split("\n")
-    num_lines = len(lines)
-    for l in range(num_lines):
-        line = lines[l]
-        if define in line:  # Find the line with define
-            version = line
-            if l != num_lines - 1 and line[-1] == "\\":  # Add next line, if multiline
-                version = version[:-1] + lines[l + 1]
-            break
-
-    # Remove any comments
-    version = version.split("//")[0]
-
-    # Remove define name
-    version = version.replace(define, "").strip()
-
-    # Remove the code after the version number.
-    version_end = version.find(" ")
-    if version_end != -1:
-        if version_end == 0:
-            auto_configure_fail(
-                "Cannot extract the version from line containing '%s' in %s" %
-                (define, str(h_path)),
-            )
-        version = version[:version_end].strip()
-    return version
+  # Confirm location of the header and grep for the line defining the macro.
+  h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
+  if not h_path.exists:
+    auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
+  result = repository_ctx.execute(
+      # Grep one more lines as some #defines are splitted into two lines.
+      ["grep", "--color=never", "-A1", "-E", define,
+       str(h_path)],)
+  if result.stderr:
+    auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
+
+  # Parse the version from the line defining the macro.
+  if result.stdout.find(define) == -1:
+    auto_configure_fail(
+        "Cannot find line containing '%s' in %s" % (define, h_path))
+
+  # Split results to lines
+  lines = result.stdout.split("\n")
+  num_lines = len(lines)
+  for l in range(num_lines):
+    line = lines[l]
+    if define in line:  # Find the line with define
+      version = line
+      if l != num_lines - 1 and line[-1] == "\\":  # Add next line, if multiline
+        version = version[:-1] + lines[l + 1]
+      break
+
+  # Remove any comments
+  version = version.split("//")[0]
+
+  # Remove define name
+  version = version.replace(define, "").strip()
+
+  # Remove the code after the version number.
+  version_end = version.find(" ")
+  if version_end != -1:
+    if version_end == 0:
+      auto_configure_fail(
+          "Cannot extract the version from line containing '%s' in %s" %
+          (define, str(h_path)),)
+    version = version[:version_end].strip()
+  return version
+
 
 def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
-    """Detects the version of cuDNN installed on the system.
+  """Detects the version of cuDNN installed on the system.
 
     Args:
       repository_ctx: The repository context.
@@ -579,68 +608,68 @@ def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
     Returns:
       A string containing the version of cuDNN.
     """
-    cudnn_header_dir = _find_cudnn_header_dir(
-        repository_ctx,
-        cudnn_install_basedir,
-    )
-    major_version = find_cuda_define(
-        repository_ctx,
-        cudnn_header_dir,
-        "cudnn.h",
-        _DEFINE_CUDNN_MAJOR,
-    )
-    minor_version = find_cuda_define(
-        repository_ctx,
-        cudnn_header_dir,
-        "cudnn.h",
-        _DEFINE_CUDNN_MINOR,
-    )
-    patch_version = find_cuda_define(
-        repository_ctx,
-        cudnn_header_dir,
-        "cudnn.h",
-        _DEFINE_CUDNN_PATCHLEVEL,
-    )
-    full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
-
-    # Check whether TF_CUDNN_VERSION was set by the user and fail if it does not
-    # match the detected version.
-    environ_version = ""
-    if _TF_CUDNN_VERSION in repository_ctx.os.environ:
-        environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
-    if environ_version and not matches_version(environ_version, full_version):
-        cudnn_h_path = repository_ctx.path("%s/include/cudnn.h" %
-                                           cudnn_install_basedir)
-        auto_configure_fail(
-            ("cuDNN version detected from %s (%s) does not match " +
-             "TF_CUDNN_VERSION (%s)") %
-            (str(cudnn_h_path), full_version, environ_version),
-        )
-
-    # We only use the major version since we use the libcudnn libraries that are
-    # only versioned with the major version (e.g. libcudnn.so.5).
-    version = major_version
-    if cpu_value == "Windows":
-        version = "64_" + version
-    return version
-
-def _compute_capabilities(repository_ctx):
-    """Returns a list of strings representing cuda compute capabilities."""
-    if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ:
-        return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-    capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES]
-    capabilities = capabilities_str.split(",")
-    for capability in capabilities:
-        # Workaround for Skylark's lack of support for regex. This check should
-        # be equivalent to checking:
-        #     if re.match("[0-9]+.[0-9]+", capability) == None:
-        parts = capability.split(".")
-        if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
-            auto_configure_fail("Invalid compute capability: %s" % capability)
-    return capabilities
+  cudnn_header_dir = _find_cudnn_header_dir(
+      repository_ctx,
+      cudnn_install_basedir,
+  )
+  major_version = find_cuda_define(
+      repository_ctx,
+      cudnn_header_dir,
+      "cudnn.h",
+      _DEFINE_CUDNN_MAJOR,
+  )
+  minor_version = find_cuda_define(
+      repository_ctx,
+      cudnn_header_dir,
+      "cudnn.h",
+      _DEFINE_CUDNN_MINOR,
+  )
+  patch_version = find_cuda_define(
+      repository_ctx,
+      cudnn_header_dir,
+      "cudnn.h",
+      _DEFINE_CUDNN_PATCHLEVEL,
+  )
+  full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+
+  # Check whether TF_CUDNN_VERSION was set by the user and fail if it does not
+  # match the detected version.
+  environ_version = ""
+  if _TF_CUDNN_VERSION in repository_ctx.os.environ:
+    environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
+  if environ_version and not matches_version(environ_version, full_version):
+    cudnn_h_path = repository_ctx.path(
+        "%s/include/cudnn.h" % cudnn_install_basedir)
+    auto_configure_fail(("cuDNN version detected from %s (%s) does not match " +
+                         "TF_CUDNN_VERSION (%s)") %
+                        (str(cudnn_h_path), full_version, environ_version),)
+
+  # We only use the major version since we use the libcudnn libraries that are
+  # only versioned with the major version (e.g. libcudnn.so.5).
+  version = major_version
+  if cpu_value == "Windows":
+    version = "64_" + version
+  return version
+
+
+def compute_capabilities(repository_ctx):
+  """Returns a list of strings representing cuda compute capabilities."""
+  if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ:
+    return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
+  capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES]
+  capabilities = capabilities_str.split(",")
+  for capability in capabilities:
+    # Workaround for Skylark's lack of support for regex. This check should
+    # be equivalent to checking:
+    #     if re.match("[0-9]+.[0-9]+", capability) == None:
+    parts = capability.split(".")
+    if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
+      auto_configure_fail("Invalid compute capability: %s" % capability)
+  return capabilities
+
 
 def get_cpu_value(repository_ctx):
-    """Returns the name of the host operating system.
+  """Returns the name of the host operating system.
 
     Args:
       repository_ctx: The repository context.
@@ -648,20 +677,22 @@ def get_cpu_value(repository_ctx):
     Returns:
       A string containing the name of the host operating system.
     """
-    os_name = repository_ctx.os.name.lower()
-    if os_name.startswith("mac os"):
-        return "Darwin"
-    if os_name.find("windows") != -1:
-        return "Windows"
-    result = repository_ctx.execute(["uname", "-s"])
-    return result.stdout.strip()
+  os_name = repository_ctx.os.name.lower()
+  if os_name.startswith("mac os"):
+    return "Darwin"
+  if os_name.find("windows") != -1:
+    return "Windows"
+  result = repository_ctx.execute(["uname", "-s"])
+  return result.stdout.strip()
+
 
 def _is_windows(repository_ctx):
-    """Returns true if the host operating system is windows."""
-    return get_cpu_value(repository_ctx) == "Windows"
+  """Returns true if the host operating system is windows."""
+  return get_cpu_value(repository_ctx) == "Windows"
+
 
 def _lib_name(lib, cpu_value, version = "", static = False):
-    """Constructs the platform-specific name of a library.
+  """Constructs the platform-specific name of a library.
 
     Args:
       lib: The name of the library, such as "cudart"
@@ -672,23 +703,24 @@ def _lib_name(lib, cpu_value, version = "", static = False):
     Returns:
       The platform-specific name of the library.
     """
-    if cpu_value in ("Linux", "FreeBSD"):
-        if static:
-            return "lib%s.a" % lib
-        else:
-            if version:
-                version = ".%s" % version
-            return "lib%s.so%s" % (lib, version)
-    elif cpu_value == "Windows":
-        return "%s.lib" % lib
-    elif cpu_value == "Darwin":
-        if static:
-            return "lib%s.a" % lib
-        elif version:
-            version = ".%s" % version
-        return "lib%s%s.dylib" % (lib, version)
+  if cpu_value in ("Linux", "FreeBSD"):
+    if static:
+      return "lib%s.a" % lib
     else:
-        auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
+      if version:
+        version = ".%s" % version
+      return "lib%s.so%s" % (lib, version)
+  elif cpu_value == "Windows":
+    return "%s.lib" % lib
+  elif cpu_value == "Darwin":
+    if static:
+      return "lib%s.a" % lib
+    elif version:
+      version = ".%s" % version
+    return "lib%s%s.dylib" % (lib, version)
+  else:
+    auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
+
 
 def _find_cuda_lib(
         lib,
@@ -697,7 +729,7 @@ def _find_cuda_lib(
         basedir,
         version = "",
         static = False):
-    """Finds the given CUDA or cuDNN library on the system.
+  """Finds the given CUDA or cuDNN library on the system.
 
     Args:
       lib: The name of the library, such as "cudart"
@@ -712,15 +744,16 @@ def _find_cuda_lib(
         file_name: The basename of the library found on the system.
         path: The full path to the library.
     """
-    file_name = _lib_name(lib, cpu_value, version, static)
-    for relative_path in CUDA_LIB_PATHS:
-        path = repository_ctx.path("%s/%s%s" % (basedir, relative_path, file_name))
-        if path.exists:
-            return struct(file_name = file_name, path = str(path.realpath))
-    auto_configure_fail("Cannot find cuda library %s" % file_name)
+  file_name = _lib_name(lib, cpu_value, version, static)
+  for relative_path in CUDA_LIB_PATHS:
+    path = repository_ctx.path("%s/%s%s" % (basedir, relative_path, file_name))
+    if path.exists:
+      return struct(file_name=file_name, path=str(path.realpath))
+  auto_configure_fail("Cannot find cuda library %s" % file_name)
+
 
 def _find_cupti_header_dir(repository_ctx, cuda_config):
-    """Returns the path to the directory containing cupti.h
+  """Returns the path to the directory containing cupti.h
 
     On most systems, the cupti library is not installed in the same directory as
     the other CUDA libraries but rather in a special extras/CUPTI directory.
@@ -732,14 +765,17 @@ def _find_cupti_header_dir(repository_ctx, cuda_config):
     Returns:
       The path of the directory containing the cupti header.
     """
-    cuda_toolkit_path = cuda_config.cuda_toolkit_path
-    for relative_path in CUPTI_HEADER_PATHS:
-        if repository_ctx.path("%s/%scupti.h" % (cuda_toolkit_path, relative_path)).exists:
-            return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-    auto_configure_fail("Cannot find cupti.h under %s" % ", ".join([cuda_toolkit_path + "/" + s for s in CUPTI_HEADER_PATHS]))
+  cuda_toolkit_path = cuda_config.cuda_toolkit_path
+  for relative_path in CUPTI_HEADER_PATHS:
+    if repository_ctx.path(
+        "%s/%scupti.h" % (cuda_toolkit_path, relative_path)).exists:
+      return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+  auto_configure_fail("Cannot find cupti.h under %s" % ", ".join(
+      [cuda_toolkit_path + "/" + s for s in CUPTI_HEADER_PATHS]))
+
 
 def _find_cupti_lib(repository_ctx, cuda_config):
-    """Finds the cupti library on the system.
+  """Finds the cupti library on the system.
 
     On most systems, the cupti library is not installed in the same directory as
     the other CUDA libraries but rather in a special extras/CUPTI directory.
@@ -753,23 +789,23 @@ def _find_cupti_lib(repository_ctx, cuda_config):
         file_name: The basename of the library found on the system.
         path: The full path to the library.
     """
-    file_name = _lib_name(
-        "cupti",
-        cuda_config.cpu_value,
-        cuda_config.cuda_version,
-    )
-    cuda_toolkit_path = cuda_config.cuda_toolkit_path
-    for relative_path in CUPTI_LIB_PATHS:
-        path = repository_ctx.path(
-            "%s/%s%s" % (cuda_toolkit_path, relative_path, file_name),
-        )
-        if path.exists:
-            return struct(file_name = file_name, path = str(path.realpath))
+  file_name = _lib_name(
+      "cupti",
+      cuda_config.cpu_value,
+      cuda_config.cuda_version,
+  )
+  cuda_toolkit_path = cuda_config.cuda_toolkit_path
+  for relative_path in CUPTI_LIB_PATHS:
+    path = repository_ctx.path(
+        "%s/%s%s" % (cuda_toolkit_path, relative_path, file_name),)
+    if path.exists:
+      return struct(file_name=file_name, path=str(path.realpath))
+
+  auto_configure_fail("Cannot find cupti library %s" % file_name)
 
-    auto_configure_fail("Cannot find cupti library %s" % file_name)
 
 def _find_libs(repository_ctx, cuda_config):
-    """Returns the CUDA and cuDNN libraries on the system.
+  """Returns the CUDA and cuDNN libraries on the system.
 
     Args:
       repository_ctx: The repository context.
@@ -778,64 +814,75 @@ def _find_libs(repository_ctx, cuda_config):
     Returns:
       Map of library names to structs of filename and path.
     """
-    cpu_value = cuda_config.cpu_value
-    return {
-        "cuda": _find_cuda_lib("cuda", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path),
-        "cudart": _find_cuda_lib(
-            "cudart",
-            repository_ctx,
-            cpu_value,
-            cuda_config.cuda_toolkit_path,
-            cuda_config.cuda_version,
-        ),
-        "cudart_static": _find_cuda_lib(
-            "cudart_static",
-            repository_ctx,
-            cpu_value,
-            cuda_config.cuda_toolkit_path,
-            cuda_config.cuda_version,
-            static = True,
-        ),
-        "cublas": _find_cuda_lib(
-            "cublas",
-            repository_ctx,
-            cpu_value,
-            cuda_config.cuda_toolkit_path,
-            cuda_config.cuda_version,
-        ),
-        "cusolver": _find_cuda_lib(
-            "cusolver",
-            repository_ctx,
-            cpu_value,
-            cuda_config.cuda_toolkit_path,
-            cuda_config.cuda_version,
-        ),
-        "curand": _find_cuda_lib(
-            "curand",
-            repository_ctx,
-            cpu_value,
-            cuda_config.cuda_toolkit_path,
-            cuda_config.cuda_version,
-        ),
-        "cufft": _find_cuda_lib(
-            "cufft",
-            repository_ctx,
-            cpu_value,
-            cuda_config.cuda_toolkit_path,
-            cuda_config.cuda_version,
-        ),
-        "cudnn": _find_cuda_lib(
-            "cudnn",
-            repository_ctx,
-            cpu_value,
-            cuda_config.cudnn_install_basedir,
-            cuda_config.cudnn_version,
-        ),
-        "cupti": _find_cupti_lib(repository_ctx, cuda_config),
-    }
+  cpu_value = cuda_config.cpu_value
+  return {
+      "cuda":
+          _find_cuda_lib("cuda", repository_ctx, cpu_value,
+                         cuda_config.cuda_toolkit_path),
+      "cudart":
+          _find_cuda_lib(
+              "cudart",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cuda_toolkit_path,
+              cuda_config.cuda_version,
+          ),
+      "cudart_static":
+          _find_cuda_lib(
+              "cudart_static",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cuda_toolkit_path,
+              cuda_config.cuda_version,
+              static=True,
+          ),
+      "cublas":
+          _find_cuda_lib(
+              "cublas",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cuda_toolkit_path,
+              cuda_config.cuda_version,
+          ),
+      "cusolver":
+          _find_cuda_lib(
+              "cusolver",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cuda_toolkit_path,
+              cuda_config.cuda_version,
+          ),
+      "curand":
+          _find_cuda_lib(
+              "curand",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cuda_toolkit_path,
+              cuda_config.cuda_version,
+          ),
+      "cufft":
+          _find_cuda_lib(
+              "cufft",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cuda_toolkit_path,
+              cuda_config.cuda_version,
+          ),
+      "cudnn":
+          _find_cuda_lib(
+              "cudnn",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cudnn_install_basedir,
+              cuda_config.cudnn_version,
+          ),
+      "cupti":
+          _find_cupti_lib(repository_ctx, cuda_config),
+  }
+
 
 def _find_cuda_include_path(repository_ctx, cuda_config):
-    """Returns the path to the directory containing cuda.h
+  """Returns the path to the directory containing cuda.h
 
     Args:
       repository_ctx: The repository context.
@@ -844,14 +891,16 @@ def _find_cuda_include_path(repository_ctx, cuda_config):
     Returns:
       The path of the directory containing the CUDA headers.
     """
-    cuda_toolkit_path = cuda_config.cuda_toolkit_path
-    for relative_path in CUDA_INCLUDE_PATHS:
-        if repository_ctx.path("%s/%scuda.h" % (cuda_toolkit_path, relative_path)).exists:
-            return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-    auto_configure_fail("Cannot find cuda.h under %s" % cuda_toolkit_path)
+  cuda_toolkit_path = cuda_config.cuda_toolkit_path
+  for relative_path in CUDA_INCLUDE_PATHS:
+    if repository_ctx.path(
+        "%s/%scuda.h" % (cuda_toolkit_path, relative_path)).exists:
+      return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+  auto_configure_fail("Cannot find cuda.h under %s" % cuda_toolkit_path)
+
 
 def _find_cudnn_header_dir(repository_ctx, cudnn_install_basedir):
-    """Returns the path to the directory containing cudnn.h
+  """Returns the path to the directory containing cudnn.h
 
     Args:
       repository_ctx: The repository context.
@@ -861,15 +910,17 @@ def _find_cudnn_header_dir(repository_ctx, cudnn_install_basedir):
     Returns:
       The path of the directory containing the cudnn header.
     """
-    for relative_path in CUDA_INCLUDE_PATHS:
-        if repository_ctx.path("%s/%scudnn.h" % (cudnn_install_basedir, relative_path)).exists:
-            return ("%s/%s" % (cudnn_install_basedir, relative_path))[:-1]
-    if repository_ctx.path("/usr/include/cudnn.h").exists:
-        return "/usr/include"
-    auto_configure_fail("Cannot find cudnn.h under %s" % cudnn_install_basedir)
+  for relative_path in CUDA_INCLUDE_PATHS:
+    if repository_ctx.path(
+        "%s/%scudnn.h" % (cudnn_install_basedir, relative_path)).exists:
+      return ("%s/%s" % (cudnn_install_basedir, relative_path))[:-1]
+  if repository_ctx.path("/usr/include/cudnn.h").exists:
+    return "/usr/include"
+  auto_configure_fail("Cannot find cudnn.h under %s" % cudnn_install_basedir)
+
 
 def _find_nvvm_libdevice_dir(repository_ctx, cuda_config):
-    """Returns the path to the directory containing libdevice in bitcode format.
+  """Returns the path to the directory containing libdevice in bitcode format.
 
     Args:
       repository_ctx: The repository context.
@@ -878,19 +929,23 @@ def _find_nvvm_libdevice_dir(repository_ctx, cuda_config):
     Returns:
       The path of the directory containing the CUDA headers.
     """
-    cuda_toolkit_path = cuda_config.cuda_toolkit_path
-    for libdevice_file in NVVM_LIBDEVICE_FILES:
-        for relative_path in NVVM_LIBDEVICE_PATHS:
-            if repository_ctx.path("%s/%s%s" % (cuda_toolkit_path, relative_path, libdevice_file)).exists:
-                return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-    auto_configure_fail("Cannot find libdevice*.bc files under %s" % cuda_toolkit_path)
+  cuda_toolkit_path = cuda_config.cuda_toolkit_path
+  for libdevice_file in NVVM_LIBDEVICE_FILES:
+    for relative_path in NVVM_LIBDEVICE_PATHS:
+      if repository_ctx.path("%s/%s%s" % (cuda_toolkit_path, relative_path,
+                                          libdevice_file)).exists:
+        return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+  auto_configure_fail(
+      "Cannot find libdevice*.bc files under %s" % cuda_toolkit_path)
+
 
 def _cudart_static_linkopt(cpu_value):
-    """Returns additional platform-specific linkopts for cudart."""
-    return "" if cpu_value == "Darwin" else "\"-lrt\","
+  """Returns additional platform-specific linkopts for cudart."""
+  return "" if cpu_value == "Darwin" else "\"-lrt\","
+
 
 def _get_cuda_config(repository_ctx):
-    """Detects and returns information about the CUDA installation on the system.
+  """Detects and returns information about the CUDA installation on the system.
 
     Args:
       repository_ctx: The repository context.
@@ -904,35 +959,39 @@ def _get_cuda_config(repository_ctx):
         compute_capabilities: A list of the system's CUDA compute capabilities.
         cpu_value: The name of the host operating system.
     """
-    cpu_value = get_cpu_value(repository_ctx)
-    cuda_toolkit_path = _cuda_toolkit_path(repository_ctx)
-    cuda_version = _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value)
-    cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
-    cudnn_version = _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value)
-    return struct(
-        cuda_toolkit_path = cuda_toolkit_path,
-        cudnn_install_basedir = cudnn_install_basedir,
-        cuda_version = cuda_version,
-        cudnn_version = cudnn_version,
-        compute_capabilities = _compute_capabilities(repository_ctx),
-        cpu_value = cpu_value,
-    )
+  cpu_value = get_cpu_value(repository_ctx)
+  toolkit_path = cuda_toolkit_path(repository_ctx)
+  cuda_version = _cuda_version(repository_ctx, toolkit_path, cpu_value)
+  cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
+  cudnn_version = _cudnn_version(repository_ctx, cudnn_install_basedir,
+                                 cpu_value)
+  return struct(
+      cuda_toolkit_path=toolkit_path,
+      cudnn_install_basedir=cudnn_install_basedir,
+      cuda_version=cuda_version,
+      cudnn_version=cudnn_version,
+      compute_capabilities=compute_capabilities(repository_ctx),
+      cpu_value=cpu_value,
+  )
+
 
 def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
-    if not out:
-        out = tpl.replace(":", "/")
-    repository_ctx.template(
-        out,
-        Label("//third_party/gpus/%s.tpl" % tpl),
-        substitutions,
-    )
+  if not out:
+    out = tpl.replace(":", "/")
+  repository_ctx.template(
+      out,
+      Label("//third_party/gpus/%s.tpl" % tpl),
+      substitutions,
+  )
+
 
 def _file(repository_ctx, label):
-    repository_ctx.template(
-        label.replace(":", "/"),
-        Label("//third_party/gpus/%s.tpl" % label),
-        {},
-    )
+  repository_ctx.template(
+      label.replace(":", "/"),
+      Label("//third_party/gpus/%s.tpl" % label),
+      {},
+  )
+
 
 _DUMMY_CROSSTOOL_BZL_FILE = """
 def error_gpu_disabled():
@@ -960,81 +1019,99 @@ error_gpu_disabled()
 """
 
 def _create_dummy_repository(repository_ctx):
-    cpu_value = get_cpu_value(repository_ctx)
+  cpu_value = get_cpu_value(repository_ctx)
+
+  # Set up BUILD file for cuda/.
+  _tpl(
+      repository_ctx,
+      "cuda:build_defs.bzl",
+      {
+          "%{cuda_is_configured}": "False",
+          "%{cuda_extra_copts}": "[]",
+      },
+  )
+  _tpl(
+      repository_ctx,
+      "cuda:BUILD",
+      {
+          "%{cuda_driver_lib}":
+              _lib_name("cuda", cpu_value),
+          "%{cudart_static_lib}":
+              _lib_name(
+                  "cudart_static",
+                  cpu_value,
+                  static=True,
+              ),
+          "%{cudart_static_linkopt}":
+              _cudart_static_linkopt(cpu_value),
+          "%{cudart_lib}":
+              _lib_name("cudart", cpu_value),
+          "%{cublas_lib}":
+              _lib_name("cublas", cpu_value),
+          "%{cusolver_lib}":
+              _lib_name("cusolver", cpu_value),
+          "%{cudnn_lib}":
+              _lib_name("cudnn", cpu_value),
+          "%{cufft_lib}":
+              _lib_name("cufft", cpu_value),
+          "%{curand_lib}":
+              _lib_name("curand", cpu_value),
+          "%{cupti_lib}":
+              _lib_name("cupti", cpu_value),
+          "%{cuda_include_genrules}":
+              "",
+          "%{cuda_headers}":
+              "",
+      },
+  )
 
-    # Set up BUILD file for cuda/.
-    _tpl(
-        repository_ctx,
-        "cuda:build_defs.bzl",
-        {
-            "%{cuda_is_configured}": "False",
-            "%{cuda_extra_copts}": "[]",
-        },
-    )
-    _tpl(
-        repository_ctx,
-        "cuda:BUILD",
-        {
-            "%{cuda_driver_lib}": _lib_name("cuda", cpu_value),
-            "%{cudart_static_lib}": _lib_name(
-                "cudart_static",
-                cpu_value,
-                static = True,
-            ),
-            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
-            "%{cudart_lib}": _lib_name("cudart", cpu_value),
-            "%{cublas_lib}": _lib_name("cublas", cpu_value),
-            "%{cusolver_lib}": _lib_name("cusolver", cpu_value),
-            "%{cudnn_lib}": _lib_name("cudnn", cpu_value),
-            "%{cufft_lib}": _lib_name("cufft", cpu_value),
-            "%{curand_lib}": _lib_name("curand", cpu_value),
-            "%{cupti_lib}": _lib_name("cupti", cpu_value),
-            "%{cuda_include_genrules}": "",
-            "%{cuda_headers}": "",
-        },
-    )
+  # Create dummy files for the CUDA toolkit since they are still required by
+  # tensorflow/core/platform/default/build_config:cuda.
+  repository_ctx.file("cuda/cuda/include/cuda.h", "")
+  repository_ctx.file("cuda/cuda/include/cublas.h", "")
+  repository_ctx.file("cuda/cuda/include/cudnn.h", "")
+  repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h", "")
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cuda", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart", cpu_value))
+  repository_ctx.file(
+      "cuda/cuda/lib/%s" % _lib_name("cudart_static", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublas", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusolver", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudnn", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("curand", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cufft", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cupti", cpu_value))
+
+  # Set up cuda_config.h, which is used by
+  # tensorflow/stream_executor/dso_loader.cc.
+  _tpl(
+      repository_ctx,
+      "cuda:cuda_config.h",
+      {
+          "%{cuda_version}":
+              _DEFAULT_CUDA_VERSION,
+          "%{cudnn_version}":
+              _DEFAULT_CUDNN_VERSION,
+          "%{cuda_compute_capabilities}":
+              ",".join([
+                  "CudaVersion(\"%s\")" % c
+                  for c in _DEFAULT_CUDA_COMPUTE_CAPABILITIES
+              ]),
+          "%{cuda_toolkit_path}":
+              _DEFAULT_CUDA_TOOLKIT_PATH,
+      },
+      "cuda/cuda/cuda_config.h",
+  )
 
-    # Create dummy files for the CUDA toolkit since they are still required by
-    # tensorflow/core/platform/default/build_config:cuda.
-    repository_ctx.file("cuda/cuda/include/cuda.h", "")
-    repository_ctx.file("cuda/cuda/include/cublas.h", "")
-    repository_ctx.file("cuda/cuda/include/cudnn.h", "")
-    repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h", "")
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cuda", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart_static", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublas", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusolver", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudnn", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("curand", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cufft", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cupti", cpu_value))
-
-    # Set up cuda_config.h, which is used by
-    # tensorflow/stream_executor/dso_loader.cc.
-    _tpl(
-        repository_ctx,
-        "cuda:cuda_config.h",
-        {
-            "%{cuda_version}": _DEFAULT_CUDA_VERSION,
-            "%{cudnn_version}": _DEFAULT_CUDNN_VERSION,
-            "%{cuda_compute_capabilities}": ",".join([
-                "CudaVersion(\"%s\")" % c
-                for c in _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-            ]),
-            "%{cuda_toolkit_path}": _DEFAULT_CUDA_TOOLKIT_PATH,
-        },
-        "cuda/cuda/cuda_config.h",
-    )
+  # If cuda_configure is not configured to build with GPU support, and the user
+  # attempts to build with --config=cuda, add a dummy build rule to intercept
+  # this and fail with an actionable error message.
+  repository_ctx.file(
+      "crosstool/error_gpu_disabled.bzl",
+      _DUMMY_CROSSTOOL_BZL_FILE,
+  )
+  repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
 
-    # If cuda_configure is not configured to build with GPU support, and the user
-    # attempts to build with --config=cuda, add a dummy build rule to intercept
-    # this and fail with an actionable error message.
-    repository_ctx.file(
-        "crosstool/error_gpu_disabled.bzl",
-        _DUMMY_CROSSTOOL_BZL_FILE,
-    )
-    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
 
 def _execute(
         repository_ctx,
@@ -1042,35 +1119,35 @@ def _execute(
         error_msg = None,
         error_details = None,
         empty_stdout_fine = False):
-    """Executes an arbitrary shell command.
+  """Executes an arbitrary shell command.
 
     Args:
       repository_ctx: the repository_ctx object
       cmdline: list of strings, the command to execute
       error_msg: string, a summary of the error if the command fails
       error_details: string, details about the error or steps to fix it
-      empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
-        it's an error
-    Return:
-      the result of repository_ctx.execute(cmdline)
-    """
-    result = repository_ctx.execute(cmdline)
-    if result.stderr or not (empty_stdout_fine or result.stdout):
-        auto_configure_fail(
-            "\n".join([
-                error_msg.strip() if error_msg else "Repository command failed",
-                result.stderr.strip(),
-                error_details if error_details else "",
-            ]),
-        )
-    return result
+      empty_stdout_fine: bool, if True, an empty stdout result is fine,
+        otherwise it's an error
+    Return: the result of repository_ctx.execute(cmdline)
+  """
+  result = repository_ctx.execute(cmdline)
+  if result.stderr or not (empty_stdout_fine or result.stdout):
+    auto_configure_fail(
+        "\n".join([
+            error_msg.strip() if error_msg else "Repository command failed",
+            result.stderr.strip(),
+            error_details if error_details else "",
+        ]),)
+  return result
+
 
 def _norm_path(path):
-    """Returns a path with '/' and remove the trailing slash."""
-    path = path.replace("\\", "/")
-    if path[-1] == "/":
-        path = path[:-1]
-    return path
+  """Returns a path with '/' and remove the trailing slash."""
+  path = path.replace("\\", "/")
+  if path[-1] == "/":
+    path = path[:-1]
+  return path
+
 
 def symlink_genrule_for_dir(
         repository_ctx,
@@ -1079,167 +1156,174 @@ def symlink_genrule_for_dir(
         genrule_name,
         src_files = [],
         dest_files = []):
-    """Returns a genrule to symlink(or copy if on Windows) a set of files.
+  """Returns a genrule to symlink(or copy if on Windows) a set of files.
 
     If src_dir is passed, files will be read from the given directory; otherwise
     we assume files are in src_files and dest_files
     """
-    if src_dir != None:
-        src_dir = _norm_path(src_dir)
-        dest_dir = _norm_path(dest_dir)
-        files = "\n".join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
-
-        # Create a list with the src_dir stripped to use for outputs.
-        dest_files = files.replace(src_dir, "").splitlines()
-        src_files = files.splitlines()
-    command = []
-    if not _is_windows(repository_ctx):
-        # We clear folders that might have been generated previously to avoid
-        # undesired inclusions
-        command.append('if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi')
-        command.append('if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi')
-        command.append('if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi')
-        command.append('if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi')
-    outs = []
-    for i in range(len(dest_files)):
-        if dest_files[i] != "":
-            # If we have only one file to link we do not want to use the dest_dir, as
-            # $(@D) will include the full path to the file.
-            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
-
-            # Copy the headers to create a sandboxable setup.
-            cmd = "cp -f"
-            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
-            outs.append('        "' + dest_dir + dest_files[i] + '",')
-    genrule = _genrule(
-        src_dir,
-        genrule_name,
-        " && ".join(command),
-        "\n".join(outs),
-    )
-    return genrule
+  if src_dir != None:
+    src_dir = _norm_path(src_dir)
+    dest_dir = _norm_path(dest_dir)
+    files = "\n".join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
+
+    # Create a list with the src_dir stripped to use for outputs.
+    dest_files = files.replace(src_dir, "").splitlines()
+    src_files = files.splitlines()
+  command = []
+  if not _is_windows(repository_ctx):
+    # We clear folders that might have been generated previously to avoid
+    # undesired inclusions
+    command.append('if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi')
+    command.append('if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi')
+    command.append('if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi')
+    command.append('if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi')
+  outs = []
+  for i in range(len(dest_files)):
+    if dest_files[i] != "":
+      # If we have only one file to link we do not want to use the dest_dir, as
+      # $(@D) will include the full path to the file.
+      dest = "$(@D)/" + dest_dir + dest_files[i] if len(
+          dest_files) != 1 else "$(@D)/" + dest_files[i]
+
+      # Copy the headers to create a sandboxable setup.
+      cmd = "cp -f"
+      command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
+      outs.append('        "' + dest_dir + dest_files[i] + '",')
+  genrule = _genrule(
+      src_dir,
+      genrule_name,
+      " && ".join(command),
+      "\n".join(outs),
+  )
+  return genrule
+
 
 def _genrule(src_dir, genrule_name, command, outs):
-    """Returns a string with a genrule.
+  """Returns a string with a genrule.
 
     Genrule executes the given command and produces the given outputs.
     """
-    return (
-        "genrule(\n" +
-        '    name = "' +
-        genrule_name + '",\n' +
-        "    outs = [\n" +
-        outs +
-        "\n    ],\n" +
-        '    cmd = """\n' +
-        command +
-        '\n   """,\n' +
-        ")\n"
-    )
+  return (
+      "genrule(\n" + '    name = "' + genrule_name + '",\n' + "    outs = [\n" +
+      outs + "\n    ],\n" + '    cmd = """\n' + command + '\n   """,\n' + ")\n")
+
 
 def _read_dir(repository_ctx, src_dir):
-    """Returns a string with all files in a directory.
+  """Returns a string with all files in a directory.
 
     Finds all files inside a directory, traversing subfolders and following
     symlinks. The returned string contains the full path of all files
     separated by line breaks.
     """
-    if _is_windows(repository_ctx):
-        src_dir = src_dir.replace("/", "\\")
-        find_result = _execute(
-            repository_ctx,
-            ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
-            empty_stdout_fine = True,
-        )
+  if _is_windows(repository_ctx):
+    src_dir = src_dir.replace("/", "\\")
+    find_result = _execute(
+        repository_ctx,
+        ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
+        empty_stdout_fine=True,
+    )
+
+    # src_files will be used in genrule.outs where the paths must
+    # use forward slashes.
+    result = find_result.stdout.replace("\\", "/")
+  else:
+    find_result = _execute(
+        repository_ctx,
+        ["find", src_dir, "-follow", "-type", "f"],
+        empty_stdout_fine=True,
+    )
+    result = find_result.stdout
+  return result
 
-        # src_files will be used in genrule.outs where the paths must
-        # use forward slashes.
-        result = find_result.stdout.replace("\\", "/")
-    else:
-        find_result = _execute(
-            repository_ctx,
-            ["find", src_dir, "-follow", "-type", "f"],
-            empty_stdout_fine = True,
-        )
-        result = find_result.stdout
-    return result
 
 def _flag_enabled(repository_ctx, flag_name):
-    if flag_name in repository_ctx.os.environ:
-        value = repository_ctx.os.environ[flag_name].strip()
-        return value == "1"
-    return False
+  if flag_name in repository_ctx.os.environ:
+    value = repository_ctx.os.environ[flag_name].strip()
+    return value == "1"
+  return False
+
 
 def _use_cuda_clang(repository_ctx):
-    return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
+  return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
+
 
 def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
-    if _use_cuda_clang(repository_ctx):
-        capability_flags = ["--cuda-gpu-arch=sm_" +
-                            cap.replace(".", "") for cap in compute_capabilities]
-    else:
-        # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
-        capability_flags = []
-    return str(capability_flags)
+  if _use_cuda_clang(repository_ctx):
+    capability_flags = [
+        "--cuda-gpu-arch=sm_" + cap.replace(".", "")
+        for cap in compute_capabilities
+    ]
+  else:
+    # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
+    # TODO(csigg): Make this consistent with cuda clang and pass to crosstool.
+    capability_flags = []
+  return str(capability_flags)
+
 
 def _create_local_cuda_repository(repository_ctx):
-    """Creates the repository containing files set up to build with CUDA."""
-    cuda_config = _get_cuda_config(repository_ctx)
+  """Creates the repository containing files set up to build with CUDA."""
+  cuda_config = _get_cuda_config(repository_ctx)
 
-    cuda_include_path = _find_cuda_include_path(repository_ctx, cuda_config)
-    cudnn_header_dir = _find_cudnn_header_dir(
-        repository_ctx,
-        cuda_config.cudnn_install_basedir,
-    )
-    cupti_header_dir = _find_cupti_header_dir(repository_ctx, cuda_config)
-    nvvm_libdevice_dir = _find_nvvm_libdevice_dir(repository_ctx, cuda_config)
-
-    # Set up symbolic links for the cuda toolkit by creating genrules to do
-    # symlinking. We create one genrule for each directory we want to track under
-    # cuda_toolkit_path
-    cuda_toolkit_path = cuda_config.cuda_toolkit_path
-    genrules = [symlink_genrule_for_dir(
-        repository_ctx,
-        cuda_include_path,
-        "cuda/include",
-        "cuda-include",
-    )]
-    genrules.append(symlink_genrule_for_dir(
-        repository_ctx,
-        nvvm_libdevice_dir,
-        "cuda/nvvm/libdevice",
-        "cuda-nvvm",
-    ))
-    genrules.append(symlink_genrule_for_dir(
-        repository_ctx,
-        cupti_header_dir,
-        "cuda/extras/CUPTI/include",
-        "cuda-extras",
-    ))
-
-    cuda_libs = _find_libs(repository_ctx, cuda_config)
-    cuda_lib_src = []
-    cuda_lib_dest = []
-    for lib in cuda_libs.values():
-        cuda_lib_src.append(lib.path)
-        cuda_lib_dest.append("cuda/lib/" + lib.file_name)
-    genrules.append(symlink_genrule_for_dir(
-        repository_ctx,
-        None,
-        "",
-        "cuda-lib",
-        cuda_lib_src,
-        cuda_lib_dest,
-    ))
-
-    # Set up the symbolic links for cudnn if cndnn was not installed to
-    # CUDA_TOOLKIT_PATH.
-    included_files = _read_dir(repository_ctx, cuda_include_path).replace(
-        cuda_include_path,
-        "",
-    ).splitlines()
-    if "/cudnn.h" not in included_files:
-        genrules.append(symlink_genrule_for_dir(
+  cuda_include_path = _find_cuda_include_path(repository_ctx, cuda_config)
+  cudnn_header_dir = _find_cudnn_header_dir(
+      repository_ctx,
+      cuda_config.cudnn_install_basedir,
+  )
+  cupti_header_dir = _find_cupti_header_dir(repository_ctx, cuda_config)
+  nvvm_libdevice_dir = _find_nvvm_libdevice_dir(repository_ctx, cuda_config)
+
+  # Set up symbolic links for the cuda toolkit by creating genrules to do
+  # symlinking. We create one genrule for each directory we want to track under
+  # cuda_toolkit_path
+  cuda_toolkit_path = cuda_config.cuda_toolkit_path
+  genrules = [
+      symlink_genrule_for_dir(
+          repository_ctx,
+          cuda_include_path,
+          "cuda/include",
+          "cuda-include",
+      )
+  ]
+  genrules.append(
+      symlink_genrule_for_dir(
+          repository_ctx,
+          nvvm_libdevice_dir,
+          "cuda/nvvm/libdevice",
+          "cuda-nvvm",
+      ))
+  genrules.append(
+      symlink_genrule_for_dir(
+          repository_ctx,
+          cupti_header_dir,
+          "cuda/extras/CUPTI/include",
+          "cuda-extras",
+      ))
+
+  cuda_libs = _find_libs(repository_ctx, cuda_config)
+  cuda_lib_src = []
+  cuda_lib_dest = []
+  for lib in cuda_libs.values():
+    cuda_lib_src.append(lib.path)
+    cuda_lib_dest.append("cuda/lib/" + lib.file_name)
+  genrules.append(
+      symlink_genrule_for_dir(
+          repository_ctx,
+          None,
+          "",
+          "cuda-lib",
+          cuda_lib_src,
+          cuda_lib_dest,
+      ))
+
+  # Set up the symbolic links for cudnn if cndnn was not installed to
+  # CUDA_TOOLKIT_PATH.
+  included_files = _read_dir(repository_ctx, cuda_include_path).replace(
+      cuda_include_path,
+      "",
+  ).splitlines()
+  if "/cudnn.h" not in included_files:
+    genrules.append(
+        symlink_genrule_for_dir(
             repository_ctx,
             None,
             "cuda/include/",
@@ -1247,204 +1331,229 @@ def _create_local_cuda_repository(repository_ctx):
             [cudnn_header_dir + "/cudnn.h"],
             ["cudnn.h"],
         ))
-    else:
-        genrules.append(
-            "filegroup(\n" +
-            '    name = "cudnn-include",\n' +
-            "    srcs = [],\n" +
-            ")\n",
-        )
-
-    # Set up BUILD file for cuda/
-    _tpl(
-        repository_ctx,
-        "cuda:build_defs.bzl",
-        {
-            "%{cuda_is_configured}": "True",
-            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
-                repository_ctx,
-                cuda_config.compute_capabilities,
-            ),
-        },
-    )
-    _tpl(
-        repository_ctx,
-        "cuda:BUILD.windows" if _is_windows(repository_ctx) else "cuda:BUILD",
-        {
-            "%{cuda_driver_lib}": cuda_libs["cuda"].file_name,
-            "%{cudart_static_lib}": cuda_libs["cudart_static"].file_name,
-            "%{cudart_static_linkopt}": _cudart_static_linkopt(
-                cuda_config.cpu_value,
-            ),
-            "%{cudart_lib}": cuda_libs["cudart"].file_name,
-            "%{cublas_lib}": cuda_libs["cublas"].file_name,
-            "%{cusolver_lib}": cuda_libs["cusolver"].file_name,
-            "%{cudnn_lib}": cuda_libs["cudnn"].file_name,
-            "%{cufft_lib}": cuda_libs["cufft"].file_name,
-            "%{curand_lib}": cuda_libs["curand"].file_name,
-            "%{cupti_lib}": cuda_libs["cupti"].file_name,
-            "%{cuda_include_genrules}": "\n".join(genrules),
-            "%{cuda_headers}": ('":cuda-include",\n' +
-                                '        ":cudnn-include",'),
-        },
-        "cuda/BUILD",
-    )
-
-    is_cuda_clang = _use_cuda_clang(repository_ctx)
+  else:
+    genrules.append(
+        "filegroup(\n" + '    name = "cudnn-include",\n' + "    srcs = [],\n" +
+        ")\n",)
+
+  # Set up BUILD file for cuda/
+  _tpl(
+      repository_ctx,
+      "cuda:build_defs.bzl",
+      {
+          "%{cuda_is_configured}":
+              "True",
+          "%{cuda_extra_copts}":
+              _compute_cuda_extra_copts(
+                  repository_ctx,
+                  cuda_config.compute_capabilities,
+              ),
+      },
+  )
+  _tpl(
+      repository_ctx,
+      "cuda:BUILD.windows" if _is_windows(repository_ctx) else "cuda:BUILD",
+      {
+          "%{cuda_driver_lib}":
+              cuda_libs["cuda"].file_name,
+          "%{cudart_static_lib}":
+              cuda_libs["cudart_static"].file_name,
+          "%{cudart_static_linkopt}":
+              _cudart_static_linkopt(cuda_config.cpu_value,),
+          "%{cudart_lib}":
+              cuda_libs["cudart"].file_name,
+          "%{cublas_lib}":
+              cuda_libs["cublas"].file_name,
+          "%{cusolver_lib}":
+              cuda_libs["cusolver"].file_name,
+          "%{cudnn_lib}":
+              cuda_libs["cudnn"].file_name,
+          "%{cufft_lib}":
+              cuda_libs["cufft"].file_name,
+          "%{curand_lib}":
+              cuda_libs["curand"].file_name,
+          "%{cupti_lib}":
+              cuda_libs["cupti"].file_name,
+          "%{cuda_include_genrules}":
+              "\n".join(genrules),
+          "%{cuda_headers}": ('":cuda-include",\n' + '        ":cudnn-include",'
+                             ),
+      },
+      "cuda/BUILD",
+  )
 
-    should_download_clang = is_cuda_clang and _flag_enabled(
-        repository_ctx,
-        _TF_DOWNLOAD_CLANG,
-    )
-    if should_download_clang:
-        download_clang(repository_ctx, "crosstool/extra_tools")
-
-    # Set up crosstool/
-    cc = find_cc(repository_ctx)
-    cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
-
-    host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
-    cuda_defines = {}
-    # Bazel sets '-B/usr/bin' flag to workaround build errors on RHEL (see
-    # https://github.com/bazelbuild/bazel/issues/760).
-    # However, this stops our custom clang toolchain from picking the provided
-    # LLD linker, so we're only adding '-B/usr/bin' when using non-downloaded
-    # toolchain.
-    # TODO: when bazel stops adding '-B/usr/bin' by default, remove this
-    #       flag from the CROSSTOOL completely (see
-    #       https://github.com/bazelbuild/bazel/issues/5634)
-    if should_download_clang:
-      cuda_defines["%{linker_bin_path_flag}"] = ""
-    else:
-      cuda_defines["%{linker_bin_path_flag}"] = 'flag: "-B/usr/bin"'
+  is_cuda_clang = _use_cuda_clang(repository_ctx)
 
-    if is_cuda_clang:
-        cuda_defines["%{host_compiler_path}"] = str(cc)
-        cuda_defines["%{host_compiler_warnings}"] = """
+  should_download_clang = is_cuda_clang and _flag_enabled(
+      repository_ctx,
+      _TF_DOWNLOAD_CLANG,
+  )
+  if should_download_clang:
+    download_clang(repository_ctx, "crosstool/extra_tools")
+
+  # Set up crosstool/
+  cc = find_cc(repository_ctx)
+  cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
+
+  host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
+  cuda_defines = {}
+  # Bazel sets '-B/usr/bin' flag to workaround build errors on RHEL (see
+  # https://github.com/bazelbuild/bazel/issues/760).
+  # However, this stops our custom clang toolchain from picking the provided
+  # LLD linker, so we're only adding '-B/usr/bin' when using non-downloaded
+  # toolchain.
+  # TODO: when bazel stops adding '-B/usr/bin' by default, remove this
+  #       flag from the CROSSTOOL completely (see
+  #       https://github.com/bazelbuild/bazel/issues/5634)
+  if should_download_clang:
+    cuda_defines["%{linker_bin_path_flag}"] = ""
+  else:
+    cuda_defines["%{linker_bin_path_flag}"] = 'flag: "-B/usr/bin"'
+
+  if is_cuda_clang:
+    cuda_defines["%{host_compiler_path}"] = str(cc)
+    cuda_defines["%{host_compiler_warnings}"] = """
         # Some parts of the codebase set -Werror and hit this warning, so
         # switch it off for now.
         flag: "-Wno-invalid-partial-specialization"
     """
-        cuda_defines["%{host_compiler_includes}"] = host_compiler_includes
-        _tpl(repository_ctx, "crosstool:BUILD", {"%{linker_files}": ":empty", "%{win_linker_files}": ":empty"})
-        repository_ctx.file("crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc", "")
-        repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.py", "")
-        repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.bat", "")
-    else:
-        cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
-        cuda_defines["%{host_compiler_warnings}"] = ""
-
-        # nvcc has the system include paths built in and will automatically
-        # search them; we cannot work around that, so we add the relevant cuda
-        # system paths to the allowed compiler specific include paths.
-        cuda_defines["%{host_compiler_includes}"] = (
-            host_compiler_includes + "\n" +
-            _cuda_include_path(repository_ctx, cuda_config) +
-            "\n  cxx_builtin_include_directory: \"%s\"" % cupti_header_dir +
-            "\n  cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir)
-        nvcc_path = str(repository_ctx.path("%s/bin/nvcc%s" %
-                                            (
-                                                cuda_config.cuda_toolkit_path,
-                                                ".exe" if _is_windows(repository_ctx) else "",
-                                            )))
-        _tpl(
-            repository_ctx,
-            "crosstool:BUILD",
-            {
-                "%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc",
-                "%{win_linker_files}": ":windows_msvc_wrapper_files",
-            },
-        )
-        wrapper_defines = {
-            "%{cpu_compiler}": str(cc),
-            "%{cuda_version}": cuda_config.cuda_version,
-            "%{nvcc_path}": nvcc_path,
-            "%{gcc_host_compiler_path}": str(cc),
-            "%{cuda_compute_capabilities}": ", ".join(
-                ["\"%s\"" % c for c in cuda_config.compute_capabilities],
-            ),
-            "%{nvcc_tmp_dir}": _get_nvcc_tmp_dir_for_windows(repository_ctx),
-        }
-        _tpl(
-            repository_ctx,
-            "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
-            wrapper_defines,
-        )
-        _tpl(
-            repository_ctx,
-            "crosstool:windows/msvc_wrapper_for_nvcc.py",
-            wrapper_defines,
-        )
-        _tpl(
-            repository_ctx,
-            "crosstool:windows/msvc_wrapper_for_nvcc.bat",
-            {
-                "%{python_binary}": _get_python_bin(repository_ctx),
-            },
-        )
-
+    cuda_defines["%{host_compiler_includes}"] = host_compiler_includes
+    _tpl(repository_ctx, "crosstool:BUILD", {
+        "%{linker_files}": ":empty",
+        "%{win_linker_files}": ":empty"
+    })
+    repository_ctx.file(
+        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc", "")
+    repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.py", "")
+    repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.bat", "")
+  else:
+    cuda_defines[
+        "%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
+    cuda_defines["%{host_compiler_warnings}"] = ""
+
+    # nvcc has the system include paths built in and will automatically
+    # search them; we cannot work around that, so we add the relevant cuda
+    # system paths to the allowed compiler specific include paths.
+    cuda_defines["%{host_compiler_includes}"] = (
+        host_compiler_includes + "\n" + _cuda_include_path(
+            repository_ctx, cuda_config) +
+        "\n  cxx_builtin_include_directory: \"%s\"" % cupti_header_dir +
+        "\n  cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir)
+    nvcc_path = str(
+        repository_ctx.path("%s/bin/nvcc%s" % (
+            cuda_config.cuda_toolkit_path,
+            ".exe" if _is_windows(repository_ctx) else "",
+        )))
     _tpl(
         repository_ctx,
-        "crosstool:CROSSTOOL",
-        cuda_defines + _get_win_cuda_defines(repository_ctx),
-        out = "crosstool/CROSSTOOL",
+        "crosstool:BUILD",
+        {
+            "%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc",
+            "%{win_linker_files}": ":windows_msvc_wrapper_files",
+        },
     )
-
-    # Set up cuda_config.h, which is used by
-    # tensorflow/stream_executor/dso_loader.cc.
+    wrapper_defines = {
+        "%{cpu_compiler}":
+            str(cc),
+        "%{cuda_version}":
+            cuda_config.cuda_version,
+        "%{nvcc_path}":
+            nvcc_path,
+        "%{gcc_host_compiler_path}":
+            str(cc),
+        "%{cuda_compute_capabilities}":
+            ", ".join(
+                ["\"%s\"" % c for c in cuda_config.compute_capabilities],),
+        "%{nvcc_tmp_dir}":
+            _get_nvcc_tmp_dir_for_windows(repository_ctx),
+    }
     _tpl(
         repository_ctx,
-        "cuda:cuda_config.h",
-        {
-            "%{cuda_version}": cuda_config.cuda_version,
-            "%{cudnn_version}": cuda_config.cudnn_version,
-            "%{cuda_compute_capabilities}": ",".join(
-                [
-                    "CudaVersion(\"%s\")" % c
-                    for c in cuda_config.compute_capabilities
-                ],
-            ),
-            "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
-        },
-        "cuda/cuda/cuda_config.h",
+        "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
+        wrapper_defines,
     )
-
-def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
-    """Creates pointers to a remotely configured repo set up to build with CUDA."""
     _tpl(
         repository_ctx,
-        "cuda:build_defs.bzl",
-        {
-            "%{cuda_is_configured}": "True",
-            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
-                repository_ctx,
-                _compute_capabilities(repository_ctx),
-            ),
-        },
+        "crosstool:windows/msvc_wrapper_for_nvcc.py",
+        wrapper_defines,
     )
     _tpl(
         repository_ctx,
-        "cuda:remote.BUILD",
+        "crosstool:windows/msvc_wrapper_for_nvcc.bat",
         {
-            "%{remote_cuda_repo}": remote_config_repo,
+            "%{python_binary}": _get_python_bin(repository_ctx),
         },
-        "cuda/BUILD",
     )
-    _tpl(repository_ctx, "crosstool:remote.BUILD", {
-        "%{remote_cuda_repo}": remote_config_repo,
-    }, "crosstool/BUILD")
+
+  _tpl(
+      repository_ctx,
+      "crosstool:CROSSTOOL",
+      cuda_defines + _get_win_cuda_defines(repository_ctx),
+      out="crosstool/CROSSTOOL",
+  )
+
+  # Set up cuda_config.h, which is used by
+  # tensorflow/stream_executor/dso_loader.cc.
+  _tpl(
+      repository_ctx,
+      "cuda:cuda_config.h",
+      {
+          "%{cuda_version}":
+              cuda_config.cuda_version,
+          "%{cudnn_version}":
+              cuda_config.cudnn_version,
+          "%{cuda_compute_capabilities}":
+              ",".join([
+                  "CudaVersion(\"%s\")" % c
+                  for c in cuda_config.compute_capabilities
+              ],),
+          "%{cuda_toolkit_path}":
+              cuda_config.cuda_toolkit_path,
+      },
+      "cuda/cuda/cuda_config.h",
+  )
+
+
+def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
+  """Creates pointers to a remotely configured repo set up to build with CUDA."""
+  _tpl(
+      repository_ctx,
+      "cuda:build_defs.bzl",
+      {
+          "%{cuda_is_configured}":
+              "True",
+          "%{cuda_extra_copts}":
+              _compute_cuda_extra_copts(
+                  repository_ctx,
+                  compute_capabilities(repository_ctx),
+              ),
+      },
+  )
+  _tpl(
+      repository_ctx,
+      "cuda:remote.BUILD",
+      {
+          "%{remote_cuda_repo}": remote_config_repo,
+      },
+      "cuda/BUILD",
+  )
+  _tpl(repository_ctx, "crosstool:remote.BUILD", {
+      "%{remote_cuda_repo}": remote_config_repo,
+  }, "crosstool/BUILD")
+
 
 def _cuda_autoconf_impl(repository_ctx):
-    """Implementation of the cuda_autoconf repository rule."""
-    if not _enable_cuda(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-    elif _TF_CUDA_CONFIG_REPO in repository_ctx.os.environ:
-        _create_remote_cuda_repository(
-            repository_ctx,
-            repository_ctx.os.environ[_TF_CUDA_CONFIG_REPO],
-        )
-    else:
-        _create_local_cuda_repository(repository_ctx)
+  """Implementation of the cuda_autoconf repository rule."""
+  if not _enable_cuda(repository_ctx):
+    _create_dummy_repository(repository_ctx)
+  elif _TF_CUDA_CONFIG_REPO in repository_ctx.os.environ:
+    _create_remote_cuda_repository(
+        repository_ctx,
+        repository_ctx.os.environ[_TF_CUDA_CONFIG_REPO],
+    )
+  else:
+    _create_local_cuda_repository(repository_ctx)
+
 
 cuda_configure = repository_rule(
     implementation = _cuda_autoconf_impl,
diff --git a/third_party/nccl/LICENSE b/third_party/nccl/LICENSE
index 146d9b765c..b958518186 100644
--- a/third_party/nccl/LICENSE
+++ b/third_party/nccl/LICENSE
@@ -1,203 +1,30 @@
-Copyright 2018 The TensorFlow Authors.  All rights reserved.
 
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2018, The TensorFlow Authors.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
+ Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+    Laboratory, the U.S. Department of Energy, nor the names of their
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ The U.S. Department of Energy funded the development of this software
+ under subcontract 7078610 with Lawrence Berkeley National Laboratory.
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
new file mode 100644
index 0000000000..f57f04c75e
--- /dev/null
+++ b/third_party/nccl/archive.BUILD
@@ -0,0 +1,179 @@
+# NVIDIA NCCL 2
+# A package of optimized primitives for collective multi-GPU communication.
+
+licenses(["restricted"])
+
+exports_files(["LICENSE.txt"])
+
+load(
+    "@local_config_nccl//:build_defs.bzl",
+    "device_link",
+    "gen_nccl_h",
+    "nccl_library",
+    "rdc_copts",
+)
+load(
+    "@local_config_cuda//cuda:build_defs.bzl",
+    "cuda_default_copts",
+)
+
+# Generate the nccl.h header file.
+gen_nccl_h(
+    name = "nccl_h",
+    output = "src/nccl.h",
+    template = "src/nccl.h.in",
+)
+
+nccl_library(
+    name = "src_hdrs",
+    hdrs = [
+        "src/nccl.h",
+        # src/include/common_coll.h #includes "collectives/collectives.h".
+        # All other #includes of collectives.h are patched in process_srcs.
+        "src/collectives/collectives.h",
+    ],
+    strip_include_prefix = "src",
+)
+
+nccl_library(
+    name = "include_hdrs",
+    hdrs = glob(["src/include/*.h"]),
+    strip_include_prefix = "src/include",
+)
+
+filegroup(
+    name = "device_hdrs",
+    srcs = glob(["src/collectives/device/*.h"]),
+)
+
+filegroup(
+    name = "device_srcs",
+    srcs = [
+        "src/collectives/device/all_gather.cu",
+        "src/collectives/device/all_reduce.cu",
+        "src/collectives/device/broadcast.cu",
+        "src/collectives/device/reduce.cu",
+        "src/collectives/device/reduce_scatter.cu",
+    ],
+)
+
+nccl_library(
+    name = "sum",
+    srcs = [
+        ":device_hdrs",
+        ":device_srcs",
+    ],
+    copts = ["-DNCCL_OP=0"] + rdc_copts(),
+    prefix = "sum_",
+    deps = [
+        ":src_hdrs",
+        ":include_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+    linkstatic = True,
+)
+
+nccl_library(
+    name = "prod",
+    srcs = [
+        ":device_hdrs",
+        ":device_srcs",
+    ],
+    copts = ["-DNCCL_OP=1"] + rdc_copts(),
+    prefix = "_prod",
+    deps = [
+        ":src_hdrs",
+        ":include_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+    linkstatic = True,
+)
+
+nccl_library(
+    name = "min",
+    srcs = [
+        ":device_hdrs",
+        ":device_srcs",
+    ],
+    copts = ["-DNCCL_OP=2"] + rdc_copts(),
+    prefix = "min_",
+    deps = [
+        ":src_hdrs",
+        ":include_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+    linkstatic = True,
+)
+
+nccl_library(
+    name = "max",
+    srcs = [
+        ":device_hdrs",
+        ":device_srcs",
+    ],
+    copts = ["-DNCCL_OP=3"] + rdc_copts(),
+    prefix = "max_",
+    deps = [
+        ":src_hdrs",
+        ":include_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+    linkstatic = True,
+)
+
+nccl_library(
+    name = "functions",
+    srcs = [
+        ":device_hdrs",
+        "src/collectives/device/functions.cu",
+    ],
+    copts = rdc_copts(),
+    deps = [
+        ":src_hdrs",
+        ":include_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+    linkstatic = True,
+)
+
+device_link(
+    name = "device_code",
+    srcs = [
+        ":functions",
+        ":max",
+        ":min",
+        ":prod",
+        ":sum",
+    ],
+)
+
+# Primary NCCL target.
+nccl_library(
+    name = "nccl",
+    srcs = glob(
+        include = ["src/**/*.cu"],
+        # Exclude device-library code.
+        exclude = ["src/collectives/device/**"],
+    ) + [
+        # Required for header inclusion checking (see
+        # http://docs.bazel.build/versions/master/be/c-cpp.html#hdrs).
+        # Files in src/ which #include "nccl.h" load it from there rather than
+        # from the virtual includes directory.
+        "src/nccl.h",
+    ],
+    hdrs = ["src/nccl.h"],
+    include_prefix = "third_party/nccl",
+    strip_include_prefix = "src",
+    copts = cuda_default_copts(),
+    deps = [
+        ":device_code",
+        ":functions",
+        ":include_hdrs",
+        ":max",
+        ":min",
+        ":prod",
+        ":src_hdrs",
+        ":sum",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
new file mode 100644
index 0000000000..ede1d3dad5
--- /dev/null
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -0,0 +1,210 @@
+"""Repository rule for NCCL."""
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+
+def _gen_nccl_h_impl(ctx):
+    """Creates nccl.h from a template."""
+    ctx.actions.expand_template(
+        output = ctx.outputs.output,
+        template = ctx.file.template,
+        substitutions = {
+            "${nccl:Major}": "2",
+            "${nccl:Minor}": "3",
+            "${nccl:Patch}": "5",
+            "${nccl:Suffix}": "",
+            "${nccl:Version}": "2305",
+        },
+    )
+gen_nccl_h = rule(
+    implementation = _gen_nccl_h_impl,
+    attrs = {
+        "template": attr.label(allow_single_file = True),
+        "output": attr.output(),
+    },
+)
+"""Creates the NCCL header file."""
+
+
+def _process_srcs_impl(ctx):
+    """Appends .cc to .cu files, patches include directives."""
+    files = []
+    for src in ctx.files.srcs:
+        if not src.is_source:
+          # Process only once, specifically "src/nccl.h".
+          files.append(src)
+          continue
+        name = src.basename
+        if src.extension == "cu":
+            name = ctx.attr.prefix + name + ".cc"
+        file = ctx.actions.declare_file(name, sibling = src)
+        ctx.actions.expand_template(
+            output = file,
+            template = src,
+            substitutions = {
+                "\"collectives.h": "\"collectives/collectives.h",
+                "\"../collectives.h": "\"collectives/collectives.h",
+                "#if __CUDACC_VER_MAJOR__":
+                    "#if defined __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__",
+                # Substitutions are applied in order.
+                "std::nullptr_t": "nullptr_t",
+                "nullptr_t": "std::nullptr_t",
+            },
+        )
+        files.append(file)
+    return [DefaultInfo(files = depset(files))]
+_process_srcs = rule(
+    implementation = _process_srcs_impl,
+    attrs = {
+        "srcs": attr.label_list(allow_files = True),
+        "prefix": attr.string(default = ""),
+    },
+)
+"""Processes the NCCL srcs so they can be compiled with bazel and clang."""
+
+
+def nccl_library(name, srcs=None, hdrs=None, prefix=None, **kwargs):
+    """Processes the srcs and hdrs and creates a cc_library."""
+
+    _process_srcs(
+        name = name + "_srcs",
+        srcs = srcs,
+        prefix = prefix,
+    )
+    _process_srcs(
+        name = name + "_hdrs",
+        srcs = hdrs,
+    )
+
+    native.cc_library(
+        name = name,
+        srcs = [name + "_srcs"] if srcs else [],
+        hdrs = [name + "_hdrs"] if hdrs else [],
+        **kwargs
+    )
+
+
+def rdc_copts():
+    """Returns copts for compiling relocatable device code."""
+
+    # The global functions can not have a lower register count than the
+    # device functions. This is enforced by setting a fixed register count.
+    # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48
+    maxrregcount = "-maxrregcount=96"
+
+    return cuda_default_copts() + select({
+          "@local_config_cuda//cuda:using_nvcc": [
+              "-nvcc_options",
+              "relocatable-device-code=true",
+              "-nvcc_options",
+              "ptxas-options=" + maxrregcount,
+          ],
+          "@local_config_cuda//cuda:using_clang": [
+              "-fcuda-rdc",
+              "-Xcuda-ptxas",
+              maxrregcount,
+          ],
+          "//conditions:default": [],
+      }) + ["-fvisibility=hidden"]
+
+
+def _filter_impl(ctx):
+    suffix = ctx.attr.suffix
+    files = [src for src in ctx.files.srcs if src.path.endswith(suffix)]
+    return [DefaultInfo(files = depset(files))]
+_filter = rule(
+    implementation = _filter_impl,
+    attrs = {
+        "srcs": attr.label_list(allow_files = True),
+        "suffix": attr.string(),
+    },
+)
+"""Filters the srcs to the ones ending with suffix."""
+
+
+def _gen_link_src_impl(ctx):
+    ctx.actions.expand_template(
+        output = ctx.outputs.output,
+        template = ctx.file.template,
+        substitutions = {
+            "REGISTERLINKBINARYFILE": '"%s"' % ctx.file.register_hdr.short_path,
+            "FATBINFILE": '"%s"' % ctx.file.fatbin_hdr.short_path,
+        },
+    )
+_gen_link_src = rule(
+    implementation = _gen_link_src_impl,
+    attrs = {
+        "register_hdr": attr.label(allow_single_file = True),
+        "fatbin_hdr": attr.label(allow_single_file = True),
+        "template": attr.label(allow_single_file = True),
+        "output": attr.output(),
+    },
+)
+"""Patches the include directives for the link.stub file."""
+
+
+def device_link(name, srcs):
+    """Links seperately compiled relocatable device code into a cc_library."""
+
+    # From .a and .pic.a archives, just use the latter.
+    _filter(
+        name = name + "_pic_a",
+        srcs = srcs,
+        suffix = ".pic.a",
+    )
+
+    # Device-link to cubins for each architecture.
+    images = []
+    cubins = []
+    for arch in %{gpu_architectures}:
+        cubin = "%s_%s.cubin" % (name, arch)
+        register_hdr = "%s_%s.h" % (name, arch)
+        nvlink = "@local_config_nccl//:nvlink"
+        cmd = ("$(location %s) --cpu-arch=X86_64 " % nvlink +
+            "--arch=%s $(SRCS) " % arch +
+            "--register-link-binaries=$(location %s) " % register_hdr +
+            "--output-file=$(location %s)" % cubin)
+        native.genrule(
+            name = "%s_%s" % (name, arch),
+            outs = [register_hdr, cubin],
+            srcs = [name + "_pic_a"],
+            cmd = cmd,
+            tools = [nvlink],
+        )
+        images.append("--image=profile=%s,file=$(location %s)" % (arch, cubin))
+        cubins.append(cubin)
+
+    # Generate fatbin header from all cubins.
+    fatbin_hdr = name + ".fatbin.h"
+    fatbinary = "@local_config_nccl//:cuda/bin/fatbinary"
+    cmd = ("PATH=$$CUDA_TOOLKIT_PATH/bin:$$PATH " + # for bin2c
+          "$(location %s) -64 --cmdline=--compile-only --link " % fatbinary +
+          "--compress-all %s --create=%%{name}.fatbin " % " ".join(images) +
+          "--embedded-fatbin=$@")
+    native.genrule(
+        name = name + "_fatbin_h",
+        outs = [fatbin_hdr],
+        srcs = cubins,
+        cmd = cmd,
+        tools = [fatbinary],
+    )
+
+    # Generate the source file #including the headers generated above.
+    _gen_link_src(
+        name = name + "_cc",
+        # Include just the last one, they are equivalent.
+        register_hdr = register_hdr,
+        fatbin_hdr = fatbin_hdr,
+        template = "@local_config_nccl//:cuda/bin/crt/link.stub",
+        output = name + ".cc",
+    )
+
+    # Compile the source file into the cc_library.
+    native.cc_library(
+        name = name,
+        srcs = [name + "_cc"],
+        textual_hdrs = [register_hdr, fatbin_hdr],
+        deps = [
+            "@local_config_cuda//cuda:cuda_headers",
+            "@local_config_cuda//cuda:cudart_static",
+        ],
+    )
diff --git a/third_party/nccl/nccl_archive.BUILD b/third_party/nccl/nccl_archive.BUILD
deleted file mode 100644
index a05899e38d..0000000000
--- a/third_party/nccl/nccl_archive.BUILD
+++ /dev/null
@@ -1,68 +0,0 @@
-# NVIDIA nccl
-# A package of optimized primitives for collective multi-GPU communication.
-
-licenses(["notice"])  # BSD
-
-exports_files(["LICENSE.txt"])
-
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "if_cuda")
-
-SRCS = [
-    "src/all_gather.cu",
-    "src/all_reduce.cu",
-    "src/broadcast.cu",
-    "src/core.cu",
-    "src/libwrap.cu",
-    "src/reduce.cu",
-    "src/reduce_scatter.cu",
-]
-
-# Copy .cu to .cu.cc so they can be in srcs of cc_library.
-[
-    genrule(
-        name = "gen_" + src,
-        srcs = [src],
-        outs = [src + ".cc"],
-        cmd = "cp $(location " + src + ") $(location " + src + ".cc)",
-    )
-    for src in SRCS
-]
-
-SRCS_CU_CC = [src + ".cc" for src in SRCS]
-
-cc_library(
-    name = "nccl",
-    srcs = if_cuda(SRCS_CU_CC + glob(["src/*.h"])),
-    hdrs = if_cuda(["src/nccl.h"]),
-    copts = [
-        "-DCUDA_MAJOR=0",
-        "-DCUDA_MINOR=0",
-        "-DNCCL_MAJOR=0",
-        "-DNCCL_MINOR=0",
-        "-DNCCL_PATCH=0",
-        "-Iexternal/nccl_archive/src",
-        "-O3",
-    ] + cuda_default_copts(),
-    include_prefix = "third_party/nccl",
-    linkopts = select({
-        "@org_tensorflow//tensorflow:android": [
-            "-pie",
-        ],
-        "@org_tensorflow//tensorflow:darwin": [
-            "-Wl,-framework",
-            "-Wl,CoreFoundation",
-            "-Wl,-framework",
-            "-Wl,Security",
-        ],
-        "@org_tensorflow//tensorflow:ios": [],
-        "@org_tensorflow//tensorflow:windows": [
-            "-DEFAULTLIB:ws2_32.lib",
-        ],
-        "//conditions:default": [
-            "-lrt",
-        ],
-    }),
-    strip_include_prefix = "src",
-    visibility = ["//visibility:public"],
-    deps = ["@local_config_cuda//cuda:cuda_headers"],
-)
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index d78fe8f3aa..7f00df0962 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -11,12 +11,16 @@
 load(
     "//third_party/gpus:cuda_configure.bzl",
     "auto_configure_fail",
+    "compute_capabilities",
+    "cuda_toolkit_path",
     "find_cuda_define",
     "matches_version",
 )
 
-_NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
+_CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
 _NCCL_HDR_PATH = "NCCL_HDR_PATH"
+_NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
+_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
 _TF_NCCL_VERSION = "TF_NCCL_VERSION"
 _TF_NCCL_CONFIG_REPO = "TF_NCCL_CONFIG_REPO"
 
@@ -37,6 +41,12 @@ cc_library(
 """
 
 _NCCL_ARCHIVE_BUILD_CONTENT = """
+exports_files([
+    "cuda/bin/crt/link.stub",
+    "cuda/bin/fatbinary",
+    "nvlink",
+])
+
 filegroup(
   name = "LICENSE",
   data = ["@nccl_archive//:LICENSE.txt"],
@@ -50,113 +60,125 @@ alias(
 )
 """
 
-# Local build results in dynamic link and the license should not be included.
-_NCCL_REMOTE_BUILD_TEMPLATE = Label("//third_party/nccl:remote.BUILD.tpl")
-_NCCL_LOCAL_BUILD_TEMPLATE = Label("//third_party/nccl:system.BUILD.tpl")
+def _label(file):
+    return Label("//third_party/nccl:{}".format(file))
 
 def _find_nccl_header(repository_ctx, nccl_install_path):
-  """Finds the NCCL header on the system.
-
-  Args:
-    repository_ctx: The repository context.
-    nccl_install_path: The NCCL library install directory.
+    """Finds the NCCL header on the system.
 
-  Returns:
-    The path to the NCCL header.
-  """
-  header_path = repository_ctx.path("%s/include/nccl.h" % nccl_install_path)
-  if not header_path.exists:
-    auto_configure_fail("Cannot find %s" % str(header_path))
-  return header_path
+    Args:
+      repository_ctx: The repository context.
+      nccl_install_path: The NCCL library install directory.
 
+    Returns:
+      The path to the NCCL header.
+    """
+    header_path = repository_ctx.path("%s/include/nccl.h" % nccl_install_path)
+    if not header_path.exists:
+        auto_configure_fail("Cannot find %s" % str(header_path))
+    return header_path
 
 def _check_nccl_version(repository_ctx, nccl_install_path, nccl_hdr_path, nccl_version):
-  """Checks whether the header file matches the specified version of NCCL.
-
-  Args:
-    repository_ctx: The repository context.
-    nccl_install_path: The NCCL library install directory.
-    nccl_version: The expected NCCL version.
-
-  Returns:
-    A string containing the library version of NCCL.
-  """
-  header_path = repository_ctx.path("%s/nccl.h" % nccl_hdr_path)
-  if not header_path.exists:
-    header_path = _find_nccl_header(repository_ctx, nccl_install_path)
-  header_dir = str(header_path.realpath.dirname)
-  major_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
-                                   _DEFINE_NCCL_MAJOR)
-  minor_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
-                                   _DEFINE_NCCL_MINOR)
-  patch_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
-                                   _DEFINE_NCCL_PATCH)
-  header_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
-  if not matches_version(nccl_version, header_version):
-    auto_configure_fail(
-        ("NCCL library version detected from %s/nccl.h (%s) does not match " +
-         "TF_NCCL_VERSION (%s). To fix this rerun configure again.") %
-        (header_dir, header_version, nccl_version))
-
-
-def _find_nccl_lib(repository_ctx, nccl_install_path, nccl_version):
-  """Finds the given NCCL library on the system.
-
-  Args:
-    repository_ctx: The repository context.
-    nccl_install_path: The NCCL library installation directory.
-    nccl_version: The version of NCCL library files as returned
-      by _nccl_version.
-
-  Returns:
-    The path to the NCCL library.
-  """
-  lib_path = repository_ctx.path("%s/lib/libnccl.so.%s" % (nccl_install_path,
-                                                           nccl_version))
-  if not lib_path.exists:
-    auto_configure_fail("Cannot find NCCL library %s" % str(lib_path))
-  return lib_path
-
+    """Checks whether the header file matches the specified version of NCCL.
+
+    Args:
+      repository_ctx: The repository context.
+      nccl_install_path: The NCCL library install directory.
+      nccl_hdr_path: The NCCL header path.
+      nccl_version: The expected NCCL version.
+
+    Returns:
+      A string containing the library version of NCCL.
+    """
+    header_path = repository_ctx.path("%s/nccl.h" % nccl_hdr_path)
+    if not header_path.exists:
+        header_path = _find_nccl_header(repository_ctx, nccl_install_path)
+    header_dir = str(header_path.realpath.dirname)
+    major_version = find_cuda_define(
+        repository_ctx,
+        header_dir,
+        "nccl.h",
+        _DEFINE_NCCL_MAJOR,
+    )
+    minor_version = find_cuda_define(
+        repository_ctx,
+        header_dir,
+        "nccl.h",
+        _DEFINE_NCCL_MINOR,
+    )
+    patch_version = find_cuda_define(
+        repository_ctx,
+        header_dir,
+        "nccl.h",
+        _DEFINE_NCCL_PATCH,
+    )
+    header_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+    if not matches_version(nccl_version, header_version):
+        auto_configure_fail(
+            ("NCCL library version detected from %s/nccl.h (%s) does not match " +
+             "TF_NCCL_VERSION (%s). To fix this rerun configure again.") %
+            (header_dir, header_version, nccl_version),
+        )
 
 def _nccl_configure_impl(repository_ctx):
-  """Implementation of the nccl_configure repository rule."""
-  if _TF_NCCL_VERSION not in repository_ctx.os.environ:
-    # Add a dummy build file to make bazel query happy.
-    repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
-    return
-
-  if _TF_NCCL_CONFIG_REPO in repository_ctx.os.environ:
-    # Forward to the pre-configured remote repository.
-    repository_ctx.template("BUILD", _NCCL_REMOTE_BUILD_TEMPLATE, {
-        "%{target}": repository_ctx.os.environ[_TF_NCCL_CONFIG_REPO],
-    })
-    return
-
-  nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip()
-  if matches_version("1", nccl_version):
-    # Alias to GitHub target from @nccl_archive.
-    if not matches_version(nccl_version, "1.3"):
-      auto_configure_fail(
-          "NCCL from GitHub must use version 1.3 (got %s)" % nccl_version)
-    repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
-  else:
-    # Create target for locally installed NCCL.
-    nccl_install_path = repository_ctx.os.environ[_NCCL_INSTALL_PATH].strip()
-    nccl_hdr_path = repository_ctx.os.environ[_NCCL_HDR_PATH].strip()
-    _check_nccl_version(repository_ctx, nccl_install_path, nccl_hdr_path, nccl_version)
-    repository_ctx.template("BUILD", _NCCL_LOCAL_BUILD_TEMPLATE, {
-        "%{version}": nccl_version,
-        "%{install_path}": nccl_install_path,
-        "%{hdr_path}": nccl_hdr_path,
-    })
-
+    """Implementation of the nccl_configure repository rule."""
+    if _TF_NCCL_VERSION not in repository_ctx.os.environ:
+        # Add a dummy build file to make bazel query happy.
+        repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
+        return
+
+    if _TF_NCCL_CONFIG_REPO in repository_ctx.os.environ:
+        # Forward to the pre-configured remote repository.
+        repository_ctx.template("BUILD", _label("remote.BUILD.tpl"), {
+            "%{target}": repository_ctx.os.environ[_TF_NCCL_CONFIG_REPO],
+        })
+        return
+
+    nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip()
+    if nccl_version == "":
+        # Alias to open source build from @nccl_archive.
+        repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+
+        # TODO(csigg): implement and reuse in cuda_configure.bzl.
+        gpu_architectures = [
+            "sm_" + capability.replace(".", "")
+            for capability in compute_capabilities(repository_ctx)
+        ]
+
+        # Round-about way to make the list unique.
+        gpu_architectures = dict(zip(gpu_architectures, gpu_architectures)).keys()
+        repository_ctx.template("build_defs.bzl", _label("build_defs.bzl.tpl"), {
+            "%{gpu_architectures}": str(gpu_architectures),
+        })
+
+        repository_ctx.symlink(cuda_toolkit_path(repository_ctx), "cuda")
+
+        # Temporary work-around for setups which symlink ptxas to a newer
+        # version. The versions of nvlink and ptxas need to agree, so we find
+        # nvlink next to the real location of ptxas. This is only temporary and
+        # will be removed again soon.
+        nvlink_dir = repository_ctx.path("cuda/bin/ptxas").realpath.dirname
+        repository_ctx.symlink(nvlink_dir.get_child("nvlink"), "nvlink")
+    else:
+        # Create target for locally installed NCCL.
+        nccl_install_path = repository_ctx.os.environ[_NCCL_INSTALL_PATH].strip()
+        nccl_hdr_path = repository_ctx.os.environ[_NCCL_HDR_PATH].strip()
+        _check_nccl_version(repository_ctx, nccl_install_path, nccl_hdr_path, nccl_version)
+        repository_ctx.template("BUILD", _label("system.BUILD.tpl"), {
+            "%{version}": nccl_version,
+            "%{install_path}": nccl_install_path,
+            "%{hdr_path}": nccl_hdr_path,
+        })
 
 nccl_configure = repository_rule(
-    implementation=_nccl_configure_impl,
-    environ=[
-        _NCCL_INSTALL_PATH,
+    implementation = _nccl_configure_impl,
+    environ = [
+        _CUDA_TOOLKIT_PATH,
         _NCCL_HDR_PATH,
+        _NCCL_INSTALL_PATH,
         _TF_NCCL_VERSION,
+        _TF_CUDA_COMPUTE_CAPABILITIES,
+        _TF_NCCL_CONFIG_REPO,
     ],
 )
 """Detects and configures the NCCL configuration.
-- 
GitLab


From d258207f1583df4faa452265b051879af6c15dac Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 08:55:53 -0700
Subject: [PATCH 465/570] BEGIN_PUBLIC Automated rollback of PR #21945
 END_PUBLIC Automated rollback of commit
 863f61412fcc654840c6b67473b742ea4e5e964e. Revert #21945.

PiperOrigin-RevId: 215913175
---
 tensorflow/python/ops/array_ops.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index e3e4d5f910..4be9c532f4 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1407,13 +1407,8 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
         gen_array_ops.conjugate_transpose
         if (conjugate and a.dtype.is_complex) else gen_array_ops.transpose)
     if perm is None:
-      a = ops.convert_to_tensor(a, name="a")
-      if not a.get_shape().ndims:
-        rank = gen_array_ops.rank(a)
-        perm = (rank - 1) - gen_math_ops._range(0, rank, 1)
-      else:
-        rank = a.get_shape().ndims
-        perm = (rank - 1) - np.arange(rank)
+      rank = gen_array_ops.rank(a)
+      perm = (rank - 1) - gen_math_ops._range(0, rank, 1)
       ret = transpose_fn(a, perm, name=name)
       # NOTE(mrry): Setting the shape explicitly because
       #   reverse is not handled by the shape function.
-- 
GitLab


From 5a43e01ef0f8cb86d836a4d1c08a246630e26f8c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 09:29:00 -0700
Subject: [PATCH 466/570] Update XlaSort to match the underlying HLO.

PiperOrigin-RevId: 215917470
---
 tensorflow/compiler/tests/sort_ops_test.py    | 18 ++++++++++++++-
 .../compiler/tf2xla/kernels/sort_ops.cc       | 17 +++++++++++++-
 tensorflow/compiler/tf2xla/ops/xla_ops.cc     | 23 ++++++++++++++++++-
 tensorflow/compiler/tf2xla/python/xla.py      | 12 ++++++----
 .../compiler/xla/service/hlo_verifier.cc      |  2 +-
 5 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/tests/sort_ops_test.py b/tensorflow/compiler/tests/sort_ops_test.py
index dbf4beb693..57f0ab7a9e 100644
--- a/tensorflow/compiler/tests/sort_ops_test.py
+++ b/tensorflow/compiler/tests/sort_ops_test.py
@@ -48,13 +48,29 @@ class XlaSortOpTest(xla_test.XLATestCase):
         self.assertAllClose(v, result, rtol=1e-3)
 
   def testSort(self):
-    supported_types = set([dtypes.bfloat16.as_numpy_dtype, np.float32])
+    supported_types = set(
+        [dtypes.bfloat16.as_numpy_dtype, np.float32, np.int32, np.uint32])
     for dtype in supported_types.intersection(self.numeric_types):
       x = np.arange(101, dtype=dtype)
       np.random.shuffle(x)
       self._assertOpOutputMatchesExpected(
           xla.sort, [x], expected=[np.arange(101, dtype=dtype)])
 
+  def testKeyValueSort(self):
+    supported_types = set(
+        [dtypes.bfloat16.as_numpy_dtype, np.float32, np.int32, np.uint32])
+    for key_type in supported_types.intersection(self.numeric_types):
+      for value_type in supported_types.intersection(self.numeric_types):
+        x = np.arange(101, dtype=key_type)
+        np.random.shuffle(x)
+        y = (-x).astype(value_type)
+        self._assertOpOutputMatchesExpected(
+            xla.key_value_sort, [x, y],
+            expected=[
+                np.arange(101, dtype=key_type),
+                -np.arange(101, dtype=value_type)
+            ])
+
   def testTopK(self):
     supported_types = set(
         [dtypes.bfloat16.as_numpy_dtype, np.float32, np.int32, np.uint32])
diff --git a/tensorflow/compiler/tf2xla/kernels/sort_ops.cc b/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
index aaeeae01cc..45f03d8c21 100644
--- a/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
@@ -25,11 +25,26 @@ class XlaSortOp : public XlaOpKernel {
   explicit XlaSortOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
 
   void Compile(XlaOpKernelContext* context) override {
-    context->SetOutput(0, xla::Sort(context->Input(0)));
+    context->SetOutput(0, xla::Sort(context->Input("input")));
   }
 };
 
 REGISTER_XLA_OP(Name("XlaSort"), XlaSortOp);
 
+class XlaKeyValueSortOp : public XlaOpKernel {
+ public:
+  explicit XlaKeyValueSortOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    xla::XlaOp result =
+        xla::Sort(context->Input("keys"), context->Input("values"));
+    context->SetOutput(0, xla::GetTupleElement(result, 0));
+    context->SetOutput(1, xla::GetTupleElement(result, 1));
+  }
+};
+
+REGISTER_XLA_OP(Name("XlaKeyValueSort"), XlaKeyValueSortOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index 733eeed3c6..557911553d 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -354,12 +354,33 @@ Wraps the XLA Sort operator, documented at
  https://www.tensorflow.org/performance/xla/operation_semantics#sort
 .
 
-Sorts a tensor. Currently only rank 1 sorts in ascending order are supported.
+Sorts a tensor. Currently only sorts in ascending order are supported.
 
 input: A `Tensor` of type T.
 output: A `Tensor` of type T.
 )doc");
 
+REGISTER_OP("XlaKeyValueSort")
+    .Input("keys: K")
+    .Input("values: V")
+    .Output("sorted_keys: K")
+    .Output("sorted_values: V")
+    .Attr("K: realnumbertype")
+    .Attr("V: type")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Wraps the XLA Sort operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#sort
+.
+
+Sorts a tensor. Currently only sorts in ascending order are supported.
+
+keys: A `Tensor` of type K.
+values: A `Tensor` of type V.
+sorted_keys: A `Tensor` of type K.
+sorted_values: A `Tensor` of type V.
+)doc");
+
 // TODO(b/37549631) setting the While Op to always be stateful is too
 // conservative.
 REGISTER_OP("XlaWhile")
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 27dd18a9bb..bc7924c371 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -212,9 +212,9 @@ bitcast_convert_type = array_ops.bitcast
 
 def broadcast(x, dims, name=None):
   x = ops.convert_to_tensor(x)
-  shape = array_ops.concat(
-      [constant_op.constant(dims),
-       array_ops.shape(x)], axis=0)
+  shape = array_ops.concat([constant_op.constant(dims),
+                            array_ops.shape(x)],
+                           axis=0)
   return array_ops.broadcast_to(x, shape, name=name)
 
 
@@ -332,12 +332,13 @@ def reduce_window(operand,
     init: a scalar tensor representing the initial value for the reduction
     reducer: a reduction function that combines a pair of scalars.
     window_dimensions: shape of the window, as a list of integers
-    window_strides: inter-window strides, as a list of integers. Optional;
-      if omitted, defaults to strides of 1.
+    window_strides: inter-window strides, as a list of integers. Optional; if
+      omitted, defaults to strides of 1.
     padding: padding to apply to 'operand'. List of (low, high) pairs of
       integers that specify the padding to apply before and after each
       dimension. Optional; if omitted, defaults to no padding.
     name: the operator name, or None.
+
   Returns:
     A tensor that represents the output of the reduce_window operator.
   """
@@ -377,4 +378,5 @@ def slice(x, start_dims, limit_dims, strides):
 
 
 sort = gen_xla_ops.xla_sort
+key_value_sort = gen_xla_ops.xla_key_value_sort
 while_loop = gen_xla_ops.xla_while
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index b5498bb936..c22ee03388 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -548,6 +548,7 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
     case HloOpcode::kTupleSelect:
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
+    case HloOpcode::kSort:
     case HloOpcode::kTuple:
     case HloOpcode::kWhile:
       break;
@@ -1153,7 +1154,6 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
   TF_RETURN_IF_ERROR(VerifyHloStructure(module));
   TF_RETURN_IF_ERROR(VerifySendsAndRecvs(*module));
 
-
   for (auto* computation : module->computations()) {
     std::unique_ptr<ShapeVerifier> shape_verifier = shape_verifier_factory_();
     TF_RETURN_IF_ERROR(computation->Accept(shape_verifier.get()));
-- 
GitLab


From 8b7c789e7401fe56b4f648a04f675a3cb69119e5 Mon Sep 17 00:00:00 2001
From: Jing Li <jingli@google.com>
Date: Fri, 5 Oct 2018 09:54:40 -0700
Subject: [PATCH 467/570] - Don't set tpu optimizer parameter variable during
 weight initialization if the optimizer isn't set, e.g. loading weights and
 then predict. - Add load_weights for `KerasTpuModel`.

PiperOrigin-RevId: 215920993
---
 tensorflow/contrib/tpu/python/tpu/keras_support.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index a3a7fd8bb0..af183b3232 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -1998,6 +1998,9 @@ class KerasTPUModel(models.Model):
 
     logging.info('Setting weights on TPU model.')
     cloned_model.set_weights(weights)
+    if self._tpu_model.optimizer is None:
+      # tpu_model may not be compiled, e.g., loading weights and then predict.
+      return
     for k, v in six.iteritems(cpu_optimizer_config):
       opt_var = getattr(self._tpu_model.optimizer, k)
       if isinstance(opt_var, variables.Variable):
@@ -2052,6 +2055,10 @@ class KerasTPUModel(models.Model):
     self._cpu_model.set_weights(weights)
     self._tpu_weights_initialized = False
 
+  def load_weights(self, filepath, by_name=False):
+    self._cpu_model.load_weights(filepath, by_name)
+    self._tpu_weights_initialized = False
+
 
 # pylint: disable=bad-continuation
 def _validate_shapes(model):
-- 
GitLab


From d493a7f2fdbbc29a292741135f4c1598352e876b Mon Sep 17 00:00:00 2001
From: Mingsheng Hong <hongm@google.com>
Date: Fri, 5 Oct 2018 10:31:23 -0700
Subject: [PATCH 468/570] When running a native/builtin op via eager C API,
 automatically fill in default attr values that are not overridden e.g.
 transpose_a in the matmul op).

This is required for backward compatibility (a binary built via an older version
of TF should still run on a newer version of TF, where some ops may have added
attrs).

For non-eager graph building, the default attr values of graph ops are added by
tensorflow::AddDefaultsToNodeDef().

We ran into this issue when running the same S4TF test cases via eager APIs --
some tests failed due to "missing attrs", but are fixed by this patch.

PiperOrigin-RevId: 215927271
---
 tensorflow/c/eager/c_api_test_util.cc            |  2 --
 .../core/common_runtime/eager/attr_builder.cc    | 16 ++++++++++++++++
 .../core/common_runtime/eager/attr_builder.h     |  6 ++++++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index 5607c9dcb0..008f088c2d 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -99,8 +99,6 @@ TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
   TFE_OpAddInput(op, b, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
-  TFE_OpSetAttrBool(op, "transpose_a", 0);
-  TFE_OpSetAttrBool(op, "transpose_b", 0);
   TFE_OpSetAttrType(op, "T", TFE_TensorHandleDataType(a));
 
   return op;
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index cf1cd4134e..5c8369de87 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -136,6 +136,22 @@ void AttrBuilder::FillAttrValueMap(AttrValueMap* m,
       m->insert(*it);
     }
   }
+  // For any attr-value pairs that exist in the op def (from op registry) but
+  // not `m`, fill them into `m`, so that we can run a TFE_Op without having to
+  // specify all the default attr values (e.g. for matmul, the `transpose_a`
+  // attr defaults to false).
+  const OpDef* op_def = nullptr;
+  Status s = OpDefForOp(op_name_.c_str(), &op_def);
+  // This is expected, if this op is a custom function, and is therefore not
+  // present in the op registry.
+  if (!s.ok()) return;
+
+  DCHECK(op_def);
+  for (const auto& attr_def : op_def->attr()) {
+    if (attr_def.has_default_value() && !m->count(attr_def.name())) {
+      SetInAttrValueMap(m, attr_def.name(), attr_def.default_value());
+    }
+  }
 }
 
 const NodeDef& AttrBuilder::BuildNodeDef() {
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index cbe6a1cb50..c114ea4ba0 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -110,6 +110,12 @@ class AttrBuilder {
   using AttrVec = tensorflow::gtl::InlinedVector<std::pair<StringPiece, T>, 2>;
 
   void MayBeInitializeNodeDef();
+  // Fill `m` with the attr-value pairs set via AttrBuilder::Set() so far, as
+  // well as any default attr-value pairs from the associated op_def, if there
+  // is one.
+  //
+  // If `include_those_in_node_def` is true, also include any attr-value pairs
+  // from `node_def_`.
   void FillAttrValueMap(AttrValueMap* m, bool include_those_in_node_def) const;
 
   template <class T>
-- 
GitLab


From e2f80439c5bfee56581875219ea83cc5307854f5 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Fri, 5 Oct 2018 10:37:16 -0700
Subject: [PATCH 469/570] Refactoring TFLite export code. Unify OperatorCode
 generation logic.

PiperOrigin-RevId: 215928419
---
 tensorflow/contrib/lite/toco/tflite/export.cc | 176 ++++++++++--------
 tensorflow/contrib/lite/toco/tflite/export.h  |  19 +-
 .../contrib/lite/toco/tflite/export_test.cc   |  77 +++++---
 3 files changed, 163 insertions(+), 109 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index 45ca7f7f0c..f6f76e48a4 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -63,21 +63,21 @@ bool IsControlFlowOp(const string& tensorflow_op) {
   return false;
 }
 
-details::OperatorKey GetOperatorKey(
-    const ::toco::Operator& op,
-    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
-    bool allow_flex_ops) {
-  string custom_code;
-  if (op.type == OperatorType::kUnsupported) {
-    const TensorFlowUnsupportedOperator& unsupported_op =
-        static_cast<const TensorFlowUnsupportedOperator&>(op);
-    custom_code = unsupported_op.tensorflow_op;
-  }
-  int version = 1;
-  if (ops_by_type.count(op.type) != 0) {
-    version = ops_by_type.at(op.type)->GetVersion(op);
+// Map from operator name to TF Lite enum value, for all builtins.
+const std::map<string, BuiltinOperator>& GetBuiltinOpsMap() {
+  static std::map<string, BuiltinOperator>* builtin_ops = nullptr;
+  if (builtin_ops == nullptr) {
+    builtin_ops = new std::map<string, BuiltinOperator>();
+
+    for (int i = BuiltinOperator_MIN; i <= BuiltinOperator_MAX; ++i) {
+      BuiltinOperator op = static_cast<BuiltinOperator>(i);
+      string name = EnumNameBuiltinOperator(op);
+      if (op != BuiltinOperator_CUSTOM && !name.empty()) {
+        (*builtin_ops)[name] = op;
+      }
+    }
   }
-  return details::OperatorKey(op.type, custom_code, version, allow_flex_ops);
+  return *builtin_ops;
 }
 
 void WriteModelToString(const flatbuffers::FlatBufferBuilder& builder,
@@ -91,27 +91,59 @@ void WriteModelToString(const flatbuffers::FlatBufferBuilder& builder,
 
 namespace details {
 
-OperatorKey::OperatorKey(OperatorType type, const std::string& custom_code,
-                         int version, bool allow_flex_ops) {
-  this->type = type;
-  this->custom_code = custom_code;
-  this->version = version;
-
-  if (type == OperatorType::kUnsupported) {
-    // TODO(b/113715895): When `allow_flex_ops` is on, for now there's no way
-    // to populate a regular custom op. We need to find a way to fix this.
-    if (allow_flex_ops) {
-      // Memorize the original TensorFlow op name.
-      this->flex_tensorflow_op = custom_code;
-      // Prefix the custom code of the flex op.
-      this->custom_code = string(::tflite::kFlexCustomCodePrefix) + custom_code;
-      this->is_flex_op = true;
-
-      if (IsControlFlowOp(this->flex_tensorflow_op)) {
-        is_unsupported_flex_op = true;
+OperatorKey GetOperatorKey(
+    const ::toco::Operator& op,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
+    bool allow_flex_ops) {
+  string name = HelpfulOperatorTypeName(op);
+  const auto& builtin_ops = GetBuiltinOpsMap();
+
+  bool is_builtin = false;
+  OperatorKey key;
+  if (ops_by_type.count(op.type) != 0) {
+    key.version = ops_by_type.at(op.type)->GetVersion(op);
+    name = ops_by_type.at(op.type)->name();
+    is_builtin = (builtin_ops.count(name) > 0);
+  }
+
+  if (is_builtin) {
+    // For TFLite supported builtin ops, find out its BuiltinOperator enum used
+    // in FlatBuffer.
+    key.type = builtin_ops.at(name);
+  } else {
+    key.type = BuiltinOperator_CUSTOM;
+
+    key.is_custom_op = true;
+    if (op.type == OperatorType::kUnsupported) {
+      const TensorFlowUnsupportedOperator& unsupported_op =
+          static_cast<const TensorFlowUnsupportedOperator&>(op);
+      const auto tensorflow_op = unsupported_op.tensorflow_op;
+
+      // TODO(b/113715895): When `allow_flex_ops` is on, for now there's no way
+      // to populate a regular custom op. We need to find a way to fix this.
+      if (allow_flex_ops) {
+        // Memorize the original TensorFlow op name.
+        key.flex_tensorflow_op = tensorflow_op;
+        // Prefix the custom code of the flex op.
+        key.custom_code =
+            string(::tflite::kFlexCustomCodePrefix) + tensorflow_op;
+        key.is_flex_op = true;
+
+        if (IsControlFlowOp(tensorflow_op)) {
+          key.is_unsupported_flex_op = true;
+        }
+      } else {
+        key.custom_code = tensorflow_op;
       }
+    } else {
+      // For Toco-supported/TFLite-unsupported ops, currently we produce a
+      // custom op. This gives developers a chance to implement custom ops.
+      // TODO(b/116800229): Also produce Toco-supported/TFLite-unsupported ops
+      // as Flex ops when Flex mode is enabled.
+      key.custom_code = name;
     }
   }
+  return key;
 }
 
 void LoadTensorsMap(const Model& model, TensorsMap* tensors_map) {
@@ -145,6 +177,7 @@ void LoadOperatorsMap(
     ++index;
   }
 }
+
 }  // namespace details
 
 Offset<Vector<Offset<Tensor>>> ExportTensors(
@@ -230,7 +263,7 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
     const Model& model,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
     const details::OperatorsMap& operators_map, FlatBufferBuilder* builder,
-    std::set<string>* unsupported_ops, const ExportParams& params) {
+    const ExportParams& params) {
   // Map from operator name to TF Lite enum value, for all builtins.
   std::map<string, BuiltinOperator> builtin_ops;
   for (int i = BuiltinOperator_MIN; i <= BuiltinOperator_MAX; ++i) {
@@ -247,37 +280,16 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
 
   for (const auto& op : model.operators) {
     const details::OperatorKey operator_key =
-        GetOperatorKey(*op, ops_by_type, params.allow_flex_ops);
+        details::GetOperatorKey(*op, ops_by_type, params.allow_flex_ops);
     int op_index = operators_map.at(operator_key);
-    int op_version = operator_key.version;
 
-    string name = HelpfulOperatorTypeName(*op);
-    bool is_builtin = false;
-    if (ops_by_type.count(op->type) != 0) {
-      name = ops_by_type.at(op->type)->name();
-      is_builtin = (builtin_ops.count(name) > 0);
+    flatbuffers::Offset<flatbuffers::String> custom_code = 0;
+    if (!operator_key.custom_code.empty()) {
+      custom_code = builder->CreateString(operator_key.custom_code);
     }
 
-    if (is_builtin) {
-      ordered_opcodes[op_index] =
-          CreateOperatorCode(*builder, builtin_ops[name], 0, op_version);
-    } else {
-      // This could be a kUnsupported, in which case we should be
-      // able to retrieve the original Tensorflow name from the OperatorKey, or
-      // this could be a proper TOCO operator that is completely unknown to TF
-      // Lite.
-      if (!operator_key.custom_code.empty()) {
-        name = operator_key.custom_code;
-      }
-      // Either way, this is an operator that is not supported by TF Lite,
-      // so we output it as a custom op and add it to the error summary.
-      if (unsupported_ops) {
-        unsupported_ops->insert(name);
-      }
-      ordered_opcodes[op_index] =
-          CreateOperatorCode(*builder, BuiltinOperator_CUSTOM,
-                             builder->CreateString(name), op_version);
-    }
+    ordered_opcodes[op_index] = CreateOperatorCode(
+        *builder, operator_key.type, custom_code, operator_key.version);
   }
 
   std::vector<Offset<OperatorCode>> opcode_vector;
@@ -312,7 +324,7 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
     }
 
     int op_index = operators_map.at(
-        GetOperatorKey(*op, ops_by_type, params.allow_flex_ops));
+        details::GetOperatorKey(*op, ops_by_type, params.allow_flex_ops));
 
     auto tflite_op_it = ops_by_type.find(op->type);
     BaseOperator* tflite_op = tflite_op_it == ops_by_type.end()
@@ -386,9 +398,8 @@ void Export(
   Array empty_array;
   buffers_to_write.push_back(&empty_array);
 
-  std::set<string> unsupported_ops;
-  auto op_codes = ExportOperatorCodes(model, ops_by_type, operators_map,
-                                      &builder, &unsupported_ops, params);
+  auto op_codes =
+      ExportOperatorCodes(model, ops_by_type, operators_map, &builder, params);
 
   for (const auto& op : model.operators) {
     if (op->type == OperatorType::kFakeQuant) {
@@ -398,7 +409,20 @@ void Export(
                       "for --std_values and --mean_values.";
     }
   }
-  if (!unsupported_ops.empty()) {
+
+  std::set<string> custom_ops;
+  std::set<string> unsupported_flex_ops;
+  for (const auto& it : operators_map) {
+    const details::OperatorKey& key = it.first;
+    if (key.is_custom_op) {
+      custom_ops.insert(key.custom_code);
+    }
+    if (key.is_unsupported_flex_op) {
+      unsupported_flex_ops.insert(key.flex_tensorflow_op);
+    }
+  }
+
+  if (!custom_ops.empty()) {
     if (!params.allow_custom_ops) {
       // Remove ExpandDims and ReorderAxes from unimplemented list unless they
       // compose the list. Both ops are removed during graph transformations.
@@ -406,14 +430,14 @@ void Export(
       // transformation is unable to run because the output shape is not
       // defined. This causes unnecessary confusion during model conversion
       // time.
-      std::set<string> unsupported_ops_final;
-      for (const auto& op_type : unsupported_ops) {
+      std::set<string> custom_ops_final;
+      for (const auto& op_type : custom_ops) {
         if (op_type != "ReorderAxes" && op_type != "ExpandDims") {
-          unsupported_ops_final.insert(op_type);
+          custom_ops_final.insert(op_type);
         }
       }
-      if (unsupported_ops_final.empty()) {
-        unsupported_ops_final = unsupported_ops;
+      if (custom_ops_final.empty()) {
+        custom_ops_final = custom_ops;
       }
 
       LOG(QFATAL)
@@ -423,13 +447,13 @@ void Export(
              "--allow_custom_ops, or by setting allow_custom_ops=True "
              "when calling tf.contrib.lite.TFLiteConverter(). Here is a list "
              "of operators for which  you will need custom implementations: "
-          << absl::StrJoin(unsupported_ops_final, ", ") << ".";
+          << absl::StrJoin(custom_ops_final, ", ") << ".";
     }
 
     std::set<string> unsupported_control_flow_ops;
     // Check if unsupported ops contains control flow ops. It's impossible
     // to implement these ops as custom ops at the moment.
-    for (const auto& op : unsupported_ops) {
+    for (const auto& op : custom_ops) {
       if (IsControlFlowOp(op)) {
         unsupported_control_flow_ops.insert(op);
       }
@@ -441,14 +465,6 @@ void Export(
     }
   }
 
-  std::set<string> unsupported_flex_ops;
-  for (const auto& it : operators_map) {
-    const details::OperatorKey& key = it.first;
-    if (key.is_unsupported_flex_op) {
-      unsupported_flex_ops.insert(key.custom_code);
-    }
-  }
-
   if (!unsupported_flex_ops.empty()) {
     LOG(QFATAL) << "Some of the operators in the model are not supported by "
                    "TensorFlow Flex runtime: "
diff --git a/tensorflow/contrib/lite/toco/tflite/export.h b/tensorflow/contrib/lite/toco/tflite/export.h
index 9efb282c6c..c627f48086 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.h
+++ b/tensorflow/contrib/lite/toco/tflite/export.h
@@ -81,16 +81,20 @@ using TensorsMap = std::unordered_map<string, int>;
 // Only when `type` is `kUnsupported`, `custom_code` is filled to
 // identify which operation is used.
 struct OperatorKey {
-  OperatorKey(OperatorType type, const std::string& custom_code, int version,
-              bool allow_flex_ops = false);
+  OperatorKey() {}
+  OperatorKey(::tflite::BuiltinOperator type, const std::string& custom_code,
+              int version)
+      : type(type), custom_code(custom_code), version(version) {}
 
   // Only `type`, `custom_code` and `version` is used to compute hash and
   // identity.
-  OperatorType type;
+  ::tflite::BuiltinOperator type = ::tflite::BuiltinOperator_CUSTOM;
   std::string custom_code;
-  int version;
+  int version = 1;
 
-  // THe fields below are not used to compute hash and identity.
+  // The fields below are not used to compute hash and identity.
+  // TODO(ycling): Consider to change these fields to accessor functions.
+  bool is_custom_op = false;
   bool is_flex_op = false;
   bool is_unsupported_flex_op = false;
   // The original TensorFlow op name for the flex op. Filled only when
@@ -124,6 +128,11 @@ struct OperatorKey {
   };
 };
 
+OperatorKey GetOperatorKey(
+    const ::toco::Operator& op,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
+    bool allow_flex_ops);
+
 // A maps from operator type to its final position in the TF Lite buffer.
 using OperatorsMap = std::unordered_map<OperatorKey, int, OperatorKey::Hash>;
 
diff --git a/tensorflow/contrib/lite/toco/tflite/export_test.cc b/tensorflow/contrib/lite/toco/tflite/export_test.cc
index a71a64d56f..d48ab78285 100644
--- a/tensorflow/contrib/lite/toco/tflite/export_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export_test.cc
@@ -105,13 +105,15 @@ TEST_F(ExportTest, LoadOperatorsMap) {
 
   details::OperatorsMap operators;
   const auto ops_by_type = BuildOperatorByTypeMap();
-  // TODO(ycling): Add a test for allow_flex_ops.
   details::LoadOperatorsMap(input_model_, &operators, ops_by_type, false);
-  EXPECT_EQ(0, operators[details::OperatorKey(OperatorType::kAdd, "", 1)]);
-  EXPECT_EQ(1, operators[details::OperatorKey(OperatorType::kConv, "", 1)]);
-  EXPECT_EQ(2, operators[details::OperatorKey(OperatorType::kSub, "", 1)]);
-  EXPECT_EQ(3, operators[details::OperatorKey(OperatorType::kUnsupported,
+  EXPECT_EQ(
+      0, operators[details::OperatorKey(::tflite::BuiltinOperator_ADD, "", 1)]);
+  EXPECT_EQ(1, operators[details::OperatorKey(::tflite::BuiltinOperator_CONV_2D,
+                                              "", 1)]);
+  EXPECT_EQ(2, operators[details::OperatorKey(::tflite::BuiltinOperator_CUSTOM,
                                               "MyCrazyOp", 1)]);
+  EXPECT_EQ(
+      3, operators[details::OperatorKey(::tflite::BuiltinOperator_SUB, "", 1)]);
 }
 
 TEST_F(ExportTest, Export) {
@@ -133,7 +135,7 @@ TEST_F(ExportTest, Export) {
   }
 
   EXPECT_THAT(names, ElementsAre("builtin:ADD", "builtin:CONV_2D",
-                                 "builtin:SUB", "custom:MyCrazyOp"));
+                                 "custom:MyCrazyOp", "builtin:SUB"));
 
   std::vector<uint32_t> indices;
   auto operators = (*model->subgraphs())[0]->operators();
@@ -142,7 +144,7 @@ TEST_F(ExportTest, Export) {
     indices.push_back(op->opcode_index());
   }
 
-  EXPECT_THAT(indices, ElementsAre(1, 0, 3, 2));
+  EXPECT_THAT(indices, ElementsAre(1, 0, 2, 3));
 }
 
 TEST_F(ExportTest, QuantizeWeights) {
@@ -257,7 +259,8 @@ TEST_F(VersionedOpExportTest, LoadOperatorsMapWithOpV1) {
   details::LoadOperatorsMap(input_model_, &operators, ops_by_type, false);
 
   EXPECT_EQ(1, operators.size());
-  EXPECT_EQ(0, operators.at(details::OperatorKey(OperatorType::kConv, "", 1)));
+  EXPECT_EQ(0, operators.at(details::OperatorKey(
+                   ::tflite::BuiltinOperator_CONV_2D, "", 1)));
 }
 
 TEST_F(VersionedOpExportTest, LoadOperatorsMapWithOpV2) {
@@ -268,7 +271,8 @@ TEST_F(VersionedOpExportTest, LoadOperatorsMapWithOpV2) {
   details::LoadOperatorsMap(input_model_, &operators, ops_by_type, false);
 
   EXPECT_EQ(1, operators.size());
-  EXPECT_EQ(0, operators.at(details::OperatorKey(OperatorType::kConv, "", 2)));
+  EXPECT_EQ(0, operators.at(details::OperatorKey(
+                   ::tflite::BuiltinOperator_CONV_2D, "", 2)));
 }
 
 TEST_F(VersionedOpExportTest, LoadOperatorsMapWithBothVersions) {
@@ -280,8 +284,10 @@ TEST_F(VersionedOpExportTest, LoadOperatorsMapWithBothVersions) {
   details::LoadOperatorsMap(input_model_, &operators, ops_by_type, false);
 
   EXPECT_EQ(2, operators.size());
-  EXPECT_EQ(0, operators.at(details::OperatorKey(OperatorType::kConv, "", 1)));
-  EXPECT_EQ(1, operators.at(details::OperatorKey(OperatorType::kConv, "", 2)));
+  EXPECT_EQ(0, operators.at(details::OperatorKey(
+                   ::tflite::BuiltinOperator_CONV_2D, "", 1)));
+  EXPECT_EQ(1, operators.at(details::OperatorKey(
+                   ::tflite::BuiltinOperator_CONV_2D, "", 2)));
 }
 
 TEST_F(VersionedOpExportTest, Export) {
@@ -314,38 +320,61 @@ TEST_F(VersionedOpExportTest, Export) {
 }
 
 TEST(OperatorKeyTest, TestBuiltinOp) {
-  details::OperatorKey key(OperatorType::kConv, "", 2);
-  EXPECT_EQ(key.type, OperatorType::kConv);
+  auto op = absl::make_unique<ConvOperator>();
+
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  const auto key = details::GetOperatorKey(*op, ops_by_type, false);
+
+  EXPECT_EQ(key.type, ::tflite::BuiltinOperator_CONV_2D);
   EXPECT_EQ(key.custom_code, "");
-  EXPECT_EQ(key.version, 2);
+  EXPECT_EQ(key.version, 1);
+}
+
+TEST(OperatorKeyTest, TestCustomOp) {
+  auto op = absl::make_unique<TensorFlowUnsupportedOperator>();
+  op->tensorflow_op = "MyCrazyCustomOp";
+
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  const auto key = details::GetOperatorKey(*op, ops_by_type, false);
+
+  EXPECT_EQ(key.type, ::tflite::BuiltinOperator_CUSTOM);
+  EXPECT_EQ(key.custom_code, "MyCrazyCustomOp");
+  EXPECT_EQ(key.version, 1);
 }
 
 TEST(OperatorKeyTest, TestFlexOp) {
+  auto op = absl::make_unique<TensorFlowUnsupportedOperator>();
+  op->tensorflow_op = "BatchMatMul";
+
+  const auto ops_by_type = BuildOperatorByTypeMap();
   {
-    details::OperatorKey key(OperatorType::kUnsupported, "SomeUnsupportedOp", 1,
-                             false);
-    EXPECT_EQ(key.type, OperatorType::kUnsupported);
+    const auto key = details::GetOperatorKey(*op, ops_by_type, false);
     // It shouldn't be converted to Flex op if `allow_flex_op` is false.
-    EXPECT_EQ(key.custom_code, "SomeUnsupportedOp");
+    EXPECT_EQ(key.type, ::tflite::BuiltinOperator_CUSTOM);
+    EXPECT_EQ(key.custom_code, "BatchMatMul");
     EXPECT_EQ(key.version, 1);
     EXPECT_FALSE(key.is_flex_op);
   }
 
   {
-    details::OperatorKey key(OperatorType::kUnsupported, "SomeUnsupportedOp", 1,
-                             true);
-    EXPECT_EQ(key.type, OperatorType::kUnsupported);
     // Verify that the custom op name is prefixed by "Flex" and `is_flex_op`
     // is true.
-    EXPECT_EQ(key.custom_code, "FlexSomeUnsupportedOp");
+    const auto key = details::GetOperatorKey(*op, ops_by_type, true);
+    EXPECT_EQ(key.type, ::tflite::BuiltinOperator_CUSTOM);
+    EXPECT_EQ(key.custom_code, "FlexBatchMatMul");
     EXPECT_EQ(key.version, 1);
     EXPECT_TRUE(key.is_flex_op);
   }
 }
 
 TEST(OperatorKeyTest, TestFlexWithControlFlowOp) {
-  details::OperatorKey key(OperatorType::kUnsupported, "Merge", 1, true);
-  EXPECT_EQ(key.type, OperatorType::kUnsupported);
+  auto op = absl::make_unique<TensorFlowUnsupportedOperator>();
+  op->tensorflow_op = "Merge";
+
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  const auto key = details::GetOperatorKey(*op, ops_by_type, true);
+
+  EXPECT_EQ(key.type, ::tflite::BuiltinOperator_CUSTOM);
   EXPECT_EQ(key.custom_code, "FlexMerge");
   EXPECT_EQ(key.version, 1);
   EXPECT_TRUE(key.is_flex_op);
-- 
GitLab


From dd8afaad37fdb284dce3518a9be22aca1c25e475 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 10:48:51 -0700
Subject: [PATCH 470/570] Fix documentation.

PiperOrigin-RevId: 215930596
---
 tensorflow/python/framework/importer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index c6595918ae..c9ac27e788 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -370,7 +370,8 @@ def import_graph_def(graph_def,
 
   Returns:
     A list of `Operation` and/or `Tensor` objects from the imported graph,
-    corresponding to the names in `return_elements`.
+    corresponding to the names in `return_elements`,
+    and None if `returns_elements` is None.
 
   Raises:
     TypeError: If `graph_def` is not a `GraphDef` proto,
-- 
GitLab


From f410ffc1699e864e84857089183db0d952ada7fe Mon Sep 17 00:00:00 2001
From: Andreas Madsen <amwebdk@gmail.com>
Date: Thu, 26 Jul 2018 15:44:39 +0200
Subject: [PATCH 471/570] make sparsemax nan and infinity safe

logits that are -inf will be given 0 probability and logits that are
inf will result in a nan output. Likewise if all logits are -inf the
output will also be nan.

This is done by using where operators, mostly because 0 * inf = nan
and x/0 = sign(x) inf following the IEEE 754 standard. However these
results are not mathematically correct in the context of the sparsemax
algorithm.

Fixes: https://github.com/tensorflow/tensorflow/issues/15564
---
 .../kernel_tests/sparsemax_loss_test.py       | 64 +++++++++++++++++++
 .../python/kernel_tests/sparsemax_test.py     | 63 +++++++++++++++++-
 .../contrib/sparsemax/python/ops/sparsemax.py | 30 ++++++++-
 .../sparsemax/python/ops/sparsemax_loss.py    | 32 ++++++++--
 4 files changed, 178 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
index 360e7dbe75..2db76a6d56 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
@@ -109,6 +109,66 @@ class SparsemaxLossTest(test.TestCase):
         np_loss, tf_loss_out, half_atol=1e-2, half_rtol=5e-3)
     self.assertShapeEqual(np_loss, tf_loss_op)
 
+  def _test_sparsemax_loss_of_nan(self, dtype, random, use_gpu):
+    """check sparsemax-loss transfers nan"""
+    q = np.asarray([
+        [0, 0, 1],
+        [0, 0, 1],
+        [0, 0, 1]
+    ])
+    z_nan = np.asarray([
+        [0, np.nan, 0],
+        [0, np.nan, np.nan],
+        [np.nan, np.nan, np.nan]
+    ]).astype(dtype)
+
+    _, tf_loss_nan = self._tf_sparsemax_loss(z_nan, q, dtype, use_gpu)
+    self.assertAllCloseAccordingToType(
+        [np.nan, np.nan, np.nan],
+        tf_loss_nan)
+
+  def _test_sparsemax_loss_of_inf(self, dtype, random, use_gpu):
+    """check sparsemax-loss is infinity safe"""
+    q = np.asarray([
+        [0, 0, 1],
+        [0, 0, 1],
+        [0, 0, 1],
+        [0, 0, 1]
+    ])
+    z_neg = np.asarray([
+        [0, -np.inf, 0],
+        [0, -np.inf, -np.inf],
+        [-np.inf, -np.inf, 0],
+        [-np.inf, -np.inf, -np.inf],
+    ]).astype(dtype)
+    z_pos = np.asarray([
+        [0, np.inf, 0],
+        [0, np.inf, np.inf],
+        [np.inf, np.inf, 0],
+        [np.inf, np.inf, np.inf]
+    ]).astype(dtype)
+    z_mix = np.asarray([
+        [0, np.inf, 0],
+        [0, np.inf, -np.inf],
+        [-np.inf, np.inf, 0],
+        [-np.inf, np.inf, -np.inf]
+    ]).astype(dtype)
+
+    _, tf_loss_neg = self._tf_sparsemax_loss(z_neg, q, dtype, use_gpu)
+    self.assertAllCloseAccordingToType(
+        [0.25, np.inf, 0, np.nan],
+        tf_loss_neg)
+
+    _, tf_loss_pos = self._tf_sparsemax_loss(z_pos, q, dtype, use_gpu)
+    self.assertAllCloseAccordingToType(
+        [np.nan, np.nan, np.nan, np.nan],
+        tf_loss_pos)
+
+    _, tf_loss_mix = self._tf_sparsemax_loss(z_mix, q, dtype, use_gpu)
+    self.assertAllCloseAccordingToType(
+        [np.nan, np.nan, np.nan, np.nan],
+        tf_loss_mix)
+
   def _test_constant_add(self, dtype, random, use_gpu):
     """check sparsemax-loss proposition 3"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10))
@@ -198,6 +258,10 @@ class SparsemaxLossTest(test.TestCase):
 
     self._test_sparsemax_loss_against_numpy(dtype, random, use_gpu=False)
 
+    self._test_sparsemax_loss_of_nan(dtype, random, use_gpu=False)
+
+    self._test_sparsemax_loss_of_inf(dtype, random, use_gpu=False)
+
     self._test_constant_add(dtype, random, use_gpu=False)
 
     self._test_sparsemax_loss_positive(dtype, random, use_gpu=False)
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
index 259e62bd86..38c6dd15db 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
@@ -87,6 +87,61 @@ class SparsemaxTest(test.TestCase):
         p_sparemax, tf_sparsemax_out, half_atol=5e-3)
     self.assertShapeEqual(p_sparemax, tf_sparsemax_op)
 
+  def _test_sparsemax_of_nan(self, dtype, random, use_gpu):
+    """check sparsemax transfers nan"""
+    z_nan = np.asarray([
+        [0, np.nan, 0],
+        [0, np.nan, np.nan],
+        [np.nan, np.nan, np.nan],
+    ]).astype(dtype)
+
+    _, tf_sparsemax_nan = self._tf_sparsemax(z_nan, dtype, use_gpu)
+    self.assertAllCloseAccordingToType([
+        [np.nan, np.nan, np.nan],
+        [np.nan, np.nan, np.nan],
+        [np.nan, np.nan, np.nan]
+    ], tf_sparsemax_nan)
+
+  def _test_sparsemax_of_inf(self, dtype, random, use_gpu):
+    """check sparsemax is infinity safe"""
+    z_neg = np.asarray([
+        [0, -np.inf, 0],
+        [0, -np.inf, -np.inf],
+        [-np.inf, -np.inf, -np.inf],
+    ]).astype(dtype)
+    z_pos = np.asarray([
+        [0, np.inf, 0],
+        [0, np.inf, np.inf],
+        [np.inf, np.inf, np.inf]
+    ]).astype(dtype)
+    z_mix = np.asarray([
+        [0, np.inf, 0],
+        [0, np.inf, -np.inf],
+        [-np.inf, np.inf, -np.inf]
+    ]).astype(dtype)
+
+    _, tf_sparsemax_neg = self._tf_sparsemax(z_neg, dtype, use_gpu)
+    self.assertAllCloseAccordingToType([
+        [0.5, 0, 0.5],
+        [1, 0, 0],
+        [np.nan, np.nan, np.nan]
+    ], tf_sparsemax_neg)
+
+    _, tf_sparsemax_pos = self._tf_sparsemax(z_pos, dtype, use_gpu)
+    self.assertAllCloseAccordingToType([
+        [np.nan, np.nan, np.nan],
+        [np.nan, np.nan, np.nan],
+        [np.nan, np.nan, np.nan]
+    ], tf_sparsemax_pos)
+
+    _, tf_sparsemax_mix = self._tf_sparsemax(z_mix, dtype, use_gpu)
+    self.assertAllCloseAccordingToType([
+        [np.nan, np.nan, np.nan],
+        [np.nan, np.nan, np.nan],
+        [np.nan, np.nan, np.nan]
+    ], tf_sparsemax_mix)
+
+
   def _test_sparsemax_of_zero(self, dtype, random, use_gpu):
     """check sparsemax proposition 1, part 1"""
     z = np.zeros((1, 10))
@@ -97,7 +152,7 @@ class SparsemaxTest(test.TestCase):
     self.assertAllCloseAccordingToType(p_sparemax, tf_sparsemax_out)
     self.assertShapeEqual(p_sparemax, tf_sparsemax_op)
 
-  def _test_sparsemax_of_inf(self, dtype, random, use_gpu):
+  def _test_sparsemax_of_to_inf(self, dtype, random, use_gpu):
     """check sparsemax proposition 1, part 2"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10))
 
@@ -210,10 +265,14 @@ class SparsemaxTest(test.TestCase):
 
     self._test_sparsemax_against_numpy(dtype, random, use_gpu=False)
 
-    self._test_sparsemax_of_zero(dtype, random, use_gpu=False)
+    self._test_sparsemax_of_nan(dtype, random, use_gpu=False)
 
     self._test_sparsemax_of_inf(dtype, random, use_gpu=False)
 
+    self._test_sparsemax_of_zero(dtype, random, use_gpu=False)
+
+    self._test_sparsemax_of_to_inf(dtype, random, use_gpu=False)
+
     self._test_constant_add(dtype, random, use_gpu=False)
 
     self._test_permutation(dtype, random, use_gpu=False)
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
index e617af2ff1..f903b629c7 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
@@ -49,7 +49,14 @@ def sparsemax(logits, name=None):
     obs = array_ops.shape(logits)[0]
     dims = array_ops.shape(logits)[1]
 
-    z = logits - math_ops.reduce_mean(logits, axis=1)[:, array_ops.newaxis]
+    # In the paper, they call the logits z.
+    # The mean(logits) can be substracted from logits to make the algorithm
+    # more numerically stable. the instability in this algorithm comes mostly
+    # from the z_cumsum. Substacting the mean will cause z_cumsum to be close
+    # to zero. However, in practise the numerical instability issues are very
+    # minor and substacting the mean causes extra issues with inf and nan
+    # input.
+    z = logits
 
     # sort z
     z_sorted, _ = nn.top_k(z, k=dims)
@@ -64,10 +71,27 @@ def sparsemax(logits, name=None):
     k_z = math_ops.reduce_sum(math_ops.cast(z_check, dtypes.int32), axis=1)
 
     # calculate tau(z)
-    indices = array_ops.stack([math_ops.range(0, obs), k_z - 1], axis=1)
+    # If there are inf values or all values are -inf, the k_z will be zero,
+    # this is mathematically invalid and will also cause the gather_nd to fail.
+    # Prevent this issue for now by setting k_z = 1 if k_z = 0, this is then
+    # fixed later (see p_safe) by returning p = nan. This results in the same
+    # behavior as softmax.
+    k_z_safe = math_ops.maximum(k_z, 1)
+    indices = array_ops.stack([math_ops.range(0, obs), k_z_safe - 1], axis=1)
     tau_sum = array_ops.gather_nd(z_cumsum, indices)
     tau_z = (tau_sum - 1) / math_ops.cast(k_z, logits.dtype)
 
     # calculate p
-    return math_ops.maximum(
+    p = math_ops.maximum(
         math_ops.cast(0, logits.dtype), z - tau_z[:, array_ops.newaxis])
+    # If k_z = 0 or if z = nan, then the input is invalid
+    p_safe = array_ops.where(
+        math_ops.logical_or(
+            math_ops.equal(k_z, 0),
+            math_ops.is_nan(z_cumsum[:, -1])
+        ),
+        array_ops.fill([obs, dims], math_ops.cast(float('nan'), logits.dtype)),
+        p
+    )
+
+    return p_safe
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
index 582d1e6136..9095cfe267 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
@@ -47,14 +47,34 @@ def sparsemax_loss(logits, sparsemax, labels, name=None):
     sparsemax = ops.convert_to_tensor(sparsemax, name="sparsemax")
     labels = ops.convert_to_tensor(labels, name="labels")
 
-    shifted_logits = logits - \
-        math_ops.reduce_mean(logits, axis=1)[:, array_ops.newaxis]
+    # In the paper, they call the logits z.
+    # A constant can be substracted from logits to make the algorithm
+    # more numerically stable in theory. However, there are really no major
+    # source numerical instability in this algorithm.
+    z = logits
 
     # sum over support
-    support = math_ops.cast(sparsemax > 0, sparsemax.dtype)
-    sum_s = support * sparsemax * (shifted_logits - 0.5 * sparsemax)
+    # Use a conditional where instead of a multiplication to support z = -inf.
+    # If z = -inf, and there is no support (sparsemax = 0), a multiplication
+    # would cause 0 * -inf = nan, which is not correct in this case.
+    sum_s = array_ops.where(
+        math_ops.logical_or(sparsemax > 0, math_ops.is_nan(sparsemax)),
+        sparsemax * (z - 0.5 * sparsemax),
+        array_ops.zeros_like(sparsemax)
+    )
 
     # - z_k + ||q||^2
-    q_part = labels * (0.5 * labels - shifted_logits)
+    q_part = labels * (0.5 * labels - z)
+    # Fix the case where labels = 0 and z = -inf, where q_part would
+    # otherwise be 0 * -inf = nan. But since the lables = 0, no cost for
+    # z = -inf should be consideredself.
+    # The code below also coveres the case where z = inf. Howeverm in this
+    # caose the sparsemax will be nan, which means the sum_s will also be nan,
+    # therefor this case doesn't need addtional special treatment.
+    q_part_safe = array_ops.where(
+        math_ops.logical_and(math_ops.equal(labels, 0), math_ops.is_inf(z)),
+        array_ops.zeros_like(z),
+        q_part
+    )
 
-    return math_ops.reduce_sum(sum_s + q_part, axis=1)
+    return math_ops.reduce_sum(sum_s + q_part_safe, axis=1)
-- 
GitLab


From b1325838aaf902e52fae4b085c6396848c445062 Mon Sep 17 00:00:00 2001
From: Tong Shen <endlessroad@google.com>
Date: Fri, 5 Oct 2018 11:13:53 -0700
Subject: [PATCH 472/570] Declare that stateless random ops are not
 differentiable in C++ code.

PiperOrigin-RevId: 215935319
---
 tensorflow/core/BUILD                        |  1 +
 tensorflow/core/ops/stateless_random_grad.cc | 23 ++++++++++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 tensorflow/core/ops/stateless_random_grad.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 6a3ee3c1cb..900a0e11c4 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1242,6 +1242,7 @@ cc_library(
     srcs = [
         "ops/math_grad.cc",
         "ops/random_grad.cc",
+        "ops/stateless_random_grad.cc",
     ],
     linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
     visibility = ["//visibility:public"],
diff --git a/tensorflow/core/ops/stateless_random_grad.cc b/tensorflow/core/ops/stateless_random_grad.cc
new file mode 100644
index 0000000000..331e1d0152
--- /dev/null
+++ b/tensorflow/core/ops/stateless_random_grad.cc
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/function.h"
+
+namespace tensorflow {
+REGISTER_OP_NO_GRADIENT("StatelessRandomUniform");
+REGISTER_OP_NO_GRADIENT("StatelessRandomNormal");
+REGISTER_OP_NO_GRADIENT("StatelessTruncatedNormal");
+REGISTER_OP_NO_GRADIENT("StatelessMultinomial");
+}  // end namespace tensorflow
-- 
GitLab


From 1e446b37620dcdca73e855c83efcc0d14bb68a8c Mon Sep 17 00:00:00 2001
From: Igor Ganichev <iga@google.com>
Date: Fri, 5 Oct 2018 11:27:03 -0700
Subject: [PATCH 473/570] Make gradient tape stack thread local

PiperOrigin-RevId: 215937618
---
 tensorflow/python/eager/pywrap_tfe_src.cc | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 6193f40ce8..6d3ef9a37b 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1228,8 +1228,9 @@ static PyTypeObject TFE_Py_Tape_Type = {
 // GIL, which is always held when any TFE_Py_* methods are called. We should
 // revisit this if/when decide to not hold the GIL while manipulating the tape
 // stack.
-static tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>* tape_set = nullptr;
 tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>* GetTapeSet() {
+  thread_local tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>* tape_set{
+      nullptr};
   if (tape_set == nullptr) {
     tape_set = new tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>;
   }
@@ -1264,27 +1265,10 @@ class SafeTapeSet {
   tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*> tape_set_;
 };
 
-// xcode 7 doesn't define thread_local, so for compatibility we implement our
-// own. TODO(apassos) remove once we can deprecate xcode 7.
-#ifndef __APPLE__
 bool* ThreadTapeIsStopped() {
   thread_local bool thread_tape_is_stopped{false};
   return &thread_tape_is_stopped;
 }
-#else
-static std::unordered_map<std::thread::id, bool>* tape_is_stopped = nullptr;
-bool* ThreadTapeIsStopped() {
-  if (tape_is_stopped == nullptr) {
-    tape_is_stopped = new std::unordered_map<std::thread::id, bool>;
-  }
-  auto it = tape_is_stopped->find(std::this_thread::get_id());
-  if (it != tape_is_stopped->end()) {
-    return &(it->second);
-  }
-  return &(tape_is_stopped->emplace(std::this_thread::get_id(), false)
-               .first->second);
-}
-#endif
 
 void TFE_Py_TapeSetStopOnThread() { *ThreadTapeIsStopped() = true; }
 
-- 
GitLab


From 496bc1589831da2f00e6d49b12c68b97301730d4 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Fri, 5 Oct 2018 11:38:34 -0700
Subject: [PATCH 474/570] Disable micro/examples/micro_speech:micro_speech_test
 test under msan

PiperOrigin-RevId: 215939542
---
 .../lite/experimental/micro/examples/micro_speech/BUILD        | 3 +++
 .../contrib/lite/experimental/micro/testing/micro_test.bzl     | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/BUILD
index 447c584387..dad58b6c1c 100644
--- a/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/BUILD
+++ b/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/BUILD
@@ -17,6 +17,9 @@ tflite_micro_cc_test(
         "tiny_conv_model_data.cc",
         "tiny_conv_model_data.h",
     ],
+    tags = [
+        "nomsan",
+    ],
     deps = [
         "//tensorflow/contrib/lite:schema_fbs_version",
         "//tensorflow/contrib/lite/experimental/micro:micro_framework",
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/micro_test.bzl b/tensorflow/contrib/lite/experimental/micro/testing/micro_test.bzl
index 91e349cb24..916e3eeac3 100644
--- a/tensorflow/contrib/lite/experimental/micro/testing/micro_test.bzl
+++ b/tensorflow/contrib/lite/experimental/micro/testing/micro_test.bzl
@@ -10,6 +10,7 @@ def tflite_micro_cc_test(
         nocopts = "",
         linkopts = [],
         deps = [],
+        tags = [],
         visibility = None):
     """Tests a C/C++ binary without testing framework  dependencies`.
 
@@ -43,6 +44,7 @@ def tflite_micro_cc_test(
         nocopts = nocopts,
         linkopts = linkopts,
         deps = deps,
+        tags = tags,
         visibility = visibility,
     )
     native.sh_test(
@@ -61,4 +63,5 @@ def tflite_micro_cc_test(
         ],
         deps = [
         ],
+        tags = tags,
     )
-- 
GitLab


From 03b4161326897453fa6b2803b873954607f7623b Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Fri, 5 Oct 2018 11:49:19 -0700
Subject: [PATCH 475/570] [XLA] Extend the HLO verifier to check that
 non-layout-changing instructions preserve operand layouts.

Add an std::function member to the HloVerifier for a backend to specify the
function object used to determine whether an instruction can change layouts.
Use the function object to find out the non-layout-changing instructions and
check that such instructions should produce results with the same layouts as
its operands.

Add test cases.

PiperOrigin-RevId: 215941282
---
 tensorflow/compiler/xla/service/BUILD         |  1 +
 .../compiler/xla/service/cpu/cpu_compiler.cc  |  9 ++-
 .../xla/service/gpu/nvptx_compiler.cc         | 21 ++++--
 .../compiler/xla/service/hlo_verifier.cc      | 34 +++++++++-
 .../compiler/xla/service/hlo_verifier.h       | 14 +++-
 .../compiler/xla/service/hlo_verifier_test.cc | 67 +++++++++++++++++++
 .../compiler/xla/tests/hlo_test_base.cc       | 14 ++--
 tensorflow/compiler/xla/tests/hlo_test_base.h |  8 ++-
 8 files changed, 149 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 4797cf3330..2b292ed053 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2450,6 +2450,7 @@ tf_cc_test(
         ":hlo",
         ":hlo_parser",
         ":hlo_verifier",
+        ":layout_assignment",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 5834f67285..68c715a086 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -327,8 +327,13 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   {
     auto& pass = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
         "simplification after layout assignement");
-    pass.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/true,
-                                          /*allow_mixed_precision=*/false);
+    // TODO(b/117156505): When the bug is fixed, the CPU backend should not
+    // produce layout changing elementwise operations. We will then pass
+    // LayoutAssignment::InstructionCanChangeLayout to the HLO verifier to
+    // enable stricter verification.
+    pass.AddInvariantChecker<HloVerifier>(
+        /*layout_sensitive=*/true,
+        /*allow_mixed_precision=*/false);
     pass.AddPass<HloPassFix<AlgebraicSimplifier>>(
         /*is_layout_sensitive=*/true,
         [](const Shape&, const Shape&) { return true; },
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 50e47542c4..ac6c2c5565 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -239,8 +239,10 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
 
   {
     HloPassPipeline pipeline("post-layout_assignment");
-    pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/true,
-                                              /*allow_mixed_precision=*/false);
+    pipeline.AddInvariantChecker<HloVerifier>(
+        /*layout_sensitive=*/true,
+        /*allow_mixed_precision=*/false,
+        LayoutAssignment::InstructionCanChangeLayout);
 
     // The LayoutAssignment pass may leave behind kCopy instructions which are
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
@@ -286,8 +288,10 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
 
   {
     HloPassFix<HloPassPipeline> fusion("fusion");
-    fusion.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/true,
-                                            /*allow_mixed_precision=*/false);
+    fusion.AddInvariantChecker<HloVerifier>(
+        /*layout_sensitive=*/true,
+        /*allow_mixed_precision=*/false,
+        LayoutAssignment::InstructionCanChangeLayout);
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false);
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true);
     fusion.AddPass<FusionMerger>();
@@ -299,7 +303,8 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
 
     HloPassPipeline reduce_pipeline("reduce-precision");
     reduce_pipeline.AddInvariantChecker<HloVerifier>(
-        /*is_layout_sensitive=*/true, /*allow_mixed_precision=*/false);
+        /*is_layout_sensitive=*/true, /*allow_mixed_precision=*/false,
+        LayoutAssignment::InstructionCanChangeLayout);
     ReducePrecisionInsertion::AddPasses(
         &reduce_pipeline, hlo_module->config().debug_options(),
         ReducePrecisionInsertion::PassTiming::AFTER_FUSION);
@@ -325,8 +330,10 @@ Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   // (b/27180329). Therefore, in that case, we set the output to be a copy of
   // the parameter.
   HloPassPipeline pipeline("GPU-ir-emit-prepare");
-  pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/true,
-                                            /*allow_mixed_precision=*/false);
+  pipeline.AddInvariantChecker<HloVerifier>(
+      /*layout_sensitive=*/true,
+      /*allow_mixed_precision=*/false,
+      LayoutAssignment::InstructionCanChangeLayout);
 
   // Copy insertion should be performed immediately before IR emission to avoid
   // inserting unnecessary copies (later pass adds an instruction which
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index c22ee03388..fad3b14ec2 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -1042,7 +1042,10 @@ Status CheckElementwiseInstruction(HloInstruction* instruction) {
 // not check result shape as that is checked in the ShapeVerifier.
 class InstructionVerifier : public DfsHloVisitorWithDefault {
  public:
-  InstructionVerifier() {}
+  explicit InstructionVerifier(std::function<bool(const HloInstruction*)>
+                                   instruction_can_change_layout_func)
+      : instruction_can_change_layout_func_(
+            instruction_can_change_layout_func) {}
 
   Status DefaultAction(HloInstruction*) override { return Status::OK(); }
 
@@ -1143,8 +1146,34 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status Postprocess(HloInstruction* instruction) override {
+    if (instruction_can_change_layout_func_ &&
+        LayoutUtil::IsDenseArray(instruction->shape()) &&
+        !instruction_can_change_layout_func_(instruction)) {
+      const Shape& result_shape = instruction->shape();
+      const Layout& result_layout = result_shape.layout();
+      for (HloInstruction* operand : instruction->operands()) {
+        const Shape& operand_shape = operand->shape();
+        if (LayoutUtil::IsDenseArray(operand_shape) &&
+            ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(result_shape)) {
+          const Layout& operand_layout = operand_shape.layout();
+          TF_RET_CHECK(LayoutUtil::Equal(result_layout, operand_layout))
+              << "Instruction shouldn't change layouts "
+              << instruction->ToString() << " From "
+              << ShapeUtil::HumanString(result_shape) << " To "
+              << ShapeUtil::HumanString(operand_shape);
+        }
+      }
+    }
+
+    return Status::OK();
+  }
+
  private:
   absl::flat_hash_map<string, const HloInstruction*> instructions_by_name_;
+  // Determines whether an instruction can change layouts.
+  std::function<bool(const HloInstruction*)>
+      instruction_can_change_layout_func_;
 };
 
 }  // namespace
@@ -1158,7 +1187,8 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
     std::unique_ptr<ShapeVerifier> shape_verifier = shape_verifier_factory_();
     TF_RETURN_IF_ERROR(computation->Accept(shape_verifier.get()));
 
-    InstructionVerifier instruction_verifier;
+    InstructionVerifier instruction_verifier(
+        instruction_can_change_layout_func_);
     TF_RETURN_IF_ERROR(computation->Accept(&instruction_verifier));
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 6d16586c2c..cb49cb95ba 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -155,11 +155,17 @@ class HloVerifier : public HloModulePass {
  public:
   using ShapeVerifierFactory = std::function<std::unique_ptr<ShapeVerifier>()>;
 
-  explicit HloVerifier(bool layout_sensitive, bool allow_mixed_precision)
+  explicit HloVerifier(bool layout_sensitive, bool allow_mixed_precision,
+                       std::function<bool(const HloInstruction*)>
+                           instruction_can_change_layout_func = {})
       : shape_verifier_factory_([layout_sensitive, allow_mixed_precision] {
           return absl::make_unique<ShapeVerifier>(layout_sensitive,
                                                   allow_mixed_precision);
-        }) {}
+        }),
+        instruction_can_change_layout_func_(
+            std::move(instruction_can_change_layout_func)) {
+    CHECK(instruction_can_change_layout_func_ == nullptr || layout_sensitive);
+  }
 
   // Uses custom shape verification.
   explicit HloVerifier(ShapeVerifierFactory shape_verifier_factory)
@@ -177,6 +183,10 @@ class HloVerifier : public HloModulePass {
   // being a DfsHloVisitor, is stateful. We want a clean object
   // for each run of the verifier.
   ShapeVerifierFactory shape_verifier_factory_;
+
+  // Determines whether an instruction can change layouts.
+  std::function<bool(const HloInstruction*)>
+      instruction_can_change_layout_func_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 8f0423bb1c..afe01e5487 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/layout_assignment.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -50,6 +51,14 @@ class HloVerifierTestAllowMixedPrecision : public HloTestBase {
                     /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 };
 
+class HloVerifierTestLayoutSensitive : public HloTestBase {
+ public:
+  HloVerifierTestLayoutSensitive()
+      : HloTestBase(/*verifier_layout_sensitive=*/true,
+                    /*allow_mixed_precision_in_hlo_verifier=*/false,
+                    LayoutAssignment::InstructionCanChangeLayout) {}
+};
+
 TEST_F(HloVerifierTest, NullInstructionParent) {
   HloComputation::Builder builder(TestName());
   const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
@@ -358,5 +367,63 @@ TEST_F(HloVerifierTest, ConvNegativeBaseDilationNotAllowed) {
               HasSubstr("non-positive base area dilation factor"));
 }
 
+static const char* const kAddWithLayoutChangeHlo = R"(
+   HloModule AddWithLayoutChange
+    ENTRY AddWithLayoutChange {
+      par0 = f32[3,4]{1,0} parameter(0)
+      par1 = f32[3,4]{0,1} parameter(1)
+      ROOT add0 = f32[3,4]{1,0} add(par0,par1)
+    }
+  )";
+
+TEST_F(HloVerifierTest, AddWithLayoutChange) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kAddWithLayoutChangeHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTestLayoutSensitive, AddWithLayoutChangeNotAllowed) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kAddWithLayoutChangeHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Instruction shouldn't change layouts"));
+}
+
+TEST_F(HloVerifierTestLayoutSensitive, SliceWithLayoutChangeNotAllowed) {
+  const char* const kSliceWithLayoutChangeHlo = R"(
+   HloModule SliceWithLayoutChange
+    ENTRY SliceWithLayoutChange {
+      par0 = f32[4,5]{0,1} parameter(0)
+      par1 = s32[2] parameter(1)
+      ROOT dslice0 = f32[3,4]{1,0} dynamic-slice(par0, par1),
+        dynamic_slice_sizes={3,4}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kSliceWithLayoutChangeHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Instruction shouldn't change layouts"));
+}
+
+TEST_F(HloVerifierTestLayoutSensitive, ConcatWithLayoutChangeNotAllowed) {
+  const char* const kConcatWithLayoutChangeHlo = R"(
+   HloModule ConcatWithLayoutChange
+   ENTRY ConcatWithLayoutChange {
+      par0 = f32[3,5]{0,1} parameter(0)
+      par1 = f32[3,3]{1,0} parameter(1)
+      ROOT concat0 = f32[3,8]{1,0} concatenate(f32[3,5] par0, f32[3,3] par1),
+        dimensions={1}
+   }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kConcatWithLayoutChangeHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Instruction shouldn't change layouts"));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index bdd4fd7e3d..7ab2ecda58 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -86,19 +86,25 @@ ProgramShape GetProgramShapeWithLayout(const HloModule& module) {
 }  // namespace
 
 HloTestBase::HloTestBase(bool verifier_layout_sensitive,
-                         bool allow_mixed_precision_in_hlo_verifier)
+                         bool allow_mixed_precision_in_hlo_verifier,
+                         std::function<bool(const HloInstruction*)>
+                             instruction_can_change_layout_func)
     : HloTestBase(GetTestPlatform(), GetReferencePlatform(),
                   verifier_layout_sensitive,
-                  allow_mixed_precision_in_hlo_verifier) {}
+                  allow_mixed_precision_in_hlo_verifier,
+                  instruction_can_change_layout_func) {}
 
 HloTestBase::HloTestBase(se::Platform* test_platform,
                          se::Platform* reference_platform,
                          bool verifier_layout_sensitive,
-                         bool allow_mixed_precision_in_hlo_verifier)
+                         bool allow_mixed_precision_in_hlo_verifier,
+                         std::function<bool(const HloInstruction*)>
+                             instruction_can_change_layout_func)
     : test_runner_(test_platform), reference_runner_(reference_platform) {
   hlo_verifier_ = absl::make_unique<HloVerifier>(
       /*layout_sensitive=*/verifier_layout_sensitive,
-      /*allow_mixed_precision=*/allow_mixed_precision_in_hlo_verifier);
+      /*allow_mixed_precision=*/allow_mixed_precision_in_hlo_verifier,
+      instruction_can_change_layout_func);
 }
 
 std::unique_ptr<HloModule> HloTestBase::CreateNewModule(const string& name) {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 0ae4bdc104..217428befa 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -88,14 +88,18 @@ class HloTestBase : public ::testing::Test {
   // interpreter is the only supported backend, it will be both the test backend
   // and the reference backend.
   HloTestBase(bool verifier_layout_sensitive = false,
-              bool allow_mixed_precision_in_hlo_verifier = true);
+              bool allow_mixed_precision_in_hlo_verifier = true,
+              std::function<bool(const HloInstruction*)>
+                  instruction_can_change_layout_func = {});
 
   // If your test doesn't use interpreter as the reference backend, you can use
   // this constructor. Note that your test target is responsible for linking in
   // both needed backends.
   HloTestBase(se::Platform* test_platform, se::Platform* reference_platform,
               bool verifier_layout_sensitive = false,
-              bool allow_mixed_precision_in_hlo_verifier = true);
+              bool allow_mixed_precision_in_hlo_verifier = true,
+              std::function<bool(const HloInstruction*)>
+                  instruction_can_change_layout_func = {});
 
   ~HloTestBase() override {}
 
-- 
GitLab


From 0541a277d5c74cf8e99c9f5a7a015926d1a05214 Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Fri, 5 Oct 2018 12:09:01 -0700
Subject: [PATCH 476/570] Do 2 warmup runs in
 assert_no_new_pyobjects_executing_eagerly.

PiperOrigin-RevId: 215944829
---
 tensorflow/python/framework/test_util.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 4ec4b41b5e..95925bb471 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -506,9 +506,9 @@ def disable_control_flow_v2(unused_msg):
 def assert_no_new_pyobjects_executing_eagerly(f):
   """Decorator for asserting that no new Python objects persist after a test.
 
-  Runs the test multiple times executing eagerly, first as a warmup and then
-  several times to let objects accumulate. The warmup helps ignore caches which
-  do not grow as the test is run repeatedly.
+  Runs the test multiple times executing eagerly, first as a warmup and then to
+  let objects accumulate. The warmup helps ignore caches which do not grow as
+  the test is run repeatedly.
 
   Useful for checking that there are no missing Py_DECREFs in the C exercised by
   a bit of Python.
@@ -518,7 +518,14 @@ def assert_no_new_pyobjects_executing_eagerly(f):
     """Warms up, gets an object count, runs the test, checks for new objects."""
     with context.eager_mode():
       gc.disable()
-      f(self, **kwargs)
+      # Run the test 2 times as warmup, in an attempt to fill up caches, which
+      # should not grow as the test is run repeatedly below.
+      #
+      # TODO(b/117156879): Running warmup twice is black magic; we have seen
+      # tests that fail with 1 warmup run, and pass with 2, on various versions
+      # of python2.7.x.
+      for _ in range(2):
+        f(self, **kwargs)
       gc.collect()
       previous_count = len(gc.get_objects())
       if ops.has_default_graph():
-- 
GitLab


From d016650ca7636c96c6664bed2cf3a2fa8a3c674b Mon Sep 17 00:00:00 2001
From: Tong Shen <endlessroad@google.com>
Date: Fri, 5 Oct 2018 12:17:31 -0700
Subject: [PATCH 477/570] Revert constant folding to previous state.

PiperOrigin-RevId: 215946205
---
 .../tf2xla/functionalize_control_flow.cc      | 64 +++----------------
 .../core/common_runtime/constant_folding.cc   | 35 +++-------
 .../core/common_runtime/constant_folding.h    |  4 --
 .../core/common_runtime/graph_optimizer.cc    |  5 +-
 .../core/common_runtime/graph_optimizer.h     |  5 +-
 5 files changed, 20 insertions(+), 93 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 28e09d7b79..0362682bd6 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -94,8 +94,9 @@ Status FunctionalizeControlFlowForFunction(
     }
   });
   const FunctionBody* body = flr->GetFunctionBody(handle);
+  Graph* g = body->graph;
 
-  // Check if the graph has Switch or Merge node before optimizing the graph.
+  // Check if the graph has Switch or Merge node.
   bool has_switch_or_merge = false;
   for (Node* n : body->graph->nodes()) {
     if (n->type_string() == "Switch" || n->type_string() == "Merge") {
@@ -108,58 +109,13 @@ Status FunctionalizeControlFlowForFunction(
   // in function body. We still need to rewrite those functions and modify
   // corresponding nodes.
 
-  // Call graph optimizer. The most important optimization we need is constant
-  // folding, which will replace ops like Shape/BroadcastGradientArgs with
-  // constant shape input. Without this optimization, those ops might become
-  // dynamic input for then/else body function and XLA will complain that input
-  // is not compile time constant. We enable function inlining as well, because
-  // otherwise we won't be able to infer shape for any node depending on
-  // function call nodes.
-  if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
-        absl::StrCat("functionalize_control_flow_before_opt_", func_name),
-        *body->graph, fld);
-  }
-  // Optimizer accepts std::unique_ptr<Graph>* as input and might change
-  // underlying pointer, thus we create a new Graph and copy from body->graph.
-  std::unique_ptr<Graph> optimized_graph(new Graph(fld));
-  CopyGraph(*body->graph, optimized_graph.get());
-  OptimizerOptions opts;
-  opts.set_opt_level(OptimizerOptions::L0);
-  opts.set_do_function_inlining(true);
-  opts.set_do_constant_folding(true);
-  GraphOptimizer optimizer(opts);
-  auto cf_consider_fn = [](const Node* n) {
-    // Skip SymbolicGradient op when doing constant folding.
-    // Enabling SymbolicGradient op in constant folding requires
-    // flr->device() to be non-null, and here we have not constructed
-    // proper Device object yet (it will be constructed in XlaCompiler).
-    return n->type_string() != FunctionLibraryDefinition::kGradientOp;
-  };
-  optimizer.Optimize(flr, flr->env(),
-                     /*device=*/nullptr, &optimized_graph,
-                     /*shape_map=*/nullptr, /*cse_consider_fn=*/nullptr,
-                     cf_consider_fn);
-  if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
-        absl::StrCat("functionalize_control_flow_after_opt_", func_name),
-        *optimized_graph, fld);
-  }
-  // Some inlined functions might have Switch/Merge nodes.
-  for (Node* n : optimized_graph->nodes()) {
-    if (n->type_string() == "Switch" || n->type_string() == "Merge") {
-      has_switch_or_merge = true;
-      break;
-    }
-  }
-
   // If any node has associated functions, functionalize them first.
   // Gather nodes with associated functions first, because rewriting those nodes
   // might involve node deletion/addition. Avoid modifying nodes while iterating
   // it.
   std::vector<std::pair<Node*, std::vector<AssociatedFunctionInfo>>>
       nodes_to_associated_functions;
-  for (auto* n : optimized_graph->nodes()) {
+  for (auto* n : g->nodes()) {
     auto associated_functions = GetAssociatedFunctions(*n, flr);
     if (!associated_functions.empty()) {
       nodes_to_associated_functions.push_back({n, associated_functions});
@@ -215,7 +171,7 @@ Status FunctionalizeControlFlowForFunction(
         // pointer. That's fine because in that case, associated_functions will
         // only have one member and the loop will only run once.
         TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
-            optimized_graph.get(), n, fld, associated_function, new_name));
+            g, n, fld, associated_function, new_name));
       }
     }
   }
@@ -227,21 +183,21 @@ Status FunctionalizeControlFlowForFunction(
     if (VLOG_IS_ON(4)) {
       dump_graph::DumpGraphToFile(
           absl::StrCat("functionalize_control_flow_before_fdef_", func_name),
-          *optimized_graph, fld);
+          *g, fld);
     }
-    TF_RETURN_IF_ERROR(FunctionalizeControlFlow(optimized_graph.get(), fld));
+    TF_RETURN_IF_ERROR(FunctionalizeControlFlow(g, fld));
     if (VLOG_IS_ON(4)) {
       dump_graph::DumpGraphToFile(
-          absl::StrCat("functionalize_control_flow_after_fdef_", func_name),
-          *optimized_graph, fld);
+          absl::StrCat("functionalize_control_flow_after_fdef_", func_name), *g,
+          fld);
     }
   }
 
   if (*modified) {
     // Add rewritten FunctionDef into library.
     FunctionDef functionalized_fdef;
-    TF_RETURN_IF_ERROR(GraphToFunctionDef(*optimized_graph, new_func_name,
-                                          &functionalized_fdef));
+    TF_RETURN_IF_ERROR(
+        GraphToFunctionDef(*g, new_func_name, &functionalized_fdef));
     if (func_name == new_func_name) {
       VLOG(2) << "Replacing function " << func_name;
       TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index db137f1a19..e81e61b633 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -466,23 +466,23 @@ Graph* GetConstantGraph(
 bool ReplaceTensorWithConstant(
     Graph* graph, Device* partition_device, NodeAndOutput tensor,
     const Tensor& constant, const gtl::FlatSet<Node*>& control_deps,
-    int64 max_constant_size_in_bytes, bool disable_memory_output_type_check,
+    int64 max_constant_size_in_bytes,
     const ConstantFoldNameGenerator& generate_new_name) {
   // Be conservative when replacing a tensor with a constant, when not
   // running on CPU.
   // 1) Do not replace another constant.
   // 2) If the destination tensor is not an int32 tensor, and has HOST_MEMORY
   // constraint, do not replace it.
-  // 3) If the size of the constant in bytes is too large (>
+  // 3) If the destination tensor is an int32 tensor, and has DEVICE_MEMORY
+  // constraint, do not replace it.
+  // 4) If the size of the constant in bytes is too large (>
   // max_constant_in_bytes), do not replace it. This prevents the size of the
   // Graph from growing too large.
-  // 4) If the constant op created does not have a kernel implementation
+  // 5) If the constant op created does not have a kernel implementation
   // for the device, do not use it.
   // TODO(keveman): Consider adding a new constant op that has a kernel
   // implementation for all types, but with HostMemory constraint on it's
   // output.
-  // 5) If the constant op for the device has different output memory type
-  // from the original op output memory type, do not replace it.
   if (tensor.first->IsConstant()) {
     return false;
   }
@@ -497,7 +497,8 @@ bool ReplaceTensorWithConstant(
       return false;
     }
     bool is_int32 = tensor.first->output_type(tensor.second) == DT_INT32;
-    if (memory_type == HOST_MEMORY && !is_int32) {
+    if ((memory_type == HOST_MEMORY && !is_int32) ||
+        (memory_type == DEVICE_MEMORY && is_int32)) {
       return false;
     }
   }
@@ -535,25 +536,6 @@ bool ReplaceTensorWithConstant(
   if (!NodeBuilder(builder).Finalize(graph, &constant_node).ok()) {
     return false;
   }
-  if (!disable_memory_output_type_check) {
-    if (partition_device && device_type != DEVICE_CPU) {
-      MemoryType original_output_memory_type;
-      if (!MemoryTypeForOutput(device_type, graph, tensor.first, tensor.second,
-                               &original_output_memory_type)
-               .ok()) {
-        return false;
-      }
-      MemoryType const_output_memory_type;
-      if (!MemoryTypeForOutput(device_type, graph, constant_node, 0,
-                               &const_output_memory_type)
-               .ok()) {
-        return false;
-      }
-      if (original_output_memory_type != const_output_memory_type) {
-        return false;
-      }
-    }
-  }
   for (auto edge : edges_to_remove) {
     graph->AddEdge(constant_node, 0, edge->dst(), edge->dst_input());
     graph->RemoveEdge(edge);
@@ -660,8 +642,7 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
         constant_control_deps[tensors_to_replace[c].first];
     if (ReplaceTensorWithConstant(
             graph, partition_device, tensors_to_replace[c], outputs[c],
-            control_deps, opts.max_constant_size_in_bytes,
-            opts.disable_memory_output_type_check, generate_new_name)) {
+            control_deps, opts.max_constant_size_in_bytes, generate_new_name)) {
       ++num_nodes_replaced;
     }
   }
diff --git a/tensorflow/core/common_runtime/constant_folding.h b/tensorflow/core/common_runtime/constant_folding.h
index 4c71b7bd27..a9a84f761b 100644
--- a/tensorflow/core/common_runtime/constant_folding.h
+++ b/tensorflow/core/common_runtime/constant_folding.h
@@ -45,10 +45,6 @@ struct ConstantFoldingOptions {
   // optimization.
   int64 max_constant_size_in_bytes = 10 * 1024 * 1024;
 
-  // If disable_memory_output_type_check is true, we will disable output memory
-  // type check for constant node replacement.
-  bool disable_memory_output_type_check = false;
-
   // A generator for the name suffix of constant folded nodes. A
   // default id generator that monotonically increases is used if nullptr is
   // passed.
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 91194bc86f..37a979a8f1 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -39,8 +39,7 @@ void GraphOptimizer::Optimize(
     const std::unordered_map<string, std::vector<PartialTensorShape>>*
         shape_map,
     const std::function<bool(const Node*)>& cse_consider_fn,
-    const std::function<bool(const Node*)>& cf_consider_fn,
-    bool cf_disable_memory_output_type_check) {
+    const std::function<bool(const Node*)>& cf_consider_fn) {
   Graph* g = graph->get();
   DumpGraph("Initial", g);
 
@@ -65,8 +64,6 @@ void GraphOptimizer::Optimize(
       ConstantFoldingOptions cf_opts;
       cf_opts.shape_map = shape_map;
       cf_opts.consider = cf_consider_fn;
-      cf_opts.disable_memory_output_type_check =
-          cf_disable_memory_output_type_check;
       if (opts_.max_folded_constant_in_bytes() > 0) {
         cf_opts.max_constant_size_in_bytes =
             opts_.max_folded_constant_in_bytes();
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index 8954e9612d..789cc56942 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -47,16 +47,13 @@ class GraphOptimizer {
   // returns true will be considered for CSE.
   // If cf_consider_fn is not null then only nodes for which cf_consider_fn
   // returns true will be considered for CF.
-  // If cf_disable_memory_output_type_check is true, CF will discard output
-  // memory type check for constant node replacement.
   void Optimize(
       FunctionLibraryRuntime* runtime, Env* env, Device* device,
       std::unique_ptr<Graph>* graph,
       const std::unordered_map<string, std::vector<PartialTensorShape>>*
           shape_map,
       const std::function<bool(const Node*)>& cse_consider_fn = nullptr,
-      const std::function<bool(const Node*)>& cf_consider_fn = nullptr,
-      bool cf_disable_memory_output_type_check = false);
+      const std::function<bool(const Node*)>& cf_consider_fn = nullptr);
 
   const OptimizerOptions& options() { return opts_; }
 
-- 
GitLab


From 58845f229be9b5ba2e1e36150bff5ba7a85920d8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 12:25:22 -0700
Subject: [PATCH 478/570] Profiler collects the number of replicas and num
 cores per replica used in the model.

PiperOrigin-RevId: 215947354
---
 tensorflow/contrib/tpu/profiler/tf_op_stats.proto | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
index f88dc51636..1e66801efd 100644
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
@@ -168,6 +168,12 @@ message RunEnvironmentResult {
   optional HostIndependentJobInfoResult host_independent_job_info = 5;
   // Host-dependent job information.
   repeated HostDependentJobInfoResult host_dependent_job_info = 6;
+  // The number of replicas, corresponds to input parallelism.
+  // If there is no model parallelism, replica_count = tpu_core_count
+  optional int32 replica_count = 7;
+  // The number of cores used for a single replica, e.g. model parallelism.
+  // If there is no model parallelism, then num_cores_per_replica = 1
+  optional int32 num_cores_per_replica = 8;
 }
 
 // The types of host operations that are tracked.
-- 
GitLab


From 6919ab5787e6384d709adf051dc1ce99236b76bc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 12:33:55 -0700
Subject: [PATCH 479/570] Convert TensorFlow's jpeg dependency to new third
 party import method.

PiperOrigin-RevId: 215948571
---
 tensorflow/workspace.bzl                        | 14 ++------------
 third_party/jpeg/BUILD                          |  2 +-
 third_party/jpeg/{jpeg.BUILD => BUILD.bazel}    | 11 ++++++-----
 .../jpeg.BUILD => jpeg/BUILD.system}            |  0
 third_party/jpeg/jpeg_helpers.BUILD.bazel       |  1 +
 third_party/jpeg/workspace.bzl                  | 17 +++++++++++++++++
 6 files changed, 27 insertions(+), 18 deletions(-)
 rename third_party/jpeg/{jpeg.BUILD => BUILD.bazel} (99%)
 rename third_party/{systemlibs/jpeg.BUILD => jpeg/BUILD.system} (100%)
 create mode 100644 third_party/jpeg/jpeg_helpers.BUILD.bazel
 create mode 100644 third_party/jpeg/workspace.bzl

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 8df41f96b8..b9ced1bd6c 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -22,10 +22,12 @@ load(
 )
 load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
 load("//third_party/icu:workspace.bzl", icu = "repo")
+load("//third_party/jpeg:workspace.bzl", jpeg = "repo")
 
 def initialize_third_party():
     flatbuffers()
     icu()
+    jpeg()
 
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
@@ -246,18 +248,6 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "jpeg",
-        build_file = clean_dep("//third_party/jpeg:jpeg.BUILD"),
-        sha256 = "f892fff427ab3adffc289363eac26d197ce3ccacefe5f5822377348a8166069b",
-        strip_prefix = "libjpeg-turbo-2.0.0",
-        system_build_file = clean_dep("//third_party/systemlibs:jpeg.BUILD"),
-        urls = [
-            "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz",
-            "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "png_archive",
         build_file = clean_dep("//third_party:png.BUILD"),
diff --git a/third_party/jpeg/BUILD b/third_party/jpeg/BUILD
index 5b01f6e3e4..e3aec1fce9 100644
--- a/third_party/jpeg/BUILD
+++ b/third_party/jpeg/BUILD
@@ -1 +1 @@
-licenses(["notice"])
+# Needed to make this a package.
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/BUILD.bazel
similarity index 99%
rename from third_party/jpeg/jpeg.BUILD
rename to third_party/jpeg/BUILD.bazel
index 1b9b9bf2f5..5243e995a3 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/BUILD.bazel
@@ -162,9 +162,9 @@ cc_library(
     hdrs = [
         "simd/powerpc/jccolext-altivec.c",
         "simd/powerpc/jcgryext-altivec.c",
+        "simd/powerpc/jcsample.h",
         "simd/powerpc/jdcolext-altivec.c",
         "simd/powerpc/jdmrgext-altivec.c",
-        "simd/powerpc/jcsample.h",
         "simd/powerpc/jsimd_altivec.h",
     ],
     copts = libjpegturbo_copts,
@@ -186,7 +186,6 @@ cc_library(
         "jsimd.h",
         "jsimddct.h",
         "simd/jsimd.h",
-        "simd/x86_64/jsimd.c",
         "simd/x86_64/jccolor-avx2.o",
         "simd/x86_64/jccolor-sse2.o",
         "simd/x86_64/jcgray-avx2.o",
@@ -213,6 +212,7 @@ cc_library(
         "simd/x86_64/jquantf-sse2.o",
         "simd/x86_64/jquanti-avx2.o",
         "simd/x86_64/jquanti-sse2.o",
+        "simd/x86_64/jsimd.c",
         "simd/x86_64/jsimdcpu.o",
     ],
     copts = libjpegturbo_copts,
@@ -322,9 +322,9 @@ cc_library(
         "jpeglib.h",
         "jsimd.h",
         "jsimddct.h",
-        "simd/jsimd.h",
         "simd/arm/jsimd.c",
         "simd/arm/jsimd_neon.S",
+        "simd/jsimd.h",
     ],
     copts = libjpegturbo_copts,
     nocopts = libjpegturbo_nocopts,
@@ -343,9 +343,9 @@ cc_library(
         "jpeglib.h",
         "jsimd.h",
         "jsimddct.h",
-        "simd/jsimd.h",
         "simd/arm64/jsimd.c",
         "simd/arm64/jsimd_neon.S",
+        "simd/jsimd.h",
     ],
     copts = libjpegturbo_copts,
     nocopts = libjpegturbo_nocopts,
@@ -366,7 +366,6 @@ cc_library(
         "jsimd.h",
         "jsimddct.h",
         "simd/jsimd.h",
-        "simd/x86_64/jsimd.c",
         "simd/x86_64/jccolor-avx2.obj",
         "simd/x86_64/jccolor-sse2.obj",
         "simd/x86_64/jcgray-avx2.obj",
@@ -393,6 +392,7 @@ cc_library(
         "simd/x86_64/jquantf-sse2.obj",
         "simd/x86_64/jquanti-avx2.obj",
         "simd/x86_64/jquanti-sse2.obj",
+        "simd/x86_64/jsimd.c",
         "simd/x86_64/jsimdcpu.obj",
     ],
     copts = libjpegturbo_copts,
@@ -603,6 +603,7 @@ JCONFIGINT_WIN_SUBSTITUTIONS = {
 }
 
 JCONFIGINT_NOWIN_SUBSTITUTIONS.update(JCONFIGINT_COMMON_SUBSTITUTIONS)
+
 JCONFIGINT_WIN_SUBSTITUTIONS.update(JCONFIGINT_COMMON_SUBSTITUTIONS)
 
 template_rule(
diff --git a/third_party/systemlibs/jpeg.BUILD b/third_party/jpeg/BUILD.system
similarity index 100%
rename from third_party/systemlibs/jpeg.BUILD
rename to third_party/jpeg/BUILD.system
diff --git a/third_party/jpeg/jpeg_helpers.BUILD.bazel b/third_party/jpeg/jpeg_helpers.BUILD.bazel
new file mode 100644
index 0000000000..5b01f6e3e4
--- /dev/null
+++ b/third_party/jpeg/jpeg_helpers.BUILD.bazel
@@ -0,0 +1 @@
+licenses(["notice"])
diff --git a/third_party/jpeg/workspace.bzl b/third_party/jpeg/workspace.bzl
new file mode 100644
index 0000000000..4b517240ec
--- /dev/null
+++ b/third_party/jpeg/workspace.bzl
@@ -0,0 +1,17 @@
+"""loads the jpeg library, used by TF."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "jpeg",
+        urls = [
+            "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz",
+            "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz",
+        ],
+        sha256 = "f892fff427ab3adffc289363eac26d197ce3ccacefe5f5822377348a8166069b",
+        strip_prefix = "libjpeg-turbo-2.0.0",
+        build_file = "//third_party/jpeg:BUILD.bazel",
+        # build_file = clean_dep("//third_party/jpeg:jpeg.BUILD"),
+        system_build_file = "//third_party/jpeg:BUILD.system",
+    )
-- 
GitLab


From ef838969b95de39353a3ba495c335cbb14a0c9b5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 12:44:45 -0700
Subject: [PATCH 480/570] Brings V2 Optimizers into Keras w/ Keras signatures

PiperOrigin-RevId: 215950207
---
 .../contrib/distribute/python/combinations.py |   16 +-
 .../distribute/python/minimize_loss_test.py   |    5 -
 tensorflow/contrib/optimizer_v2/BUILD         |   11 +-
 tensorflow/contrib/optimizer_v2/adadelta.py   |   75 +-
 tensorflow/contrib/optimizer_v2/adagrad.py    |   79 +-
 .../contrib/optimizer_v2/adagrad_test.py      |    3 -
 tensorflow/contrib/optimizer_v2/adam.py       |  129 +-
 .../optimizer_v2/checkpointable_utils_test.py |   68 +-
 .../contrib/optimizer_v2/gradient_descent.py  |   40 +-
 tensorflow/contrib/optimizer_v2/momentum.py   |   69 +-
 .../contrib/optimizer_v2/optimizer_v2.py      | 1205 +--------------
 tensorflow/contrib/optimizer_v2/rmsprop.py    |  154 +-
 tensorflow/python/keras/BUILD                 |  155 ++
 .../python/keras/optimizer_v2/adadelta.py     |  116 ++
 .../keras/optimizer_v2/adadelta_test.py       |  166 ++
 .../python/keras/optimizer_v2/adagrad.py      |  119 ++
 .../python/keras/optimizer_v2/adagrad_test.py |  276 ++++
 tensorflow/python/keras/optimizer_v2/adam.py  |  203 +++
 .../python/keras/optimizer_v2/adam_test.py    |  333 ++++
 .../optimizer_v2/checkpointable_utils_test.py |  761 ++++++++++
 .../python/keras/optimizer_v2/optimizer_v2.py | 1349 +++++++++++++++++
 .../keras/optimizer_v2/optimizer_v2_test.py   |  277 ++++
 .../python/keras/optimizer_v2/rmsprop.py      |  239 +++
 .../python/keras/optimizer_v2/rmsprop_test.py |  444 ++++++
 tensorflow/python/keras/optimizer_v2/sgd.py   |  170 +++
 .../python/keras/optimizer_v2/sgd_test.py     |  759 ++++++++++
 26 files changed, 5487 insertions(+), 1734 deletions(-)
 create mode 100644 tensorflow/python/keras/optimizer_v2/adadelta.py
 create mode 100644 tensorflow/python/keras/optimizer_v2/adadelta_test.py
 create mode 100644 tensorflow/python/keras/optimizer_v2/adagrad.py
 create mode 100644 tensorflow/python/keras/optimizer_v2/adagrad_test.py
 create mode 100644 tensorflow/python/keras/optimizer_v2/adam.py
 create mode 100644 tensorflow/python/keras/optimizer_v2/adam_test.py
 create mode 100644 tensorflow/python/keras/optimizer_v2/checkpointable_utils_test.py
 create mode 100644 tensorflow/python/keras/optimizer_v2/optimizer_v2.py
 create mode 100644 tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
 create mode 100644 tensorflow/python/keras/optimizer_v2/rmsprop.py
 create mode 100644 tensorflow/python/keras/optimizer_v2/rmsprop_test.py
 create mode 100644 tensorflow/python/keras/optimizer_v2/sgd.py
 create mode 100644 tensorflow/python/keras/optimizer_v2/sgd_test.py

diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index cff4b0a463..63a163e76c 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -349,26 +349,26 @@ mirrored_strategy_with_two_gpus = NamedDistribution(
     required_gpus=2)
 
 
-adam_optimizer_v1_fn = NamedObject(
-    "AdamV1", lambda: adam.AdamOptimizer(0.001, epsilon=1))
 gradient_descent_optimizer_v1_fn = NamedObject(
     "GradientDescentV1", lambda: gradient_descent.GradientDescentOptimizer(0.2))
 adagrad_optimizer_v1_fn = NamedObject(
     "AdagradV1", lambda: adagrad.AdagradOptimizer(0.001))
+adam_optimizer_v1_fn = NamedObject("AdamV1",
+                                   lambda: adam.AdamOptimizer(0.001, epsilon=1))
 rmsprop_optimizer_v1_fn = NamedObject(
     "RmsPropV1", lambda: rmsprop.RMSPropOptimizer(0.001))
-optimizers_v1 = [adam_optimizer_v1_fn, gradient_descent_optimizer_v1_fn,
-                 adagrad_optimizer_v1_fn]
 
-adam_optimizer_v2_fn = NamedObject(
-    "AdamV2", lambda: adam_v2.AdamOptimizer(0.001, epsilon=1))
+optimizers_v1 = [gradient_descent_optimizer_v1_fn, adagrad_optimizer_v1_fn]
+
 gradient_descent_optimizer_v2_fn = NamedObject(
     "GradientDescentV2",
     lambda: gradient_descent_v2.GradientDescentOptimizer(0.2))
 adagrad_optimizer_v2_fn = NamedObject(
     "AdagradV2", lambda: adagrad_v2.AdagradOptimizer(0.001))
-optimizers_v2 = [adam_optimizer_v2_fn, gradient_descent_optimizer_v2_fn,
-                 adagrad_optimizer_v2_fn]
+adam_optimizer_v2_fn = NamedObject(
+    "AdamV2", lambda: adam_v2.AdamOptimizer(0.001, epsilon=1))
+
+optimizers_v2 = [gradient_descent_optimizer_v2_fn, adagrad_optimizer_v2_fn]
 
 graph_and_eager_modes = ["graph", "eager"]
 
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index ba147e7824..60e134055f 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -179,11 +179,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def get_expected_variables(optimizer_fn, num_parameter_devices):
         variables_map = {
             "GradientDescent": ["dense/kernel", "dense/bias"],
-            "Adam": [
-                "dense/kernel", "dense/bias", "beta1_power", "beta2_power",
-                "dense/kernel/Adam", "dense/kernel/Adam_1", "dense/bias/Adam",
-                "dense/bias/Adam_1"
-            ],
             "Adagrad": [
                 "dense/kernel/Adagrad", "dense/kernel",
                 "dense/bias/Adagrad", "dense/bias"
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
index 3ba3ee29ec..2cf445a85e 100644
--- a/tensorflow/contrib/optimizer_v2/BUILD
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -47,15 +47,8 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:distribute",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:util",
+        "//tensorflow/python/keras:optimizer_v2",
     ],
 )
 
diff --git a/tensorflow/contrib/optimizer_v2/adadelta.py b/tensorflow/contrib/optimizer_v2/adadelta.py
index b206f9f61b..9d73bddd1c 100644
--- a/tensorflow/contrib/optimizer_v2/adadelta.py
+++ b/tensorflow/contrib/optimizer_v2/adadelta.py
@@ -18,17 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.optimizer_v2 import optimizer_v2
-from tensorflow.python.training import training_ops
+from tensorflow.python.keras.optimizer_v2 import adadelta
+from tensorflow.python.util import deprecation
 
 
-class AdadeltaOptimizer(optimizer_v2.OptimizerV2):
+class AdadeltaOptimizer(adadelta.Adadelta):
   """Optimizer that implements the Adadelta algorithm.
 
   See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
   ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
   """
 
+  @deprecation.deprecated_args(
+      "2018-10-01",
+      "`use_locking = True` is no longer supported and will be ignored.",
+      ("use_locking", [False]))
   def __init__(self, learning_rate=0.001, rho=0.95, epsilon=1e-8,
                use_locking=False, name="Adadelta"):
     """Construct a new Adadelta optimizer.
@@ -48,66 +52,5 @@ class AdadeltaOptimizer(optimizer_v2.OptimizerV2):
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to "Adadelta".
     """
-    super(AdadeltaOptimizer, self).__init__(use_locking, name)
-    self._set_hyper("learning_rate", learning_rate)
-    self._set_hyper("rho", rho)
-    self._set_hyper("epsilon", epsilon)
-
-  def _create_vars(self, var_list, state):
-    for v in var_list:
-      state.zeros_slot(v, "accum")
-      state.zeros_slot(v, "accum_update")
-
-  def _apply_dense(self, grad, var, state):
-    accum = state.get_slot(var, "accum")
-    accum_update = state.get_slot(var, "accum_update")
-    return training_ops.apply_adadelta(
-        var,
-        accum,
-        accum_update,
-        state.get_hyper("learning_rate", var.dtype.base_dtype),
-        state.get_hyper("rho", var.dtype.base_dtype),
-        state.get_hyper("epsilon", var.dtype.base_dtype),
-        grad,
-        use_locking=self._use_locking)
-
-  def _resource_apply_dense(self, grad, var, state):
-    accum = state.get_slot(var, "accum")
-    accum_update = state.get_slot(var, "accum_update")
-    return training_ops.resource_apply_adadelta(
-        var.handle,
-        accum.handle,
-        accum_update.handle,
-        state.get_hyper("learning_rate", var.dtype.base_dtype),
-        state.get_hyper("rho", var.dtype.base_dtype),
-        state.get_hyper("epsilon", var.dtype.base_dtype),
-        grad,
-        use_locking=self._use_locking)
-
-  def _apply_sparse(self, grad, var, state):
-    accum = state.get_slot(var, "accum")
-    accum_update = state.get_slot(var, "accum_update")
-    return training_ops.sparse_apply_adadelta(
-        var,
-        accum,
-        accum_update,
-        state.get_hyper("learning_rate", var.dtype.base_dtype),
-        state.get_hyper("rho", var.dtype.base_dtype),
-        state.get_hyper("epsilon", var.dtype.base_dtype),
-        grad.values,
-        grad.indices,
-        use_locking=self._use_locking)
-
-  def _resource_apply_sparse(self, grad, var, indices, state):
-    accum = state.get_slot(var, "accum")
-    accum_update = state.get_slot(var, "accum_update")
-    return training_ops.resource_sparse_apply_adadelta(
-        var.handle,
-        accum.handle,
-        accum_update.handle,
-        state.get_hyper("learning_rate", var.dtype.base_dtype),
-        state.get_hyper("rho", var.dtype.base_dtype),
-        state.get_hyper("epsilon", var.dtype.base_dtype),
-        grad,
-        indices,
-        use_locking=self._use_locking)
+    super(AdadeltaOptimizer, self).__init__(
+        learning_rate=learning_rate, rho=rho, epsilon=epsilon, name=name)
diff --git a/tensorflow/contrib/optimizer_v2/adagrad.py b/tensorflow/contrib/optimizer_v2/adagrad.py
index dab1e02716..716361e29c 100644
--- a/tensorflow/contrib/optimizer_v2/adagrad.py
+++ b/tensorflow/contrib/optimizer_v2/adagrad.py
@@ -18,15 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.optimizer_v2 import optimizer_v2
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.training import training_ops
+from tensorflow.python.keras.optimizer_v2 import adagrad
+from tensorflow.python.util import deprecation
 
 
-class AdagradOptimizer(optimizer_v2.OptimizerV2):
+class AdagradOptimizer(adagrad.Adagrad):
   """Optimizer that implements the Adagrad algorithm.
 
   See this [paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
@@ -34,6 +30,10 @@ class AdagradOptimizer(optimizer_v2.OptimizerV2):
   [intro](https://ppasupat.github.io/a9online/uploads/proximal_notes.pdf).
   """
 
+  @deprecation.deprecated_args(
+      "2018-10-01",
+      "`use_locking = True` is no longer supported and will be ignored.",
+      ("use_locking", [False]))
   def __init__(self, learning_rate, initial_accumulator_value=0.1,
                use_locking=False, name="Adagrad"):
     """Construct a new Adagrad optimizer.
@@ -54,64 +54,7 @@ class AdagradOptimizer(optimizer_v2.OptimizerV2):
     Raises:
       ValueError: If the `initial_accumulator_value` is invalid.
     """
-    if initial_accumulator_value <= 0.0:
-      raise ValueError("initial_accumulator_value must be positive: %s" %
-                       initial_accumulator_value)
-    super(AdagradOptimizer, self).__init__(use_locking, name)
-    self._set_hyper("learning_rate", learning_rate)
-
-    self._initial_accumulator_value = initial_accumulator_value
-
-  def _create_vars(self, var_list, state):
-    for v in var_list:
-      dtype = v.dtype.base_dtype
-      if v.get_shape().is_fully_defined():
-        init = init_ops.constant_initializer(self._initial_accumulator_value,
-                                             dtype=dtype)
-      else:
-        def init(v=v, dtype=dtype):
-          # Use a Tensor instead of initializer if variable does not have
-          # static shape.
-          init_constant = gen_array_ops.fill(array_ops.shape(v),
-                                             self._initial_accumulator_value)
-          return math_ops.cast(init_constant, dtype)
-      state.create_slot_with_initializer(v, init, v.get_shape(), dtype,
-                                         "accumulator")
-
-  def _apply_dense(self, grad, var, state):
-    acc = state.get_slot(var, "accumulator")
-    return training_ops.apply_adagrad(
-        var,
-        acc,
-        state.get_hyper("learning_rate", var.dtype.base_dtype),
-        grad,
-        use_locking=self._use_locking)
-
-  def _resource_apply_dense(self, grad, var, state):
-    acc = state.get_slot(var, "accumulator")
-    return training_ops.resource_apply_adagrad(
-        var.handle,
-        acc.handle,
-        state.get_hyper("learning_rate", var.dtype.base_dtype),
-        grad,
-        use_locking=self._use_locking)
-
-  def _apply_sparse(self, grad, var, state):
-    acc = state.get_slot(var, "accumulator")
-    return training_ops.sparse_apply_adagrad(
-        var,
-        acc,
-        state.get_hyper("learning_rate", var.dtype.base_dtype),
-        grad.values,
-        grad.indices,
-        use_locking=self._use_locking)
-
-  def _resource_apply_sparse(self, grad, var, indices, state):
-    acc = state.get_slot(var, "accumulator")
-    return training_ops.resource_sparse_apply_adagrad(
-        var.handle,
-        acc.handle,
-        state.get_hyper("learning_rate", var.dtype.base_dtype),
-        grad,
-        indices,
-        use_locking=self._use_locking)
+    super(AdagradOptimizer, self).__init__(
+        learning_rate=learning_rate,
+        initial_accumulator_value=initial_accumulator_value,
+        name=name)
diff --git a/tensorflow/contrib/optimizer_v2/adagrad_test.py b/tensorflow/contrib/optimizer_v2/adagrad_test.py
index debaaaeeba..320e41567f 100644
--- a/tensorflow/contrib/optimizer_v2/adagrad_test.py
+++ b/tensorflow/contrib/optimizer_v2/adagrad_test.py
@@ -68,9 +68,6 @@ class AdagradOptimizerTest(test.TestCase):
   def testBasicResource(self):
     self.doTestBasic(use_locking=False, use_resource=True)
 
-  def testBasicLocked(self):
-    self.doTestBasic(use_locking=True)
-
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index 04b1552b61..363e020757 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -18,22 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.optimizer_v2 import optimizer_v2
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.training import training_ops
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.util import deprecation
 
 
-class AdamOptimizer(optimizer_v2.OptimizerV2):
+class AdamOptimizer(adam.Adam):
   """Optimizer that implements the Adam algorithm.
 
   See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
   ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
   """
 
+  @deprecation.deprecated_args(
+      "2018-10-01",
+      "`use_locking = True` is no longer supported and will be ignored.",
+      ("use_locking", [False]))
   def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
                use_locking=False, name="Adam"):
     """Construct a new Adam optimizer.
@@ -87,111 +86,9 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
       name: Optional name for the operations created when applying gradients.
         Defaults to "Adam".
     """
-    super(AdamOptimizer, self).__init__(use_locking, name)
-
-    self._set_hyper("learning_rate", learning_rate)
-    self._set_hyper("beta1", beta1)
-    self._set_hyper("beta2", beta2)
-    self._set_hyper("epsilon", epsilon)
-
-  def _get_beta_accumulators(self, state=None):
-    if state is None:
-      state = self._get_per_graph_state()
-    return (state.get_non_slot("beta1_power"),
-            state.get_non_slot("beta2_power"))
-
-  def _create_vars(self, var_list, state):
-    # Non-slot variables end up on the same device(s).
-    state.create_non_slot(initial_value=lambda: state.get_hyper("beta1"),
-                          name="beta1_power")
-    state.create_non_slot(initial_value=lambda: state.get_hyper("beta2"),
-                          name="beta2_power")
-
-    # Create slots for the first and second moments.
-    for v in var_list:
-      state.zeros_slot(v, "m")
-      state.zeros_slot(v, "v")
-
-  def _apply_dense(self, grad, var, state):
-    m = state.get_slot(var, "m")
-    v = state.get_slot(var, "v")
-    beta1_power, beta2_power = self._get_beta_accumulators(state)
-    return training_ops.apply_adam(
-        var, m, v,
-        math_ops.cast(beta1_power, var.dtype.base_dtype),
-        math_ops.cast(beta2_power, var.dtype.base_dtype),
-        state.get_hyper("learning_rate", var.dtype.base_dtype),
-        state.get_hyper("beta1", var.dtype.base_dtype),
-        state.get_hyper("beta2", var.dtype.base_dtype),
-        state.get_hyper("epsilon", var.dtype.base_dtype),
-        grad, use_locking=self._use_locking).op
-
-  def _resource_apply_dense(self, grad, var, state):
-    m = state.get_slot(var, "m")
-    v = state.get_slot(var, "v")
-    beta1_power, beta2_power = self._get_beta_accumulators(state)
-    return training_ops.resource_apply_adam(
-        var.handle, m.handle, v.handle,
-        math_ops.cast(beta1_power, grad.dtype.base_dtype),
-        math_ops.cast(beta2_power, grad.dtype.base_dtype),
-        state.get_hyper("learning_rate", grad.dtype.base_dtype),
-        state.get_hyper("beta1", grad.dtype.base_dtype),
-        state.get_hyper("beta2", grad.dtype.base_dtype),
-        state.get_hyper("epsilon", grad.dtype.base_dtype),
-        grad, use_locking=self._use_locking)
-
-  def _apply_sparse_shared(self, grad, var, indices, scatter_add, state):
-    beta1_power, beta2_power = self._get_beta_accumulators(state)
-    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
-    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
-    lr_t = state.get_hyper("learning_rate", var.dtype.base_dtype)
-    beta1_t = state.get_hyper("beta1", var.dtype.base_dtype)
-    beta2_t = state.get_hyper("beta2", var.dtype.base_dtype)
-    epsilon_t = state.get_hyper("epsilon", var.dtype.base_dtype)
-    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
-    # m_t = beta1 * m + (1 - beta1) * g_t
-    m = state.get_slot(var, "m")
-    m_scaled_g_values = grad * (1 - beta1_t)
-    m_t = state_ops.assign(m, m * beta1_t,
-                           use_locking=self._use_locking)
-    with ops.control_dependencies([m_t]):
-      m_t = scatter_add(m, indices, m_scaled_g_values)
-    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
-    v = state.get_slot(var, "v")
-    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
-    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
-    with ops.control_dependencies([v_t]):
-      v_t = scatter_add(v, indices, v_scaled_g_values)
-    v_sqrt = math_ops.sqrt(v_t)
-    var_update = state_ops.assign_sub(var,
-                                      lr * m_t / (v_sqrt + epsilon_t),
-                                      use_locking=self._use_locking)
-    return control_flow_ops.group(*[var_update, m_t, v_t])
-
-  def _apply_sparse(self, grad, var, state):
-    return self._apply_sparse_shared(
-        grad.values, var, grad.indices,
-        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
-            x, i, v, use_locking=self._use_locking),
-        state)
-
-  def _resource_scatter_add(self, x, i, v):
-    with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_add(
-            x.handle, i, v)]):
-      return x.value()
-
-  def _resource_apply_sparse(self, grad, var, indices, state):
-    return self._apply_sparse_shared(
-        grad, var, indices, self._resource_scatter_add, state)
-
-  def _finish(self, state):
-    # Update the power accumulators.
-    beta1_power, beta2_power = self._get_beta_accumulators(state)
-    update_beta1 = beta1_power.assign(
-        beta1_power * state.get_hyper("beta1"),
-        use_locking=self._use_locking)
-    update_beta2 = beta2_power.assign(
-        beta2_power * state.get_hyper("beta2"),
-        use_locking=self._use_locking)
-    return control_flow_ops.group(update_beta1, update_beta2)
+    super(AdamOptimizer, self).__init__(
+        learning_rate=learning_rate,
+        beta_1=beta1,
+        beta_2=beta2,
+        epsilon=epsilon,
+        name=name)
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index e13b82d1d2..3c68ef995a 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -130,8 +130,8 @@ class CheckpointingTests(test.TestCase):
         # non-Layer dependency of the model
         "model/_non_layer/a_variable",
         # The optimizer creates two non-slot variables
-        "optimizer/beta1_power",
-        "optimizer/beta2_power",
+        "optimizer/beta_1_power",
+        "optimizer/beta_2_power",
         # Slot variables
         "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
         "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
@@ -161,21 +161,20 @@ class CheckpointingTests(test.TestCase):
         "my_model/dense/kernel",
         named_variables["model/_named_dense/kernel" + suffix].full_name)
     self.assertEqual(
-        "beta1_power",
-        named_variables["optimizer/beta1_power" + suffix].full_name)
+        "beta_1_power",
+        named_variables["optimizer/beta_1_power" + suffix].full_name)
     self.assertEqual(
-        "beta2_power",
-        named_variables["optimizer/beta2_power" + suffix].full_name)
+        "beta_2_power",
+        named_variables["optimizer/beta_2_power" + suffix].full_name)
     # Spot check the generated protocol buffers.
     self.assertEqual("optimizer",
                      serialized_graph.nodes[0].children[1].local_name)
     optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
         1].node_id]
-    self.assertEqual("beta1_power",
-                     optimizer_node.children[0].local_name)
-    self.assertEqual("beta1_power",
-                     serialized_graph.nodes[optimizer_node.children[0].node_id]
-                     .attributes[0].full_name)
+    self.assertEqual("beta_1_power", optimizer_node.children[0].local_name)
+    self.assertEqual(
+        "beta_1_power", serialized_graph.nodes[
+            optimizer_node.children[0].node_id].attributes[0].full_name)
     self.assertEqual(
         "my_model/dense/kernel",
         serialized_graph.nodes[optimizer_node.slot_variables[0]
@@ -241,9 +240,10 @@ class CheckpointingTests(test.TestCase):
     on_create_model = MyModel()
     on_create_optimizer = adam.AdamOptimizer(
         0.001,
-        # Preserve beta1_power and beta2_power when appying gradients so we can
-        # test that they've been restored correctly.
-        beta1=1.0, beta2=1.0)
+        # Preserve beta_1_power and beta_2_power when appying gradients
+        # so we can test that they've been restored correctly.
+        beta1=1.0,
+        beta2=1.0)
     on_create_root = util.Checkpoint(
         optimizer=on_create_optimizer, model=on_create_model)
     # Deferred restoration
@@ -263,9 +263,9 @@ class CheckpointingTests(test.TestCase):
     dummy_var = resource_variable_ops.ResourceVariable([1.])
     on_create_optimizer.minimize(loss=dummy_var.read_value)
     status.assert_consumed()
-    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
-    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
-    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+    beta_1_power, beta_2_power = on_create_optimizer._get_beta_accumulators()
+    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta_1_power))
+    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta_2_power))
 
   # TODO(allenl): Debug garbage created by this test in python3.
   def testDeferredRestorationUsageEager(self):
@@ -477,7 +477,7 @@ class CheckpointingTests(test.TestCase):
     no_slot_status.run_restore_ops()
     self.assertEqual(12., self.evaluate(new_root.var))
     new_root.optimizer = adam.AdamOptimizer(0.1)
-    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
+    with self.assertRaisesRegexp(AssertionError, "beta_1_power"):
       slot_status.assert_consumed()
     self.assertEqual(12., self.evaluate(new_root.var))
     if context.executing_eagerly():
@@ -556,8 +556,8 @@ class CheckpointingTests(test.TestCase):
         self.evaluate(first_variable.assign([1.]))
         self.evaluate(optimizer.get_slot(
             var=first_variable, name="m").assign([2.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.evaluate(beta1_power.assign(3.))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta_1_power.assign(3.))
 
       # Save and load in a second graph
       second_graph = ops.Graph()
@@ -571,29 +571,29 @@ class CheckpointingTests(test.TestCase):
         self.evaluate(second_variable.assign([4.]))
         self.evaluate(optimizer.get_slot(
             var=second_variable, name="m").assign([5.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.evaluate(beta1_power.assign(6.))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta_1_power.assign(6.))
         save_path = second_root_checkpointable.save(checkpoint_prefix)
         self.evaluate(second_variable.assign([7.]))
         self.evaluate(optimizer.get_slot(
             var=second_variable, name="m").assign([8.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(6., self.evaluate(beta1_power))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta_1_power))
         status = second_root_checkpointable.restore(save_path)
         status.assert_consumed().run_restore_ops()
         self.assertAllEqual([4.], self.evaluate(second_variable))
         self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
             var=second_variable, name="m")))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(6., self.evaluate(beta1_power))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta_1_power))
 
       # Check that the first graph is unmolested
       with first_graph.as_default(), first_session.as_default():
         self.assertAllEqual([1.], self.evaluate(first_variable))
         self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
             var=first_variable, name="m")))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(3., self.evaluate(beta1_power))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(3., self.evaluate(beta_1_power))
 
 
 class TemplateTests(test.TestCase):
@@ -659,8 +659,8 @@ class CheckpointCompatibilityTests(test.TestCase):
     self.evaluate(model._named_dense.bias.assign([1.]))
     self.evaluate(optimizer.get_slot(
         var=model._named_dense.bias, name="m").assign([2.]))
-    beta1_power, _ = optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(3.))
+    beta_1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta_1_power.assign(3.))
     return root_checkpointable
 
   def _set_sentinels(self, root_checkpointable):
@@ -669,8 +669,8 @@ class CheckpointCompatibilityTests(test.TestCase):
         root_checkpointable.optimizer.get_slot(
             var=root_checkpointable.model._named_dense.bias, name="m")
         .assign([102.]))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(103.))
+    beta_1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.evaluate(beta_1_power.assign(103.))
 
   def _check_sentinels(self, root_checkpointable):
     self.assertAllEqual(
@@ -678,8 +678,8 @@ class CheckpointCompatibilityTests(test.TestCase):
     self.assertAllEqual([2.], self.evaluate(
         root_checkpointable.optimizer.get_slot(
             var=root_checkpointable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
-    self.assertAllEqual(3., self.evaluate(beta1_power))
+    beta_1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta_1_power))
 
   def _write_name_based_checkpoint(self):
     checkpoint_directory = self.get_temp_dir()
diff --git a/tensorflow/contrib/optimizer_v2/gradient_descent.py b/tensorflow/contrib/optimizer_v2/gradient_descent.py
index 945c8de559..8bdf408217 100644
--- a/tensorflow/contrib/optimizer_v2/gradient_descent.py
+++ b/tensorflow/contrib/optimizer_v2/gradient_descent.py
@@ -18,15 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.optimizer_v2 import optimizer_v2
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training import training_ops
+from tensorflow.python.keras.optimizer_v2 import sgd
+from tensorflow.python.util import deprecation
 
 
-class GradientDescentOptimizer(optimizer_v2.OptimizerV2):
+class GradientDescentOptimizer(sgd.SGD):
   """Optimizer that implements the gradient descent algorithm."""
 
+  @deprecation.deprecated_args(
+      "2018-10-01",
+      "`use_locking = True` is no longer supported and will be ignored.",
+      ("use_locking", [False]))
   def __init__(self, learning_rate, use_locking=False, name="GradientDescent"):
     """Construct a new gradient descent optimizer.
 
@@ -41,29 +43,5 @@ class GradientDescentOptimizer(optimizer_v2.OptimizerV2):
       name: Optional name prefix for the operations created when applying
         gradients. Defaults to "GradientDescent".
     """
-    super(GradientDescentOptimizer, self).__init__(use_locking, name)
-    self._set_hyper("learning_rate", learning_rate)
-
-  def _apply_dense(self, grad, var, state):
-    return training_ops.apply_gradient_descent(
-        var,
-        state.get_hyper("learning_rate", var.dtype.base_dtype),
-        grad,
-        use_locking=self._use_locking).op
-
-  def _resource_apply_dense(self, grad, handle, state):
-    lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
-    return training_ops.resource_apply_gradient_descent(
-        handle.handle, lr, grad, use_locking=self._use_locking)
-
-  def _resource_apply_sparse_duplicate_indices(
-      self, grad, handle, indices, state):
-    lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
-    return resource_variable_ops.resource_scatter_add(
-        handle.handle, indices, -grad * lr)
-
-  def _apply_sparse_duplicate_indices(self, grad, var, state):
-    delta = ops.IndexedSlices(
-        grad.values * state.get_hyper("learning_rate", var.dtype.base_dtype),
-        grad.indices, grad.dense_shape)
-    return var.scatter_sub(delta, use_locking=self._use_locking)
+    super(GradientDescentOptimizer, self).__init__(
+        learning_rate=learning_rate, name=name)
diff --git a/tensorflow/contrib/optimizer_v2/momentum.py b/tensorflow/contrib/optimizer_v2/momentum.py
index 0a5aadc2d1..0636f7e356 100644
--- a/tensorflow/contrib/optimizer_v2/momentum.py
+++ b/tensorflow/contrib/optimizer_v2/momentum.py
@@ -18,11 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.optimizer_v2 import optimizer_v2
-from tensorflow.python.training import training_ops
+from tensorflow.python.keras.optimizer_v2 import sgd
+from tensorflow.python.util import deprecation
 
 
-class MomentumOptimizer(optimizer_v2.OptimizerV2):
+class MomentumOptimizer(sgd.SGD):
   """Optimizer that implements the Momentum algorithm.
 
   Computes (if `use_nesterov = False`):
@@ -39,6 +39,10 @@ class MomentumOptimizer(optimizer_v2.OptimizerV2):
   when that part of the variable was used in the forward pass.
   """
 
+  @deprecation.deprecated_args(
+      "2018-10-01",
+      "`use_locking = True` is no longer supported and will be ignored.",
+      ("use_locking", [False]))
   def __init__(self, learning_rate, momentum,
                use_locking=False, name="Momentum", use_nesterov=False):
     """Construct a new Momentum optimizer.
@@ -68,57 +72,8 @@ class MomentumOptimizer(optimizer_v2.OptimizerV2):
     optimizer functions.
     @end_compatibility
     """
-    super(MomentumOptimizer, self).__init__(use_locking, name)
-    self._set_hyper("learning_rate", learning_rate)
-    self._set_hyper("momentum", momentum)
-    self._use_nesterov = use_nesterov
-
-  def _create_vars(self, var_list, state):
-    for v in var_list:
-      state.zeros_slot(v, "momentum")
-
-  def _apply_dense(self, grad, var, state):
-    mom = state.get_slot(var, "momentum")
-    return training_ops.apply_momentum(
-        var,
-        mom,
-        state.get_hyper("learning_rate", var.dtype.base_dtype),
-        grad,
-        state.get_hyper("momentum", var.dtype.base_dtype),
-        use_locking=self._use_locking,
-        use_nesterov=self._use_nesterov).op
-
-  def _resource_apply_dense(self, grad, var, state):
-    mom = state.get_slot(var, "momentum")
-    return training_ops.resource_apply_momentum(
-        var.handle,
-        mom.handle,
-        state.get_hyper("learning_rate", var.dtype.base_dtype),
-        grad,
-        state.get_hyper("momentum", var.dtype.base_dtype),
-        use_locking=self._use_locking,
-        use_nesterov=self._use_nesterov)
-
-  def _apply_sparse(self, grad, var, state):
-    mom = state.get_slot(var, "momentum")
-    return training_ops.sparse_apply_momentum(
-        var,
-        mom,
-        state.get_hyper("learning_rate", var.dtype.base_dtype),
-        grad.values,
-        grad.indices,
-        state.get_hyper("momentum", var.dtype.base_dtype),
-        use_locking=self._use_locking,
-        use_nesterov=self._use_nesterov).op
-
-  def _resource_apply_sparse(self, grad, var, indices, state):
-    mom = state.get_slot(var, "momentum")
-    return training_ops.resource_sparse_apply_momentum(
-        var.handle,
-        mom.handle,
-        state.get_hyper("learning_rate", var.dtype.base_dtype),
-        grad,
-        indices,
-        state.get_hyper("momentum", var.dtype.base_dtype),
-        use_locking=self._use_locking,
-        use_nesterov=self._use_nesterov)
+    super(MomentumOptimizer, self).__init__(
+        learning_rate=learning_rate,
+        momentum=momentum,
+        name=name,
+        nesterov=use_nesterov)
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 53e27c08c4..9c98dd93b4 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -20,462 +20,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.util import deprecation
 
-from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gradients
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.training import distribution_strategy_context
-from tensorflow.python.training import optimizer as optimizer_v1
-from tensorflow.python.training import slot_creator
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.util import nest
 
-
-class _OptimizableVariable(object):
-  """Interface for abstracting over variables in the optimizers."""
-
-  @abc.abstractmethod
-  def target(self):
-    """Returns the optimization target for this variable."""
-    raise NotImplementedError("Calling an abstract method.")
-
-  @abc.abstractmethod
-  def update_op(self, optimizer, g, *args):
-    """Returns the update ops for updating the variable."""
-    raise NotImplementedError("Calling an abstract method.")
-
-
-class _RefVariableProcessor(_OptimizableVariable):
-  """Processor for Variable."""
-
-  def __init__(self, v):
-    self._v = v
-
-  def target(self):
-    return self._v._ref()  # pylint: disable=protected-access
-
-  def update_op(self, optimizer, g, *args):
-    if isinstance(g, ops.Tensor):
-      update_op = optimizer._apply_dense(g, self._v, *args)  # pylint: disable=protected-access
-      if self._v.constraint is not None:
-        with ops.control_dependencies([update_op]):
-          return self._v.assign(self._v.constraint(self._v))
-      else:
-        return update_op
-    else:
-      assert isinstance(g, ops.IndexedSlices), ("Gradient ", g, " is neither a "
-                                                "tensor nor IndexedSlices.")
-      if self._v.constraint is not None:
-        raise RuntimeError(
-            "Cannot use a constraint function on a sparse variable.")
-      # pylint: disable=protected-access
-      return optimizer._apply_sparse_duplicate_indices(g, self._v, *args)
-
-
-class _DenseReadResourceVariableProcessor(_OptimizableVariable):
-  """Processor for dense ResourceVariables."""
-
-  def __init__(self, v):
-    self._v = v
-
-  def target(self):
-    return self._v
-
-  def update_op(self, optimizer, g, *args):
-    # pylint: disable=protected-access
-    update_op = optimizer._resource_apply_dense(g, self._v.op.inputs[0], *args)
-    if self._v.constraint is not None:
-      with ops.control_dependencies([update_op]):
-        return self._v.assign(self._v.constraint(self._v))
-    else:
-      return update_op
-
-
-class _DenseResourceVariableProcessor(_OptimizableVariable):
-  """Processor for dense ResourceVariables."""
-
-  def __init__(self, v):
-    self._v = v
-
-  def target(self):
-    return self._v
-
-  def update_op(self, optimizer, g, *args):
-    # pylint: disable=protected-access
-    if isinstance(g, ops.IndexedSlices):
-      if self._v.constraint is not None:
-        raise RuntimeError(
-            "Cannot use a constraint function on a sparse variable.")
-      return optimizer._resource_apply_sparse_duplicate_indices(
-          g.values, self._v, g.indices, *args)
-    update_op = optimizer._resource_apply_dense(g, self._v, *args)
-    if self._v.constraint is not None:
-      with ops.control_dependencies([update_op]):
-        return self._v.assign(self._v.constraint(self._v))
-    else:
-      return update_op
-
-
-class _TensorProcessor(_OptimizableVariable):
-  """Processor for ordinary Tensors.
-
-  Even though a Tensor can't really be updated, sometimes it is useful to
-  compute the gradients with respect to a Tensor using the optimizer. Updating
-  the Tensor is, of course, unsupported.
-  """
-
-  def __init__(self, v):
-    self._v = v
-
-  def target(self):
-    return self._v
-
-  def update_op(self, optimizer, g, *args):
-    raise NotImplementedError("Trying to update a Tensor ", self._v)
-
-
-def _get_processor(v):
-  """The processor of v."""
-  if context.executing_eagerly():
-    if isinstance(v, ops.Tensor):
-      return _TensorProcessor(v)
-    else:
-      return _DenseResourceVariableProcessor(v)
-  if v.op.type == "VarHandleOp":
-    return _DenseResourceVariableProcessor(v)
-  if isinstance(v, variables.Variable):
-    return _RefVariableProcessor(v)
-  if isinstance(v, ops.Tensor):
-    return _TensorProcessor(v)
-  raise NotImplementedError("Trying to optimize unsupported type ", v)
-
-
-def _var_key_v2(var):
-  """Key for representing a primary variable, for looking up slots."""
-  # pylint: disable=protected-access
-  if hasattr(var, "_distributed_container"):
-    distributed_container = var._distributed_container()
-    assert distributed_container is not None
-    if context.executing_eagerly():
-      return distributed_container._unique_id
-    return distributed_container._shared_name
-  if context.executing_eagerly():
-    return var._unique_id
-  return var.op.name
-
-
-def _resolve(value, name):
-  if callable(value):
-    value = value()
-  return ops.convert_to_tensor(value, name=name)
-
-
-def _is_dynamic(value):
-  """Returns true if __init__ arg `value` should be re-evaluated each step."""
-  if callable(value): return True
-  # Don't need to do anything special in graph mode, since dynamic values
-  # will propagate correctly automatically.
-  # TODO(josh11b): Add per-device caching across steps using variables for
-  # truly static values once we add distributed support.
-  if context.executing_eagerly() and isinstance(
-      value, resource_variable_ops.ResourceVariable):
-    return True
-  return False
-
-
-class _OptimizerV2State(object):
-  """Holds per-graph and per-step optimizer state.
-
-  Use _init_with_static_hyper() to create the state for a graph, and then
-  _copy_with_dynamic_hyper() to convert that to state for a particular step.
-  The difference between the two is that the former only has hyper
-  parameter values that are static and the latter also has values that
-  can change every step (according to _is_dynamic()).
-  """
-
-  def __init__(self, op_name):
-    self._op_name = op_name
-
-  def _init_with_static_hyper(self, hyper):
-    """Initialize a fresh state object from hyper dict."""
-    # self._hyper contains a dict from name to a dict with the Tensor values.
-    # This dict starts with a single item with key "None" with the hyper
-    # parameter value converted to a Tensor. Other items have dtype keys
-    # with that Tensor cast to that dtype.
-    with ops.init_scope():
-      self._hyper = {name: {None: ops.convert_to_tensor(value, name=name)}
-                     for name, (dynamic, value) in sorted(hyper.items())
-                     if not dynamic}
-    self._slots = {}
-    self._non_slot_dict = {}
-    # Extra state to help Optimizers implement Checkpointable. Holds information
-    # about variables which will be restored as soon as they're created.
-    self._deferred_dependencies = {}  # Non-slot variables
-    self._deferred_slot_restorations = {}  # Slot variables
-
-  def _copy_with_dynamic_hyper(self, hyper, distribution, non_slot_devices):
-    """Create a new state object for a particular step."""
-    ret = _OptimizerV2State(self._op_name)
-    # pylint: disable=protected-access
-    ret._slots = self._slots
-    ret._non_slot_dict = self._non_slot_dict
-    ret._deferred_dependencies = self._deferred_dependencies
-    ret._deferred_slot_restorations = self._deferred_slot_restorations
-    ret._hyper = {name: {None: _resolve(value, name)}
-                  for name, (dynamic, value) in sorted(hyper.items())
-                  if dynamic}
-    ret._hyper.update(self._hyper)
-    ret._non_slot_devices = non_slot_devices
-    ret._distribution = distribution
-    return ret
-
-  def _variables(self):
-    """Returns a list of all variables held by self."""
-    optimizer_variables = list(self._non_slot_dict.values())
-    for variable_dict in self._slots.values():
-      for slot_for_variable in variable_dict.values():
-        optimizer_variables.append(slot_for_variable)
-    # Sort variables by name so that the return is deterministic.
-    return sorted(optimizer_variables, key=lambda v: v.name)
-
-  def _slot_dict(self, slot_name):
-    """Returns a dict for caching slots created under the given name.
-
-    Args:
-      slot_name: Name for the slot.
-
-    Returns:
-      A dict that maps primary `Variable` objects to the slot created
-      for that variable, under the given slot name.
-    """
-    named_slots = self._slots.get(slot_name, None)
-    if named_slots is None:
-      named_slots = {}
-      self._slots[slot_name] = named_slots
-    return named_slots
-
-  def create_slot(self, var, val, slot_name, optional_op_name=None):
-    """Find or create a slot for a variable.
-
-    Args:
-      var: A `Variable` object.
-      val: A `Tensor`.  The initial value of the slot.
-      slot_name: Name for the slot.
-      optional_op_name: Name to use when scoping the Variable that
-        needs to be created for the slot.
-
-    Returns:
-      A `Variable` object.
-    """
-    named_slots = self._slot_dict(slot_name)
-    var_key = _var_key_v2(var)
-    if var_key not in named_slots:
-      new_slot_variable = slot_creator.create_slot(
-          var, val, optional_op_name or self._op_name)
-      self._restore_slot_variable(
-          slot_name=slot_name, variable=var,
-          slot_variable=new_slot_variable)
-      named_slots[var_key] = new_slot_variable
-    return named_slots[var_key]
-
-  def create_slot_with_initializer(self, var, initializer, shape, dtype,
-                                   slot_name, optional_op_name=None):
-    """Find or create a slot for a variable, using an Initializer.
-
-    Args:
-      var: A `Variable` object.
-      initializer: An `Initializer`.  The initial value of the slot.
-      shape: Shape of the initial value of the slot.
-      dtype: Type of the value of the slot.
-      slot_name: Name for the slot.
-      optional_op_name: Name to use when scoping the Variable that
-        needs to be created for the slot.
-
-    Returns:
-      A `Variable` object.
-    """
-    named_slots = self._slot_dict(slot_name)
-    var_key = _var_key_v2(var)
-    if var_key not in named_slots:
-      new_slot_variable = slot_creator.create_slot_with_initializer(
-          var, initializer, shape, dtype, optional_op_name or self._op_name)
-      self._restore_slot_variable(
-          slot_name=slot_name, variable=var,
-          slot_variable=new_slot_variable)
-      named_slots[var_key] = new_slot_variable
-    return named_slots[var_key]
-
-  def zeros_slot(self, var, slot_name, optional_op_name=None):
-    """Find or create a slot initialized with 0.0.
-
-    Args:
-      var: A `Variable` object.
-      slot_name: Name for the slot.
-      optional_op_name: Name to use when scoping the Variable that
-        needs to be created for the slot.
-
-    Returns:
-      A `Variable` object.
-    """
-    named_slots = self._slot_dict(slot_name)
-    var_key = _var_key_v2(var)
-    if var_key not in named_slots:
-      new_slot_variable = slot_creator.create_zeros_slot(
-          var, optional_op_name or self._op_name)
-      self._restore_slot_variable(
-          slot_name=slot_name, variable=var,
-          slot_variable=new_slot_variable)
-      named_slots[var_key] = new_slot_variable
-    return named_slots[var_key]
-
-  def _create_or_restore_slot_variable(
-      self, slot_variable_position, slot_name, variable,
-      optional_op_name=None):
-    """Restore a slot variable's value, possibly creating it.
-
-    Called when a variable which has an associated slot variable is created or
-    restored. When executing eagerly, we create the slot variable with a
-    restoring initializer.
-
-    No new variables are created when graph building. Instead,
-    _restore_slot_variable catches these after normal creation and adds restore
-    ops to the graph. This method is nonetheless important when graph building
-    for the case when a slot variable has already been created but `variable`
-    has just been added to a dependency graph (causing us to realize that the
-    slot variable needs to be restored).
-
-    Args:
-      slot_variable_position: A `checkpointable._CheckpointPosition` object
-        indicating the slot variable `Checkpointable` object to be restored.
-      slot_name: The name of this `Optimizer`'s slot to restore into.
-      variable: The variable object this slot is being created for.
-      optional_op_name: Name to use when scoping the Variable that
-        needs to be created for the slot.
-    """
-    slot_variable = self.get_slot(var=variable, name=slot_name)
-    if (slot_variable is None and context.executing_eagerly() and
-        slot_variable_position.is_simple_variable()
-        # Defer slot variable creation if there is an active variable creator
-        # scope. Generally we'd like to eagerly create/restore slot variables
-        # when possible, but this may mean that scopes intended to catch
-        # `variable` also catch its eagerly created slot variable
-        # unintentionally (specifically make_template would add a dependency on
-        # a slot variable if not for this case). Deferring is mostly harmless
-        # (aside from double initialization), and makes variable creator scopes
-        # behave the same way they do when graph building.
-        and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
-      initializer = checkpointable.CheckpointInitialValue(
-          checkpoint_position=slot_variable_position)
-      slot_variable = self.create_slot(
-          var=variable,
-          val=initializer,
-          slot_name=slot_name,
-          optional_op_name=optional_op_name)
-      # Optimizers do not have unconditional dependencies on their slot
-      # variables (nor do any other objects). They are only saved if the
-      # variables they were created for are also saved.
-    if slot_variable is not None:
-      # If we've either made this slot variable, or if we've pulled out an
-      # existing slot variable, we should restore it.
-      slot_variable_position.restore(slot_variable)
-    else:
-      # We didn't make the slot variable. Defer restoring until it gets created
-      # normally. We keep a list rather than the one with the highest restore
-      # UID in case slot variables have their own dependencies, in which case
-      # those could differ between restores.
-      variable_key = _var_key_v2(variable)
-      self._deferred_slot_restorations.setdefault(
-          slot_name, {}).setdefault(variable_key, []).append(
-              slot_variable_position)
-
-  def get_slot(self, var, name):
-    """Return a slot named `name` created for `var` by the Optimizer.
-
-    Some `Optimizer` subclasses use additional variables.  For example
-    `Momentum` and `Adagrad` use variables to accumulate updates.  This method
-    gives access to these `Variable` objects if for some reason you need them.
-
-    Use `get_slot_names()` to get the list of slot names created by the
-    `Optimizer`.
-
-    Args:
-      var: A variable passed to `minimize()` or `apply_gradients()`.
-      name: A string.
-
-    Returns:
-      The `Variable` for the slot if it was created, `None` otherwise.
-    """
-    named_slots = self._slots.get(name, None)
-    if not named_slots:
-      return None
-    return named_slots.get(_var_key_v2(var), None)
-
-  def get_slot_names(self):
-    """Return a list of the names of slots created by the `Optimizer`.
-
-    See `get_slot()`.
-
-    Returns:
-      A list of strings.
-    """
-    return sorted(self._slots.keys())
-
-  def create_non_slot(self, initial_value, name, colocate_with=None):
-    """Add an extra variable, not associated with a slot."""
-    v = self._non_slot_dict.get(name, None)
-    if v is None:
-      if colocate_with is None: colocate_with = self._non_slot_devices
-      with self._distribution.colocate_vars_with(colocate_with):
-        # TODO(josh11b): Use get_variable() except for the legacy Adam use case.
-        v = variable_scope.variable(initial_value, name=name, trainable=False)
-      self._non_slot_dict[name] = v
-      deferred_dependencies_list = self._deferred_dependencies.pop(name, ())
-      for checkpoint_position in sorted(
-          deferred_dependencies_list,
-          key=lambda restore: restore.checkpoint.restore_uid,
-          reverse=True):
-        checkpoint_position.restore(v)
-    return v
-
-  def _restore_slot_variable(self, slot_name, variable, slot_variable):
-    """Restore a newly created slot variable's value."""
-    variable_key = _var_key_v2(variable)
-    deferred_restorations = self._deferred_slot_restorations.get(
-        slot_name, {}).pop(variable_key, [])
-    # Iterate over restores, highest restore UID first to minimize the number
-    # of assignments.
-    deferred_restorations.sort(key=lambda position: position.restore_uid,
-                               reverse=True)
-    for checkpoint_position in deferred_restorations:
-      checkpoint_position.restore(slot_variable)
-
-  def get_non_slot(self, name):
-    """Returns the non-slot variable identified by `name`."""
-    return self._non_slot_dict.get(name, None)
-
-  def get_hyper(self, name, dtype=None):
-    """Returns the `name` hyper parameter, optionally cast to `dtype`."""
-    dtype_dict = self._hyper[name]
-    # Do we have the value cast to dtype already cached? This should always
-    # succeed when dtype is None.
-    if dtype in dtype_dict:
-      return dtype_dict[dtype]
-    # Not cached, cast to dtype and save the result in the cache.
-    result = math_ops.cast(dtype_dict[None], dtype)
-    dtype_dict[dtype] = result
-    return result
-
-
-class OptimizerV2(optimizer_v1.Optimizer):
+class OptimizerV2(optimizer_v2.OptimizerV2):
   """Updated base class for optimizers.
 
   This class defines the API to add Ops to train a model.  You never use this
@@ -586,6 +135,10 @@ class OptimizerV2(optimizer_v1.Optimizer):
   GATE_OP = 1
   GATE_GRAPH = 2
 
+  @deprecation.deprecated_args(
+      "2018-10-01",
+      "`use_locking = True` is no longer supported and will be ignored.",
+      ("use_locking", [False]))
   def __init__(self, use_locking, name):
     """Create a new Optimizer.
 
@@ -606,746 +159,4 @@ class OptimizerV2(optimizer_v1.Optimizer):
       RuntimeError: If _create_slots has been overridden instead of
           _create_vars.
     """
-    # Note: We intentionally don't call parent __init__.
-
-    # Optimizer._create_slots was replaced by _create_vars in OptimizerV2.
-    if (self.__class__._create_slots.__code__ is not  # pylint: disable=protected-access
-        OptimizerV2._create_slots.__code__):
-      raise RuntimeError("Override _create_vars instead of _create_slots when "
-                         "descending from OptimizerV2 (class %s)" %
-                         self.__class__.__name__)
-    if not name:
-      raise ValueError("Must specify the optimizer name")
-
-    self._use_locking = use_locking
-    self._name = name
-    # Map from graph_key to state for that graph. We use the graph_key
-    # since it works in both eager and graph mode, and gives the outer
-    # graph inside functions.
-    tower_context = distribution_strategy_context.get_tower_context()
-    if tower_context is None:
-      # In a cross-tower context for a DistributionStrategy, which means
-      # only one Optimizer will be created, not one per tower.
-      self._per_graph_state = {}
-    else:
-      # We use get_tower_context().merge_call() to get a single dict
-      # shared across all model replicas when running with a
-      # DistributionStrategy.
-      self._per_graph_state = tower_context.merge_call(lambda _: {})
-
-    # Hyper parameters, and whether they should be re-evaluated every step.
-    self._hyper = {}
-
-  def _set_hyper(self, name, value):
-    self._hyper[name] = (_is_dynamic(value), value)
-
-  def minimize(self, loss, global_step=None, var_list=None,
-               gate_gradients=GATE_OP, aggregation_method=None,
-               colocate_gradients_with_ops=False, name=None,
-               grad_loss=None, stop_gradients=None,
-               scale_loss_by_num_towers=None):
-    """Add operations to minimize `loss` by updating `var_list`.
-
-    This method simply combines calls `compute_gradients()` and
-    `apply_gradients()`. If you want to process the gradient before applying
-    them call `compute_gradients()` and `apply_gradients()` explicitly instead
-    of using this function.
-
-    Args:
-      loss: A `Tensor` containing the value to minimize.
-      global_step: Optional `Variable` to increment by one after the
-        variables have been updated.
-      var_list: Optional list or tuple of `Variable` objects to update to
-        minimize `loss`.  Defaults to the list of variables collected in
-        the graph under the key `GraphKeys.TRAINABLE_VARIABLES`.
-      gate_gradients: How to gate the computation of gradients.  Can be
-        `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
-      aggregation_method: Specifies the method used to combine gradient terms.
-        Valid values are defined in the class `AggregationMethod`.
-      colocate_gradients_with_ops: If True, try colocating gradients with
-        the corresponding op.
-      name: Optional name for the returned operation.
-      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
-      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
-        through.
-      scale_loss_by_num_towers: Optional boolean. If true, scale the loss
-        down by the number of towers. By default, auto-detects whether this
-        is needed.
-
-    Returns:
-      An Operation that updates the variables in `var_list`.  If `global_step`
-      was not `None`, that operation also increments `global_step`.
-
-    Raises:
-      ValueError: If some of the variables are not `Variable` objects.
-
-    @compatibility(eager)
-    When eager execution is enabled, `loss` should be a Python function that
-    takes elements of `var_list` as arguments and computes the value to be
-    minimized. If `var_list` is None, `loss` should take no arguments.
-    Minimization (and gradient computation) is done with respect to the
-    elements of `var_list` if not None, else with respect to any trainable
-    variables created during the execution of the `loss` function.
-    `gate_gradients`, `aggregation_method`, `colocate_gradients_with_ops` and
-    `grad_loss` are ignored when eager execution is enabled.
-    @end_compatibility
-    """
-    grads_and_vars = self.compute_gradients(
-        loss, var_list=var_list, gate_gradients=gate_gradients,
-        aggregation_method=aggregation_method,
-        colocate_gradients_with_ops=colocate_gradients_with_ops,
-        grad_loss=grad_loss, stop_gradients=stop_gradients,
-        scale_loss_by_num_towers=scale_loss_by_num_towers)
-
-    vars_with_grad = [v for g, v in grads_and_vars if g is not None]
-    if not vars_with_grad:
-      raise ValueError(
-          "No gradients provided for any variable, check your graph for ops"
-          " that do not support gradients, between variables %s and loss %s." %
-          ([str(v) for _, v in grads_and_vars], loss))
-
-    return self.apply_gradients(grads_and_vars, global_step=global_step,
-                                name=name)
-
-  def compute_gradients(self, loss, var_list=None,
-                        gate_gradients=GATE_OP,
-                        aggregation_method=None,
-                        colocate_gradients_with_ops=False,
-                        grad_loss=None, stop_gradients=None,
-                        scale_loss_by_num_towers=None):
-    """Compute gradients of `loss` for the variables in `var_list`.
-
-    This is the first part of `minimize()`.  It returns a list
-    of (gradient, variable) pairs where "gradient" is the gradient
-    for "variable".  Note that "gradient" can be a `Tensor`, an
-    `IndexedSlices`, or `None` if there is no gradient for the
-    given variable.
-
-    Args:
-      loss: A Tensor containing the value to minimize or a callable taking
-        no arguments which returns the value to minimize. When eager execution
-        is enabled it must be a callable.
-      var_list: Optional list or tuple of `tf.Variable` to update to minimize
-        `loss`.  Defaults to the list of variables collected in the graph
-        under the key `GraphKeys.TRAINABLE_VARIABLES`.
-      gate_gradients: How to gate the computation of gradients.  Can be
-        `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
-      aggregation_method: Specifies the method used to combine gradient terms.
-        Valid values are defined in the class `AggregationMethod`.
-      colocate_gradients_with_ops: If True, try colocating gradients with
-        the corresponding op.
-      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
-      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
-        through.
-      scale_loss_by_num_towers: Optional boolean. If true, scale the loss
-        down by the number of towers. By default, auto-detects whether this
-        is needed.
-
-    Returns:
-      A list of (gradient, variable) pairs. Variable is always present, but
-      gradient can be `None`.
-
-    Raises:
-      TypeError: If `var_list` contains anything else than `Variable` objects.
-      ValueError: If some arguments are invalid.
-      RuntimeError: If called with eager execution enabled and `loss` is
-        not callable.
-
-    @compatibility(eager)
-    When eager execution is enabled, `gate_gradients`, `aggregation_method`,
-    and `colocate_gradients_with_ops` are ignored.
-    @end_compatibility
-    """
-    # TODO(josh11b): Test that we handle weight decay in a reasonable way.
-    if callable(loss):
-      with backprop.GradientTape() as tape:
-        if var_list is not None:
-          tape.watch(var_list)
-        loss_value = loss()
-
-        # Scale loss for number of towers (callable-loss case). In this case,
-        # we have to be careful to call distribute_lib.get_loss_reduction()
-        # *after* loss() is evaluated, so we know what loss reduction it uses.
-        if scale_loss_by_num_towers is None:
-          scale_loss_by_num_towers = (
-              distribute_lib.get_loss_reduction() ==
-              variable_scope.VariableAggregation.MEAN)
-        if scale_loss_by_num_towers:
-          num_towers = distribution_strategy_context.get_distribution_strategy(
-          ).num_towers
-          if num_towers > 1:
-            loss_value *= 1. / num_towers
-
-      if var_list is None:
-        var_list = tape.watched_variables()
-      grads = tape.gradient(loss_value, var_list, grad_loss)
-      return list(zip(grads, var_list))
-    if context.executing_eagerly():
-      raise RuntimeError(
-          "`loss` passed to Optimizer.compute_gradients should "
-          "be a function when eager execution is enabled.")
-
-    # Scale loss for number of towers (non-callable-loss case).
-    if scale_loss_by_num_towers is None:
-      scale_loss_by_num_towers = (
-          distribute_lib.get_loss_reduction() ==
-          variable_scope.VariableAggregation.MEAN)
-    if scale_loss_by_num_towers:
-      num_towers = distribution_strategy_context.get_distribution_strategy(
-      ).num_towers
-      if num_towers > 1:
-        loss *= 1. / num_towers
-
-    if gate_gradients not in [optimizer_v1.Optimizer.GATE_NONE,
-                              optimizer_v1.Optimizer.GATE_OP,
-                              optimizer_v1.Optimizer.GATE_GRAPH]:
-      raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
-                       "Optimizer.GATE_OP, Optimizer.GATE_GRAPH.  Not %s" %
-                       gate_gradients)
-    self._assert_valid_dtypes([loss])
-    if grad_loss is not None:
-      self._assert_valid_dtypes([grad_loss])
-    if var_list is None:
-      var_list = (
-          variables.trainable_variables() +
-          ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
-    else:
-      var_list = nest.flatten(var_list)
-    # pylint: disable=protected-access
-    var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS)
-    # pylint: enable=protected-access
-    processors = [_get_processor(v) for v in var_list]
-    if not var_list:
-      raise ValueError("No variables to optimize.")
-    var_refs = [p.target() for p in processors]
-    grads = gradients.gradients(
-        loss, var_refs, grad_ys=grad_loss,
-        gate_gradients=(gate_gradients == optimizer_v1.Optimizer.GATE_OP),
-        aggregation_method=aggregation_method,
-        colocate_gradients_with_ops=colocate_gradients_with_ops,
-        stop_gradients=stop_gradients)
-    if gate_gradients == optimizer_v1.Optimizer.GATE_GRAPH:
-      grads = control_flow_ops.tuple(grads)
-    grads_and_vars = list(zip(grads, var_list))
-    self._assert_valid_dtypes(
-        [v for g, v in grads_and_vars
-         if g is not None and v.dtype != dtypes.resource])
-    return grads_and_vars
-
-  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
-    """Apply gradients to variables.
-
-    This is the second part of `minimize()`. It returns an `Operation` that
-    applies gradients.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs as returned by
-        `compute_gradients()`.
-      global_step: Optional `Variable` to increment by one after the
-        variables have been updated.
-      name: Optional name for the returned operation.  Default to the
-        name passed to the `Optimizer` constructor.
-
-    Returns:
-      An `Operation` that applies the specified gradients. If `global_step`
-      was not None, that operation also increments `global_step`.
-
-    Raises:
-      TypeError: If `grads_and_vars` is malformed.
-      ValueError: If none of the variables have gradients.
-    """
-    # This is a default implementation of apply_gradients() that can be shared
-    # by most optimizers.  It relies on the subclass implementing the following
-    # methods: _create_vars(), _prepare(), _apply_dense(), and _apply_sparse().
-
-    # Filter out variables with gradients of `None`.
-    grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
-    if not grads_and_vars:
-      raise ValueError("No variables provided.")
-    filtered = tuple((g, v) for (g, v) in grads_and_vars if g is not None)
-    if not filtered:
-      raise ValueError("No gradients provided for any variable: %s." %
-                       ([str(v) for _, v in grads_and_vars],))
-    return distribution_strategy_context.get_tower_context().merge_call(
-        self._distributed_apply, filtered, global_step=global_step, name=name)
-
-  def _get_or_create_state(self, var_list=None):
-    """Either looks up or creates `_OptimizerV2State`.
-
-    If any variables are available, they should be passed via the `var_list`
-    argument, and these will be used to determine the graph to create/retrieve
-    state for. Otherwise the returned state is for the current default graph.
-
-    Args:
-      var_list: A list of variables to extract a graph from.
-
-    Returns:
-      An `_OptimizerV2State` object.
-    """
-    # Determine the graph_key from the current graph.
-    eager_execution = context.executing_eagerly()
-    if eager_execution or var_list is None:
-      graph = ops.get_default_graph()
-    else:
-      graph = ops._get_graph_from_inputs(var_list)  # pylint: disable=protected-access
-    assert graph is not None
-    graph_key = graph._graph_key  # pylint: disable=protected-access
-
-    # Get the per graph state by looking up the graph_key.
-    if graph_key in self._per_graph_state:
-      per_graph_state = self._per_graph_state[graph_key]
-    else:
-      per_graph_state = _OptimizerV2State(self._name)
-      per_graph_state._init_with_static_hyper(self._hyper)  # pylint: disable=protected-access
-      self._per_graph_state[graph_key] = per_graph_state
-    return per_graph_state
-
-  def _distributed_apply(self, distribution, grads_and_vars, global_step, name):
-    """`apply_gradients` for use with a `DistributionStrategy`."""
-    reduced_grads = distribution.batch_reduce(
-        variable_scope.VariableAggregation.SUM, grads_and_vars)
-    var_list = [v for _, v in grads_and_vars]
-    grads_and_vars = zip(reduced_grads, var_list)
-
-    unwrapped_var_list = [x for v in var_list for x in distribution.unwrap(v)]
-    eager_execution = context.executing_eagerly()
-    if eager_execution:
-      # Give a clear error in this case instead of "name not supported
-      # for Eager Tensors" when we compute non_slot_devices.
-      for v in unwrapped_var_list:
-        if isinstance(v, ops.Tensor):
-          raise NotImplementedError("Trying to update a Tensor ", v)
-
-    with ops.name_scope(name, self._name) as name:
-      per_graph_state = self._get_or_create_state(var_list=unwrapped_var_list)
-      # Include the current value of any dynamic hyper parameters in `state`.
-      non_slot_devices = distribution.non_slot_devices(var_list)
-      state = per_graph_state._copy_with_dynamic_hyper(  # pylint: disable=protected-access
-          self._hyper, distribution, non_slot_devices)
-
-    # Create any slot and non-slot variables we need in `state`.
-    with ops.init_scope():
-      self._create_vars(var_list, state)
-
-    with ops.name_scope(name):  # Re-enter name_scope created above
-      # Give the child class a chance to do something before we start
-      # applying gradients.
-      self._prepare(state)
-
-      def update(v, g):
-        """Update variable `v` using gradient `g`."""
-        assert v is not None
-
-        # Convert the grad to Tensor or IndexedSlices if necessary, and
-        # look up a processor for each variable's type.
-        try:
-          g = ops.convert_to_tensor_or_indexed_slices(g)
-        except TypeError:
-          raise TypeError(
-              "Gradient must be convertible to a Tensor"
-              " or IndexedSlices, or None: %s" % g)
-        if not isinstance(g, (ops.Tensor, ops.IndexedSlices)):
-          raise TypeError(
-              "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
-        processor = _get_processor(v)
-
-        # We colocate all ops created in _apply_dense or _apply_sparse
-        # on the same device as the variable.
-        # TODO(apassos): figure out how to get the variable name here.
-        scope_name = "" if eager_execution else v.op.name
-        # device_policy is set because non-mirrored tensors will be read in
-        # `update_op`.
-        # TODO(josh11b): Make different state objects for each device to
-        # avoid needing to set the device_policy.
-        with ops.name_scope("update_" + scope_name), \
-            context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-          return processor.update_op(self, g, state)
-
-      # Use the processors to update the variables.
-      update_ops = []
-      for grad, var in grads_and_vars:
-        update_ops.extend(distribution.update(var, update, grad, grouped=False))
-
-      # Give the child class a chance to do something after applying
-      # gradients
-      def finish():
-        # TODO(josh11b): Make different state objects for each device to
-        # avoid needing to set the device_policy.
-        with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-          return self._finish(state)
-
-      update_ops = control_flow_ops.group(update_ops)
-      with ops.control_dependencies([update_ops]):
-        finish_updates = distribution.update_non_slot(
-            non_slot_devices, finish, grouped=False)
-      # We said grouped=False, which means finish_updates is always a list.
-      # It will be [None] when finish() returns None.
-      if finish_updates == [None]:
-        finish_updates = [update_ops]
-
-      # Update `global_step` (if any).
-      if global_step is None:
-        apply_updates = distribution.group(finish_updates, name=name)
-      else:
-        with ops.control_dependencies(finish_updates):
-
-          def update_global_step(global_step, name):
-            return global_step.assign_add(1, read_value=False, name=name)
-
-          apply_updates = distribution.update(
-              global_step, update_global_step, name)
-
-      # Add the training op to the TRAIN_OP graph collection in graph mode.
-      if not eager_execution:
-        if isinstance(apply_updates, ops.Tensor):
-          apply_updates = apply_updates.op
-        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
-        if apply_updates not in train_op:
-          train_op.append(apply_updates)
-
-      return apply_updates
-
-  def get_slot(self, var, name):
-    """Return a slot named `name` created for `var` by the Optimizer.
-
-    Some `Optimizer` subclasses use additional variables.  For example
-    `Momentum` and `Adagrad` use variables to accumulate updates.  This method
-    gives access to these `Variable` objects if for some reason you need them.
-
-    Use `get_slot_names()` to get the list of slot names created by the
-    `Optimizer`.
-
-    Args:
-      var: A variable passed to `minimize()` or `apply_gradients()`.
-      name: A string.
-
-    Returns:
-      The `Variable` for the slot if it was created, `None` otherwise.
-    """
-    state = self._get_state_for_var(var)
-    return state.get_slot(var, name) if state is not None else None
-
-  def get_slot_names(self):
-    """Return a list of the names of slots created by the `Optimizer`.
-
-    See `get_slot()`.
-
-    Returns:
-      A list of strings.
-    """
-    state = self._get_per_graph_state()
-    return state.get_slot_names() if state is not None else []
-
-  def variables(self):
-    """A list of variables which encode the current state of `Optimizer`.
-
-    Includes slot variables and additional global variables created by the
-    optimizer in the current default graph.
-
-    Returns:
-      A list of variables.
-    """
-    state = self._get_per_graph_state()
-    return state._variables() if state is not None else []  # pylint: disable=protected-access
-
-  # --------------
-  # Methods to be implemented by subclasses if they want to use the
-  # inherited implementation of apply_gradients() or compute_gradients().
-  # --------------
-  def _create_vars(self, var_list, state):
-    """Create all slots needed by the variables and any non-slot variables.
-
-    Args:
-      var_list: A list of `Variable` objects.
-      state: An object with these methods:
-        `create_slot(var, val, slot_name, optional_op_name)`,
-        `create_slot_with_initializer(`
-            `var, initializer, shape, dtype, slot_name, optional_op_name)`,
-        `zeros_slot(var, slot_name, optional_op_name)`,
-        `create_non_slot_variable(initial_value, name, colocate_with)`,
-        `get_hyper(name)`
-    """
-    # No slots needed by default
-    pass
-
-  def _prepare(self, state):
-    """Code to execute before applying gradients.
-
-    Note that most uses of _prepare() in Optimizer have been subsumed
-    by explicit support for hyper parameters in OptimizerV2
-
-    Args:
-      state: An object with a `get_hyper(name)` method.
-
-    Returns:
-      Return value will be ignored.
-    """
-    pass
-
-  def _apply_dense(self, grad, var, state):
-    """Add ops to apply dense gradients to `var`.
-
-    Args:
-      grad: A `Tensor`.
-      var: A `Variable` object.
-      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
-        and `get_hyper(name)` methods.
-
-    Returns:
-      An `Operation`.
-    """
-    raise NotImplementedError()
-
-  def _resource_apply_dense(self, grad, handle, state):
-    """Add ops to apply dense gradients to the variable `handle`.
-
-    Args:
-      grad: a `Tensor` representing the gradient.
-      handle: a `Tensor` of dtype `resource` which points to the variable
-       to be updated.
-      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
-        and `get_hyper(name)` methods.
-
-    Returns:
-      An `Operation` which updates the value of the variable.
-    """
-    raise NotImplementedError()
-
-  def _resource_apply_sparse_duplicate_indices(
-      self, grad, handle, indices, state):
-    """Add ops to apply sparse gradients to `handle`, with repeated indices.
-
-    Optimizers which override this method must deal with repeated indices. See
-    the docstring of `_apply_sparse_duplicate_indices` for details. By default
-    the correct behavior, to sum non-unique indices and their associated
-    gradients, is enforced by first pre-processing `grad` and `indices` and
-    passing them on to `_resource_apply_sparse`. Optimizers which deal correctly
-    with duplicate indices may instead override this method to avoid the
-    overhead of summing.
-
-    Args:
-      grad: a `Tensor` representing the gradient for the affected indices.
-      handle: a `Tensor` of dtype `resource` which points to the variable
-       to be updated.
-      indices: a `Tensor` of integral type representing the indices for
-       which the gradient is nonzero. Indices may be repeated.
-      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
-        and `get_hyper(name)` methods.
-
-    Returns:
-      An `Operation` which updates the value of the variable.
-    """
-    # pylint: disable=protected-access
-    summed_grad, unique_indices = optimizer_v1._deduplicate_indexed_slices(
-        values=grad, indices=indices)
-    # pylint: enable=protected-access
-    return self._resource_apply_sparse(
-        summed_grad, handle, unique_indices, state)
-
-  def _resource_apply_sparse(self, grad, handle, indices, state):
-    """Add ops to apply sparse gradients to the variable `handle`.
-
-    Similar to `_apply_sparse`, the `indices` argument to this method has been
-    de-duplicated. Optimizers which deal correctly with non-unique indices may
-    instead override `_resource_apply_sparse_duplicate_indices` to avoid this
-    overhead.
-
-    Args:
-      grad: a `Tensor` representing the gradient for the affected indices.
-      handle: a `Tensor` of dtype `resource` which points to the variable
-       to be updated.
-      indices: a `Tensor` of integral type representing the indices for
-       which the gradient is nonzero. Indices are unique.
-      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
-        and `get_hyper(name)` methods.
-
-    Returns:
-      An `Operation` which updates the value of the variable.
-    """
-    raise NotImplementedError()
-
-  def _apply_sparse_duplicate_indices(self, grad, var, state):
-    """Add ops to apply sparse gradients to `var`, with repeated sparse indices.
-
-    Optimizers which override this method must deal with IndexedSlices objects
-    such as the following:
-
-      IndexedSlicesValue(values=[1, 1], indices=[0, 0], dense_shape=[1])
-
-    The correct interpretation is:
-
-      IndexedSlicesValue(values=[2], indices=[0], dense_shape=[1])
-
-    Many optimizers deal incorrectly with repeated indices when updating based
-    on sparse gradients (e.g. summing squares rather than squaring the sum, or
-    applying momentum terms multiple times). Adding first is always the correct
-    behavior, so this is enforced here by reconstructing the IndexedSlices to
-    have only unique indices, then calling _apply_sparse.
-
-    Optimizers which deal correctly with repeated indices may instead override
-    this method to avoid the overhead of summing indices.
-
-    Args:
-      grad: `IndexedSlices`.
-      var: A `Variable` object.
-      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
-        and `get_hyper(name)` methods.
-
-    Returns:
-      An `Operation`.
-    """
-    # pylint: disable=protected-access
-    summed_values, unique_indices = optimizer_v1._deduplicate_indexed_slices(
-        values=grad.values, indices=grad.indices)
-    # pylint: enable=protected-access
-    gradient_no_duplicate_indices = ops.IndexedSlices(
-        indices=unique_indices,
-        values=summed_values,
-        dense_shape=grad.dense_shape)
-    return self._apply_sparse(gradient_no_duplicate_indices, var, state)
-
-  def _apply_sparse(self, grad, var, state):
-    """Add ops to apply sparse gradients to `var`.
-
-    The IndexedSlices object passed to `grad` in this function is by default
-    pre-processed in `_apply_sparse_duplicate_indices` to remove duplicate
-    indices (see its docstring for details). Optimizers which can tolerate or
-    have correct special cases for duplicate sparse indices may override
-    `_apply_sparse_duplicate_indices` instead of this function, avoiding that
-    overhead.
-
-    Args:
-      grad: `IndexedSlices`, with no repeated indices.
-      var: A `Variable` object.
-      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
-        and `get_hyper(name)` methods.
-
-    Returns:
-      An `Operation`.
-    """
-    raise NotImplementedError()
-
-  def _finish(self, state):
-    """Do what is needed to finish the update.
-
-    This is called inside a scope colocated with any non-slot variables.
-
-    Args:
-      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
-        and `get_hyper(name)` methods.
-
-    Returns:
-      The operation to apply updates, or None if no updates.
-    """
-    return None
-
-  # --------------
-  # Utility methods for subclasses.
-  # --------------
-  def _get_per_graph_state(self):
-    # pylint: disable=protected-access
-    return self._per_graph_state.get(ops.get_default_graph()._graph_key, None)
-
-  def _get_state_for_var(self, var):
-    # pylint: disable=protected-access
-    return self._per_graph_state.get(var._graph_key, None)
-
-  # --------------
-  # Overridden methods from Checkpointable.
-  # --------------
-
-  def _track_checkpointable(self, *args, **kwargs):
-    """Optimizers may not track dependencies. Raises an error."""
-    raise NotImplementedError(
-        "Optimizers may not have dependencies. File a feature request if this "
-        "limitation bothers you.")
-
-  @property
-  def _checkpoint_dependencies(self):
-    """From Checkpointable. Gather graph-specific non-slot variables to save."""
-    current_graph_non_slot_variables = []
-    state = self._get_per_graph_state()
-    if state is not None:
-      for name, variable_object in sorted(
-          state._non_slot_dict.items(),  # pylint: disable=protected-access
-          # Avoid comparing variables
-          key=lambda item: item[0]):
-        current_graph_non_slot_variables.append(
-            checkpointable.CheckpointableReference(
-                name=name, ref=variable_object))
-    # Note: ignores super(); Optimizers may not have any dependencies outside of
-    # state objects.
-    return current_graph_non_slot_variables
-
-  def _lookup_dependency(self, name):
-    """From Checkpointable. Find a non-slot variable in the current graph."""
-    state = self._get_per_graph_state()
-    if state is None:
-      return None
-    else:
-      return state.get_non_slot(name)
-
-  @property
-  def _deferred_dependencies(self):
-    """Lets Checkpointable know where non-slot variables are created.
-
-    If necessary, creates a new state object for the current default graph.
-    Checkpointable will then add entries to that state's deferred dependency
-    dictionary. The state object will check that dictionary when creating
-    non-slot variables, restoring their value if an entry is found.
-
-    Returns:
-      A dictionary which holds deferred dependencies for the current default
-      graph.
-    """
-    state = self._get_or_create_state()
-    return state._deferred_dependencies  # pylint: disable=protected-access
-
-  def _create_or_restore_slot_variable(
-      self, slot_variable_position, slot_name, variable):
-    """Checkpointable: Restore a slot variable's value, possibly creating it.
-
-    Called when a variable which has an associated slot variable is created or
-    restored.
-
-    Args:
-      slot_variable_position: A `checkpointable._CheckpointPosition` object
-        indicating the slot variable `Checkpointable` object to be restored.
-      slot_name: The name of this `Optimizer`'s slot to restore into.
-      variable: The variable object this slot is being created for.
-    """
-    state = self._get_or_create_state(var_list=[variable])
-    state._create_or_restore_slot_variable(  # pylint: disable=protected-access
-        slot_variable_position=slot_variable_position,
-        slot_name=slot_name,
-        variable=variable,
-        optional_op_name=self._name)
-
-  # --------------
-  # Unsupported parent methods
-  # --------------
-  def _slot_dict(self, slot_name):
-    raise NotImplementedError(
-        "_slot_dict() method unsupported in OptimizerV2")
-
-  def _get_or_make_slot(self, var, val, slot_name, op_name):
-    raise NotImplementedError(
-        "_get_or_make_slot() method unsupported in OptimizerV2")
-
-  def _get_or_make_slot_with_initializer(self, var, initializer, shape, dtype,
-                                         slot_name, op_name):
-    raise NotImplementedError(
-        "_get_or_make_slot_with_initializer() method unsupported in "
-        "OptimizerV2")
-
-  def _create_non_slot_variable(self, initial_value, name, colocate_with):
-    raise NotImplementedError(
-        "_create_non_slot_variable() method unsupported in OptimizerV2")
-
-  def _get_non_slot_variable(self, name, graph=None):
-    raise NotImplementedError(
-        "_get_non_slot_variable() method unsupported in OptimizerV2")
-
-  def _non_slot_variables(self):
-    raise NotImplementedError(
-        "_non_slot_variables() method unsupported in OptimizerV2")
+    super(OptimizerV2, self).__init__(name)
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop.py b/tensorflow/contrib/optimizer_v2/rmsprop.py
index 3de53405ec..090e257ddc 100644
--- a/tensorflow/contrib/optimizer_v2/rmsprop.py
+++ b/tensorflow/contrib/optimizer_v2/rmsprop.py
@@ -41,19 +41,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.optimizer_v2 import optimizer_v2
-from tensorflow.python.ops import array_ops
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.util import deprecation
 
-from tensorflow.python.training import training_ops
 
-
-class RMSPropOptimizer(optimizer_v2.OptimizerV2):
+class RMSPropOptimizer(rmsprop.RMSProp):
   """Optimizer that implements the RMSProp algorithm.
 
   See the
   [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
   """
 
+  @deprecation.deprecated_args(
+      "2018-10-01",
+      "`use_locking = True` is no longer supported and will be ignored.",
+      ("use_locking", [False]))
   def __init__(self,
                learning_rate,
                decay=0.9,
@@ -96,138 +98,10 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
       name: Optional name prefix for the operations created when applying
         gradients. Defaults to "RMSProp".
     """
-    super(RMSPropOptimizer, self).__init__(use_locking, name)
-    self._set_hyper("learning_rate", learning_rate)
-    self._set_hyper("decay", decay)
-    self._set_hyper("momentum", momentum)
-    self._set_hyper("epsilon", epsilon)
-
-    self._centered = centered
-
-  def _create_vars(self, var_list, state):
-    for v in var_list:
-      init_rms = state.get_hyper(
-          "epsilon", v.dtype.base_dtype) * array_ops.ones_like(v)
-      state.create_slot_with_initializer(v, init_rms, v.get_shape(),
-                                         v.dtype.base_dtype, "rms")
-      if self._centered:
-        state.zeros_slot(v, "mg")
-      state.zeros_slot(v, "momentum")
-
-  def _apply_dense(self, grad, var, state):
-    rms = state.get_slot(var, "rms")
-    mom = state.get_slot(var, "momentum")
-    if self._centered:
-      mg = state.get_slot(var, "mg")
-      return training_ops.apply_centered_rms_prop(
-          var,
-          mg,
-          rms,
-          mom,
-          state.get_hyper("learning_rate", var.dtype.base_dtype),
-          state.get_hyper("decay", var.dtype.base_dtype),
-          state.get_hyper("momentum", var.dtype.base_dtype),
-          # epsilon is now the rms initial value and is not added to the
-          # denominator anymore, hence calling the kernel op with epsilon=0.
-          0,
-          grad,
-          use_locking=self._use_locking).op
-    else:
-      return training_ops.apply_rms_prop(
-          var,
-          rms,
-          mom,
-          state.get_hyper("learning_rate", var.dtype.base_dtype),
-          state.get_hyper("decay", var.dtype.base_dtype),
-          state.get_hyper("momentum", var.dtype.base_dtype),
-          0,
-          grad,
-          use_locking=self._use_locking).op
-
-  def _resource_apply_dense(self, grad, var, state):
-    rms = state.get_slot(var, "rms")
-    mom = state.get_slot(var, "momentum")
-    if self._centered:
-      mg = state.get_slot(var, "mg")
-      return training_ops.resource_apply_centered_rms_prop(
-          var.handle,
-          mg.handle,
-          rms.handle,
-          mom.handle,
-          state.get_hyper("learning_rate", var.dtype.base_dtype),
-          state.get_hyper("decay", var.dtype.base_dtype),
-          state.get_hyper("momentum", var.dtype.base_dtype),
-          0,
-          grad,
-          use_locking=self._use_locking)
-    else:
-      return training_ops.resource_apply_rms_prop(
-          var.handle,
-          rms.handle,
-          mom.handle,
-          state.get_hyper("learning_rate", var.dtype.base_dtype),
-          state.get_hyper("decay", var.dtype.base_dtype),
-          state.get_hyper("momentum", var.dtype.base_dtype),
-          0,
-          grad,
-          use_locking=self._use_locking)
-
-  def _apply_sparse(self, grad, var, state):
-    rms = state.get_slot(var, "rms")
-    mom = state.get_slot(var, "momentum")
-    if self._centered:
-      mg = state.get_slot(var, "mg")
-      return training_ops.sparse_apply_centered_rms_prop(
-          var,
-          mg,
-          rms,
-          mom,
-          state.get_hyper("learning_rate", var.dtype.base_dtype),
-          state.get_hyper("decay", var.dtype.base_dtype),
-          state.get_hyper("momentum", var.dtype.base_dtype),
-          0,
-          grad.values,
-          grad.indices,
-          use_locking=self._use_locking)
-    else:
-      return training_ops.sparse_apply_rms_prop(
-          var,
-          rms,
-          mom,
-          state.get_hyper("learning_rate", var.dtype.base_dtype),
-          state.get_hyper("decay", var.dtype.base_dtype),
-          state.get_hyper("momentum", var.dtype.base_dtype),
-          0,
-          grad.values,
-          grad.indices,
-          use_locking=self._use_locking)
-
-  def _resource_apply_sparse(self, grad, var, indices, state):
-    rms = state.get_slot(var, "rms")
-    mom = state.get_slot(var, "momentum")
-    if self._centered:
-      mg = self.get_slot(var, "mg")
-      return training_ops.resource_sparse_apply_centered_rms_prop(
-          var.handle,
-          mg.handle,
-          rms.handle,
-          mom.handle,
-          state.get_hyper("learning_rate", var.dtype.base_dtype),
-          state.get_hyper("decay", var.dtype.base_dtype),
-          state.get_hyper("momentum", var.dtype.base_dtype),
-          0,
-          grad,
-          indices,
-          use_locking=self._use_locking)
-    else:
-      return training_ops.resource_sparse_apply_rms_prop(
-          var.handle,
-          rms.handle,
-          mom.handle,
-          state.get_hyper("learning_rate", var.dtype.base_dtype),
-          state.get_hyper("decay", var.dtype.base_dtype),
-          state.get_hyper("momentum", var.dtype.base_dtype),
-          0,
-          grad,
-          indices,
-          use_locking=self._use_locking)
+    super(RMSPropOptimizer, self).__init__(
+        learning_rate=learning_rate,
+        rho=decay,
+        momentum=momentum,
+        epsilon=epsilon,
+        centered=centered,
+        name=name)
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 4a72c4b3f3..c4d23f117f 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -62,6 +62,7 @@ py_library(
         ":backend",
         ":engine",
         ":layers",
+        ":optimizer_v2",
         "//tensorflow/python/saved_model",
         "//tensorflow/python:training",
     ],
@@ -189,6 +190,30 @@ py_library(
     ],
 )
 
+py_library(
+    name = "optimizer_v2",
+    srcs = [
+        "optimizer_v2/adadelta.py",
+        "optimizer_v2/adagrad.py",
+        "optimizer_v2/adam.py",
+        "optimizer_v2/optimizer_v2.py",
+        "optimizer_v2/rmsprop.py",
+        "optimizer_v2/sgd.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distribute",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
 py_test(
     name = "integration_test",
     size = "medium",
@@ -827,3 +852,133 @@ py_library(
         "//third_party/py/numpy",
     ],
 )
+
+cuda_py_test(
+    name = "adadelta_test",
+    size = "medium",
+    srcs = ["optimizer_v2/adadelta_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "adagrad_test",
+    size = "small",
+    srcs = ["optimizer_v2/adagrad_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "adam_test",
+    size = "small",
+    srcs = ["optimizer_v2/adam_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "checkpointable_utils_test",
+    srcs = ["optimizer_v2/checkpointable_utils_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "@six_archive//:six",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:layers_base",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
+    ],
+    tags = ["notsan"],
+)
+
+cuda_py_test(
+    name = "sgd_test",
+    size = "medium",
+    srcs = ["optimizer_v2/sgd_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "optimizer_v2_test",
+    size = "medium",
+    srcs = ["optimizer_v2/optimizer_v2_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "rmsprop_test",
+    size = "small",
+    srcs = ["optimizer_v2/rmsprop_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+    tags = ["optonly"],
+)
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
new file mode 100644
index 0000000000..d3b3c9c12e
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -0,0 +1,116 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adadelta for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.training import training_ops
+
+
+class Adadelta(optimizer_v2.OptimizerV2):
+  """Adadelta optimizer.
+
+  It is recommended to leave the parameters of this optimizer at their default
+  values.
+
+  See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
+  ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
+
+  Some of the args below are hyperparameters, where a hyperparameter is
+  defined as a scalar Tensor, a regular Python value, or a callable (which
+  will be evaluated when `apply_gradients` is called) returning a scalar
+  Tensor or a Python value.
+
+  Arguments:
+      learning_rate: float hyperparameter >= 0. Learning rate. It is recommended
+        to leave it at the default value.
+      rho: float hyperparameter >= 0. The decay rate.
+      epsilon: float hyperparameter >= 0. Fuzz factor. A constant epsilon used
+        to better condition the grad update.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to 'Adadelta'.
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               rho=0.95,
+               epsilon=1e-8,
+               name="Adadelta"):
+    super(Adadelta, self).__init__(name)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("rho", rho)
+    self._set_hyper("epsilon", epsilon)
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      state.zeros_slot(v, "accum")
+      state.zeros_slot(v, "accum_update")
+
+  def _apply_dense(self, grad, var, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.apply_adadelta(
+        var,
+        accum,
+        accum_update,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_dense(self, grad, var, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.resource_apply_adadelta(
+        var.handle,
+        accum.handle,
+        accum_update.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.sparse_apply_adadelta(
+        var,
+        accum,
+        accum_update,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.resource_sparse_apply_adadelta(
+        var.handle,
+        accum.handle,
+        accum_update.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad,
+        indices,
+        use_locking=self._use_locking)
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta_test.py b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
new file mode 100644
index 0000000000..6e48f92e4f
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
@@ -0,0 +1,166 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adadelta Optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.optimizer_v2 import adadelta
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class AdadeltaOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_resource=False):
+    num_updates = 4  # number of ADADELTA steps to perform
+    for dtype in [dtypes.half, dtypes.float32]:
+      for grad in [0.2, 0.1, 0.01]:
+        for lr in [1.0, 0.5, 0.1]:
+          with self.cached_session():
+            var0_init = [1.0, 2.0]
+            var1_init = [3.0, 4.0]
+            if use_resource:
+              var0 = resource_variable_ops.ResourceVariable(
+                  var0_init, dtype=dtype)
+              var1 = resource_variable_ops.ResourceVariable(
+                  var1_init, dtype=dtype)
+            else:
+              var0 = variables.Variable(var0_init, dtype=dtype)
+              var1 = variables.Variable(var1_init, dtype=dtype)
+
+            grads = constant_op.constant([grad, grad], dtype=dtype)
+
+            accum = 0.0
+            accum_update = 0.0
+
+            # ADADELTA gradient optimizer
+            rho = 0.95
+            epsilon = 1e-8
+            adadelta_opt = adadelta.Adadelta(lr, rho, epsilon)
+            adadelta_update = adadelta_opt.apply_gradients(
+                zip([grads, grads], [var0, var1]))
+
+            opt_vars = adadelta_opt.variables()
+            self.assertStartsWith(opt_vars[0].name, var0._shared_name)
+            self.assertStartsWith(opt_vars[1].name, var0._shared_name)
+            self.assertStartsWith(opt_vars[2].name, var1._shared_name)
+            self.assertStartsWith(opt_vars[3].name, var1._shared_name)
+            self.assertEqual(4, len(opt_vars))
+
+            variables.global_variables_initializer().run()
+
+            # Assign slots
+            slot = [None] * 2
+            slot_update = [None] * 2
+            self.assertEqual(["accum", "accum_update"],
+                             adadelta_opt.get_slot_names())
+            slot[0] = adadelta_opt.get_slot(var0, "accum")
+            self.assertEquals(slot[0].get_shape(), var0.get_shape())
+            self.assertFalse(slot[0] in variables.trainable_variables())
+
+            slot_update[0] = adadelta_opt.get_slot(var0, "accum_update")
+            self.assertEquals(slot_update[0].get_shape(), var0.get_shape())
+            self.assertFalse(slot_update[0] in variables.trainable_variables())
+
+            slot[1] = adadelta_opt.get_slot(var1, "accum")
+            self.assertEquals(slot[1].get_shape(), var1.get_shape())
+            self.assertFalse(slot[1] in variables.trainable_variables())
+
+            slot_update[1] = adadelta_opt.get_slot(var1, "accum_update")
+            self.assertEquals(slot_update[1].get_shape(), var1.get_shape())
+            self.assertFalse(slot_update[1] in variables.trainable_variables())
+
+            # Fetch params to validate initial values
+            self.assertAllClose(var0_init, var0.eval())
+            self.assertAllClose(var1_init, var1.eval())
+
+            update = [None] * num_updates
+            tot_update = 0
+            for step in range(num_updates):
+              # Run adadelta update for comparison
+              adadelta_update.run()
+
+              # Perform initial update without previous accum values
+              accum = accum * rho + (grad**2) * (1 - rho)
+              update[step] = (np.sqrt(accum_update + epsilon) *
+                              (1. / np.sqrt(accum + epsilon)) * grad)
+              accum_update = (accum_update * rho + (update[step]**2) *
+                              (1.0 - rho))
+              tot_update += update[step] * lr
+
+              # Check that the accumulators have been updated
+              for slot_idx in range(2):
+                self.assertAllCloseAccordingToType(
+                    np.array([accum, accum], dtype=dtype.as_numpy_dtype()),
+                    slot[slot_idx].eval(),
+                    rtol=1e-5)
+
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [accum_update, accum_update],
+                        dtype=dtype.as_numpy_dtype()),
+                    slot_update[slot_idx].eval(),
+                    rtol=1e-5)
+
+              # Check that the parameters have been updated
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var0_init[0] - tot_update, var0_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  var0.eval(),
+                  rtol=1e-5)
+
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var1_init[0] - tot_update, var1_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  var1.eval(),
+                  rtol=1e-5)
+
+  def testBasic(self):
+    self.doTestBasic(use_resource=False)
+
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adadelta.Adadelta(1.0, 1.0, 1.0).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[-111, -138]], var0.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
new file mode 100644
index 0000000000..2d8cec2300
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -0,0 +1,119 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adagrad optimizer for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import training_ops
+
+
+class Adagrad(optimizer_v2.OptimizerV2):
+  """Adagrad optimizer.
+
+  It is recommended to leave the parameters of this optimizer at their default
+  values.
+
+  See this [paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+  or this
+  [intro](https://ppasupat.github.io/a9online/uploads/proximal_notes.pdf).
+
+  The learning_rate arg below is a hyperparameter, where a hyperparameter is
+  defined as a scalar Tensor, a regular Python value, or a callable (which
+  will be evaluated when `apply_gradients` is called) returning a scalar
+  Tensor or a Python value.
+
+  Arguments:
+      learning_rate: float hyperparameter >= 0. Learning rate.
+      initial_accumulator_value: A floating point value. Starting value for the
+        accumulators, must be positive.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to 'Adagrad'.
+
+  Raises:
+    ValueError: If the `initial_accumulator_value` is invalid.
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               initial_accumulator_value=0.1,
+               name="Adagrad"):
+    if initial_accumulator_value <= 0.0:
+      raise ValueError("initial_accumulator_value must be positive: %s" %
+                       initial_accumulator_value)
+    super(Adagrad, self).__init__(name)
+    self._set_hyper("learning_rate", learning_rate)
+
+    self._initial_accumulator_value = initial_accumulator_value
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      dtype = v.dtype.base_dtype
+      if v.get_shape().is_fully_defined():
+        init = init_ops.constant_initializer(self._initial_accumulator_value,
+                                             dtype=dtype)
+      else:
+        def init(v=v, dtype=dtype):
+          # Use a Tensor instead of initializer if variable does not have
+          # static shape.
+          init_constant = gen_array_ops.fill(array_ops.shape(v),
+                                             self._initial_accumulator_value)
+          return math_ops.cast(init_constant, dtype)
+      state.create_slot_with_initializer(v, init, v.get_shape(), dtype,
+                                         "accumulator")
+
+  def _apply_dense(self, grad, var, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.apply_adagrad(
+        var,
+        acc,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_dense(self, grad, var, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.resource_apply_adagrad(
+        var.handle,
+        acc.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.sparse_apply_adagrad(
+        var,
+        acc,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.resource_sparse_apply_adagrad(
+        var.handle,
+        acc.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        indices,
+        use_locking=self._use_locking)
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
new file mode 100644
index 0000000000..fc4ef5c399
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -0,0 +1,276 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for aggregate operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import adagrad
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class AdagradOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+          var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        else:
+          var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+          var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.Adagrad(3.0, initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testBasic(self):
+    self.doTestBasic()
+
+  def testBasicResource(self):
+    self.doTestBasic(use_resource=True)
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable(
+            [[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adagrad.Adagrad(1.0).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType(
+            [[1.0, 2.0], [3.0, 4.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[0, 1], [3, 4]], var0.eval(), atol=0.01)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.Adagrad(
+            constant_op.constant(3.0), initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]),
+            constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(
+                [0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        ada_opt = adagrad.Adagrad(3.0, initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([[1.0], [2.0]], var0.eval())
+        self.assertAllClose([[3.0], [4.0]], var1.eval())
+        # Run 3 step of sgd
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([[-1.6026098728179932], [2.0]]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([[3.0], [3.715679168701172]]), var1.eval())
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adagrad.Adagrad(3.0).apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adagrad.Adagrad(3.0).apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def testSparseRepeatedIndicesResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var_repeated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_repeated = math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_repeated, [0, 0]))
+        var_aggregated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_aggregated = 2 * math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_aggregated, [0]))
+        update_op_repeated = adagrad.Adagrad(2.0).minimize(loss_repeated)
+        update_op_aggregated = adagrad.Adagrad(2.0).minimize(loss_aggregated)
+        variables.global_variables_initializer().run()
+        self.assertAllCloseAccordingToType(
+            var_repeated.eval(), var_aggregated.eval())
+        for _ in range(3):
+          update_op_repeated.run()
+          update_op_aggregated.run()
+          self.assertAllCloseAccordingToType(
+              var_repeated.eval(), var_aggregated.eval())
+
+  def testSparseStability(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        shape = [1, 6]
+        var0 = variables.Variable(
+            [[
+                0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257,
+                -0.0105945
+            ]],
+            dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [[
+                    -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05,
+                    -8.4877e-05, -9.48906e-05
+                ]],
+                shape=shape,
+                dtype=dtype),
+            constant_op.constant([0]),
+            constant_op.constant(shape))
+        ada_opt = adagrad.Adagrad(1.0, initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        init = variables.global_variables_initializer()
+        for _ in range(100):
+          init.run()
+          ada_update.run()
+          self.assertAllCloseAccordingToType(
+              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval())
+          self.assertAllCloseAccordingToType(
+              np.array([[
+                  0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
+                  -0.01029443
+              ]]), var0.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.Adagrad(3.0)
+        # Apply the optimizer twice.  Both applications will use
+        # the same accums.
+        ada_update1 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        ada_update2 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values.
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Mix the first and the second adagrad for 3 steps.
+        ada_update1.run()
+        ada_update2.run()
+        ada_update1.run()
+        # Validate updated params (the same as with only 1 Adagrad).
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testDynamicShapeVariable_Ok(self):
+    with self.cached_session():
+      v = variable_scope.get_variable("v", initializer=constant_op.constant(1.),
+                                      validate_shape=False)
+      self.assertFalse(v.shape.is_fully_defined())
+      # Creating optimizer should cause no exception.
+      adagrad.Adagrad(3.0, initial_accumulator_value=0.1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
new file mode 100644
index 0000000000..8367228d7a
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -0,0 +1,203 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adam optimizer for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training_ops
+
+
+class Adam(optimizer_v2.OptimizerV2):
+  r"""Adam Optimizer.
+
+  Default parameters follow those provided in the original paper.
+
+  See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+  ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+
+  Some of the args below are hyperparameters where a hyperparameter is
+  defined as a scalar Tensor, a regular Python value, or a callable (which
+  will be evaluated when `apply_gradients` is called) returning a scalar
+  Tensor or a Python value.
+
+  Initialization:
+
+  $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+  $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+  $$t := 0 \text{(Initialize timestep)}$$
+  The update rule for `variable` with gradient `g` uses an optimization
+  described at the end of section2 of the paper:
+
+  $$t := t + 1$$
+  $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+
+  $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+  $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+  $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+
+  The default value of 1e-8 for epsilon might not be a good default in
+  general. For example, when training an Inception network on ImageNet a
+  current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+  formulation just before Section 2.1 of the Kingma and Ba paper rather than
+  the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+  hat" in the paper.
+
+  The sparse implementation of this algorithm (used when the gradient is an
+  IndexedSlices object, typically because of `tf.gather` or an embedding
+  lookup in the forward pass) does apply momentum to variable slices even if
+  they were not used in the forward pass (meaning they have a gradient equal
+  to zero). Momentum decay (beta1) is also applied to the entire momentum
+  accumulator. This means that the sparse behavior is equivalent to the dense
+  behavior (in contrast to some momentum implementations which ignore momentum
+  unless a variable slice was actually used).
+
+  Arguments:
+      learning_rate: float hyperparameter >= 0. Learning rate.
+      beta_1: float hyperparameter, 0 < beta_1 < 1. Generally close to 1. The
+        exponential decay rate for the 1st moment estimates.
+      beta_2: float hyperparameter, 0 < beta_2 < 1. Generally close to 1. The
+        exponential decay rate for the 2nd moment estimates.
+      epsilon: float hyperparameter >= 0. Fuzz factor. This epsilon is "epsilon
+        hat" in the Kingma and Ba paper (in the formula just before Section
+        2.1), not the epsilon in Algorithm 1 of the paper.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adam".
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-8,
+               name="Adam"):
+    super(Adam, self).__init__(name)
+
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("beta_1", beta_1)
+    self._set_hyper("beta_2", beta_2)
+    self._set_hyper("epsilon", epsilon)
+
+  def _get_beta_accumulators(self, state=None):
+    if state is None:
+      state = self._get_per_graph_state()
+    return (state.get_non_slot("beta_1_power"),
+            state.get_non_slot("beta_2_power"))
+
+  def _create_vars(self, var_list, state):
+    # Non-slot variables end up on the same device(s).
+    state.create_non_slot(
+        initial_value=lambda: state.get_hyper("beta_1"), name="beta_1_power")
+    state.create_non_slot(
+        initial_value=lambda: state.get_hyper("beta_2"), name="beta_2_power")
+
+    # Create slots for the first and second moments.
+    for v in var_list:
+      state.zeros_slot(v, "m")
+      state.zeros_slot(v, "v")
+
+  def _apply_dense(self, grad, var, state):
+    m = state.get_slot(var, "m")
+    v = state.get_slot(var, "v")
+    beta_1_power, beta_2_power = self._get_beta_accumulators(state)
+    return training_ops.apply_adam(
+        var,
+        m,
+        v,
+        math_ops.cast(beta_1_power, var.dtype.base_dtype),
+        math_ops.cast(beta_2_power, var.dtype.base_dtype),
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("beta_1", var.dtype.base_dtype),
+        state.get_hyper("beta_2", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var, state):
+    m = state.get_slot(var, "m")
+    v = state.get_slot(var, "v")
+    beta_1_power, beta_2_power = self._get_beta_accumulators(state)
+    return training_ops.resource_apply_adam(
+        var.handle,
+        m.handle,
+        v.handle,
+        math_ops.cast(beta_1_power, grad.dtype.base_dtype),
+        math_ops.cast(beta_2_power, grad.dtype.base_dtype),
+        state.get_hyper("learning_rate", grad.dtype.base_dtype),
+        state.get_hyper("beta_1", grad.dtype.base_dtype),
+        state.get_hyper("beta_2", grad.dtype.base_dtype),
+        state.get_hyper("epsilon", grad.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse_shared(self, grad, var, indices, scatter_add, state):
+    beta_1_power, beta_2_power = self._get_beta_accumulators(state)
+    beta_1_power = math_ops.cast(beta_1_power, var.dtype.base_dtype)
+    beta_2_power = math_ops.cast(beta_2_power, var.dtype.base_dtype)
+    lr_t = state.get_hyper("learning_rate", var.dtype.base_dtype)
+    beta_1_t = state.get_hyper("beta_1", var.dtype.base_dtype)
+    beta_2_t = state.get_hyper("beta_2", var.dtype.base_dtype)
+    epsilon_t = state.get_hyper("epsilon", var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+    # m_t = beta_1 * m + (1 - beta_1) * g_t
+    m = state.get_slot(var, "m")
+    m_scaled_g_values = grad * (1 - beta_1_t)
+    m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = scatter_add(m, indices, m_scaled_g_values)
+    # v_t = beta_2 * v + (1 - beta_2) * (g_t * g_t)
+    v = state.get_slot(var, "v")
+    v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
+    v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = scatter_add(v, indices, v_scaled_g_values)
+    v_sqrt = math_ops.sqrt(v_t)
+    var_update = state_ops.assign_sub(var,
+                                      lr * m_t / (v_sqrt + epsilon_t),
+                                      use_locking=self._use_locking)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _apply_sparse(self, grad, var, state):
+    return self._apply_sparse_shared(
+        grad.values, var, grad.indices,
+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking),
+        state)
+
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(
+            x.handle, i, v)]):
+      return x.value()
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    return self._apply_sparse_shared(
+        grad, var, indices, self._resource_scatter_add, state)
+
+  def _finish(self, state):
+    # Update the power accumulators.
+    beta_1_power, beta_2_power = self._get_beta_accumulators(state)
+    update_beta_1 = beta_1_power.assign(
+        beta_1_power * state.get_hyper("beta_1"), use_locking=self._use_locking)
+    update_beta_2 = beta_2_power.assign(
+        beta_2_power * state.get_hyper("beta_2"), use_locking=self._use_locking)
+    return control_flow_ops.group(update_beta_1, update_beta_2)
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
new file mode 100644
index 0000000000..77796317a1
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -0,0 +1,333 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adam optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      alpha=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class AdamOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = adam.Adam()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adam.Adam(3.0)
+        minimize_op = optimizer.minimize(gathered_sum)
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adam.Adam().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adam.Adam().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def doTestBasic(self, use_resource=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(
+              var0_np, name="var0_%d" % i)
+          var1 = resource_variable_ops.ResourceVariable(
+              var1_np, name="var1_%d" % i)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adam.Adam()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        opt_variables = opt.variables()
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+        self.assertTrue(beta1_power is not None)
+        self.assertTrue(beta2_power is not None)
+        self.assertIn(beta1_power, opt_variables)
+        self.assertIn(beta2_power, opt_variables)
+
+        with ops.Graph().as_default():
+          # Shouldn't return non-slot variables from other graphs.
+          self.assertEqual(0, len(opt.variables()))
+
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          elif t > 1:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta2_power))
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/Adam:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
+
+  def testBasic(self):
+    with self.cached_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.Adam(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.Adam()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testTwoSessions(self):
+    optimizer = adam.Adam()
+    g = ops.Graph()
+    with g.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+        optimizer.apply_gradients([(grads0, var0)])
+
+    gg = ops.Graph()
+    with gg.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+
+        # If the optimizer saves any state not keyed by graph the following line
+        # fails.
+        optimizer.apply_gradients([(grads0, var0)])
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adam.Adam(1.)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two non-slot variables, and two unique slot variables
+      # for v1 and v2 respectively.
+      self.assertEqual(6, len(set(opt.variables())))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/checkpointable_utils_test.py b/tensorflow/python/keras/optimizer_v2/checkpointable_utils_test.py
new file mode 100644
index 0000000000..338c04148b
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/checkpointable_utils_test.py
@@ -0,0 +1,761 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# TODO(josh11b): Forked from contrib/eager/python to test OptimizerV2 the same way
+# OptimizerV1 is tested. This file should be removed once the fork is resolved.
+
+import functools
+import os
+
+import six
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import template
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import saver as core_saver
+from tensorflow.python.training import training_util
+from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.checkpointable import util
+
+
+class NonLayerCheckpointable(tracking.Checkpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    self.a_variable = util.add_variable(
+        self, name="a_variable", shape=[])
+
+
+# pylint: disable=not-callable
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class _MirroringSaveable(
+    core_saver.BaseSaverBuilder.ResourceVariableSaveable):
+
+  def __init__(self, primary_variable, mirrored_variable, name):
+    self._primary_variable = primary_variable
+    self._mirrored_variable = mirrored_variable
+    super(_MirroringSaveable, self).__init__(
+        self._primary_variable, "", name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into both variables."""
+    tensor, = restored_tensors
+    return control_flow_ops.group(
+        self._primary_variable.assign(tensor),
+        self._mirrored_variable.assign(tensor))
+
+
+class CheckpointingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNamingWithOptimizer(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    # A nuisance Model using the same optimizer. Its slot variables should not
+    # go in the checkpoint, since it is never depended on.
+    other_model = MyModel()
+    optimizer = adam.Adam(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = util.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value),
+          global_step=optimizer_step)
+      optimizer.minimize(
+          lambda: other_model(input_value),
+          global_step=optimizer_step)
+    else:
+      train_op = optimizer.minimize(
+          model(input_value), global_step=optimizer_step)
+      optimizer.minimize(
+          other_model(input_value),
+          global_step=optimizer_step)
+      self.evaluate(util.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    named_variables, serialized_graph, _ = (
+        util._serialize_object_graph(
+            root_checkpointable, saveables_cache=None))
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "optimizer_step",
+        "model/_second/kernel",
+        "model/_named_dense/kernel",
+        "model/_named_dense/bias",
+        # non-Layer dependency of the model
+        "model/_non_layer/a_variable",
+        # The optimizer creates two non-slot variables
+        "optimizer/beta_1_power",
+        "optimizer/beta_2_power",
+        # Slot variables
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
+    )
+    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+    expected_checkpoint_names = [
+        name + suffix for name in expected_checkpoint_names]
+    # The Dense layers also save get_config() JSON
+    expected_checkpoint_names.extend(
+        ["model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
+         "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"])
+    named_variables = {v.name: v for v in named_variables}
+    six.assertCountEqual(self, expected_checkpoint_names,
+                         named_variables.keys())
+    # Check that we've mapped to the right variable objects (not exhaustive)
+    self.assertEqual(
+        "global_step",
+        named_variables["optimizer_step" + suffix].full_name)
+    self.assertEqual(
+        "my_model/dense_1/kernel",
+        named_variables["model/_second/kernel" + suffix].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        named_variables["model/_named_dense/kernel" + suffix].full_name)
+    self.assertEqual(
+        "beta_1_power",
+        named_variables["optimizer/beta_1_power" + suffix].full_name)
+    self.assertEqual(
+        "beta_2_power",
+        named_variables["optimizer/beta_2_power" + suffix].full_name)
+    # Spot check the generated protocol buffers.
+    self.assertEqual("optimizer",
+                     serialized_graph.nodes[0].children[1].local_name)
+    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
+        1].node_id]
+    self.assertEqual("beta_1_power", optimizer_node.children[0].local_name)
+    self.assertEqual(
+        "beta_1_power", serialized_graph.nodes[
+            optimizer_node.children[0].node_id].attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .original_variable_node_id]
+        .attributes[0].full_name)
+    # We strip off the :0 suffix, as variable.name-based saving does.
+    self.assertEqual(
+        "my_model/dense/kernel/Adam",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .slot_variable_node_id]
+        .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel/Adam:0",
+        optimizer.get_slot(
+            var=model._named_dense.kernel,
+            name="m").name)
+    self.assertEqual(
+        "model/_named_dense/kernel" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .original_variable_node_id].attributes[0].checkpoint_key)
+    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
+    self.assertEqual(
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .slot_variable_node_id].attributes[0].checkpoint_key)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSaveRestore(self):
+    model = MyModel()
+    optimizer = adam.Adam(0.001)
+    root_checkpointable = util.Checkpoint(
+        optimizer=optimizer, model=model)
+    input_value = constant_op.constant([[3.]])
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value))
+    else:
+      train_op = optimizer.minimize(model(input_value))
+      # TODO(allenl): Make initialization more pleasant when graph building.
+      root_checkpointable.save_counter  # pylint: disable=pointless-statement
+      self.evaluate(util.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
+    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
+    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+    save_path = root_checkpointable.save(file_prefix=prefix)
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
+    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    optimizer_variables = self.evaluate(optimizer.variables())
+    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+    # Immediate restoration
+    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
+    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+    if not context.executing_eagerly():
+      return  # Restore-on-create is only supported when executing eagerly
+    on_create_model = MyModel()
+    on_create_optimizer = adam.Adam(
+        0.001,
+        # Preserve beta_1_power and beta_2_power when appying gradients
+        # so we can test that they've been restored correctly.
+        beta_1=1.0,
+        beta_2=1.0)
+    on_create_root = util.Checkpoint(
+        optimizer=on_create_optimizer, model=on_create_model)
+    # Deferred restoration
+    status = on_create_root.restore(save_path=save_path)
+    on_create_model(constant_op.constant([[3.]]))  # create variables
+    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+    self.assertAllEqual([42.],
+                        self.evaluate(
+                            on_create_model._named_dense.variables[1]))
+    on_create_m_bias_slot = on_create_optimizer.get_slot(
+        on_create_model._named_dense.variables[1], "m")
+    # Optimizer slot variables are created when the original variable is
+    # restored.
+    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+    self.assertAllEqual(optimizer_variables[2:],
+                        self.evaluate(on_create_optimizer.variables()))
+    dummy_var = resource_variable_ops.ResourceVariable([1.])
+    on_create_optimizer.minimize(loss=dummy_var.read_value)
+    status.assert_consumed()
+    beta_1_power, beta_2_power = on_create_optimizer._get_beta_accumulators()
+    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta_1_power))
+    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta_2_power))
+
+  # TODO(allenl): Debug garbage created by this test in python3.
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      model = MyModel()
+      optimizer = adam.Adam(0.001)
+      root = util.Checkpoint(
+          optimizer=optimizer, model=model,
+          optimizer_step=training_util.get_or_create_global_step())
+      root.restore(checkpoint_management.latest_checkpoint(
+          checkpoint_directory))
+      for _ in range(num_training_steps):
+        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+        input_value = constant_op.constant([[3.]])
+        optimizer.minimize(
+            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
+            global_step=root.optimizer_step)
+      root.save(file_prefix=checkpoint_prefix)
+      self.assertEqual((training_continuation + 1) * num_training_steps,
+                       root.optimizer_step.numpy())
+
+  def testUsageGraph(self):
+    """Expected usage when graph building."""
+    with context.graph_mode():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      for training_continuation in range(3):
+        with ops.Graph().as_default():
+          model = MyModel()
+          optimizer = adam.Adam(0.001)
+          root = util.Checkpoint(
+              optimizer=optimizer, model=model,
+              global_step=training_util.get_or_create_global_step())
+          input_value = constant_op.constant([[3.]])
+          train_op = optimizer.minimize(
+              model(input_value),
+              global_step=root.global_step)
+          checkpoint_path = checkpoint_management.latest_checkpoint(
+              checkpoint_directory)
+          with self.session(graph=ops.get_default_graph()) as session:
+            status = root.restore(save_path=checkpoint_path)
+            status.initialize_or_restore(session=session)
+            if checkpoint_path is None:
+              self.assertEqual(0, training_continuation)
+              with self.assertRaises(AssertionError):
+                status.assert_consumed()
+            else:
+              status.assert_consumed()
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            root.save(file_prefix=checkpoint_prefix, session=session)
+            self.assertEqual((training_continuation + 1) * num_training_steps,
+                             session.run(root.global_step))
+            self.assertEqual(training_continuation + 1,
+                             session.run(root.save_counter))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAgnosticUsage(self):
+    """Graph/eager agnostic usage."""
+    # Does create garbage when executing eagerly due to ops.Graph() creation.
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        optimizer = adam.Adam(0.001)
+        root = util.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = checkpoint_management.latest_checkpoint(
+            checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        input_value = constant_op.constant([[3.]])
+        train_fn = functools.partial(
+            optimizer.minimize,
+            functools.partial(model, input_value),
+            global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+
+  # pylint: disable=cell-var-from-loop
+  @test_util.run_in_graph_and_eager_modes
+  def testWithDefun(self):
+    num_training_steps = 2
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        # Don't actually train so we can test variable values
+        optimizer = adam.Adam(0.)
+        root = util.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = checkpoint_management.latest_checkpoint(
+            checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        def train_fn():
+          @function.defun
+          def _call_model(x):
+            return model(x)
+          with backprop.GradientTape() as tape:
+            loss = _call_model(constant_op.constant([[3.]]))
+          gradients = tape.gradient(loss, model.variables)
+          return optimizer.apply_gradients(zip(gradients, model.variables),
+                                           global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(
+              self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        if training_continuation > 0:
+          status.assert_consumed()
+          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+        else:
+          self.evaluate(model.variables[0].assign([[42.]]))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+  # pylint: enable=cell-var-from-loop
+
+  def testAnonymousVarsInInit(self):
+
+    class Model(training.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.w = resource_variable_ops.ResourceVariable(0.0)
+        self.b = resource_variable_ops.ResourceVariable(0.0)
+        self.vars = [self.w, self.b]
+
+      def call(self, x):
+        return x * self.w + self.b
+
+    with context.eager_mode():
+      model = Model()
+      optimizer = adam.Adam(learning_rate=0.05)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      checkpoint = util.Checkpoint(
+          model=model, optimizer=optimizer)
+      for _ in range(2):
+        checkpoint.save(checkpoint_prefix)
+        with backprop.GradientTape() as tape:
+          loss = (constant_op.constant(1.)
+                  - model(constant_op.constant(1.))) ** 2
+        grad = tape.gradient(loss, model.vars)
+        optimizer.apply_gradients(
+            [(g, v) for g, v in zip(grad, model.vars)])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeferredSlotRestoration(self):
+    checkpoint_directory = self.get_temp_dir()
+
+    root = tracking.Checkpointable()
+    root.var = util.add_variable(
+        root, name="var", initializer=0.)
+    optimizer = adam.Adam(0.1)
+    if context.executing_eagerly():
+      optimizer.minimize(root.var.read_value)
+    else:
+      train_op = optimizer.minimize(root.var)
+      # Note that `optimizer` has not been added as a dependency of
+      # `root`. Create a one-off grouping so that slot variables for `root.var`
+      # get initialized too.
+      self.evaluate(util.gather_initializers(
+          util.Checkpoint(root=root, optimizer=optimizer)))
+      self.evaluate(train_op)
+    self.evaluate(state_ops.assign(root.var, 12.))
+    no_slots_path = util.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "no_slots"))
+    root.optimizer = optimizer
+    self.evaluate(state_ops.assign(root.var, 13.))
+    self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
+                                   14.))
+    slots_path = util.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "with_slots"))
+    new_root = tracking.Checkpointable()
+    # Load the slot-containing checkpoint (deferred), then immediately overwrite
+    # the non-slot variable (also deferred).
+    slot_status = util.CheckpointableSaver(
+        new_root).restore(slots_path)
+    no_slot_status = util.CheckpointableSaver(
+        new_root).restore(no_slots_path)
+    with self.assertRaises(AssertionError):
+      no_slot_status.assert_consumed()
+    new_root.var = util.add_variable(
+        new_root, name="var", shape=[])
+    no_slot_status.assert_consumed()
+    no_slot_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    new_root.optimizer = adam.Adam(0.1)
+    with self.assertRaisesRegexp(AssertionError, "beta_1_power"):
+      slot_status.assert_consumed()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    if context.executing_eagerly():
+      # Slot variables are only created with restoring initializers when
+      # executing eagerly.
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+    else:
+      self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var),
+                    None)
+    if context.executing_eagerly():
+      new_root.optimizer.minimize(new_root.var.read_value)
+    else:
+      train_op = new_root.optimizer.minimize(new_root.var)
+      # The slot variable now exists; restore() didn't create it, but we should
+      # now have a restore op for it.
+      slot_status.run_restore_ops()
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+      self.evaluate(train_op)
+    slot_status.assert_consumed()
+
+  def testManySavesGraph(self):
+    """Saves after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = tracking.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.Adam(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(util.gather_initializers(obj))
+        saver = util.CheckpointableSaver(obj)
+        saver.save(checkpoint_prefix)
+        before_ops = graph.get_operations()
+        saver.save(checkpoint_prefix)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testManyRestoresGraph(self):
+    """Restores after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = tracking.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.Adam(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(util.gather_initializers(obj))
+        saver = util.CheckpointableSaver(obj)
+        save_path = saver.save(checkpoint_prefix)
+        saver.restore(save_path)
+        before_ops = graph.get_operations()
+        saver.restore(save_path)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testMultipleGraphsNonSlotVariables(self):
+    with context.graph_mode():
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      optimizer = adam.Adam(0.001)
+      # Construct a model in one graph
+      first_graph = ops.Graph()
+      first_session = session_lib.Session(graph=first_graph)
+      with first_graph.as_default(), first_session.as_default():
+        first_variable = resource_variable_ops.ResourceVariable([1.])
+        first_root_checkpointable = util.Checkpoint(
+            optimizer=optimizer, variable=first_variable)
+        train_op = optimizer.minimize(first_variable.read_value)
+        self.evaluate(util.gather_initializers(
+            first_root_checkpointable))
+        self.evaluate(train_op)
+        self.evaluate(first_variable.assign([1.]))
+        self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m").assign([2.]))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta_1_power.assign(3.))
+
+      # Save and load in a second graph
+      second_graph = ops.Graph()
+      with second_graph.as_default(), session_lib.Session(graph=second_graph):
+        second_variable = resource_variable_ops.ResourceVariable([1.])
+        second_root_checkpointable = util.Checkpoint(
+            optimizer=optimizer, variable=second_variable)
+        train_op = optimizer.minimize(second_variable.read_value)
+        second_root_checkpointable.restore(None).initialize_or_restore()
+        self.evaluate(train_op)
+        self.evaluate(second_variable.assign([4.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([5.]))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta_1_power.assign(6.))
+        save_path = second_root_checkpointable.save(checkpoint_prefix)
+        self.evaluate(second_variable.assign([7.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([8.]))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta_1_power))
+        status = second_root_checkpointable.restore(save_path)
+        status.assert_consumed().run_restore_ops()
+        self.assertAllEqual([4.], self.evaluate(second_variable))
+        self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m")))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta_1_power))
+
+      # Check that the first graph is unmolested
+      with first_graph.as_default(), first_session.as_default():
+        self.assertAllEqual([1.], self.evaluate(first_variable))
+        self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m")))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(3., self.evaluate(beta_1_power))
+
+
+class TemplateTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_checkpointable_save_restore(self):
+
+    def _templated():
+      v = variable_scope.get_variable(
+          "v", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
+      v2 = variable_scope.get_variable(
+          "v2", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
+      return v, v + 1., v2
+
+    save_template = template.make_template("s1", _templated)
+    v1_save, _, v2_save = save_template()
+    optimizer = adam.Adam(0.0)
+    save_root = util.Checkpoint(
+        my_template=save_template, optimizer=optimizer)
+    optimizer.minimize(v1_save.read_value)
+    self.evaluate([v.initializer for v in optimizer.variables()])
+    self.evaluate(v1_save.assign([12.]))
+    self.evaluate(v2_save.assign([14.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_root.save(checkpoint_prefix)
+
+    load_template = template.make_template("s2", _templated)
+    load_optimizer = adam.Adam(0.0)
+    load_root = util.Checkpoint(
+        my_template=load_template, optimizer=load_optimizer)
+    status = load_root.restore(save_path)
+    var, var_plus_one, var2 = load_template()
+    load_optimizer.minimize(var.read_value)
+    self.assertEqual(2, len(load_template._checkpoint_dependencies))
+    self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
+    self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual([12.], self.evaluate(var))
+    self.assertAllEqual([13.], self.evaluate(var_plus_one))
+    self.assertAllEqual([14.], self.evaluate(var2))
+
+
+class CheckpointCompatibilityTests(test.TestCase):
+
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.Adam(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = util.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    train_op = optimizer.minimize(
+        functools.partial(model, input_value),
+        global_step=optimizer_step)
+    self.evaluate(util.gather_initializers(
+        root_checkpointable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, name="m").assign([2.]))
+    beta_1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta_1_power.assign(3.))
+    return root_checkpointable
+
+  def _set_sentinels(self, root_checkpointable):
+    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")
+        .assign([102.]))
+    beta_1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.evaluate(beta_1_power.assign(103.))
+
+  def _check_sentinels(self, root_checkpointable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")))
+    beta_1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta_1_power))
+
+  def _write_name_based_checkpoint(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        name_saver = core_saver.Saver()
+        return name_saver.save(
+            sess=session, save_path=checkpoint_prefix,
+            global_step=root.optimizer_step)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testLoadFromNameBasedSaver(self):
+    """Save a name-based checkpoint, load it using the object-based API."""
+    with test_util.device(use_gpu=True):
+      save_path = self._write_name_based_checkpoint()
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      with self.assertRaises(AssertionError):
+        self._check_sentinels(root)
+      object_saver = util.CheckpointableSaver(root)
+      self._set_sentinels(root)
+      status = object_saver.restore(save_path)
+      if context.executing_eagerly():
+        self._check_sentinels(root)
+      if context.executing_eagerly():
+        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
+          status.assert_consumed()
+      else:
+        # When graph building, we haven't read any keys, so we don't know
+        # whether the restore will be complete.
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_consumed()
+      status.run_restore_ops()
+      self._check_sentinels(root)
+      self._set_sentinels(root)
+      status = object_saver.restore(save_path)
+      status.initialize_or_restore()
+      self._check_sentinels(root)
+
+  # TODO(allenl): Test for the core name-based saver loading object-based
+  # checkpoints once object-based checkpointing is in core.
+
+  def testSaveGraphLoadEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        save_path = root.save(
+            session=session, file_prefix=checkpoint_prefix)
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      root.restore(save_path).assert_consumed()
+      self._check_sentinels(root)
+
+  def testSaveEagerLoadGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.eager_mode():
+      root = self._initialized_model()
+      save_path = root.save(file_prefix=checkpoint_prefix)
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph):
+        root = self._initialized_model()
+        self._set_sentinels(root)
+        root.restore(save_path).assert_consumed().run_restore_ops()
+        self._check_sentinels(root)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
new file mode 100644
index 0000000000..bd5557f4fd
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -0,0 +1,1349 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Version 2 of class Optimizer."""
+# pylint: disable=g-bad-name
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
+from tensorflow.python.training import optimizer as optimizer_v1
+from tensorflow.python.training import slot_creator
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.util import nest
+
+
+class _OptimizableVariable(object):
+  """Interface for abstracting over variables in the optimizers."""
+
+  @abc.abstractmethod
+  def target(self):
+    """Returns the optimization target for this variable."""
+    raise NotImplementedError("Calling an abstract method.")
+
+  @abc.abstractmethod
+  def update_op(self, optimizer, g, *args):
+    """Returns the update ops for updating the variable."""
+    raise NotImplementedError("Calling an abstract method.")
+
+
+class _RefVariableProcessor(_OptimizableVariable):
+  """Processor for Variable."""
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v._ref()  # pylint: disable=protected-access
+
+  def update_op(self, optimizer, g, *args):
+    if isinstance(g, ops.Tensor):
+      update_op = optimizer._apply_dense(g, self._v, *args)  # pylint: disable=protected-access
+      if self._v.constraint is not None:
+        with ops.control_dependencies([update_op]):
+          return self._v.assign(self._v.constraint(self._v))
+      else:
+        return update_op
+    else:
+      assert isinstance(g, ops.IndexedSlices), ("Gradient ", g, " is neither a "
+                                                "tensor nor IndexedSlices.")
+      if self._v.constraint is not None:
+        raise RuntimeError(
+            "Cannot use a constraint function on a sparse variable.")
+      # pylint: disable=protected-access
+      return optimizer._apply_sparse_duplicate_indices(g, self._v, *args)
+
+
+class _DenseReadResourceVariableProcessor(_OptimizableVariable):
+  """Processor for dense ResourceVariables."""
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g, *args):
+    # pylint: disable=protected-access
+    update_op = optimizer._resource_apply_dense(g, self._v.op.inputs[0], *args)
+    if self._v.constraint is not None:
+      with ops.control_dependencies([update_op]):
+        return self._v.assign(self._v.constraint(self._v))
+    else:
+      return update_op
+
+
+class _DenseResourceVariableProcessor(_OptimizableVariable):
+  """Processor for dense ResourceVariables."""
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g, *args):
+    # pylint: disable=protected-access
+    if isinstance(g, ops.IndexedSlices):
+      if self._v.constraint is not None:
+        raise RuntimeError(
+            "Cannot use a constraint function on a sparse variable.")
+      return optimizer._resource_apply_sparse_duplicate_indices(
+          g.values, self._v, g.indices, *args)
+    update_op = optimizer._resource_apply_dense(g, self._v, *args)
+    if self._v.constraint is not None:
+      with ops.control_dependencies([update_op]):
+        return self._v.assign(self._v.constraint(self._v))
+    else:
+      return update_op
+
+
+class _TensorProcessor(_OptimizableVariable):
+  """Processor for ordinary Tensors.
+
+  Even though a Tensor can't really be updated, sometimes it is useful to
+  compute the gradients with respect to a Tensor using the optimizer. Updating
+  the Tensor is, of course, unsupported.
+  """
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g, *args):
+    raise NotImplementedError("Trying to update a Tensor ", self._v)
+
+
+def _get_processor(v):
+  """The processor of v."""
+  if context.executing_eagerly():
+    if isinstance(v, ops.Tensor):
+      return _TensorProcessor(v)
+    else:
+      return _DenseResourceVariableProcessor(v)
+  if v.op.type == "VarHandleOp":
+    return _DenseResourceVariableProcessor(v)
+  if isinstance(v, variables.Variable):
+    return _RefVariableProcessor(v)
+  if isinstance(v, ops.Tensor):
+    return _TensorProcessor(v)
+  raise NotImplementedError("Trying to optimize unsupported type ", v)
+
+
+def _var_key_v2(var):
+  """Key for representing a primary variable, for looking up slots."""
+  # pylint: disable=protected-access
+  if hasattr(var, "_distributed_container"):
+    distributed_container = var._distributed_container()
+    assert distributed_container is not None
+    if context.executing_eagerly():
+      return distributed_container._unique_id
+    return distributed_container._shared_name
+  if context.executing_eagerly():
+    return var._unique_id
+  return var.op.name
+
+
+def _resolve(value, name):
+  if callable(value):
+    value = value()
+  return ops.convert_to_tensor(value, name=name)
+
+
+def _is_dynamic(value):
+  """Returns true if __init__ arg `value` should be re-evaluated each step."""
+  if callable(value): return True
+  # Don't need to do anything special in graph mode, since dynamic values
+  # will propagate correctly automatically.
+  # TODO(josh11b): Add per-device caching across steps using variables for
+  # truly static values once we add distributed support.
+  if context.executing_eagerly() and isinstance(
+      value, resource_variable_ops.ResourceVariable):
+    return True
+  return False
+
+
+class _OptimizerV2State(object):
+  """Holds per-graph and per-step optimizer state.
+
+  Use _init_with_static_hyper() to create the state for a graph, and then
+  _copy_with_dynamic_hyper() to convert that to state for a particular step.
+  The difference between the two is that the former only has hyper
+  parameter values that are static and the latter also has values that
+  can change every step (according to _is_dynamic()).
+  """
+
+  def __init__(self, op_name):
+    self._op_name = op_name
+
+  def _init_with_static_hyper(self, hyper):
+    """Initialize a fresh state object from hyper dict."""
+    # self._hyper contains a dict from name to a dict with the Tensor values.
+    # This dict starts with a single item with key "None" with the hyper
+    # parameter value converted to a Tensor. Other items have dtype keys
+    # with that Tensor cast to that dtype.
+    with ops.init_scope():
+      self._hyper = {name: {None: ops.convert_to_tensor(value, name=name)}
+                     for name, (dynamic, value) in sorted(hyper.items())
+                     if not dynamic}
+    self._slots = {}
+    self._non_slot_dict = {}
+    # Extra state to help Optimizers implement Checkpointable. Holds information
+    # about variables which will be restored as soon as they're created.
+    self._deferred_dependencies = {}  # Non-slot variables
+    self._deferred_slot_restorations = {}  # Slot variables
+
+  def _copy_with_dynamic_hyper(self, hyper, distribution, non_slot_devices):
+    """Create a new state object for a particular step."""
+    ret = _OptimizerV2State(self._op_name)
+    # pylint: disable=protected-access
+    ret._slots = self._slots
+    ret._non_slot_dict = self._non_slot_dict
+    ret._deferred_dependencies = self._deferred_dependencies
+    ret._deferred_slot_restorations = self._deferred_slot_restorations
+    ret._hyper = {name: {None: _resolve(value, name)}
+                  for name, (dynamic, value) in sorted(hyper.items())
+                  if dynamic}
+    ret._hyper.update(self._hyper)
+    ret._non_slot_devices = non_slot_devices
+    ret._distribution = distribution
+    return ret
+
+  def _variables(self):
+    """Returns a list of all variables held by self."""
+    optimizer_variables = list(self._non_slot_dict.values())
+    for variable_dict in self._slots.values():
+      for slot_for_variable in variable_dict.values():
+        optimizer_variables.append(slot_for_variable)
+    # Sort variables by name so that the return is deterministic.
+    return sorted(optimizer_variables, key=lambda v: v.name)
+
+  def _slot_dict(self, slot_name):
+    """Returns a dict for caching slots created under the given name.
+
+    Args:
+      slot_name: Name for the slot.
+
+    Returns:
+      A dict that maps primary `Variable` objects to the slot created
+      for that variable, under the given slot name.
+    """
+    named_slots = self._slots.get(slot_name, None)
+    if named_slots is None:
+      named_slots = {}
+      self._slots[slot_name] = named_slots
+    return named_slots
+
+  def create_slot(self, var, val, slot_name, optional_op_name=None):
+    """Find or create a slot for a variable.
+
+    Args:
+      var: A `Variable` object.
+      val: A `Tensor`.  The initial value of the slot.
+      slot_name: Name for the slot.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+
+    Returns:
+      A `Variable` object.
+    """
+    named_slots = self._slot_dict(slot_name)
+    var_key = _var_key_v2(var)
+    if var_key not in named_slots:
+      new_slot_variable = slot_creator.create_slot(
+          var, val, optional_op_name or self._op_name)
+      self._restore_slot_variable(
+          slot_name=slot_name, variable=var,
+          slot_variable=new_slot_variable)
+      named_slots[var_key] = new_slot_variable
+    return named_slots[var_key]
+
+  def create_slot_with_initializer(self, var, initializer, shape, dtype,
+                                   slot_name, optional_op_name=None):
+    """Find or create a slot for a variable, using an Initializer.
+
+    Args:
+      var: A `Variable` object.
+      initializer: An `Initializer`.  The initial value of the slot.
+      shape: Shape of the initial value of the slot.
+      dtype: Type of the value of the slot.
+      slot_name: Name for the slot.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+
+    Returns:
+      A `Variable` object.
+    """
+    named_slots = self._slot_dict(slot_name)
+    var_key = _var_key_v2(var)
+    if var_key not in named_slots:
+      new_slot_variable = slot_creator.create_slot_with_initializer(
+          var, initializer, shape, dtype, optional_op_name or self._op_name)
+      self._restore_slot_variable(
+          slot_name=slot_name, variable=var,
+          slot_variable=new_slot_variable)
+      named_slots[var_key] = new_slot_variable
+    return named_slots[var_key]
+
+  def zeros_slot(self, var, slot_name, optional_op_name=None):
+    """Find or create a slot initialized with 0.0.
+
+    Args:
+      var: A `Variable` object.
+      slot_name: Name for the slot.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+
+    Returns:
+      A `Variable` object.
+    """
+    named_slots = self._slot_dict(slot_name)
+    var_key = _var_key_v2(var)
+    if var_key not in named_slots:
+      new_slot_variable = slot_creator.create_zeros_slot(
+          var, optional_op_name or self._op_name)
+      self._restore_slot_variable(
+          slot_name=slot_name, variable=var,
+          slot_variable=new_slot_variable)
+      named_slots[var_key] = new_slot_variable
+    return named_slots[var_key]
+
+  def _create_or_restore_slot_variable(
+      self, slot_variable_position, slot_name, variable,
+      optional_op_name=None):
+    """Restore a slot variable's value, possibly creating it.
+
+    Called when a variable which has an associated slot variable is created or
+    restored. When executing eagerly, we create the slot variable with a
+    restoring initializer.
+
+    No new variables are created when graph building. Instead,
+    _restore_slot_variable catches these after normal creation and adds restore
+    ops to the graph. This method is nonetheless important when graph building
+    for the case when a slot variable has already been created but `variable`
+    has just been added to a dependency graph (causing us to realize that the
+    slot variable needs to be restored).
+
+    Args:
+      slot_variable_position: A `checkpointable._CheckpointPosition` object
+        indicating the slot variable `Checkpointable` object to be restored.
+      slot_name: The name of this `Optimizer`'s slot to restore into.
+      variable: The variable object this slot is being created for.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+    """
+    slot_variable = self.get_slot(var=variable, name=slot_name)
+    if (slot_variable is None and context.executing_eagerly() and
+        slot_variable_position.is_simple_variable()
+        # Defer slot variable creation if there is an active variable creator
+        # scope. Generally we'd like to eagerly create/restore slot variables
+        # when possible, but this may mean that scopes intended to catch
+        # `variable` also catch its eagerly created slot variable
+        # unintentionally (specifically make_template would add a dependency on
+        # a slot variable if not for this case). Deferring is mostly harmless
+        # (aside from double initialization), and makes variable creator scopes
+        # behave the same way they do when graph building.
+        and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
+      initializer = checkpointable.CheckpointInitialValue(
+          checkpoint_position=slot_variable_position)
+      slot_variable = self.create_slot(
+          var=variable,
+          val=initializer,
+          slot_name=slot_name,
+          optional_op_name=optional_op_name)
+      # Optimizers do not have unconditional dependencies on their slot
+      # variables (nor do any other objects). They are only saved if the
+      # variables they were created for are also saved.
+    if slot_variable is not None:
+      # If we've either made this slot variable, or if we've pulled out an
+      # existing slot variable, we should restore it.
+      slot_variable_position.restore(slot_variable)
+    else:
+      # We didn't make the slot variable. Defer restoring until it gets created
+      # normally. We keep a list rather than the one with the highest restore
+      # UID in case slot variables have their own dependencies, in which case
+      # those could differ between restores.
+      variable_key = _var_key_v2(variable)
+      self._deferred_slot_restorations.setdefault(
+          slot_name, {}).setdefault(variable_key, []).append(
+              slot_variable_position)
+
+  def get_slot(self, var, name):
+    """Return a slot named `name` created for `var` by the Optimizer.
+
+    Some `Optimizer` subclasses use additional variables.  For example
+    `Momentum` and `Adagrad` use variables to accumulate updates.  This method
+    gives access to these `Variable` objects if for some reason you need them.
+
+    Use `get_slot_names()` to get the list of slot names created by the
+    `Optimizer`.
+
+    Args:
+      var: A variable passed to `minimize()` or `apply_gradients()`.
+      name: A string.
+
+    Returns:
+      The `Variable` for the slot if it was created, `None` otherwise.
+    """
+    named_slots = self._slots.get(name, None)
+    if not named_slots:
+      return None
+    return named_slots.get(_var_key_v2(var), None)
+
+  def get_slot_names(self):
+    """Return a list of the names of slots created by the `Optimizer`.
+
+    See `get_slot()`.
+
+    Returns:
+      A list of strings.
+    """
+    return sorted(self._slots.keys())
+
+  def create_non_slot(self, initial_value, name, colocate_with=None):
+    """Add an extra variable, not associated with a slot."""
+    v = self._non_slot_dict.get(name, None)
+    if v is None:
+      if colocate_with is None: colocate_with = self._non_slot_devices
+      with self._distribution.colocate_vars_with(colocate_with):
+        # TODO(josh11b): Use get_variable() except for the legacy Adam use case.
+        v = variable_scope.variable(initial_value, name=name, trainable=False)
+      self._non_slot_dict[name] = v
+      deferred_dependencies_list = self._deferred_dependencies.pop(name, ())
+      for checkpoint_position in sorted(
+          deferred_dependencies_list,
+          key=lambda restore: restore.checkpoint.restore_uid,
+          reverse=True):
+        checkpoint_position.restore(v)
+    return v
+
+  def _restore_slot_variable(self, slot_name, variable, slot_variable):
+    """Restore a newly created slot variable's value."""
+    variable_key = _var_key_v2(variable)
+    deferred_restorations = self._deferred_slot_restorations.get(
+        slot_name, {}).pop(variable_key, [])
+    # Iterate over restores, highest restore UID first to minimize the number
+    # of assignments.
+    deferred_restorations.sort(key=lambda position: position.restore_uid,
+                               reverse=True)
+    for checkpoint_position in deferred_restorations:
+      checkpoint_position.restore(slot_variable)
+
+  def get_non_slot(self, name):
+    """Returns the non-slot variable identified by `name`."""
+    return self._non_slot_dict.get(name, None)
+
+  def get_hyper(self, name, dtype=None):
+    """Returns the `name` hyper parameter, optionally cast to `dtype`."""
+    dtype_dict = self._hyper[name]
+    # Do we have the value cast to dtype already cached? This should always
+    # succeed when dtype is None.
+    if dtype in dtype_dict:
+      return dtype_dict[dtype]
+    # Not cached, cast to dtype and save the result in the cache.
+    result = math_ops.cast(dtype_dict[None], dtype)
+    dtype_dict[dtype] = result
+    return result
+
+
+class OptimizerV2(optimizer_v1.Optimizer):
+  """Updated base class for optimizers.
+
+  This class defines the API to add Ops to train a model.  You never use this
+  class directly, but instead instantiate one of its subclasses such as
+  `GradientDescentOptimizer`, `AdagradOptimizer`, or `MomentumOptimizer`.
+
+  ### Usage
+
+  ```python
+  # Create an optimizer with the desired parameters.
+  opt = GradientDescentOptimizer(learning_rate=0.1)
+  # Add Ops to the graph to minimize a cost by updating a list of variables.
+  # "cost" is a Tensor, and the list of variables contains tf.Variable
+  # objects.
+  opt_op = opt.minimize(cost, var_list=<list of variables>)
+  ```
+
+  In the training program you will just have to run the returned Op.
+
+  ```python
+  # Execute opt_op to do one step of training:
+  opt_op.run()
+  ```
+
+  ### Processing gradients before applying them.
+
+  Calling `minimize()` takes care of both computing the gradients and
+  applying them to the variables.  If you want to process the gradients
+  before applying them you can instead use the optimizer in three steps:
+
+  1.  Compute the gradients with `compute_gradients()`.
+  2.  Process the gradients as you wish.
+  3.  Apply the processed gradients with `apply_gradients()`.
+
+  Example:
+
+  ```python
+  # Create an optimizer.
+  opt = GradientDescentOptimizer(learning_rate=0.1)
+
+  # Compute the gradients for a list of variables.
+  grads_and_vars = opt.compute_gradients(loss, <list of variables>)
+
+  # grads_and_vars is a list of tuples (gradient, variable).  Do whatever you
+  # need to the 'gradient' part, for example cap them, etc.
+  capped_grads_and_vars = [(MyCapper(gv[0]), gv[1]) for gv in grads_and_vars]
+
+  # Ask the optimizer to apply the capped gradients.
+  opt.apply_gradients(capped_grads_and_vars)
+  ```
+
+  ### Gating Gradients
+
+  Both `minimize()` and `compute_gradients()` accept a `gate_gradients`
+  argument that controls the degree of parallelism during the application of
+  the gradients.
+
+  The possible values are: `GATE_NONE`, `GATE_OP`, and `GATE_GRAPH`.
+
+  <b>`GATE_NONE`</b>: Compute and apply gradients in parallel.  This provides
+  the maximum parallelism in execution, at the cost of some non-reproducibility
+  in the results.  For example the two gradients of `matmul` depend on the input
+  values: With `GATE_NONE` one of the gradients could be applied to one of the
+  inputs _before_ the other gradient is computed resulting in non-reproducible
+  results.
+
+  <b>`GATE_OP`</b>: For each Op, make sure all gradients are computed before
+  they are used.  This prevents race conditions for Ops that generate gradients
+  for multiple inputs where the gradients depend on the inputs.
+
+  <b>`GATE_GRAPH`</b>: Make sure all gradients for all variables are computed
+  before any one of them is used.  This provides the least parallelism but can
+  be useful if you want to process all gradients before applying any of them.
+
+  ### Slots
+
+  Some optimizer subclasses, such as `MomentumOptimizer` and `AdagradOptimizer`
+  allocate and manage additional variables associated with the variables to
+  train.  These are called <i>Slots</i>.  Slots have names and you can ask the
+  optimizer for the names of the slots that it uses.  Once you have a slot name
+  you can ask the optimizer for the variable it created to hold the slot value.
+
+  This can be useful if you want to log debug a training algorithm, report stats
+  about the slots, etc.
+
+  ### Non-slot variables
+
+  Some optimizer subclasses, such as `AdamOptimizer` have variables that
+  are not associated with the variables to train, just the step itself.
+
+  ### Hyper parameters
+
+  These are arguments passed to the optimizer subclass constructor
+  (the `__init__` method), and then passed to `self._set_hyper()`.
+  They can be either regular Python values (like 1.0), tensors, or
+  callables. If they are callable, the callable will be called during
+  `apply_gradients()` to get the value for the hyper parameter.
+
+  ### State
+
+  Internal methods are passed a `state` argument with the correct
+  values to use for the slot and non-slot variables, and the hyper
+  parameters.
+  """
+
+  # Values for gate_gradients.
+  GATE_NONE = 0
+  GATE_OP = 1
+  GATE_GRAPH = 2
+
+  def __init__(self, name):
+    """Create a new Optimizer.
+
+    This must be called by the constructors of subclasses.
+    Note that Optimizer instances should not bind to a single graph,
+    and so shouldn't keep Tensors as member variables. Generally
+    you should be able to use the _set_hyper()/state.get_hyper()
+    facility instead.
+
+    Args:
+      name: A non-empty string.  The name to use for accumulators created
+        for the optimizer.
+
+    Raises:
+      ValueError: If name is malformed.
+      RuntimeError: If _create_slots has been overridden instead of
+          _create_vars.
+    """
+    # Note: We intentionally don't call parent __init__.
+
+    # Optimizer._create_slots was replaced by _create_vars in OptimizerV2.
+    if (self.__class__._create_slots.__code__ is not  # pylint: disable=protected-access
+        OptimizerV2._create_slots.__code__):
+      raise RuntimeError("Override _create_vars instead of _create_slots when "
+                         "descending from OptimizerV2 (class %s)" %
+                         self.__class__.__name__)
+    if not name:
+      raise ValueError("Must specify the optimizer name")
+
+    self._use_locking = False
+    self._name = name
+    # Map from graph_key to state for that graph. We use the graph_key
+    # since it works in both eager and graph mode, and gives the outer
+    # graph inside functions.
+    tower_context = distribution_strategy_context.get_tower_context()
+    if tower_context is None:
+      # In a cross-tower context for a DistributionStrategy, which means
+      # only one Optimizer will be created, not one per tower.
+      self._per_graph_state = {}
+    else:
+      # We use get_tower_context().merge_call() to get a single dict
+      # shared across all model replicas when running with a
+      # DistributionStrategy.
+      self._per_graph_state = tower_context.merge_call(lambda _: {})
+
+    # Hyper parameters, and whether they should be re-evaluated every step.
+    self._hyper = {}
+
+  def _set_hyper(self, name, value):
+    self._hyper[name] = (_is_dynamic(value), value)
+
+  def minimize(self, loss, global_step=None, var_list=None,
+               gate_gradients=GATE_OP, aggregation_method=None,
+               colocate_gradients_with_ops=False, name=None,
+               grad_loss=None, stop_gradients=None,
+               scale_loss_by_num_towers=None):
+    """Add operations to minimize `loss` by updating `var_list`.
+
+    This method simply combines calls `compute_gradients()` and
+    `apply_gradients()`. If you want to process the gradient before applying
+    them call `compute_gradients()` and `apply_gradients()` explicitly instead
+    of using this function.
+
+    Args:
+      loss: A `Tensor` containing the value to minimize.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      var_list: Optional list or tuple of `Variable` objects to update to
+        minimize `loss`.  Defaults to the list of variables collected in
+        the graph under the key `GraphKeys.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      name: Optional name for the returned operation.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
+        through.
+      scale_loss_by_num_towers: Optional boolean. If true, scale the loss
+        down by the number of towers. By default, auto-detects whether this
+        is needed.
+
+    Returns:
+      An Operation that updates the variables in `var_list`.  If `global_step`
+      was not `None`, that operation also increments `global_step`.
+
+    Raises:
+      ValueError: If some of the variables are not `Variable` objects.
+
+    @compatibility(eager)
+    When eager execution is enabled, `loss` should be a Python function that
+    takes elements of `var_list` as arguments and computes the value to be
+    minimized. If `var_list` is None, `loss` should take no arguments.
+    Minimization (and gradient computation) is done with respect to the
+    elements of `var_list` if not None, else with respect to any trainable
+    variables created during the execution of the `loss` function.
+    `gate_gradients`, `aggregation_method`, `colocate_gradients_with_ops` and
+    `grad_loss` are ignored when eager execution is enabled.
+    @end_compatibility
+    """
+    grads_and_vars = self.compute_gradients(
+        loss, var_list=var_list, gate_gradients=gate_gradients,
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        grad_loss=grad_loss, stop_gradients=stop_gradients,
+        scale_loss_by_num_towers=scale_loss_by_num_towers)
+
+    vars_with_grad = [v for g, v in grads_and_vars if g is not None]
+    if not vars_with_grad:
+      raise ValueError(
+          "No gradients provided for any variable, check your graph for ops"
+          " that do not support gradients, between variables %s and loss %s." %
+          ([str(v) for _, v in grads_and_vars], loss))
+
+    return self.apply_gradients(grads_and_vars, global_step=global_step,
+                                name=name)
+
+  def compute_gradients(self, loss, var_list=None,
+                        gate_gradients=GATE_OP,
+                        aggregation_method=None,
+                        colocate_gradients_with_ops=False,
+                        grad_loss=None, stop_gradients=None,
+                        scale_loss_by_num_towers=None):
+    """Compute gradients of `loss` for the variables in `var_list`.
+
+    This is the first part of `minimize()`.  It returns a list
+    of (gradient, variable) pairs where "gradient" is the gradient
+    for "variable".  Note that "gradient" can be a `Tensor`, an
+    `IndexedSlices`, or `None` if there is no gradient for the
+    given variable.
+
+    Args:
+      loss: A Tensor containing the value to minimize or a callable taking
+        no arguments which returns the value to minimize. When eager execution
+        is enabled it must be a callable.
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+        `loss`.  Defaults to the list of variables collected in the graph
+        under the key `GraphKeys.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
+        through.
+      scale_loss_by_num_towers: Optional boolean. If true, scale the loss
+        down by the number of towers. By default, auto-detects whether this
+        is needed.
+
+    Returns:
+      A list of (gradient, variable) pairs. Variable is always present, but
+      gradient can be `None`.
+
+    Raises:
+      TypeError: If `var_list` contains anything else than `Variable` objects.
+      ValueError: If some arguments are invalid.
+      RuntimeError: If called with eager execution enabled and `loss` is
+        not callable.
+
+    @compatibility(eager)
+    When eager execution is enabled, `gate_gradients`, `aggregation_method`,
+    and `colocate_gradients_with_ops` are ignored.
+    @end_compatibility
+    """
+    # TODO(josh11b): Test that we handle weight decay in a reasonable way.
+    if callable(loss):
+      with backprop.GradientTape() as tape:
+        if var_list is not None:
+          tape.watch(var_list)
+        loss_value = loss()
+
+        # Scale loss for number of towers (callable-loss case). In this case,
+        # we have to be careful to call distribute_lib.get_loss_reduction()
+        # *after* loss() is evaluated, so we know what loss reduction it uses.
+        if scale_loss_by_num_towers is None:
+          scale_loss_by_num_towers = (
+              distribute_lib.get_loss_reduction() ==
+              variable_scope.VariableAggregation.MEAN)
+        if scale_loss_by_num_towers:
+          num_towers = distribution_strategy_context.get_distribution_strategy(
+          ).num_towers
+          if num_towers > 1:
+            loss_value *= 1. / num_towers
+
+      if var_list is None:
+        var_list = tape.watched_variables()
+      grads = tape.gradient(loss_value, var_list, grad_loss)
+      return list(zip(grads, var_list))
+    if context.executing_eagerly():
+      raise RuntimeError(
+          "`loss` passed to Optimizer.compute_gradients should "
+          "be a function when eager execution is enabled.")
+
+    # Scale loss for number of towers (non-callable-loss case).
+    if scale_loss_by_num_towers is None:
+      scale_loss_by_num_towers = (
+          distribute_lib.get_loss_reduction() ==
+          variable_scope.VariableAggregation.MEAN)
+    if scale_loss_by_num_towers:
+      num_towers = distribution_strategy_context.get_distribution_strategy(
+      ).num_towers
+      if num_towers > 1:
+        loss *= 1. / num_towers
+
+    if gate_gradients not in [optimizer_v1.Optimizer.GATE_NONE,
+                              optimizer_v1.Optimizer.GATE_OP,
+                              optimizer_v1.Optimizer.GATE_GRAPH]:
+      raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
+                       "Optimizer.GATE_OP, Optimizer.GATE_GRAPH.  Not %s" %
+                       gate_gradients)
+    self._assert_valid_dtypes([loss])
+    if grad_loss is not None:
+      self._assert_valid_dtypes([grad_loss])
+    if var_list is None:
+      var_list = (
+          variables.trainable_variables() +
+          ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+    else:
+      var_list = nest.flatten(var_list)
+    # pylint: disable=protected-access
+    var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS)
+    # pylint: enable=protected-access
+    processors = [_get_processor(v) for v in var_list]
+    if not var_list:
+      raise ValueError("No variables to optimize.")
+    var_refs = [p.target() for p in processors]
+    grads = gradients.gradients(
+        loss, var_refs, grad_ys=grad_loss,
+        gate_gradients=(gate_gradients == optimizer_v1.Optimizer.GATE_OP),
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        stop_gradients=stop_gradients)
+    if gate_gradients == optimizer_v1.Optimizer.GATE_GRAPH:
+      grads = control_flow_ops.tuple(grads)
+    grads_and_vars = list(zip(grads, var_list))
+    self._assert_valid_dtypes(
+        [v for g, v in grads_and_vars
+         if g is not None and v.dtype != dtypes.resource])
+    return grads_and_vars
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to variables.
+
+    This is the second part of `minimize()`. It returns an `Operation` that
+    applies gradients.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that applies the specified gradients. If `global_step`
+      was not None, that operation also increments `global_step`.
+
+    Raises:
+      TypeError: If `grads_and_vars` is malformed.
+      ValueError: If none of the variables have gradients.
+    """
+    # This is a default implementation of apply_gradients() that can be shared
+    # by most optimizers.  It relies on the subclass implementing the following
+    # methods: _create_vars(), _prepare(), _apply_dense(), and _apply_sparse().
+
+    # Filter out variables with gradients of `None`.
+    grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
+    if not grads_and_vars:
+      raise ValueError("No variables provided.")
+    filtered = tuple((g, v) for (g, v) in grads_and_vars if g is not None)
+    if not filtered:
+      raise ValueError("No gradients provided for any variable: %s." %
+                       ([str(v) for _, v in grads_and_vars],))
+    return distribution_strategy_context.get_tower_context().merge_call(
+        self._distributed_apply, filtered, global_step=global_step, name=name)
+
+  def _get_or_create_state(self, var_list=None):
+    """Either looks up or creates `_OptimizerV2State`.
+
+    If any variables are available, they should be passed via the `var_list`
+    argument, and these will be used to determine the graph to create/retrieve
+    state for. Otherwise the returned state is for the current default graph.
+
+    Args:
+      var_list: A list of variables to extract a graph from.
+
+    Returns:
+      An `_OptimizerV2State` object.
+    """
+    # Determine the graph_key from the current graph.
+    eager_execution = context.executing_eagerly()
+    if eager_execution or var_list is None:
+      graph = ops.get_default_graph()
+    else:
+      graph = ops._get_graph_from_inputs(var_list)  # pylint: disable=protected-access
+    assert graph is not None
+    graph_key = graph._graph_key  # pylint: disable=protected-access
+
+    # Get the per graph state by looking up the graph_key.
+    if graph_key in self._per_graph_state:
+      per_graph_state = self._per_graph_state[graph_key]
+    else:
+      per_graph_state = _OptimizerV2State(self._name)
+      per_graph_state._init_with_static_hyper(self._hyper)  # pylint: disable=protected-access
+      self._per_graph_state[graph_key] = per_graph_state
+    return per_graph_state
+
+  def _distributed_apply(self, distribution, grads_and_vars, global_step, name):
+    """`apply_gradients` for use with a `DistributionStrategy`."""
+    reduced_grads = distribution.batch_reduce(
+        variable_scope.VariableAggregation.SUM, grads_and_vars)
+    var_list = [v for _, v in grads_and_vars]
+    grads_and_vars = zip(reduced_grads, var_list)
+
+    unwrapped_var_list = [x for v in var_list for x in distribution.unwrap(v)]
+    eager_execution = context.executing_eagerly()
+    if eager_execution:
+      # Give a clear error in this case instead of "name not supported
+      # for Eager Tensors" when we compute non_slot_devices.
+      for v in unwrapped_var_list:
+        if isinstance(v, ops.Tensor):
+          raise NotImplementedError("Trying to update a Tensor ", v)
+
+    with ops.name_scope(name, self._name) as name:
+      per_graph_state = self._get_or_create_state(var_list=unwrapped_var_list)
+      # Include the current value of any dynamic hyper parameters in `state`.
+      non_slot_devices = distribution.non_slot_devices(var_list)
+      state = per_graph_state._copy_with_dynamic_hyper(  # pylint: disable=protected-access
+          self._hyper, distribution, non_slot_devices)
+
+    # Create any slot and non-slot variables we need in `state`.
+    with ops.init_scope():
+      self._create_vars(var_list, state)
+
+    with ops.name_scope(name):  # Re-enter name_scope created above
+      # Give the child class a chance to do something before we start
+      # applying gradients.
+      self._prepare(state)
+
+      def update(v, g):
+        """Update variable `v` using gradient `g`."""
+        assert v is not None
+
+        # Convert the grad to Tensor or IndexedSlices if necessary, and
+        # look up a processor for each variable's type.
+        try:
+          g = ops.convert_to_tensor_or_indexed_slices(g)
+        except TypeError:
+          raise TypeError(
+              "Gradient must be convertible to a Tensor"
+              " or IndexedSlices, or None: %s" % g)
+        if not isinstance(g, (ops.Tensor, ops.IndexedSlices)):
+          raise TypeError(
+              "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
+        processor = _get_processor(v)
+
+        # We colocate all ops created in _apply_dense or _apply_sparse
+        # on the same device as the variable.
+        # TODO(apassos): figure out how to get the variable name here.
+        scope_name = "" if eager_execution else v.op.name
+        # device_policy is set because non-mirrored tensors will be read in
+        # `update_op`.
+        # TODO(josh11b): Make different state objects for each device to
+        # avoid needing to set the device_policy.
+        with ops.name_scope("update_" + scope_name), \
+            context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+          return processor.update_op(self, g, state)
+
+      # Use the processors to update the variables.
+      update_ops = []
+      for grad, var in grads_and_vars:
+        update_ops.extend(distribution.update(var, update, grad, grouped=False))
+
+      # Give the child class a chance to do something after applying
+      # gradients
+      def finish():
+        # TODO(josh11b): Make different state objects for each device to
+        # avoid needing to set the device_policy.
+        with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+          return self._finish(state)
+
+      update_ops = control_flow_ops.group(update_ops)
+      with ops.control_dependencies([update_ops]):
+        finish_updates = distribution.update_non_slot(
+            non_slot_devices, finish, grouped=False)
+      # We said grouped=False, which means finish_updates is always a list.
+      # It will be [None] when finish() returns None.
+      if finish_updates == [None]:
+        finish_updates = [update_ops]
+
+      # Update `global_step` (if any).
+      if global_step is None:
+        apply_updates = distribution.group(finish_updates, name=name)
+      else:
+        with ops.control_dependencies(finish_updates):
+
+          def update_global_step(global_step, name):
+            return global_step.assign_add(1, read_value=False, name=name)
+
+          apply_updates = distribution.update(global_step, update_global_step,
+                                              name)
+
+      # Add the training op to the TRAIN_OP graph collection in graph mode.
+      if not eager_execution:
+        if isinstance(apply_updates, ops.Tensor):
+          apply_updates = apply_updates.op
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        if apply_updates not in train_op:
+          train_op.append(apply_updates)
+
+      return apply_updates
+
+  def get_slot(self, var, name):
+    """Return a slot named `name` created for `var` by the Optimizer.
+
+    Some `Optimizer` subclasses use additional variables.  For example
+    `Momentum` and `Adagrad` use variables to accumulate updates.  This method
+    gives access to these `Variable` objects if for some reason you need them.
+
+    Use `get_slot_names()` to get the list of slot names created by the
+    `Optimizer`.
+
+    Args:
+      var: A variable passed to `minimize()` or `apply_gradients()`.
+      name: A string.
+
+    Returns:
+      The `Variable` for the slot if it was created, `None` otherwise.
+    """
+    state = self._get_state_for_var(var)
+    return state.get_slot(var, name) if state is not None else None
+
+  def get_slot_names(self):
+    """Return a list of the names of slots created by the `Optimizer`.
+
+    See `get_slot()`.
+
+    Returns:
+      A list of strings.
+    """
+    state = self._get_per_graph_state()
+    return state.get_slot_names() if state is not None else []
+
+  def variables(self):
+    """A list of variables which encode the current state of `Optimizer`.
+
+    Includes slot variables and additional global variables created by the
+    optimizer in the current default graph.
+
+    Returns:
+      A list of variables.
+    """
+    state = self._get_per_graph_state()
+    return state._variables() if state is not None else []  # pylint: disable=protected-access
+
+  # --------------
+  # Methods to be implemented by subclasses if they want to use the
+  # inherited implementation of apply_gradients() or compute_gradients().
+  # --------------
+  def _create_vars(self, var_list, state):
+    """Create all slots needed by the variables and any non-slot variables.
+
+    Args:
+      var_list: A list of `Variable` objects.
+      state: An object with these methods:
+        `create_slot(var, val, slot_name, optional_op_name)`,
+        `create_slot_with_initializer(`
+            `var, initializer, shape, dtype, slot_name, optional_op_name)`,
+        `zeros_slot(var, slot_name, optional_op_name)`,
+        `create_non_slot_variable(initial_value, name, colocate_with)`,
+        `get_hyper(name)`
+    """
+    # No slots needed by default
+    pass
+
+  def _prepare(self, state):
+    """Code to execute before applying gradients.
+
+    Note that most uses of _prepare() in Optimizer have been subsumed
+    by explicit support for hyper parameters in OptimizerV2
+
+    Args:
+      state: An object with a `get_hyper(name)` method.
+
+    Returns:
+      Return value will be ignored.
+    """
+    pass
+
+  def _apply_dense(self, grad, var, state):
+    """Add ops to apply dense gradients to `var`.
+
+    Args:
+      grad: A `Tensor`.
+      var: A `Variable` object.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation`.
+    """
+    raise NotImplementedError()
+
+  def _resource_apply_dense(self, grad, handle, state):
+    """Add ops to apply dense gradients to the variable `handle`.
+
+    Args:
+      grad: a `Tensor` representing the gradient.
+      handle: a `Tensor` of dtype `resource` which points to the variable
+       to be updated.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    raise NotImplementedError()
+
+  def _resource_apply_sparse_duplicate_indices(
+      self, grad, handle, indices, state):
+    """Add ops to apply sparse gradients to `handle`, with repeated indices.
+
+    Optimizers which override this method must deal with repeated indices. See
+    the docstring of `_apply_sparse_duplicate_indices` for details. By default
+    the correct behavior, to sum non-unique indices and their associated
+    gradients, is enforced by first pre-processing `grad` and `indices` and
+    passing them on to `_resource_apply_sparse`. Optimizers which deal correctly
+    with duplicate indices may instead override this method to avoid the
+    overhead of summing.
+
+    Args:
+      grad: a `Tensor` representing the gradient for the affected indices.
+      handle: a `Tensor` of dtype `resource` which points to the variable
+       to be updated.
+      indices: a `Tensor` of integral type representing the indices for
+       which the gradient is nonzero. Indices may be repeated.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    # pylint: disable=protected-access
+    summed_grad, unique_indices = optimizer_v1._deduplicate_indexed_slices(
+        values=grad, indices=indices)
+    # pylint: enable=protected-access
+    return self._resource_apply_sparse(
+        summed_grad, handle, unique_indices, state)
+
+  def _resource_apply_sparse(self, grad, handle, indices, state):
+    """Add ops to apply sparse gradients to the variable `handle`.
+
+    Similar to `_apply_sparse`, the `indices` argument to this method has been
+    de-duplicated. Optimizers which deal correctly with non-unique indices may
+    instead override `_resource_apply_sparse_duplicate_indices` to avoid this
+    overhead.
+
+    Args:
+      grad: a `Tensor` representing the gradient for the affected indices.
+      handle: a `Tensor` of dtype `resource` which points to the variable
+       to be updated.
+      indices: a `Tensor` of integral type representing the indices for
+       which the gradient is nonzero. Indices are unique.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    raise NotImplementedError()
+
+  def _apply_sparse_duplicate_indices(self, grad, var, state):
+    """Add ops to apply sparse gradients to `var`, with repeated sparse indices.
+
+    Optimizers which override this method must deal with IndexedSlices objects
+    such as the following:
+
+      IndexedSlicesValue(values=[1, 1], indices=[0, 0], dense_shape=[1])
+
+    The correct interpretation is:
+
+      IndexedSlicesValue(values=[2], indices=[0], dense_shape=[1])
+
+    Many optimizers deal incorrectly with repeated indices when updating based
+    on sparse gradients (e.g. summing squares rather than squaring the sum, or
+    applying momentum terms multiple times). Adding first is always the correct
+    behavior, so this is enforced here by reconstructing the IndexedSlices to
+    have only unique indices, then calling _apply_sparse.
+
+    Optimizers which deal correctly with repeated indices may instead override
+    this method to avoid the overhead of summing indices.
+
+    Args:
+      grad: `IndexedSlices`.
+      var: A `Variable` object.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation`.
+    """
+    # pylint: disable=protected-access
+    summed_values, unique_indices = optimizer_v1._deduplicate_indexed_slices(
+        values=grad.values, indices=grad.indices)
+    # pylint: enable=protected-access
+    gradient_no_duplicate_indices = ops.IndexedSlices(
+        indices=unique_indices,
+        values=summed_values,
+        dense_shape=grad.dense_shape)
+    return self._apply_sparse(gradient_no_duplicate_indices, var, state)
+
+  def _apply_sparse(self, grad, var, state):
+    """Add ops to apply sparse gradients to `var`.
+
+    The IndexedSlices object passed to `grad` in this function is by default
+    pre-processed in `_apply_sparse_duplicate_indices` to remove duplicate
+    indices (see its docstring for details). Optimizers which can tolerate or
+    have correct special cases for duplicate sparse indices may override
+    `_apply_sparse_duplicate_indices` instead of this function, avoiding that
+    overhead.
+
+    Args:
+      grad: `IndexedSlices`, with no repeated indices.
+      var: A `Variable` object.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation`.
+    """
+    raise NotImplementedError()
+
+  def _finish(self, state):
+    """Do what is needed to finish the update.
+
+    This is called inside a scope colocated with any non-slot variables.
+
+    Args:
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      The operation to apply updates, or None if no updates.
+    """
+    return None
+
+  # --------------
+  # Utility methods for subclasses.
+  # --------------
+  def _get_per_graph_state(self):
+    # pylint: disable=protected-access
+    return self._per_graph_state.get(ops.get_default_graph()._graph_key, None)
+
+  def _get_state_for_var(self, var):
+    # pylint: disable=protected-access
+    return self._per_graph_state.get(var._graph_key, None)
+
+  # --------------
+  # Overridden methods from Checkpointable.
+  # --------------
+
+  def _track_checkpointable(self, *args, **kwargs):
+    """Optimizers may not track dependencies. Raises an error."""
+    raise NotImplementedError(
+        "Optimizers may not have dependencies. File a feature request if this "
+        "limitation bothers you.")
+
+  @property
+  def _checkpoint_dependencies(self):
+    """From Checkpointable. Gather graph-specific non-slot variables to save."""
+    current_graph_non_slot_variables = []
+    state = self._get_per_graph_state()
+    if state is not None:
+      for name, variable_object in sorted(
+          state._non_slot_dict.items(),  # pylint: disable=protected-access
+          # Avoid comparing variables
+          key=lambda item: item[0]):
+        current_graph_non_slot_variables.append(
+            checkpointable.CheckpointableReference(
+                name=name, ref=variable_object))
+    # Note: ignores super(); Optimizers may not have any dependencies outside of
+    # state objects.
+    return current_graph_non_slot_variables
+
+  def _lookup_dependency(self, name):
+    """From Checkpointable. Find a non-slot variable in the current graph."""
+    state = self._get_per_graph_state()
+    if state is None:
+      return None
+    else:
+      return state.get_non_slot(name)
+
+  @property
+  def _deferred_dependencies(self):
+    """Lets Checkpointable know where non-slot variables are created.
+
+    If necessary, creates a new state object for the current default graph.
+    Checkpointable will then add entries to that state's deferred dependency
+    dictionary. The state object will check that dictionary when creating
+    non-slot variables, restoring their value if an entry is found.
+
+    Returns:
+      A dictionary which holds deferred dependencies for the current default
+      graph.
+    """
+    state = self._get_or_create_state()
+    return state._deferred_dependencies  # pylint: disable=protected-access
+
+  def _create_or_restore_slot_variable(
+      self, slot_variable_position, slot_name, variable):
+    """Checkpointable: Restore a slot variable's value, possibly creating it.
+
+    Called when a variable which has an associated slot variable is created or
+    restored.
+
+    Args:
+      slot_variable_position: A `checkpointable._CheckpointPosition` object
+        indicating the slot variable `Checkpointable` object to be restored.
+      slot_name: The name of this `Optimizer`'s slot to restore into.
+      variable: The variable object this slot is being created for.
+    """
+    state = self._get_or_create_state(var_list=[variable])
+    state._create_or_restore_slot_variable(  # pylint: disable=protected-access
+        slot_variable_position=slot_variable_position,
+        slot_name=slot_name,
+        variable=variable,
+        optional_op_name=self._name)
+
+  # --------------
+  # Unsupported parent methods
+  # --------------
+  def _slot_dict(self, slot_name):
+    raise NotImplementedError(
+        "_slot_dict() method unsupported in OptimizerV2")
+
+  def _get_or_make_slot(self, var, val, slot_name, op_name):
+    raise NotImplementedError(
+        "_get_or_make_slot() method unsupported in OptimizerV2")
+
+  def _get_or_make_slot_with_initializer(self, var, initializer, shape, dtype,
+                                         slot_name, op_name):
+    raise NotImplementedError(
+        "_get_or_make_slot_with_initializer() method unsupported in "
+        "OptimizerV2")
+
+  def _create_non_slot_variable(self, initial_value, name, colocate_with):
+    raise NotImplementedError(
+        "_create_non_slot_variable() method unsupported in OptimizerV2")
+
+  def _get_non_slot_variable(self, name, graph=None):
+    raise NotImplementedError(
+        "_get_non_slot_variable() method unsupported in OptimizerV2")
+
+  def _non_slot_variables(self):
+    raise NotImplementedError(
+        "_non_slot_variables() method unsupported in OptimizerV2")
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
new file mode 100644
index 0000000000..a6c939393e
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -0,0 +1,277 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for OptimizerV2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.optimizer_v2 import sgd
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class OptimizerTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBasic(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+      def loss():
+        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+      # Note that for eager execution, minimize expects a function instead of a
+      # Tensor.
+      global_step = resource_variable_ops.ResourceVariable(
+          array_ops.zeros([], dtypes.int64), name='global_step_%d' % i)
+      sgd_op = sgd.SGD(3.0)
+
+      self.evaluate(variables.global_variables_initializer())
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+      # Run 1 step of sgd through optimizer
+      opt_op = sgd_op.minimize(loss, global_step, [var0, var1])
+      self.evaluate(opt_op)
+      # Validate updated params
+      self.assertAllClose([-14., -13.], self.evaluate(var0))
+      self.assertAllClose([-6., -5.], self.evaluate(var1))
+
+  def testAggregationMethod(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        cost = 5 * var0 + 3 * var1
+        global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64), name='global_step')
+        sgd_op = sgd.SGD(3.0)
+        opt_op = sgd_op.minimize(
+            cost,
+            global_step, [var0, var1],
+            aggregation_method=gradients_impl.AggregationMethod.
+            EXPERIMENTAL_ACCUMULATE_N)
+
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd through optimizer
+        opt_op.run()
+        # Validate updated params
+        self.assertAllClose([-14., -13.], var0.eval())
+        self.assertAllClose([-6., -5.], var1.eval())
+
+  def testPrecomputedGradient(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        cost = 5 * var0 + 3 * var1
+        grad_loss = constant_op.constant([42, -42], dtype=dtype)
+        global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64), name='global_step')
+        sgd_op = sgd.SGD(3.0)
+        opt_op = sgd_op.minimize(
+            cost, global_step, [var0, var1], grad_loss=grad_loss)
+
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd through optimizer
+        opt_op.run()
+        # Validate updated params
+        self.assertAllClose([1.0 - 3 * 5 * 42.0, 2.0 - 3 * 5 * (-42.0)],
+                            var0.eval())
+        self.assertAllClose([3.0 - 3 * 3 * 42.0, 4.0 - 3 * 3 * (-42.0)],
+                            var1.eval())
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNoVariables(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        var0 = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype, trainable=False, name='a')
+        var1 = resource_variable_ops.ResourceVariable(
+            [3.0, 4.0], dtype=dtype, trainable=False, name='b')
+        return 5 * var0 + var1
+      # pylint: enable=cell-var-from-loop
+      sgd_op = sgd.SGD(3.0)
+      with self.assertRaisesRegexp(ValueError, 'No.*variables'):
+        sgd_op.minimize(loss)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNoGradients(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        return 5 * var0
+      # pylint: enable=cell-var-from-loop
+      sgd_op = sgd.SGD(3.0)
+      with self.assertRaisesRegexp(ValueError, 'No gradients'):
+        # var1 has no gradient
+        sgd_op.minimize(loss, var_list=[var1])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNoGradientsForAnyVariables_Minimize(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+      def loss():
+        return constant_op.constant(5.0)
+
+      sgd_op = sgd.SGD(3.0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'No gradients provided for any variable'):
+        sgd_op.minimize(loss, var_list=[var0, var1])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNoGradientsForAnyVariables_ApplyGradients(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+      sgd_op = sgd.SGD(3.0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'No gradients provided for any variable'):
+        sgd_op.apply_gradients([(None, var0), (None, var1)])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testGradientsAsVariables(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+      def loss():
+        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+
+      sgd_op = sgd.SGD(3.0)
+      grads_and_vars = sgd_op.compute_gradients(loss, [var0, var1])
+      # Convert gradients to tf.Variables
+      converted_grads = [
+          resource_variable_ops.ResourceVariable(array_ops.zeros([2], dtype),
+                                                 name='c_%d_%d' % (i, j))
+          for j, gv in enumerate(grads_and_vars)
+      ]
+      convert_ops = [
+          state_ops.assign(converted_grads[j], gv[0])
+          for j, gv in enumerate(grads_and_vars)
+      ]
+
+      self.evaluate(variables.global_variables_initializer())
+      # Run convert_ops to achieve the gradietns converting
+      self.evaluate(convert_ops)
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+      # Run 1 step of sgd through optimizer
+      converted_grads_and_vars = list(zip(converted_grads, [var0, var1]))
+      opt_op = sgd_op.apply_gradients(converted_grads_and_vars)
+      self.evaluate(opt_op)
+
+      # Validate updated params
+      self.assertAllClose([-14., -13.], self.evaluate(var0))
+      self.assertAllClose([-6., -5.], self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testComputeGradientsWithTensors(self):
+    x = ops.convert_to_tensor(1.0)
+    def f():
+      return x * x
+
+    sgd_op = sgd.SGD(3.0)
+    grads_and_vars = sgd_op.compute_gradients(f, [x])
+    self.assertEqual(1, len(grads_and_vars))
+    grad, x_as_var = grads_and_vars[0]
+    self.assertIs(x, x_as_var)
+    self.assertEqual(2.0, self.evaluate(grad))
+
+    with self.assertRaises(NotImplementedError):
+      sgd_op.apply_gradients(grads_and_vars)
+
+  def testTrainOp(self):
+    with self.cached_session():
+      var0 = variables.Variable([1.0, 2.0])
+      var1 = variables.Variable([3.0, 4.0])
+      cost = 5 * var0 + 3 * var1
+      global_step = variables.Variable(
+          array_ops.zeros([], dtypes.int64), name='global_step')
+      sgd_op = sgd.SGD(3.0)
+      opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
+      self.assertTrue(opt_op in ops.get_collection(ops.GraphKeys.TRAIN_OP))
+
+  def testConstraint(self):
+    constraint_01 = lambda x: clip_ops.clip_by_value(x, -0.1, 0.)
+    constraint_0 = lambda x: clip_ops.clip_by_value(x, 0., 1.)
+    with self.cached_session():
+      var0 = variables.Variable([1.0, 2.0],
+                                constraint=constraint_01)
+      var1 = variables.Variable([3.0, 4.0],
+                                constraint=constraint_0)
+      cost = 5 * var0 + 3 * var1
+      global_step = variables.Variable(
+          array_ops.zeros([], dtypes.int64), name='global_step')
+      sgd_op = sgd.SGD(3.0)
+      opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
+
+      variables.global_variables_initializer().run()
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+      # Run 1 step of sgd through optimizer
+      opt_op.run()
+      # Validate updated params
+      self.assertAllClose([-0.1, -0.1], var0.eval())
+      self.assertAllClose([0., 0.], var1.eval())
+
+  def testStopGradients(self):
+    with self.cached_session():
+      var0 = variables.Variable([1.0, 2.0], name='var0')
+      var1 = variables.Variable([3.0, 4.0], name='var1')
+      var0_id = array_ops.identity(var0)
+      cost = 5 * var0_id + 3 * var1
+      sgd_op = sgd.SGD(3.0)
+      grads_and_vars = sgd_op.compute_gradients(cost, [var0, var1],
+                                                stop_gradients=[var0_id])
+      grad_dict = {var.op.name: grad for grad, var in grads_and_vars}
+      self.assertIsNone(grad_dict['var0'])
+      self.assertIsNotNone(grad_dict['var1'])
+
+  def testDoNotOverrideCreateSlots(self):
+    class ShouldNotOverrideCreateSlots(optimizer_v2.OptimizerV2):
+
+      def _create_slots(self, var_list):
+        """In OptimizerV2 _create_slots was renamed _create_vars."""
+        return var_list
+
+    with self.assertRaises(RuntimeError):
+      ShouldNotOverrideCreateSlots('name')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
new file mode 100644
index 0000000000..2748d8eff7
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -0,0 +1,239 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RMSprop optimizer for Tensorflow.
+
+rmsprop algorithm [tieleman2012rmsprop]
+
+A detailed description of rmsprop.
+
+- maintain a moving (discounted) average of the square of gradients
+- divide gradient by the root of this average
+
+mean_square = rho * mean_square{t-1} + (1-rho) * gradient ** 2
+mom = momentum * mom{t-1} + learning_rate * g_t / sqrt(mean_square)
+delta = - mom
+
+This implementation of RMSProp uses plain momentum, not Nesterov momentum.
+
+The centered version additionally maintains a moving (discounted) average of the
+gradients, and uses that average to estimate the variance:
+
+mean_grad = rho * mean_square{t-1} + (1-rho) * gradient
+mean_square = rho * mean_square{t-1} + (1-rho) * gradient ** 2
+mom = momentum * mom{t-1} + learning_rate * g_t /
+    sqrt(mean_square - mean_grad**2)
+delta = - mom
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+
+from tensorflow.python.training import training_ops
+
+
+class RMSProp(optimizer_v2.OptimizerV2):
+  """RMSProp optimizer.
+
+  It is recommended to leave the parameters of this optimizer at their default
+  values (except the learning rate, which can be freely tuned).
+
+  This optimizer is usually a good choice for recurrent neural networks.
+
+  Some of the args below are hyperparameters, where a hyperparameter is
+  defined as a scalar Tensor, a regular Python value, or a callable (which
+  will be evaluated when `apply_gradients` is called) returning a scalar
+  Tensor or a Python value.
+
+  Note that in the dense implementation of this algorithm, variables and their
+  corresponding accumulators (momentum, gradient moving average, square
+  gradient moving average) will be updated even if the gradient is zero
+  (i.e. accumulators will decay, momentum will be applied). The sparse
+  implementation (used when the gradient is an `IndexedSlices` object,
+  typically because of `tf.gather` or an embedding lookup in the forward pass)
+  will not update variable slices or their accumulators unless those slices
+  were used in the forward pass (nor is there an "eventual" correction to
+  account for these omitted updates). This leads to more efficient updates for
+  large embedding lookup tables (where most of the slices are not accessed in
+  a particular graph execution), but differs from the published algorithm.
+
+  Arguments:
+      learning_rate: A float hyperparameter >= 0. The learning rate.
+      rho: A float hyperparameter >= 0. Discounting factor for the
+        history/coming gradient.
+      momentum: A float hyperparameter >= 0.
+      epsilon: A float hyperparameter >= 0 . Small value to initialize the
+        average square gradient variable and avoid zero denominator.
+      centered: If True, gradients are normalized by the estimated variance of
+        the gradient; if False, by the uncentered second moment. Setting this to
+        True may help with training, but is slightly more expensive in terms of
+        computation and memory. Defaults to False.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "RMSProp".
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               rho=0.9,
+               momentum=None,
+               epsilon=1e-10,
+               centered=False,
+               name="RMSProp"):
+    super(RMSProp, self).__init__(name)
+    # Momentum default is `None` for consistency with SGD
+    # but underlying implementation uses `momentum` hyperparameter here
+    # regardless unlike SGD. Since extneral Keras RMSProp does not have
+    # a `momentum` weight, for compatibility with external Keras h5 files,
+    # when  `momentum` was set as `None` we should ignore the `momentum`
+    # variable in `get_weights` and not require it in `set_weights`.
+    if momentum is None:
+      momentum = 0.0
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("rho", rho)
+    self._set_hyper("momentum", momentum)
+    self._set_hyper("epsilon", epsilon)
+
+    self._centered = centered
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      init_rms = state.get_hyper(
+          "epsilon", v.dtype.base_dtype) * array_ops.ones_like(v)
+      state.create_slot_with_initializer(v, init_rms, v.get_shape(),
+                                         v.dtype.base_dtype, "rms")
+      if self._centered:
+        state.zeros_slot(v, "mg")
+      state.zeros_slot(v, "momentum")
+
+  def _apply_dense(self, grad, var, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = state.get_slot(var, "mg")
+      return training_ops.apply_centered_rms_prop(
+          var,
+          mg,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("rho", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          # epsilon is now the rms initial value and is not added to the
+          # denominator anymore, hence calling the kernel op with epsilon=0.
+          0,
+          grad,
+          use_locking=self._use_locking).op
+    else:
+      return training_ops.apply_rms_prop(
+          var,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("rho", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          0,
+          grad,
+          use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = state.get_slot(var, "mg")
+      return training_ops.resource_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("rho", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          0,
+          grad,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("rho", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          0,
+          grad,
+          use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = state.get_slot(var, "mg")
+      return training_ops.sparse_apply_centered_rms_prop(
+          var,
+          mg,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("rho", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          0,
+          grad.values,
+          grad.indices,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.sparse_apply_rms_prop(
+          var,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("rho", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          0,
+          grad.values,
+          grad.indices,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = self.get_slot(var, "mg")
+      return training_ops.resource_sparse_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("rho", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          0,
+          grad,
+          indices,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_sparse_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("rho", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          0,
+          grad,
+          indices,
+          use_locking=self._use_locking)
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
new file mode 100644
index 0000000000..2c5eccdc5b
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -0,0 +1,444 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for rmsprop optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import math
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+_DATA_TYPES = [dtypes.half, dtypes.float32]
+
+_TEST_PARAM_VALUES = [
+    # learning_rate, rho, momentum, epsilon, centered, use_resource
+    [0.5, 0.9, 0.0, 1.0, True, False],
+    [0.5, 0.9, 0.0, 1.0, False, False],
+    [0.5, 0.9, 0.0, 1.0, True, True],
+    [0.5, 0.9, 0.0, 1.0, False, True],
+    [0.1, 0.9, 0.0, 1.0, True, False],
+    [0.5, 0.95, 0.0, 1.0, False, False],
+    [0.5, 0.8, 0.0, 1e-3, True, False],
+    [0.5, 0.8, 0.9, 1e-3, True, False],
+]
+
+
+class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
+
+  def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, rho, momentum,
+                            centered):
+    rms_t = rms * rho + (1 - rho) * g * g
+    if centered:
+      mg_t = mg * rho + (1 - rho) * g
+      denom_t = rms_t - mg_t * mg_t
+    else:
+      mg_t = mg
+      denom_t = rms_t
+    mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype)
+    var_t = var - mom_t
+    return var_t, mg_t, rms_t, mom_t
+
+  def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
+                                   lr, rho, momentum, centered):
+    mg_t = copy.deepcopy(mg)
+    rms_t = copy.deepcopy(rms)
+    mom_t = copy.deepcopy(mom)
+    var_t = copy.deepcopy(var)
+    for i in range(len(gindexs)):
+      gindex = gindexs[i]
+      gvalue = gvalues[i]
+      rms_t[gindex] = rms[gindex] * rho + (1 - rho) * gvalue * gvalue
+      denom_t = rms_t[gindex]
+      if centered:
+        mg_t[gindex] = mg_t[gindex] * rho + (1 - rho) * gvalue
+        denom_t -= mg_t[gindex] * mg_t[gindex]
+      mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t)
+      var_t[gindex] = var[gindex] - mom_t[gindex]
+    return var_t, mg_t, rms_t, mom_t
+
+  @parameterized.named_parameters(
+      *test_util.generate_combinations_with_testcase_name(
+          dtype=_DATA_TYPES, param_value=_TEST_PARAM_VALUES))
+  def testDense(self, dtype, param_value):
+    (learning_rate, rho, momentum, epsilon, centered,
+     use_resource) = tuple(param_value)
+    with self.test_session(use_gpu=True):
+      # Initialize variables for numpy implementation.
+      var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+      grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
+      var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+      grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
+
+      if use_resource:
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+      else:
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+      grads0 = constant_op.constant(grads0_np)
+      grads1 = constant_op.constant(grads1_np)
+      opt = rmsprop.RMSProp(
+          learning_rate=learning_rate,
+          rho=rho,
+          momentum=momentum,
+          epsilon=epsilon,
+          centered=centered)
+
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      variables.global_variables_initializer().run()
+
+      mg0 = opt.get_slot(var0, "mg")
+      self.assertEqual(mg0 is not None, centered)
+      mg1 = opt.get_slot(var1, "mg")
+      self.assertEqual(mg1 is not None, centered)
+      rms0 = opt.get_slot(var0, "rms")
+      self.assertIsNotNone(rms0)
+      rms1 = opt.get_slot(var1, "rms")
+      self.assertIsNotNone(rms1)
+      mom0 = opt.get_slot(var0, "momentum")
+      self.assertIsNotNone(mom0)
+      mom1 = opt.get_slot(var1, "momentum")
+      self.assertIsNotNone(mom1)
+
+      mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      rms0_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
+      rms1_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
+      mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+
+      # Run 4 steps of RMSProp
+      for _ in range(4):
+        update.run()
+
+        var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+            var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate, rho,
+            momentum, centered)
+        var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+            var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate, rho,
+            momentum, centered)
+
+        # Validate updated params
+        if centered:
+          self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
+          self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
+        self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
+        self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
+        self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
+        self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
+        self.assertAllCloseAccordingToType(
+            var0_np, var0.eval(), half_rtol=0.01, half_atol=0.01)
+        self.assertAllCloseAccordingToType(
+            var1_np, var1.eval(), half_rtol=0.01, half_atol=0.01)
+
+  @parameterized.parameters([dtypes.float32, dtypes.float64])
+  def testMinimizeSparseResourceVariable(self, dtype):
+    with self.cached_session():
+      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+      pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+      loss = pred * pred
+      sgd_op = rmsprop.RMSProp(
+          learning_rate=1.0, rho=0.0, momentum=0.0, epsilon=0.0,
+          centered=False).minimize(loss)
+      variables.global_variables_initializer().run()
+      # Fetch params to validate initial values
+      self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+      # Run 1 step of sgd
+      sgd_op.run()
+      # Validate updated params
+      self.assertAllCloseAccordingToType(
+          [[0., 1.]], var0.eval(), atol=0.01)
+
+  @parameterized.parameters([dtypes.float32, dtypes.float64])
+  def testMinimizeSparseResourceVariableCentered(self, dtype):
+    with self.cached_session():
+      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+      pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+      loss = pred * pred
+      sgd_op = rmsprop.RMSProp(
+          learning_rate=1.0, rho=0.1, momentum=0.0, epsilon=1.0,
+          centered=True).minimize(loss)
+      variables.global_variables_initializer().run()
+      # Fetch params to validate initial values
+      self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+      # Run 1 step of sgd
+      sgd_op.run()
+      # Validate updated params
+      self.assertAllCloseAccordingToType(
+          [[-7/3.0, -4/3.0]], var0.eval(), atol=0.01)
+
+  @parameterized.named_parameters(
+      *test_util.generate_combinations_with_testcase_name(
+          dtype=_DATA_TYPES, param_value=_TEST_PARAM_VALUES))
+  def testSparse(self, dtype, param_value):
+    (learning_rate, rho, momentum, epsilon, centered, _) = tuple(param_value)
+    with self.test_session(use_gpu=True):
+      # Initialize variables for numpy implementation.
+      var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+      grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+      var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+      grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)
+
+      var0 = variables.Variable(var0_np)
+      var1 = variables.Variable(var1_np)
+      grads0_np_indices = np.array([0], dtype=np.int32)
+      grads0 = ops.IndexedSlices(
+          constant_op.constant(grads0_np),
+          constant_op.constant(grads0_np_indices), constant_op.constant([1]))
+      grads1_np_indices = np.array([1], dtype=np.int32)
+      grads1 = ops.IndexedSlices(
+          constant_op.constant(grads1_np),
+          constant_op.constant(grads1_np_indices), constant_op.constant([1]))
+      opt = rmsprop.RMSProp(
+          learning_rate=learning_rate,
+          rho=rho,
+          momentum=momentum,
+          epsilon=epsilon,
+          centered=centered)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      variables.global_variables_initializer().run()
+
+      mg0 = opt.get_slot(var0, "mg")
+      self.assertEqual(mg0 is not None, centered)
+      mg1 = opt.get_slot(var1, "mg")
+      self.assertEqual(mg1 is not None, centered)
+      rms0 = opt.get_slot(var0, "rms")
+      self.assertIsNotNone(rms0)
+      rms1 = opt.get_slot(var1, "rms")
+      self.assertIsNotNone(rms1)
+      mom0 = opt.get_slot(var0, "momentum")
+      self.assertIsNotNone(mom0)
+      mom1 = opt.get_slot(var1, "momentum")
+      self.assertIsNotNone(mom1)
+
+      mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      rms0_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
+      rms1_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
+      mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+
+      # Run 4 steps of RMSProp
+      for _ in range(4):
+        update.run()
+
+        var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
+            var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
+            learning_rate, rho, momentum, centered)
+        var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
+            var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
+            learning_rate, rho, momentum, centered)
+
+        # Validate updated params
+        if centered:
+          self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
+          self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
+        self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
+        self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
+        self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
+        self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
+        self.assertAllCloseAccordingToType(var0_np, var0.eval())
+        self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @parameterized.parameters(_DATA_TYPES)
+  def testWithoutMomentum(self, dtype):
+    with self.test_session(use_gpu=True):
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+      opt = rmsprop.RMSProp(
+          learning_rate=2.0, rho=0.9, momentum=0.0, epsilon=1.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      variables.global_variables_initializer().run()
+
+      rms0 = opt.get_slot(var0, "rms")
+      self.assertIsNotNone(rms0)
+      rms1 = opt.get_slot(var1, "rms")
+      self.assertIsNotNone(rms1)
+      mom0 = opt.get_slot(var0, "momentum")
+      self.assertIsNotNone(mom0)
+      mom1 = opt.get_slot(var1, "momentum")
+      self.assertIsNotNone(mom1)
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+      # Step 1: the rms accumulators where 1. So we should see a normal
+      # update: v -= grad * learning_rate
+      update.run()
+      # Check the root mean square accumulators.
+      self.assertAllCloseAccordingToType(
+          np.array([0.901, 0.901]), rms0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([0.90001, 0.90001]), rms1.eval())
+      # Check the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901))
+          ]), var0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001))
+          ]), var1.eval())
+      # Step 2: the root mean square accumulators contain the previous update.
+      update.run()
+      # Check the rms accumulators.
+      self.assertAllCloseAccordingToType(
+          np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+      # Check the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001))
+          ]), var0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5))
+          ]), var1.eval())
+
+  @parameterized.parameters(_DATA_TYPES)
+  def testWithMomentum(self, dtype):
+    with self.test_session(use_gpu=True):
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+
+      opt = rmsprop.RMSProp(
+          learning_rate=2.0, rho=0.9, momentum=0.5, epsilon=1.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      variables.global_variables_initializer().run()
+
+      rms0 = opt.get_slot(var0, "rms")
+      self.assertIsNotNone(rms0)
+      rms1 = opt.get_slot(var1, "rms")
+      self.assertIsNotNone(rms1)
+      mom0 = opt.get_slot(var0, "momentum")
+      self.assertIsNotNone(mom0)
+      mom1 = opt.get_slot(var1, "momentum")
+      self.assertIsNotNone(mom1)
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+      # Step 1: rms = 1, mom = 0. So we should see a normal
+      # update: v -= grad * learning_rate
+      update.run()
+      # Check the root mean square accumulators.
+      self.assertAllCloseAccordingToType(
+          np.array([0.901, 0.901]), rms0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([0.90001, 0.90001]), rms1.eval())
+      # Check the momentum accumulators
+      self.assertAllCloseAccordingToType(
+          np.array([(0.1 * 2.0 / math.sqrt(0.901)),
+                    (0.1 * 2.0 / math.sqrt(0.901))]), mom0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([(0.01 * 2.0 / math.sqrt(0.90001)),
+                    (0.01 * 2.0 / math.sqrt(0.90001))]), mom1.eval())
+
+      # Check that the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901))
+          ]), var0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001))
+          ]), var1.eval())
+
+      # Step 2: the root mean square accumulators contain the previous update.
+      update.run()
+      # Check the rms accumulators.
+      self.assertAllCloseAccordingToType(
+          np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)),
+              0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001))
+          ]), mom0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)),
+              0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5))
+          ]), mom1.eval())
+
+      # Check the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+              (0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+               (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001))),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+              (0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+               (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)))
+          ]), var0.eval())
+
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+              (0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+               (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5))),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+              (0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+               (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)))
+          ]), var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/sgd.py b/tensorflow/python/keras/optimizer_v2/sgd.py
new file mode 100644
index 0000000000..f5583691f7
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/sgd.py
@@ -0,0 +1,170 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Momentum for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import training_ops
+
+
+class SGD(optimizer_v2.OptimizerV2):
+  """Stochastic gradient descent optimizer.
+
+  Includes support for momentum and Nesterov momentum.
+
+  Computes (if `nesterov = False`):
+
+  ```
+  accumulation = momentum * accumulation + gradient
+  variable -= learning_rate * accumulation
+  ```
+
+  Some of the args below are hyperparameters, where a hyperparameter is
+  defined as a scalar Tensor, a regular Python value, or a callable (which
+  will be evaluated when `apply_gradients` is called) returning a scalar
+  Tensor or a Python value.
+
+  Note that in the dense version of this algorithm, `accumulation` is updated
+  and applied regardless of a gradient's value, whereas the sparse version (when
+  the gradient is an `IndexedSlices`, typically because of `tf.gather` or an
+  embedding) only updates variable slices and corresponding `accumulation` terms
+  when that part of the variable was used in the forward pass.
+
+  @compatibility(eager)
+  When eager execution is enabled, learning_rate and momentum can each be a
+  callable that takes no arguments and returns the actual value to use. This
+  can be useful for changing these values across different invocations of
+  optimizer functions.
+  @end_compatibility
+
+  Arguments:
+      learning_rate: float hyperparameter >= 0. Learning rate.
+      momentum: float hyperparameter >= 0 or None. Parameter that accelerates
+        SGD in the relevant direction and dampens oscillations.
+      nesterov: boolean. Whether to apply Nesterov momentum. See [Sutskever et
+        al., 2013](http://jmlr.org/proceedings/papers/v28/sutskever13.pdf). This
+          implementation always computes gradients at the value of the
+          variable(s) passed to the optimizer. Using Nesterov Momentum makes the
+          variable(s) track the values called `theta_t + mu*v_t` in the paper.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to 'SGD'.
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               momentum=None,
+               nesterov=False,
+               name="SGD"):
+    super(SGD, self).__init__(name)
+    self._set_hyper("learning_rate", learning_rate)
+    # Only create momentum variables and use momentum ops if needed.
+    if momentum is not None:
+      self._set_hyper("momentum", momentum)
+      self._use_nesterov = nesterov
+      self._use_momentum = True
+    else:
+      self._use_momentum = False
+
+  def _create_vars(self, var_list, state):
+    if self._use_momentum:
+      for v in var_list:
+        state.zeros_slot(v, "momentum")
+
+  def _apply_dense(self, grad, var, state):
+    if self._use_momentum:
+      mom = state.get_slot(var, "momentum")
+      return training_ops.apply_momentum(
+          var,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          grad,
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          use_locking=self._use_locking,
+          use_nesterov=self._use_nesterov).op
+    else:
+      return training_ops.apply_gradient_descent(
+          var,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var, state):
+    if self._use_momentum:
+      mom = state.get_slot(var, "momentum")
+      return training_ops.resource_apply_momentum(
+          var.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          grad,
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          use_locking=self._use_locking,
+          use_nesterov=self._use_nesterov)
+    else:
+      lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
+      return training_ops.resource_apply_gradient_descent(
+          var.handle, lr, grad, use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var, state):
+    if self._use_momentum:
+      mom = state.get_slot(var, "momentum")
+      return training_ops.sparse_apply_momentum(
+          var,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          grad.values,
+          grad.indices,
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          use_locking=self._use_locking,
+          use_nesterov=self._use_nesterov).op
+    else:
+      return super(SGD, self)._apply_sparse(grad, var, state)
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    if self._use_momentum:
+      mom = state.get_slot(var, "momentum")
+      return training_ops.resource_sparse_apply_momentum(
+          var.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          grad,
+          indices,
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          use_locking=self._use_locking,
+          use_nesterov=self._use_nesterov)
+    else:
+      return super(SGD, self)._resource_apply_sparse(grad, var, indices, state)
+
+  def _resource_apply_sparse_duplicate_indices(self, grad, var, indices, state):
+    if self._use_momentum:
+      return super(SGD, self)._resource_apply_sparse_duplicate_indices(
+          grad, var, indices, state)
+    else:
+      lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
+      return resource_variable_ops.resource_scatter_add(var.handle, indices,
+                                                        -grad * lr)
+
+  def _apply_sparse_duplicate_indices(self, grad, var, state):
+    if self._use_momentum:
+      return super(SGD, self)._apply_sparse_duplicate_indices(grad, var, state)
+    else:
+      delta = ops.IndexedSlices(
+          grad.values * state.get_hyper("learning_rate", var.dtype.base_dtype),
+          grad.indices, grad.dense_shape)
+      return var.scatter_sub(delta, use_locking=self._use_locking)
diff --git a/tensorflow/python/keras/optimizer_v2/sgd_test.py b/tensorflow/python/keras/optimizer_v2/sgd_test.py
new file mode 100644
index 0000000000..eb39aac283
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/sgd_test.py
@@ -0,0 +1,759 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Momentum."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import sgd
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class GradientDescentOptimizerTest(test.TestCase):
+
+  def testBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        optimizer = sgd.SGD(3.0)
+        sgd_op = optimizer.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+        self.assertEqual(0, len(optimizer.variables()))
+
+  def testBasicResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        sgd_op = sgd.SGD(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        resources.initialize_resources([var0, var1]).run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+
+  def testMinimizeResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(var0, x) + var1
+        loss = pred * pred
+        sgd_op = sgd.SGD(1.0).minimize(loss)
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        resources.initialize_resources([var0, var1]).run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
+        np_grad = 2 * np_pred
+        self.assertAllCloseAccordingToType(
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        pred += var1
+        loss = pred * pred
+        sgd_op = sgd.SGD(1.0).minimize(loss)
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
+        np_grad = 2 * np_pred
+        self.assertAllCloseAccordingToType(
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        lrate = constant_op.constant(3.0)
+        sgd_op = sgd.SGD(lrate).apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+
+  def testGradWrtRef(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        opt = sgd.SGD(3.0)
+        values = [1.0, 3.0]
+        vars_ = [variables.Variable([v], dtype=dtype) for v in values]
+        grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
+        variables.global_variables_initializer().run()
+        for grad, _ in grads_and_vars:
+          self.assertAllCloseAccordingToType([1.0], grad.eval())
+
+  def testWithGlobalStep(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        global_step = variables.Variable(0, trainable=False)
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        sgd_op = sgd.SGD(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1]), global_step=global_step)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params and global_step
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+        self.assertAllCloseAccordingToType(1, global_step.eval())
+
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+        sgd_op = sgd.SGD(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0], [2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
+                                           var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
+
+
+class MomentumOptimizerTest(test.TestCase):
+
+  def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
+    var = var + accum * lr * momentum
+    accum = accum * momentum + g
+    var = var - lr * accum
+    var = var - accum * lr * momentum
+    return var, accum
+
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      if use_resource:
+        var0 = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            [3.0, 4.0], dtype=dtype, name="var1_%d" % i)
+      else:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+      learning_rate = lambda: 2.0
+      momentum = lambda: 0.9
+      if not use_callable_params:
+        learning_rate = learning_rate()
+        momentum = momentum()
+      mom_opt = sgd.SGD(learning_rate=learning_rate, momentum=momentum)
+      mom_update = mom_opt.apply_gradients(
+          zip([grads0, grads1], [var0, var1]))
+
+      if not context.executing_eagerly():
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+      # Check we have slots
+      self.assertEqual(["momentum"], mom_opt.get_slot_names())
+      slot0 = mom_opt.get_slot(var0, "momentum")
+      self.assertEquals(slot0.get_shape(), var0.get_shape())
+      slot1 = mom_opt.get_slot(var1, "momentum")
+      self.assertEquals(slot1.get_shape(), var1.get_shape())
+      if not context.executing_eagerly():
+        self.assertFalse(slot0 in variables.trainable_variables())
+        self.assertFalse(slot1 in variables.trainable_variables())
+
+      # Step 1: the momentum accumulators where 0. So we should see a normal
+      # update: v -= grad * learning_rate
+      if not context.executing_eagerly():
+        self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(np.array([0.1, 0.1]),
+                                         self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(np.array([0.01, 0.01]),
+                                         self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+          self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+          self.evaluate(var1))
+      # Step 2: the momentum accumulators contain the previous update.
+      if context.executing_eagerly():
+        mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      else:
+        self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+          self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+          self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+              2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+          ]), self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([
+              2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
+                  (0.9 * 0.01 + 0.01) * 2.0)
+          ]), self.evaluate(var1))
+
+  def testBasic(self):
+    with self.cached_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
+  def testVariablesAcrossGraphs(self):
+    optimizer = sgd.SGD(0.01, 0.5)
+    with ops.Graph().as_default():
+      var0 = resource_variable_ops.ResourceVariable(
+          [1.0, 2.0], dtype=dtypes.float32, name="var0")
+      var1 = resource_variable_ops.ResourceVariable(
+          [3.0, 4.0], dtype=dtypes.float32, name="var1")
+      loss = math_ops.reduce_sum(var0 + var1)
+      optimizer.minimize(loss)
+      optimizer_variables = optimizer.variables()
+      self.assertStartsWith(optimizer_variables[0].name, "var0")
+      self.assertStartsWith(optimizer_variables[1].name, "var1")
+      self.assertEquals(2, len(optimizer_variables))
+
+    with ops.Graph().as_default():
+      var2 = resource_variable_ops.ResourceVariable(
+          [1.0, 2.0], dtype=dtypes.float32, name="var2")
+      var3 = resource_variable_ops.ResourceVariable(
+          [3.0, 4.0], dtype=dtypes.float32, name="var3")
+      loss = math_ops.reduce_sum(var2 + var3)
+      optimizer.minimize(loss)
+      optimizer_variables = optimizer.variables()
+      self.assertStartsWith(optimizer_variables[0].name, "var2")
+      self.assertStartsWith(optimizer_variables[1].name, "var3")
+      self.assertEquals(2, len(optimizer_variables))
+
+  def testNesterovMomentum(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        cost = 5 * var0 * var0 + 3 * var1
+        global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64), name="global_step")
+        mom_op = sgd.SGD(learning_rate=2.0, momentum=0.9, nesterov=True)
+        opt_op = mom_op.minimize(cost, global_step, [var0, var1])
+        variables.global_variables_initializer().run()
+        for t in range(1, 5):
+          opt_op.run()
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+                                                                    accum1_np,
+                                                                    3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
+  def testSparseNesterovMomentum(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        grads = []
+        for t in range(1, 5):
+          grads.append(var0_np * 10)
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+                                                                    accum1_np,
+                                                                    3, 2.0, 0.9)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        loss = 5 * var0 * var0 + 3 * var1
+        mom_op = sgd.SGD(learning_rate=2.0, momentum=0.9, nesterov=True)
+        x_feed = array_ops.placeholder(dtype)
+        y_feed = ops.IndexedSlices(
+            x_feed, constant_op.constant([0, 1]), constant_op.constant([2]))
+        grads_and_vars = [(y_feed, var0), (constant_op.constant(
+            [3.0, 3.0], dtype=dtype), var1)]
+        opt_update = mom_op.apply_gradients(grads_and_vars)
+        variables.global_variables_initializer().run()
+        for t in range(1, 5):
+          opt_update.run(feed_dict={x_feed: grads[t - 1]})
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+                                                                    accum1_np,
+                                                                    3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      # This test invokes the ResourceSparseApplyMomentum operation, which
+      # did not have a registered GPU kernel as of April 2018. With graph
+      # execution, the placement algorithm notices this and automatically
+      # places the variable in CPU (host) memory. With eager execution,
+      # the variable would be placed in GPU memory if available, which
+      # would then conflict with the future invocation of the
+      # ResourceSparseApplyMomentum operation.
+      # To work around this discrepancy, for now we force the variable
+      # to be placed on CPU.
+      with ops.device("/cpu:0"):
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        return pred * pred
+      # pylint: enable=cell-var-from-loop
+
+      opt = sgd.SGD(learning_rate=1.0, momentum=0.0)
+      sgd_op = opt.minimize(loss)
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testMinimizeWith2DIndiciesForEmbeddingLookup(self):
+    # This test invokes the ResourceSparseApplyMomentum operation, which
+    # did not have a registered GPU kernel as of April 2018. With graph
+    # execution, the placement algorithm notices this and automatically
+    # places the variable in CPU (host) memory. With eager execution,
+    # the variable would be placed in GPU memory if available, which
+    # would then conflict with the future invocation of the
+    # ResourceSparseApplyMomentum operation.
+    # To work around this discrepancy, for now we force the variable
+    # to be placed on CPU.
+    with ops.device("/cpu:0"):
+      var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+
+    def loss():
+      return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
+
+    opt = sgd.SGD(learning_rate=1.0, momentum=0.0)
+    sgd_op = opt.minimize(loss)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(sgd_op)
+    self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
+
+  def testTensorLearningRateAndMomentum(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = sgd.SGD(
+            learning_rate=constant_op.constant(2.0),
+            momentum=constant_op.constant(0.9))
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Check we have slots
+        self.assertEqual(["momentum"], mom_opt.get_slot_names())
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertFalse(slot0 in variables.trainable_variables())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertFalse(slot1 in variables.trainable_variables())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
+        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
+                    (0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval())
+
+  def _dbParamsMom01(self):
+    """Return dist-belief momentum values.
+
+    Return values been generated from the dist-belief momentum unittest,
+    running with a learning rate of 0.1 and a momentum of 0.1.
+
+    These values record how a parameter vector of size 10, initialized with 0.0,
+    gets updated with 10 consecutive momentum steps.  It uses random gradients.
+
+    Returns:
+      db_grad: The gradients to apply
+      db_out: The parameters after the momentum update.
+    """
+    db_grad = [[]] * 10
+    db_out = [[]] * 10
+    # pylint: disable=line-too-long
+    db_grad[0] = [
+        0.00096264342, 0.17914793, 0.93945462, 0.41396621, 0.53037018,
+        0.93197989, 0.78648776, 0.50036013, 0.55345792, 0.96722615
+    ]
+    db_out[0] = [
+        -9.6264346e-05, -0.017914793, -0.093945466, -0.041396622, -0.053037018,
+        -0.093197994, -0.078648776, -0.050036013, -0.055345792, -0.096722618
+    ]
+    db_grad[1] = [
+        0.17075552, 0.88821375, 0.20873757, 0.25236958, 0.57578111, 0.15312378,
+        0.5513742, 0.94687688, 0.16012503, 0.22159521
+    ]
+    db_out[1] = [
+        -0.017181443, -0.10852765, -0.12421377, -0.070773244, -0.11591884,
+        -0.11783017, -0.14165108, -0.14972731, -0.076892875, -0.1285544
+    ]
+    db_grad[2] = [
+        0.35077485, 0.47304362, 0.44412705, 0.44368884, 0.078527533, 0.81223965,
+        0.31168157, 0.43203235, 0.16792089, 0.24644311
+    ]
+    db_out[2] = [
+        -0.053967446, -0.1648933, -0.1716533, -0.1180798, -0.13005978,
+        -0.20151734, -0.17911947, -0.20289968, -0.095839672, -0.15638189
+    ]
+    db_grad[3] = [
+        0.9694621, 0.75035888, 0.28171822, 0.83813518, 0.53807181, 0.3728098,
+        0.81454384, 0.03848977, 0.89759839, 0.93665648
+    ]
+    db_out[3] = [
+        -0.15459226, -0.24556576, -0.20456907, -0.20662397, -0.18528105,
+        -0.24716705, -0.2643207, -0.21206589, -0.18749419, -0.2528303
+    ]
+    db_grad[4] = [
+        0.38578293, 0.8536852, 0.88722926, 0.66276771, 0.13678469, 0.94036359,
+        0.69107032, 0.81897682, 0.5433259, 0.67860287
+    ]
+    db_out[4] = [
+        -0.20323303, -0.33900154, -0.29658359, -0.28175515, -0.20448165,
+        -0.34576839, -0.34194785, -0.29488021, -0.25099224, -0.33033544
+    ]
+    db_grad[5] = [
+        0.27885768, 0.76100707, 0.24625534, 0.81354135, 0.18959245, 0.48038563,
+        0.84163809, 0.41172323, 0.83259648, 0.44941229
+    ]
+    db_out[5] = [
+        -0.23598288, -0.42444581, -0.33041057, -0.3706224, -0.22536094,
+        -0.40366709, -0.43387437, -0.34433398, -0.34060168, -0.38302717
+    ]
+    db_grad[6] = [
+        0.27233034, 0.056316052, 0.5039115, 0.24105175, 0.35697976, 0.75913221,
+        0.73577434, 0.16014607, 0.57500273, 0.071136251
+    ]
+    db_out[6] = [
+        -0.26649091, -0.43862185, -0.38418442, -0.40361428, -0.26314685,
+        -0.48537019, -0.51664448, -0.36529395, -0.40706289, -0.39540997
+    ]
+    db_grad[7] = [
+        0.58697265, 0.2494842, 0.08106143, 0.39954534, 0.15892942, 0.12683646,
+        0.74053431, 0.16033, 0.66625422, 0.73515922
+    ]
+    db_out[7] = [
+        -0.32823896, -0.46498787, -0.39766794, -0.446868, -0.28281838,
+        -0.50622416, -0.59897494, -0.38342294, -0.48033443, -0.47016418
+    ]
+    db_grad[8] = [
+        0.8215279, 0.41994119, 0.95172721, 0.68000203, 0.79439718, 0.43384039,
+        0.55561525, 0.22567581, 0.93331909, 0.29438227
+    ]
+    db_out[8] = [
+        -0.41656655, -0.50961858, -0.49418902, -0.51919359, -0.36422527,
+        -0.55169362, -0.6627695, -0.40780342, -0.58099347, -0.50707781
+    ]
+    db_grad[9] = [
+        0.68297005, 0.67758518, 0.1748755, 0.13266537, 0.70697063, 0.055731893,
+        0.68593478, 0.50580865, 0.12602448, 0.093537711
+    ]
+    db_out[9] = [
+        -0.49369633, -0.58184016, -0.52132869, -0.5396927, -0.44306302,
+        -0.56181377, -0.73774242, -0.46082234, -0.60366184, -0.52012295
+    ]
+    # pylint: enable=line-too-long
+    return db_grad, db_out
+
+  def testLikeDistBeliefMom01(self):
+    with self.cached_session():
+      db_grad, db_out = self._dbParamsMom01()
+      num_samples = len(db_grad)
+      var0 = variables.Variable([0.0] * num_samples)
+      grads0 = constant_op.constant([0.0] * num_samples)
+      mom_opt = sgd.SGD(learning_rate=0.1, momentum=0.1)
+      mom_update = mom_opt.apply_gradients(zip([grads0], [var0]))
+      variables.global_variables_initializer().run()
+      for i in xrange(num_samples):
+        mom_update.run(feed_dict={grads0: db_grad[i]})
+        self.assertAllClose(np.array(db_out[i]), var0.eval())
+
+  def testSparse(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable(array_ops.zeros([4, 2], dtype=dtype))
+        var1 = variables.Variable(constant_op.constant(1.0, dtype, [4, 2]))
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [[.1, .1]], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([4, 2]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(
+                [[.01, .01], [.01, .01]], dtype=dtype),
+            constant_op.constant([2, 3]),
+            constant_op.constant([4, 2]))
+        mom_opt = sgd.SGD(learning_rate=2.0, momentum=0.9)
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Check we have slots
+        self.assertEqual(["momentum"], mom_opt.get_slot_names())
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([0, 0], var0.eval()[0])
+        self.assertAllClose([0, 0], var0.eval()[1])
+        self.assertAllClose([1, 1], var1.eval()[2])
+
+        # Step 1: the momentum accumulators are 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllCloseAccordingToType(np.array([.1, .1]), slot0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([.01, .01]), slot1.eval()[2])
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(np.array([0, 0]), var0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]), var0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]), var1.eval()[2])
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllClose(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            slot1.eval()[2])
+        # Check that the parameters have been updated.
+        self.assertAllClose(np.array([0, 0]), var0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), -(0.1 * 2.0) - (
+                    (0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0), 0.98 - (
+                    (0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval()[2])
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = sgd.SGD(learning_rate=2.0, momentum=0.9)
+        mom_update1 = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        mom_update2 = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        self.assertEqual(["momentum"], mom_opt.get_slot_names())
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update1.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
+        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+        # Step 2: the second momentum accumulators contain the previous update.
+        mom_update2.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
+                    (0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
-- 
GitLab


From 3f54f1f60413cbd3e9a5a4126f8ae04bc4e06abc Mon Sep 17 00:00:00 2001
From: Jeremy Lau <lauj@google.com>
Date: Fri, 5 Oct 2018 12:45:56 -0700
Subject: [PATCH 481/570] Workaround build errors in Android NDK r14b.

PiperOrigin-RevId: 215950376
---
 tensorflow/tools/ci_build/Dockerfile.android | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/tools/ci_build/Dockerfile.android b/tensorflow/tools/ci_build/Dockerfile.android
index dcf077791a..7e72eb0cbf 100644
--- a/tensorflow/tools/ci_build/Dockerfile.android
+++ b/tensorflow/tools/ci_build/Dockerfile.android
@@ -45,9 +45,14 @@ ENV ANDROID_NDK_FILENAME android-ndk-r14b-linux-x86_64.zip
 ENV ANDROID_NDK_URL https://dl.google.com/android/repository/${ANDROID_NDK_FILENAME}
 ENV ANDROID_NDK_HOME ${ANDROID_DEV_HOME}/ndk
 ENV PATH ${PATH}:${ANDROID_NDK_HOME}
+# Workaround for b/117156972: inject missing #include into NDK versions of
+# futex.h.
 RUN cd ${ANDROID_DEV_HOME} && \
     wget -q ${ANDROID_NDK_URL} && \
     unzip ${ANDROID_NDK_FILENAME} -d ${ANDROID_DEV_HOME} && \
+    sed -i 15i"#include <linux/compiler.h>" ${ANDROID_DEV_HOME}/android-ndk-r14b/platforms/android-14/arch-arm/usr/include/linux/futex.h && \
+    sed -i 15i"#include <linux/compiler.h>" ${ANDROID_DEV_HOME}/android-ndk-r14b/platforms/android-14/arch-mips/usr/include/linux/futex.h && \
+    sed -i 15i"#include <linux/compiler.h>" ${ANDROID_DEV_HOME}/android-ndk-r14b/platforms/android-14/arch-x86/usr/include/linux/futex.h && \
     rm ${ANDROID_NDK_FILENAME} && \
     bash -c "ln -s ${ANDROID_DEV_HOME}/android-ndk-* ${ANDROID_NDK_HOME}"
 
-- 
GitLab


From 3427a3c638fb92a172d390266ed62403f9140f7d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 12:52:22 -0700
Subject: [PATCH 482/570] Internal change.

PiperOrigin-RevId: 215951354
---
 tensorflow/contrib/lite/kernels/BUILD        | 1 +
 tensorflow/contrib/lite/kernels/lstm_eval.cc | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 68636fb070..d2d8073abd 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -259,6 +259,7 @@ cc_library(
     srcs = ["lstm_eval.cc"],
     hdrs = ["lstm_eval.h"],
     deps = [
+        ":op_macros",
         "//tensorflow/contrib/lite/c:c_api_internal",
         "//tensorflow/contrib/lite/kernels/internal:kernel_utils",
         "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
diff --git a/tensorflow/contrib/lite/kernels/lstm_eval.cc b/tensorflow/contrib/lite/kernels/lstm_eval.cc
index c6c21eb085..20a4e30009 100644
--- a/tensorflow/contrib/lite/kernels/lstm_eval.cc
+++ b/tensorflow/contrib/lite/kernels/lstm_eval.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -599,6 +600,7 @@ TfLiteStatus EvalFloat(
     const TfLiteLSTMParams* params, bool forward_sequence, int output_offset,
     TfLiteTensor* scratch_buffer, TfLiteTensor* activation_state,
     TfLiteTensor* cell_state, TfLiteTensor* output) {
+  TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
   const int max_time = (input->dims->size == 2) ? 1 : input->dims->data[0];
   const int n_batch = input->dims->data[input->dims->size - 2];
   const int n_input = input->dims->data[input->dims->size - 1];
@@ -716,6 +718,7 @@ TfLiteStatus EvalHybrid(
     TfLiteTensor* output_state_quantized, TfLiteTensor* cell_state_quantized,
     TfLiteTensor* output_state, TfLiteTensor* cell_state,
     TfLiteTensor* output) {
+  TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
   const int max_time = (input->dims->size == 2) ? 1 : input->dims->data[0];
   const int n_batch = input->dims->data[input->dims->size - 2];
   const int n_input = input->dims->data[input->dims->size - 1];
-- 
GitLab


From ec451f5ab43467d7cb4ae7736f2de16331441e0b Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Fri, 5 Oct 2018 12:53:50 -0700
Subject: [PATCH 483/570] Break up build --define <option_name>=true into two
 steps: 1) define bazel config    build:<bazel_config_name> --define
 <option_name>s=true 2) set the config    build --config=<bazel_config_name>

PiperOrigin-RevId: 215951614
---
 configure.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/configure.py b/configure.py
index 65b4622995..89dc79b6b6 100644
--- a/configure.py
+++ b/configure.py
@@ -383,7 +383,9 @@ def set_build_var(environ_cp,
   var = str(int(get_var(environ_cp, var_name, query_item, enabled_by_default)))
   environ_cp[var_name] = var
   if var == '1':
-    write_to_bazelrc('build --define %s=true' % option_name)
+    write_to_bazelrc(
+        'build:%s --define %s=true' % (bazel_config_name, option_name))
+    write_to_bazelrc('build --config=%s' % bazel_config_name)
   elif bazel_config_name is not None:
     # TODO(mikecase): Migrate all users of configure.py to use --config Bazel
     # options and not to set build configs through environment variables.
-- 
GitLab


From f14287eabf69c57a2d2e044c311f2db1413cb6a5 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Fri, 5 Oct 2018 13:24:34 -0700
Subject: [PATCH 484/570] Copy device from If op to the lowered ops. Enable GPU
 tests for cond_v2.

PiperOrigin-RevId: 215956220
---
 tensorflow/core/common_runtime/lower_if_op.cc |  9 +++-
 tensorflow/python/kernel_tests/BUILD          |  3 +-
 .../python/kernel_tests/cond_v2_test.py       | 49 +++++++++----------
 .../kernel_tests/control_flow_ops_py_test.py  |  5 --
 4 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index a02084f223..9306386117 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -107,6 +107,8 @@ CondBuilder::CondBuilder(Node* if_op, const string& then_fn_name,
       then_call_builder_(NewName("then"), then_fn_name, graph->op_registry()),
       else_call_builder_(NewName("else"), else_fn_name, graph->op_registry()) {
   TF_CHECK_OK(if_op_->input_node(0, &pred_));
+  then_call_builder_.Device(if_op_->requested_device());
+  else_call_builder_.Device(if_op_->requested_device());
 }
 
 Status CondBuilder::CreatePivotNodes() {
@@ -117,15 +119,18 @@ Status CondBuilder::CreatePivotNodes() {
       NodeBuilder(NewName("switch_pred"), "Switch", graph_->op_registry())
           .Input(NodeOut(pred_, 0))
           .Input(NodeOut(pred_, 0))
+          .Device(if_op_->requested_device())
           .Finalize(graph_, &switch_pred));
   control_predecessor_ = switch_pred;
   TF_RETURN_IF_ERROR(
       NodeBuilder(NewName("pivot_f"), "Identity", graph_->op_registry())
           .Input(switch_pred, kElseBranch)
+          .Device(if_op_->requested_device())
           .Finalize(graph_, &pivot_f_));
   TF_RETURN_IF_ERROR(
       NodeBuilder(NewName("pivot_t"), "Identity", graph_->op_registry())
           .Input(switch_pred, kThenBranch)
+          .Device(if_op_->requested_device())
           .Finalize(graph_, &pivot_t_));
   return Status::OK();
 }
@@ -140,6 +145,7 @@ Status CondBuilder::AddInput(Node* src, int src_output) {
       NodeBuilder(NewName(src->name()), "Switch", graph_->op_registry())
           .Input(src, src_output)
           .Input(pred_, 0)
+          .Device(if_op_->requested_device())
           .Finalize(graph_, &input));
   then_call_builder_.Input(input, kThenBranch);
   else_call_builder_.Input(input, kElseBranch);
@@ -178,6 +184,7 @@ Status CondBuilder::AddOutputs() {
     TF_RETURN_IF_ERROR(
         NodeBuilder(graph_->NewName("merge"), "Merge", graph_->op_registry())
             .Input({NodeOut(then_call_node_, i), NodeOut(else_call_node_, i)})
+            .Device(if_op_->requested_device())
             .Finalize(graph_, &merges[i]));
     outputs_[i] = NodeOut(merges[i], 0);
   }
@@ -218,7 +225,7 @@ Status InlineCallInGraph(Node* n, const FunctionLibraryDefinition& flib,
 Status CondBuilder::BuildLoweredIfOutput() {
   // Build the identity node output.
   NodeBuilder ib(name_, "IdentityN");
-  ib.Input(outputs_);
+  ib.Input(outputs_).Device(if_op_->requested_device());
   return ib.Finalize(graph_, &lowered_if_output_);
 }
 
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index e055ef1c1b..4e8639dfc8 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -3255,7 +3255,7 @@ tf_py_test(
     tags = ["no_pip"],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "cond_v2_test",
     size = "medium",
     srcs = ["cond_v2_test.py"],
@@ -3272,7 +3272,6 @@ tf_py_test(
         "//tensorflow/python:training",
     ],
     grpc_enabled = True,
-    tags = ["no_gpu"],  # TODO(b/111656070)
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 377c041675..ec875aae59 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -172,7 +172,7 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [y])
 
   def testNestedDefunInCond(self):
-    self.skipTest("b/110550782")
+    self.skipTest("b/117284369")
 
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
@@ -198,7 +198,7 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [y])
 
   def testDoubleNestedDefunInCond(self):
-    self.skipTest("b/110550782")
+    self.skipTest("b/117284369")
 
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
@@ -468,7 +468,6 @@ class CondV2Test(test.TestCase):
             }), [5., 0.])
 
   def testBuildCondAndGradientInsideDefun(self):
-    self.skipTest("b/110550782")
 
     def build_graph():
       pred_outer = array_ops.placeholder(dtypes.bool, name="pred_outer")
@@ -502,29 +501,29 @@ class CondV2Test(test.TestCase):
 
       return grads, pred_outer, pred_inner
 
-    with ops.Graph().as_default():
+    with ops.Graph().as_default(), self.session(
+        graph=ops.get_default_graph()) as sess:
       grads, pred_outer, pred_inner = build_graph()
-      with self.session(graph=ops.get_default_graph()) as sess:
-        self.assertSequenceEqual(
-            sess.run(grads, {
-                pred_outer: True,
-                pred_inner: True
-            }), [0., 0.])
-        self.assertSequenceEqual(
-            sess.run(grads, {
-                pred_outer: True,
-                pred_inner: False
-            }), [0., 0.])
-        self.assertSequenceEqual(
-            sess.run(grads, {
-                pred_outer: False,
-                pred_inner: True
-            }), [4., 2.])
-        self.assertSequenceEqual(
-            sess.run(grads, {
-                pred_outer: False,
-                pred_inner: False
-            }), [5., 0.])
+      self.assertSequenceEqual(
+          sess.run(grads, {
+              pred_outer: True,
+              pred_inner: True
+          }), [0., 0.])
+      self.assertSequenceEqual(
+          sess.run(grads, {
+              pred_outer: True,
+              pred_inner: False
+          }), [0., 0.])
+      self.assertSequenceEqual(
+          sess.run(grads, {
+              pred_outer: False,
+              pred_inner: True
+          }), [4., 2.])
+      self.assertSequenceEqual(
+          sess.run(grads, {
+              pred_outer: False,
+              pred_inner: False
+          }), [5., 0.])
 
   def testSecondDerivative(self):
     with self.cached_session() as sess:
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index c7e89dd5f9..7fae5249aa 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -23,7 +23,6 @@ from __future__ import print_function
 import collections
 import math
 import time
-import unittest
 
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -661,7 +660,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
       sess.run(r)
 
-  @test_util.disable_control_flow_v2("b/113346829 (gpu failure)")
   def testCondGrad_1(self):
     graph = ops.Graph()
     with graph.as_default():
@@ -3424,9 +3422,6 @@ class EagerTest(test.TestCase):
 
   # TODO(b/117279927): Re-enable once msan failure is fixed.
   def DISABLED_testCondInDefun(self):
-    if "GPU" in [d.device_type for d in device_lib.list_local_devices()]:
-      return unittest.skip("b/113346829 (gpu failure)")
-
     with context.eager_mode():
 
       @eager_function.defun
-- 
GitLab


From 0c37dcc02f54395d2bde3cc5850574c8f98f1b46 Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Fri, 5 Oct 2018 13:32:24 -0700
Subject: [PATCH 485/570] [XLA] Use the highest possible precision for large
 Iota inputs.

PiperOrigin-RevId: 215957327
---
 tensorflow/compiler/xla/tests/convolution_test.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 070b092d18..b851db14ec 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -91,7 +91,14 @@ class ForwardPassConvolution_3x3x256_256_OutputZ_Iota : public ConvolutionTest {
     XlaBuilder builder(TestName());
     auto lhs = ConstantR4FromArray4D<T>(&builder, *alhs);
     auto rhs = ConstantR4FromArray4D<T>(&builder, *arhs);
-    Conv(lhs, rhs, {1, 1}, Padding::kValid);
+    PrecisionConfig precision;
+    // The left hand side of the convolution is numbers between 0 and 2304 which
+    // requires at least 11 mantissa bits and the DEFAULT precision config is
+    // allowed to round to bfloat16 which only has 7 mantissa bits.
+    precision.add_operand_precision(PrecisionConfig::HIGHEST);
+    precision.add_operand_precision(PrecisionConfig::DEFAULT);
+    Conv(lhs, rhs, {1, 1}, Padding::kValid, /*feature_group_count=*/1,
+         &precision);
 
     ComputeAndCompare(&builder, {}, error_spec_);
   }
-- 
GitLab


From 4d69a79b1ebd0c2180959c1047fbc9db106701e1 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Fri, 5 Oct 2018 13:33:38 -0700
Subject: [PATCH 486/570] Handle Range & BatchMatMul in partial Flex mode

PiperOrigin-RevId: 215957535
---
 .../contrib/lite/toco/import_tensorflow.cc    | 37 ++++++++-
 tensorflow/contrib/lite/toco/model.h          |  9 +-
 tensorflow/contrib/lite/toco/tflite/export.cc | 83 +++++++++++--------
 .../contrib/lite/toco/tflite/export_test.cc   | 34 ++++++++
 .../contrib/lite/toco/tflite/operator.cc      | 32 ++++---
 .../contrib/lite/toco/tflite/operator.h       |  6 ++
 6 files changed, 155 insertions(+), 46 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 5eaf6e27fc..133ef79a34 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -477,6 +477,30 @@ string CreateConstArray(Model* model, string const& name,
   return array_name;
 }
 
+// Retain TensorFlow NodeDef in Toco Operator.
+//
+// If an op is supported by Toco but not supported by TFLite, TFLite exporter
+// will use the retained NodeDef to populate a Flex op when Flex mode is
+// enabled.
+//
+// This can't be easily applied to all operations, because a TensorFlow node
+// may become multiple Toco operators. Thus we need to call this function in
+// operator conversion functions one by one whenever feasible.
+//
+// This may cause problems if a graph transformation rule changes parameters
+// of the node. When calling this function, please check if any existing
+// graph transformation rule will change an existing operator with the same
+// type.
+//
+// This provides a route to handle Toco-supported & TFLite-unsupported ops
+// in Flex mode. However it's not a solid solution. Eventually we should
+// get rid of this.
+// TODO(b/117327937): Implement all Toco-supported ops in TFLite, and remove
+// this function.
+void RetainTensorFlowNodeDef(const NodeDef& node, Operator* op) {
+  node.SerializeToString(&op->tensorflow_node_def);
+}
+
 tensorflow::Status ConvertConstOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -990,6 +1014,10 @@ tensorflow::Status ConvertBatchMatMulOperator(
   auto* batch_matmul = new BatchMatMulOperator;
   batch_matmul->inputs = {node.input(0), node.input(1)};
   batch_matmul->outputs = {node.name()};
+
+  // For Flex mode. Please read the comments of the function.
+  RetainTensorFlowNodeDef(node, batch_matmul);
+
   model->operators.emplace_back(batch_matmul);
   return tensorflow::Status::OK();
 }
@@ -1081,7 +1109,10 @@ tensorflow::Status ConvertUnsupportedOperator(
 
   auto* op = new TensorFlowUnsupportedOperator;
   op->tensorflow_op = node.op();
-  node.SerializeToString(&op->tensorflow_node_def);
+
+  // For Flex mode. Please read the comments of the function.
+  RetainTensorFlowNodeDef(node, op);
+
   model->operators.emplace_back(op);
 
   // Parse inputs.
@@ -1605,6 +1636,10 @@ tensorflow::Status ConvertRangeOperator(
   op->inputs.push_back(node.input(1));
   op->inputs.push_back(node.input(2));
   op->outputs.push_back(node.name());
+
+  // For Flex mode. Please read the comments of the function.
+  RetainTensorFlowNodeDef(node, op);
+
   model->operators.emplace_back(op);
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 6e207fdf54..61f1f095e9 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -376,6 +376,13 @@ struct Operator {
   // looks unused.
   bool unresolved_outputs = false;
 
+  // A serialized tensorflow::NodeDef string.
+  // The field is filled only when importing from TensorFlow.
+  // It's guaranteed to be filled for `TensorFlowUnsupportedOperator`.
+  // It's not guaranteed to be filled for other ops. Ops created by graph
+  // transformations won't have TensorFlow NodeDef.
+  string tensorflow_node_def;
+
  protected:
   // Constructor used by subclasses for specific OperatorType's.
   explicit Operator(OperatorType t)
@@ -1535,8 +1542,6 @@ struct TensorFlowUnsupportedOperator : Operator {
 
   // The original TF operation type. Used for diagnostic purposes.
   string tensorflow_op;
-  // A serialized tensorflow::NodeDef string.
-  string tensorflow_node_def;
   // A boolean indicating if the unsupported op should be treated as quantized.
   bool quantized = false;
   // A boolean indicating if the unsupported op output should allow float values
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index f6f76e48a4..3b34cd6285 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -95,11 +95,13 @@ OperatorKey GetOperatorKey(
     const ::toco::Operator& op,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
     bool allow_flex_ops) {
+  // Get the op name (by Toco definition).
   string name = HelpfulOperatorTypeName(op);
-  const auto& builtin_ops = GetBuiltinOpsMap();
 
   bool is_builtin = false;
   OperatorKey key;
+
+  const auto& builtin_ops = GetBuiltinOpsMap();
   if (ops_by_type.count(op.type) != 0) {
     key.version = ops_by_type.at(op.type)->GetVersion(op);
     name = ops_by_type.at(op.type)->name();
@@ -110,37 +112,46 @@ OperatorKey GetOperatorKey(
     // For TFLite supported builtin ops, find out its BuiltinOperator enum used
     // in FlatBuffer.
     key.type = builtin_ops.at(name);
-  } else {
-    key.type = BuiltinOperator_CUSTOM;
-
-    key.is_custom_op = true;
-    if (op.type == OperatorType::kUnsupported) {
-      const TensorFlowUnsupportedOperator& unsupported_op =
-          static_cast<const TensorFlowUnsupportedOperator&>(op);
-      const auto tensorflow_op = unsupported_op.tensorflow_op;
-
-      // TODO(b/113715895): When `allow_flex_ops` is on, for now there's no way
-      // to populate a regular custom op. We need to find a way to fix this.
-      if (allow_flex_ops) {
-        // Memorize the original TensorFlow op name.
-        key.flex_tensorflow_op = tensorflow_op;
-        // Prefix the custom code of the flex op.
-        key.custom_code =
-            string(::tflite::kFlexCustomCodePrefix) + tensorflow_op;
-        key.is_flex_op = true;
-
-        if (IsControlFlowOp(tensorflow_op)) {
-          key.is_unsupported_flex_op = true;
-        }
-      } else {
-        key.custom_code = tensorflow_op;
-      }
+    return key;
+  }
+
+  // The logic below is all for custom ops.
+  key.is_custom_op = true;
+  key.type = BuiltinOperator_CUSTOM;
+
+  if (op.type == OperatorType::kUnsupported) {
+    const TensorFlowUnsupportedOperator& unsupported_op =
+        static_cast<const TensorFlowUnsupportedOperator&>(op);
+    const auto tensorflow_op = unsupported_op.tensorflow_op;
+
+    // TODO(b/113715895): When `allow_flex_ops` is on, for now there's no way
+    // to populate a regular custom op. We need to find a way to fix this.
+    if (allow_flex_ops) {
+      key.is_flex_op = true;
+      key.flex_tensorflow_op = tensorflow_op;
+      key.custom_code =
+          string(::tflite::kFlexCustomCodePrefix) + key.flex_tensorflow_op;
     } else {
-      // For Toco-supported/TFLite-unsupported ops, currently we produce a
-      // custom op. This gives developers a chance to implement custom ops.
-      // TODO(b/116800229): Also produce Toco-supported/TFLite-unsupported ops
-      // as Flex ops when Flex mode is enabled.
-      key.custom_code = name;
+      key.custom_code = tensorflow_op;
+    }
+  } else if (allow_flex_ops && !op.tensorflow_node_def.empty()) {
+    // For Toco-supported/TFLite-unsupported ops, if the TensorFlow NodeDef
+    // is retained in the Toco Operator, we produce a Flex op if Flex mode
+    // is enabled.
+    key.is_flex_op = true;
+    key.flex_tensorflow_op = name;
+    key.custom_code =
+        string(::tflite::kFlexCustomCodePrefix) + key.flex_tensorflow_op;
+  } else {
+    // If Flex is disabled or the original TensorFlow NodeDef isn't available,
+    // we produce a custom op. This gives developers a chance to implemenr
+    // custom ops.
+    key.custom_code = name;
+  }
+
+  if (key.is_flex_op) {
+    if (IsControlFlowOp(key.flex_tensorflow_op)) {
+      key.is_unsupported_flex_op = true;
     }
   }
   return key;
@@ -323,8 +334,9 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
       outputs.push_back(tensors_map.at(output));
     }
 
-    int op_index = operators_map.at(
-        details::GetOperatorKey(*op, ops_by_type, params.allow_flex_ops));
+    const auto key =
+        details::GetOperatorKey(*op, ops_by_type, params.allow_flex_ops);
+    int op_index = operators_map.at(key);
 
     auto tflite_op_it = ops_by_type.find(op->type);
     BaseOperator* tflite_op = tflite_op_it == ops_by_type.end()
@@ -349,6 +361,11 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
           variable_tensor_indices->insert(variable_tensor_index);
         }
       }
+    } else if (key.is_flex_op && !op->tensorflow_node_def.empty()) {
+      auto fbb = WriteFlexOpOptions(op->tensorflow_node_def);
+      if (fbb) {
+        options = Options::Custom(builder->CreateVector(fbb->GetBuffer()));
+      }
     }
     // The only supported CustomOptionFormat is FLEXBUFFERS now.
     op_vector.push_back(CreateOperator(
diff --git a/tensorflow/contrib/lite/toco/tflite/export_test.cc b/tensorflow/contrib/lite/toco/tflite/export_test.cc
index d48ab78285..eda1aa78a3 100644
--- a/tensorflow/contrib/lite/toco/tflite/export_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/toco/tflite/builtin_operator.h"
 #include "tensorflow/contrib/lite/toco/tflite/operator.h"
 #include "tensorflow/contrib/lite/toco/tflite/types.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 
 namespace toco {
 namespace tflite {
@@ -382,6 +383,39 @@ TEST(OperatorKeyTest, TestFlexWithControlFlowOp) {
   EXPECT_TRUE(key.is_unsupported_flex_op);
 }
 
+TEST(OperatorKeyTest, TestFlexWithPartiallySupportedOps) {
+  // Test Toco-supported/TFLite-unsupported operators.
+  // TODO(ycling): The test will be broken if Range is implemented in TFLite.
+  // Find a more robust way to test the fallback logic.
+  auto op = absl::make_unique<RangeOperator>();
+
+  const auto ops_by_type = BuildOperatorByTypeMap();
+
+  {
+    // If NodeDef isn't retained in the Toco op, a regular custom op
+    // will be exported.
+    const auto key = details::GetOperatorKey(*op, ops_by_type, true);
+    EXPECT_EQ(key.type, ::tflite::BuiltinOperator_CUSTOM);
+    EXPECT_EQ(key.custom_code, "Range");
+    EXPECT_EQ(key.version, 1);
+    EXPECT_FALSE(key.is_flex_op);
+  }
+
+  ::tensorflow::NodeDef node_def;
+  node_def.set_name("Range");
+  node_def.set_op("Range");
+  node_def.SerializeToString(&op->tensorflow_node_def);
+
+  {
+    // If NodeDef is retained in the Toco op, a Flex op will be exported.
+    const auto key = details::GetOperatorKey(*op, ops_by_type, true);
+    EXPECT_EQ(key.type, ::tflite::BuiltinOperator_CUSTOM);
+    EXPECT_EQ(key.custom_code, "FlexRange");
+    EXPECT_EQ(key.version, 1);
+    EXPECT_TRUE(key.is_flex_op);
+  }
+}
+
 // TODO(ahentz): tests for tensors, inputs, outputs, opcodes and operators.
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 9addbb81e7..ed37535fe0 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -1157,6 +1157,25 @@ class Unpack : public BuiltinOperator<UnpackOperator, ::tflite::UnpackOptions,
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
+    const string& tensorflow_node_def) {
+  auto fbb = absl::make_unique<flexbuffers::Builder>();
+
+  ::tensorflow::NodeDef node_def;
+  if (!node_def.ParseFromString(tensorflow_node_def)) {
+    LOG(ERROR) << "Failed to parse TensorFlow NodeDef";
+    return {};
+  }
+
+  fbb->Vector([&]() {
+    fbb->String(node_def.op());
+    fbb->String(tensorflow_node_def);
+  });
+  fbb->Finish();
+  LOG(INFO) << "Writing flex op: " << node_def.op();
+  return std::unique_ptr<flexbuffers::Builder>(fbb.release());
+}
+
 class TensorFlowUnsupported : public BaseOperator {
  public:
   TensorFlowUnsupported(const string& name, OperatorType type,
@@ -1192,6 +1211,9 @@ class TensorFlowUnsupported : public BaseOperator {
 
   std::unique_ptr<flexbuffers::Builder> WriteOptions(
       const TensorFlowUnsupportedOperator& op) const {
+    if (allow_flex_ops_) {
+      return WriteFlexOpOptions(op.tensorflow_node_def);
+    }
     auto fbb = absl::make_unique<flexbuffers::Builder>();
 
     ::tensorflow::NodeDef node_def;
@@ -1200,16 +1222,6 @@ class TensorFlowUnsupported : public BaseOperator {
       return std::unique_ptr<flexbuffers::Builder>();
     }
 
-    if (allow_flex_ops_) {
-      fbb->Vector([&]() {
-        fbb->String(node_def.op());
-        fbb->String(op.tensorflow_node_def);
-      });
-      fbb->Finish();
-      LOG(INFO) << "Writing flex op: " << node_def.op();
-      return std::unique_ptr<flexbuffers::Builder>(fbb.release());
-    }
-
     bool has_valid_attr = false;
     size_t map_start = fbb->StartMap();
     for (const auto& pair : node_def.attr()) {
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.h b/tensorflow/contrib/lite/toco/tflite/operator.h
index 13d9f6c49a..6e4e0a16d1 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/operator.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
 
 #include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/flexbuffers.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 
@@ -36,6 +37,11 @@ std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
 std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
     bool allow_flex_ops = false);
 
+// Write the custom option FlexBuffer with a serialized TensorFlow NodeDef
+// for a Flex op.
+std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
+    const string& tensorflow_node_def);
+
 // These are the flatbuffer types for custom and builtin options.
 using CustomOptions = flatbuffers::Vector<uint8_t>;
 using BuiltinOptions = void;
-- 
GitLab


From efcf11cd44dfe8ddc441aa58f1b21ff7c8444568 Mon Sep 17 00:00:00 2001
From: shengfuintel <sheng.fu@intel.com>
Date: Fri, 5 Oct 2018 13:47:52 -0700
Subject: [PATCH 487/570] Clean up the code under INTEL_MKL_ML_ONLY

---
 tensorflow/core/graph/mkl_layout_pass.cc      | 2177 +----------------
 tensorflow/core/graph/mkl_layout_pass_test.cc | 1865 --------------
 2 files changed, 1 insertion(+), 4041 deletions(-)

diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 7394b1cddf..42a35727db 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -45,2181 +45,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-#ifdef INTEL_MKL_ML_ONLY
-
-// This pass implements rewriting of graph to support following scenarios:
-// (A) Merging nodes in the graph
-// (B) Rewriting a node in the graph to a new node
-//     Rewrite happens under following 2 scenarios:
-//     1) Propagating Mkl layout as an additional output tensor
-//        (we will loosely call a tensor that carries Mkl layout as Mkl tensor
-//         henceforth.) from every Mkl supported NN layer.
-//     2) Context-based rewrite: This is needed in order to optimize
-//        gradient ops of Conv2D+AddBias. Gradient op of both the Conv2D and
-//        MatMul is BiasAddGrad, and we need to rewrite BiasAddGrad into
-//        Conv2D-specific BiasAddGrad, and MatMul-specific BiasAddGrad.
-//        This is context-specific optimization, where the context is the
-//        forward operator that the BiasAddGrad corresponds to.
-//
-// Example of A : Merging nodes in the graph
-// -----------------------------------------
-// Currently, we merge Conv2D+AddBias together. Consider Conv2D and BiasAdd as:
-//
-//           O = Conv2D(A, B)
-//           P = BiasAdd(O, C)
-//
-// We merge them into Conv2DWithBias as:
-//           P = _MklConv2DWithBias(A, A_m, B, B_m, C, C_m)
-//
-// The meaning of A_m, B_m and C_m is explained in B.1.
-//
-// Merge rules:
-//  - The merge for Conv2D and BiasAdd happens when the output of Conv2D _only_
-//    goes to BiasAdd.
-//  - Also, the intersection of attributes of both the nodes must have same
-//    values.
-//  - Both the nodes must have been assigned to same device (if any).
-//
-// Example of B.1 : Rewriting nodes to Mkl nodes
-// ---------------------------------------------
-// Consider a Relu node. Current definition of Relu node looks like:
-//
-//           O = Relu(A)
-//
-// Relu has 1 input (A), and 1 output (O).
-//
-// This rewrite pass will generate a new graph node for Relu (new node is
-// called MklRelu) as:
-//
-//          O, O_m = MklRelu(A, A_m)
-//
-// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m). Here input A is
-// same as input A of Relu; output O is same as output O of Relu. O_m is the
-// additional output tensor that will be set by MklRelu, and it represents
-// Mkl tensor corresponding to O -- in other words, O_m is some kind of
-// metadata for O. A_m is additional input of Relu, and it represents metadata
-// for A - as O_m is metadata for O, A_m is metadata for A. MklRelu receives
-// this metadata from previous node in the graph.
-//
-// When a previous node in the graph is an Mkl node, A_m will represent a valid
-// Mkl tensor. But when a previous node is not an Mkl node, A_m will represent
-// a dummy Mkl tensor.
-//
-// Rewriting rules:
-//  - Selection of a node for rewriting happens by registering the op type of
-//    the node with the rewriting pass. If the op type is not registered, then
-//    all nodes of this op type will not be rewritten.
-//  - Number of inputs after rewriting:
-//      Since for every input Tensorflow tensor, the rewritten node gets Mkl
-//      tensor(s), rewritten node gets 2*N inputs, where N is the number of
-//      inputs for the original node.
-//  - Number of outputs after rewriting:
-//      Since for every output Tensorflow tensor, the rewritten node generates
-//      Mkl tensor(s), the rewritten node generates 2*N outputs, where N is the
-//      number of outputs of the original node.
-//  - Ordering of Tensorflow tensors and Mkl tensors:
-//      Since every rewritten node generates twice the number of inputs and
-//      outputs, one could imagine various orderings among Tensorflow tensors
-//      and Mkl tensors. E.g., assume an op 'Conv2D' that takes (A, B) as
-//      inputs, then the new op '_MklConv2D' can take inputs A, B, A_m and B_m
-//      in A, A_m, B, B_m order or it can also take them in A, B, A_m, B_m
-//      order. Among N inputs one can get N! permutations.
-//
-//      So the question is: which order do we follow? We support 2 types of
-//      orderings: (1) interleaved, and (2) contiguous. Interleaved ordering
-//      follows an intuitive order where an Mkl tensor follows the
-//      corresponding Tensorflow tensor immediately. In the context of the
-//      above example, it will be: A, A_m, B, B_m. Note that the ordering rule
-//      applies to both the inputs and outputs. Contiguous ordering means
-//      all the Tensorflow tensors are contiguous followed by all the Mkl
-//      tensors. We use contiguous ordering as default.
-//
-// Graph rewrite algorithm:
-//      Algorithm: Graph Rewrite
-//      Input: Graph G, Names of the nodes to rewrite and their new names
-//      Output: Modified Graph G' if the nodes are modified, G otherwise.
-//      Start:
-//        N = Topological_Sort(G) // N is a set of nodes in toposort order.
-//        foreach node n in N
-//        do
-//          if (Is_MKL_Op(n))  // Can this node accept an Mkl layout as input.
-//          then
-//            E = set of <incoming edge and its src_output slot> of n
-//            E' = {}   // a new set of edges for rewritten node
-//            foreach <e,s> in E
-//            do
-//              E' U {<e,s>}  // First copy edge which generates Tensorflow
-//                            // tensor as it is
-//              m = Source node of edge e
-//              if Is_Rewritten(m)  // Did we rewrite this node in this pass?
-//              then
-//                E' U {<m,s+1>}    // If yes, then m will generate an Mkl
-//                                  // tensor as an additional output.
-//              else
-//                d = Generate_Dummy_Mkl_Tensor()  // If not, generate a dummy
-//                                                 // Mkl tensor.
-//                E' U {<d,0>}  // The dummy Mkl tensor has only 1 output slot.
-//              fi
-//            done
-//            n' = Build_New_Node(G,new_name,E')
-//            Mark_Rewritten(n')  // Mark the new node as being rewritten.
-//          fi
-//        done
-//
-//      Explanation:
-//        For graph rewrite, we visit nodes of the input graph in the
-//        topological sort order. With this ordering, we visit nodes in the
-//        top-to-bottom fashion. We need this order because while visiting a
-//        node we want that all of its input nodes are visited and rewritten if
-//        applicable. This is because if we need to rewrite a given node
-//        then all of its input nodes need to be fixed (in other words they
-//        cannot be deleted later.)
-//
-//        While visiting a node, we first check if the op type of the node is
-//        an Mkl op. If it is, then we rewrite that node after constructing
-//        new inputs to the node. If the op type of the node is not Mkl op,
-//        then we do not rewrite that node.
-//
-// Handling workspace propagation for certain ops:
-//
-//        Certain backward ops in MKL (MaxPool, LRN and BatchNorm) require
-//        passing of a workspace from their respective forward ops. Workspace
-//        tensors provide memory for storing results of intermediate operations
-//        which are helpful in backward propagation. TensorFlow does not have
-//        a notion of a workspace and as a result does not allow producing
-//        additional outputs from these forward ops. For these ops, we need
-//        to add 2 extra edges between forward ops and their corresponding
-//        backward ops - the first extra edge carries a workspace tensor and
-//        the second one carries an Mkl tensor for the workspace tensor.
-//
-//        Example:
-//
-//        Typical graph for MaxPool and its gradient looks like:
-//
-//        A = MaxPool(T)
-//        B = MaxPoolGrad(X, A, Y)
-//
-//        We will transform this graph to propagate the workspace as:
-//        (with the contiguous ordering)
-//
-//        A, W, A_m, W_m = MklMaxPool(T, T_m)
-//        B, B_m = MklMaxPoolGrad(X, A, Y, W, X_m, A_m, Y_m, W_m)
-//
-//        Here W is the workspace tensor. Transformed tensor names with the
-//        suffix _m are Mkl tensors, and this transformation has been done
-//        using the algorithm discussed earlier. The transformation for
-//        workspace propagation only adds extra outputs (W, W_m) for a forward
-//        op and connects them to the corresponding backward ops.
-//
-//        Terms:
-//
-//        Forward op name = name of the op in the forward pass
-//          where a workspace tensor originates (MaxPool in this example)
-//        Backward op name = name of the op in the backward pass that receives
-//          a workspace tensor from the forward op (MaxPoolGrad in the example)
-//        Slot = Position of the output or input slot that will be
-//               used by the workspace tensor (1 for MklMaxPool as W is the 2nd
-//               output of MaxPool (0 is 1st); 3 for MklMaxPoolGrad)
-//
-//        Question:
-//
-//        How do we associate a backward op to a forward op? There can be more
-//        than one op with the exact same name.
-//
-//        In this example, we associate MaxPoolGrad with MaxPool. But there
-//        could be more than one MaxPool ops. To solve this problem, we look
-//        for _direct_ edge between a forward op and a backward op (tensor A is
-//        flowing along this edge in the example).
-//
-//        How do we transform forward and backward ops when there is no direct
-//        edge between them? In such a case, we generate dummy tensors for
-//        workspace tensors. For the example, transformation of MaxPool will
-//        be exactly same as it would be when there is a direct edge between
-//        the forward and the backward op --- it is just that MaxPool won't
-//        generate any workspace tensor. For MaxPoolGrad, the transformation
-//        will also be same, but instead of connecting W and W_m with the
-//        outputs of MaxPool, we will produce dummy tensors for them, and we
-//        will set workspace_enabled attribute to false.
-//
-// Example of B.2 : Context-based node rewrite
-// -------------------------------------------
-// Consider BiasAddGrad op as:
-//
-//           O = _MklConv2D(A, B, C, A_m, B_m, C_m)
-//           P = BiasAddGrad(O)
-//
-// Then we rewrite it as:
-//
-//           P = Conv2DWithBiasBackpropBias(O, O_m)
-//
-// Rewrite of BiasAddGrad into Conv2DWithBiasBackpropBias takes place depending
-// on the matching 'context'. The term context is loosely related to which
-// forward op is _associated_ to BiasAddGrad. If it is _MklConv2DWithBias then
-// we consider it Conv2D context; if it is MatMul, then it is MatMul context.
-
-class MklLayoutRewritePass : public GraphOptimizationPass {
- public:
-  MklLayoutRewritePass() {
-    // NOTE: names are alphabetically sorted.
-    csinfo_.addn = "AddN";
-    csinfo_.avg_pool = "AvgPool";
-    csinfo_.avg_pool_grad = "AvgPoolGrad";
-    csinfo_.bias_add = "BiasAdd";
-    csinfo_.bias_add_grad = "BiasAddGrad";
-    csinfo_.concat = "Concat";
-    csinfo_.concatv2 = "ConcatV2";
-    csinfo_.conv2d = "Conv2D";
-    csinfo_.conv2d_grad_input = "Conv2DBackpropInput";
-    csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter";
-    csinfo_.fused_batch_norm = "FusedBatchNorm";
-    csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
-    csinfo_.identity = "Identity";
-    csinfo_.lrn = "LRN";
-    csinfo_.lrn_grad = "LRNGrad";
-    csinfo_.matmul = "MatMul";
-    csinfo_.max_pool = "MaxPool";
-    csinfo_.max_pool_grad = "MaxPoolGrad";
-    csinfo_.mkl_conv2d = "_MklConv2D";
-    csinfo_.mkl_conv2d_grad_input = "_MklConv2DBackpropInput";
-    csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
-    csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
-    csinfo_.mkl_conv2d_with_bias_backprop_bias =
-        "_MklConv2DWithBiasBackpropBias";
-    csinfo_.relu = "Relu";
-    csinfo_.relu_grad = "ReluGrad";
-    csinfo_.reshape = "Reshape";
-    csinfo_.split = "Split";
-    // Element-wise ops. Ensure you also add any new ops to IsOpElementWise
-    // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the
-    // MklInputConversion op is added before it.
-    csinfo_.add = "Add";
-    csinfo_.maximum = "Maximum";
-    csinfo_.mul = "Mul";
-    csinfo_.squared_difference = "SquaredDifference";
-    csinfo_.sub = "Sub";
-    // End - element-wise ops. See note above.
-
-    // NOTE: names are alphabetically sorted.
-    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
-                      CopyAttrsAddN, AddNRewrite, nullptr});
-    rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
-                      CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.avg_pool,
-                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
-                      CopyAttrsPooling, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.avg_pool_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool_grad),
-                      CopyAttrsPooling, AlwaysRewrite, nullptr});
-    // BiasAddGrad gets written into Conv2DWithBiasBackpropBias depending
-    // on if context contains Conv2D.
-    rinfo_.push_back({csinfo_.bias_add_grad,
-                      csinfo_.mkl_conv2d_with_bias_backprop_bias,
-                      CopyAttrsBiasAddGrad, ContextMatchRewrite,
-                      &biasaddgrad_conv2dwithbias_context_});
-    // BiasAddGrad gets written into BiasAddGrad depending on if context
-    // contains MatMul.
-    rinfo_.push_back({csinfo_.bias_add_grad, csinfo_.matmul,
-                      CopyAttrsBiasAddGrad, ContextMatchRewrite,
-                      &biasaddgrad_matmul_context_});
-    rinfo_.push_back({csinfo_.concat,
-                      mkl_op_registry::GetMklOpName(csinfo_.concat),
-                      CopyAttrsConcat, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.concatv2,
-                      mkl_op_registry::GetMklOpName(csinfo_.concatv2),
-                      CopyAttrsConcatV2, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.conv2d,
-                      mkl_op_registry::GetMklOpName(csinfo_.conv2d),
-                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.conv2d_grad_filter,
-                      mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
-                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.conv2d_grad_input,
-                      mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input),
-                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
-
-    rinfo_.push_back({csinfo_.fused_batch_norm,
-                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
-                      CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
-    rinfo_.push_back(
-        {csinfo_.fused_batch_norm_grad,
-         mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
-         CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.identity,
-                      mkl_op_registry::GetMklOpName(csinfo_.identity),
-                      CopyAttrsIdentity, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
-                      CopyAttrsLRN, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.lrn_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
-                      CopyAttrsLRN, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.max_pool,
-                      mkl_op_registry::GetMklOpName(csinfo_.max_pool),
-                      CopyAttrsPooling, NonDepthBatchWisePoolRewrite, nullptr});
-    rinfo_.push_back({csinfo_.max_pool_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
-                      CopyAttrsPooling, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.maximum,
-                      mkl_op_registry::GetMklOpName(csinfo_.maximum),
-                      CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul),
-                      CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
-                      CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.relu_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
-                      CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.reshape,
-                      mkl_op_registry::GetMklOpName(csinfo_.reshape),
-                      CopyAttrsReshape, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.squared_difference,
-                      mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
-                      CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub),
-                      CopyAttrsDataType, AlwaysRewrite, nullptr});
-
-    // Add info about which ops to add workspace edge to and the slots.
-    wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
-    wsinfo_.push_back({csinfo_.max_pool, csinfo_.max_pool_grad, 0, 1, 1, 3});
-
-    // Add a rule for merging nodes
-    minfo_.push_back({csinfo_.mkl_conv2d, csinfo_.bias_add, 0,
-                      csinfo_.mkl_conv2d_with_bias});
-
-    biasaddgrad_matmul_context_ = {csinfo_.bias_add_grad, csinfo_.matmul,
-                                   IsBiasAddGradInMatMulContext};
-
-    biasaddgrad_conv2dwithbias_context_ = {
-        csinfo_.bias_add_grad, csinfo_.mkl_conv2d_with_bias,
-        IsBiasAddGradInConv2DWithBiasContext};
-
-    cinfo_.push_back(&biasaddgrad_matmul_context_);
-    cinfo_.push_back(&biasaddgrad_conv2dwithbias_context_);
-  }
-
-  // Standard interface to run pass
-  Status Run(const GraphOptimizationPassOptions& options);
-
-  // Helper function which does most of heavy lifting for rewriting
-  // Mkl nodes to propagate Mkl tensor as additional output
-  //
-  // Extracts common functionality between Run public interface and
-  // test interface.
-  //
-  // @return true, if and only if graph is mutated; false otherwise.
-  bool RunPass(std::unique_ptr<Graph>* g);
-
-  /// Structure to specify the context information used in a node rewrite rule
-  typedef struct {
-    string node;  // Name of the node to be rewritten
-    string fwd;   // Name of the node in the forward pass that this node
-                  // corresponds to
-    std::function<bool(const Node*, const Node**, void* c)> context_match_fn;
-  } ContextInfo;
-
-  /// Structure to specify the name of an original node, its new name after
-  /// rewrite, the number of inputs to the original node, the function to
-  /// be used to copy attributes for the op, and the rule (if any) which
-  /// must hold for rewriting the node
-  typedef struct {
-    string name;      // Original name of op of the node in the graph
-    string new_name;  // New name of the op of the node in the graph
-    // A function handler to copy attributes from an old node to a new node.
-    std::function<void(const Node*, NodeBuilder*)> copy_attrs;
-    // A rule under which to rewrite this node
-    std::function<bool(const Node*, const ContextInfo* c)> rewrite_rule;
-    // ContextInfo, if any, to be used for rewrite
-    ContextInfo* context;
-  } RewriteInfo;
-
-  /// Structure to specify a forward op, a backward op, and the slot numbers
-  /// in the forward and backward ops where we will add a workspace edge.
-  typedef struct {
-    string fwd_op;    // Name of a forward op in the graph
-    string bwd_op;    // Name of a backward op in the graph
-    int fwd_slot;     // Output slot in the forward op node where actual
-                      // output tensor resides
-    int bwd_slot;     // Input slot in the backward op node where actual
-                      // input tensor resides
-    int ws_fwd_slot;  // Output slot in the forward op node where workspace
-                      // edge is added
-    int ws_bwd_slot;  // Input slot in the backward op node where workspace
-                      // edge is added
-  } WorkSpaceInfo;
-
-  /// Structure to specify information used in node merge
-  typedef struct {
-    string pred;      // Predecessor node string
-    string succ;      // Successor node string
-    int op;           // The operand no the predecessor node corresponds
-                      // to the successor node
-    string new_node;  // Name of the node after merge
-  } MergeInfo;
-
-  /// Structure to store all constant strings
-  /// NOTE: names are alphabetically sorted.
-  typedef struct {
-    string addn;
-    string add;
-    string avg_pool;
-    string avg_pool_grad;
-    string bias_add;
-    string bias_add_grad;
-    string concat;
-    string concatv2;
-    string conv2d;
-    string conv2d_grad_input;
-    string conv2d_grad_filter;
-    string fused_batch_norm;
-    string fused_batch_norm_grad;
-    string identity;
-    string lrn;
-    string lrn_grad;
-    string matmul;
-    string max_pool;
-    string max_pool_grad;
-    string maximum;
-    string mkl_conv2d;
-    string mkl_conv2d_grad_input;
-    string mkl_conv2d_grad_filter;
-    string mkl_conv2d_with_bias;
-    string mkl_conv2d_with_bias_backprop_bias;
-    string mul;
-    string relu;
-    string relu_grad;
-    string reshape;
-    string split;
-    string squared_difference;
-    string sub;
-  } ConstStringsInfo;
-
- private:
-  /// Maintain info about nodes to rewrite
-  std::vector<RewriteInfo> rinfo_;
-
-  /// Maintain info about nodes to add workspace edge
-  std::vector<WorkSpaceInfo> wsinfo_;
-
-  /// Maintain info about nodes to be merged
-  std::vector<MergeInfo> minfo_;
-
-  /// Maintain info about nodes to rewrite
-  static std::vector<ContextInfo*> cinfo_;
-
-  /// Maintain structure of constant strings
-  static ConstStringsInfo csinfo_;
-
-  /// Context variables used in referencing rules
-  static ContextInfo biasaddgrad_matmul_context_;
-  static ContextInfo biasaddgrad_conv2dwithbias_context_;
-
- private:
-  // Is OpDef::ArgDef a list type? It could be N * T or list(type).
-  // Refer to opdef.proto for details of list type.
-  inline bool ArgIsList(const OpDef::ArgDef& arg) const {
-    return !arg.type_list_attr().empty() || !arg.number_attr().empty();
-  }
-
-  // Get length of a list in 'n' if 'arg' is of list type. Refer to
-  // description of ArgIsList for definition of list type.
-  inline int GetTensorListLength(const OpDef::ArgDef& arg, Node* n) {
-    CHECK_EQ(ArgIsList(arg), true);
-    int N = 0;
-    const string attr_name = !arg.type_list_attr().empty()
-                                 ? arg.type_list_attr()
-                                 : arg.number_attr();
-    if (!arg.type_list_attr().empty()) {
-      std::vector<DataType> value;
-      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &value));
-      N = value.size();
-    } else {
-      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &N));
-    }
-    return N;
-  }
-
-  // Can op represented by node 'n' run on DEVICE_CPU?
-  // Op can run on CPU with MKL if the runtime assigned device or the
-  // user requested device contains device CPU, or both are empty.
-  bool CanOpRunOnCPUDevice(const Node* n) {
-    bool result = true;
-    string reason;
-
-    // Substring that should be checked for in device name for CPU device.
-    const char* const kCPUDeviceSubStr = "CPU";
-
-    // If Op has been specifically assigned to a non-CPU device, then No.
-    if (!n->assigned_device_name().empty() &&
-       !str_util::StrContains(n->assigned_device_name(), kCPUDeviceSubStr)) {
-      result = false;
-      reason = "Op has been assigned a runtime device that is not CPU.";
-    }
-
-    // If user has specifically assigned this op to a non-CPU device, then No.
-    if (!n->def().device().empty() &&
-       !str_util::StrContains(n->def().device(), kCPUDeviceSubStr)) {
-      result = false;
-      reason = "User has assigned a device that is not CPU.";
-    }
-
-    if (result == false) {
-      VLOG(1) << "MklLayoutRewritePass: Skipping rewriting of the node "
-              << n->type_string() << ", reason: " << reason;
-    }
-
-    // Otherwise Yes.
-    return result;
-  }
-
-  // Return a node that can be merged with input node 'n'
-  //
-  // @return pointer to the node if we can find such a
-  // node. Otherwise, it returns nullptr.
-  Node* CheckForNodeMerge(const Node* n) const;
-
-  // Merge predecessor node with its successor.
-  // Currently, we merge Conv2D with BiasAdd only.
-  //
-  // Input nodes succ and pred may be deleted if the call to
-  // this function is successful. Attempt to use the pointers
-  // after the call to function may result in undefined behaviors.
-  //
-  // @input g - input graph, succ - successor node, pred - predecessor node
-  // @return Status::OK(), if merging is successful and supported.
-  //         Returns appropriate Status error code otherwise.
-  //         Graph is updated in case nodes are merged. Otherwise, it is
-  //         not updated.
-  Status MergeNode(std::unique_ptr<Graph>* g, Node* succ, Node* pred);
-
-  // Check if the node 'n' has any applicable rewrite rule
-  // We check for 2 scenarios for rewrite.
-  //
-  // @return RewriteInfo* for the applicable rewrite rule
-  const RewriteInfo* CheckForNodeRewrite(const Node* n) const;
-
-  // Default rewrite rule to be used in scenario 1 for rewrite.
-  // @return - true (since we want to always rewrite)
-  static bool AlwaysRewrite(const Node* n, const ContextInfo* c = nullptr) {
-    return true;
-  }
-
-  // Check if we are performing pooling on depth or batch. If it is, then we
-  // do not rewrite MaxPool node to Mkl version.
-  // @return - true (if it is not a depth/batch wise pooling case);
-  //           false otherwise.
-  static bool NonDepthBatchWisePoolRewrite(const Node* n,
-                                           const ContextInfo* c) {
-    CHECK_NOTNULL(n);
-
-    string data_format_str;
-    TensorFormat data_format;
-    std::vector<int32> ksize, strides;
-    CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
-    CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
-    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), true);
-    CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
-
-    // Condition that specifies non-batch-wise and non-depth-wise pooling.
-    if (GetTensorDim(ksize, data_format, 'N') == 1 &&
-        GetTensorDim(strides, data_format, 'N') == 1 &&
-        GetTensorDim(ksize, data_format, 'C') == 1 &&
-        GetTensorDim(strides, data_format, 'C') == 1) {
-      return true;
-    }
-
-    return false;
-  }
-
-  static bool AddNRewrite(const Node* n, const ContextInfo* c) {
-    CHECK_NOTNULL(n);
-
-    int num;
-    CHECK_EQ(GetNodeAttr(n->def(), "N", &num).ok(), true);
-
-    // Condition that specifies non-batch-wise and non-depth-wise pooling.
-    if (num == 2) {
-      return true;
-    }
-
-    return false;
-  }
-  // Is BiasAddGrad node in 'n' is associated with Conv2DWithBias node
-  // specified in contextinfo 'ci'. Function updates fwd_node to point
-  // to Conv2DWithBias node if 'n' is associated with Conv2DWithBias.
-  //
-  // Association checks for one of the following graphs:
-  //
-  // Graph A:
-  //
-  // _ = Conv2DWithBias(F, I, _)
-  // ..
-  // _ = Conv2DBackpropFilter(F, _, G)
-  // _ = Conv2DBackpropInput(_, I, G)
-  // _ = BiasAddGrad(G)
-  //
-  // OR
-  //
-  // Graph B:
-  //
-  // _ = Conv2DWithBias(F, _, _)
-  // ..
-  // _ = Conv2DBackpropFilter(F, _, G)
-  // _ = BiasAddGrad(G)
-  //
-  // Here F, G, and I are graph nodes; _ represents graph nodes that we
-  // don't care here.
-  //
-  // @return - true (if BiasAddGrad is associated with Conv2DWithBias);
-  //           false otherwise.
-  static bool IsBiasAddGradInConv2DWithBiasContext(const Node* n,
-                                                   const Node** fwd_node,
-                                                   void* ci) {
-    CHECK_NOTNULL(n);
-    CHECK_NOTNULL(fwd_node);
-    CHECK_NOTNULL(ci);
-    *fwd_node = nullptr;
-
-    CHECK_EQ(n->type_string(), csinfo_.bias_add_grad);
-
-    // Get the only 1 input of BiasAddGrad.
-    CHECK_EQ(n->num_inputs(), 1);
-    const Node* bias_add_grad_inp = nullptr;
-    TF_CHECK_OK(n->input_node(0, &bias_add_grad_inp));
-    CHECK_NOTNULL(bias_add_grad_inp);
-
-    // Check if this input also goes to BackpropFilter and BackpropInput
-    // as 3rd input.
-    bool found_backprop_input = false;
-    bool found_backprop_filter = false;
-    Node* backprop_filter_node = nullptr;
-    Node* backprop_input_node = nullptr;
-
-    for (const Edge* e : bias_add_grad_inp->out_edges()) {
-      Node* third_input = nullptr;
-      if (e->dst()->type_string() == csinfo_.conv2d_grad_input ||
-          e->dst()->type_string() == csinfo_.mkl_conv2d_grad_input) {
-        // Third input (index 2) of BackpropInput
-        TF_CHECK_OK(e->dst()->input_node(2, &third_input));
-        // Third input (index 2) of BackpropInput must be same as the input
-        // of BiasAddGrad.
-        if (third_input == bias_add_grad_inp) {
-          found_backprop_input = true;
-          backprop_input_node = e->dst();
-        }
-      }
-
-      if (e->dst()->type_string() == csinfo_.conv2d_grad_filter ||
-          e->dst()->type_string() == csinfo_.mkl_conv2d_grad_filter) {
-        // Third input (index 2) of BackpropFilter
-        TF_CHECK_OK(e->dst()->input_node(2, &third_input));
-        // Third input (index 2) of BackpropFilter must be same as the input
-        // of BiasAddGrad.
-        if (third_input == bias_add_grad_inp) {
-          found_backprop_filter = true;
-          backprop_filter_node = e->dst();
-        }
-      }
-
-      // If we found both the nodes, then we can stop the search.
-      if (found_backprop_input && found_backprop_filter) {
-        break;
-      }
-    }
-
-    // If BackpropFilter node is not found, then this is not
-    // Conv2DWithBias context. For 2nd graph in the example above, only
-    // BackpropFilter would be present.
-    if (!found_backprop_filter) {
-      return false;
-    }
-
-    // Otherwise, we found the nodes.
-    CHECK_NOTNULL(backprop_filter_node);
-    if (found_backprop_input) {
-      CHECK_NOTNULL(backprop_input_node);
-    }
-
-    // Now that we confirmed that this is Conv2DWithBias context, we need to
-    // get access to the forward node (Conv2DWithBias). 2nd input of
-    // Conv2DWithBias is same as the 2nd input of Conv2DBackpropInput; 1st
-    // input of Conv2DWithBias is same as the 1st input of Conv2DBackpropFilter
-    // (This comes from definition of gradient computation for Conv2D).
-    if (found_backprop_input) {
-      // Graph A in the example.
-      Node* second_inp_of_input = nullptr;
-      Node* first_inp_of_filter = nullptr;
-      TF_CHECK_OK(backprop_input_node->input_node(1, &second_inp_of_input));
-      TF_CHECK_OK(backprop_filter_node->input_node(0, &first_inp_of_filter));
-      CHECK_NOTNULL(second_inp_of_input);
-      CHECK_NOTNULL(first_inp_of_filter);
-
-      // Now we need to find out Conv2DWithBias node from these input nodes.
-      // Conv2DWithBias node is the node that accepts both the nodes
-      // second_inp_of_input and first_inp_of_filter in 2nd and 1st input slots.
-      for (const Edge* fe : first_inp_of_filter->out_edges()) {
-        if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
-            fe->dst_input() == 0) {
-          for (const Edge* ie : second_inp_of_input->out_edges()) {
-            if (ie->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
-                ie->dst_input() == 1 && fe->dst() == ie->dst()) {
-              VLOG(1) << "MklLayoutRewritePass: found "
-                      << fe->dst()->DebugString()
-                      << " as the forward node for matching context, backward"
-                      << " node is: " << n->DebugString();
-              *fwd_node = fe->dst();
-              return true;
-            }
-          }
-        }
-      }
-    } else {
-      // We did not find BackpropInput, so we work with BackpropFilter only.
-      // Graph B in the example.
-      Node* first_inp_of_filter = nullptr;
-      TF_CHECK_OK(backprop_filter_node->input_node(0, &first_inp_of_filter));
-      CHECK_NOTNULL(first_inp_of_filter);
-
-      // Now we need to find out Conv2DWithBias node from first input of
-      // BackpropFIlter. Conv2DWithBias node is the node that accepts
-      // first_inp_of_filter in 1st input slot.
-      for (const Edge* fe : first_inp_of_filter->out_edges()) {
-        if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
-            fe->dst_input() == 0) {
-          VLOG(1) << "MklLayoutRewritePass: found " << fe->dst()->DebugString()
-                  << " as the forward node for matching context, backward"
-                  << " node is: " << n->DebugString();
-          *fwd_node = fe->dst();
-          return true;
-        }
-      }
-    }
-
-    return false;
-  }
-
-  // Is BiasAddGrad node in 'n' is associated with MatMul node
-  // specified in contextinfo 'ci'. Function does not update fwd_node.
-  //
-  // @return - true (if BiasAddGrad is associated with MatMul);
-  //           false otherwise.
-  static bool IsBiasAddGradInMatMulContext(const Node* n, const Node** fwd_node,
-                                           void* ci) {
-    return (!IsBiasAddGradInConv2DWithBiasContext(n, fwd_node, ci));
-  }
-
-  // Rewrite rule that uses context-information for matching,
-  // used in scenario 2.
-  //
-  // @input - Node 'n' for which to search for matching context
-  // @input - The context 'c' under which to rewrite
-  // @return - true if we can rewrite node under context 'c';
-  //           false otherwise.
-  static bool ContextMatchRewrite(const Node* n, const ContextInfo* c);
-
-  // Helper function that searches the matching contextinfo for the node.
-  //
-  // @input n - Node (gradient op) whose contextinfo is to be searched,
-  //        fwd_node - pointer to node from the forward pass that this node
-  //        belongs to. fwd_node cannot be NULL.
-  // @return Matching contextinfo in case a match is found; null otherwise.
-  //         Also updates *fwd_node with pointer to forward node that this
-  //         context matches.
-  static const ContextInfo* SearchMatchingContext(const Node* n,
-                                                  const Node** fwd_node);
-
-  // Rewrites input node to a new node specified by its matching rewrite info.
-  //
-  // Method first searches matching rewrite info for input node and then
-  // uses that info to rewrite.
-  //
-  // Input node may be deleted in case of rewrite. Attempt to use the node
-  // after the call can result in undefined behaviors.
-  //
-  // @input  g - input graph, n - Node to be rewritten,
-  //         ri - matching rewriteinfo
-  // @return Status::OK(), if the input node is rewritten;
-  //         Returns appropriate Status error code otherwise.
-  //         Graph is updated in case the input node is rewritten.
-  //         Otherwise, it is not updated.
-  Status RewriteNode(std::unique_ptr<Graph>* g, Node* n, const RewriteInfo* ri);
-
-  // Get nodes that will feed a list of TF tensors to the new
-  // node that we are constructing.
-  //
-  // @input g - input graph,
-  // @input inputs - inputs to old node that we are using for constructing
-  //                 new inputs,
-  // @input input_idx - the index in the 'inputs' vector pointing to the
-  //                    current input that we have processed so far
-  // @output input_idx - index will be incremented by the number of nodes
-  //                     from 'inputs' that are processed
-  // @input list_length - The expected length of list of TF tensors
-  // @output output_nodes - the list of new nodes creating TF tensors
-  //
-  // @return None
-  void GetNodesProducingTFTensorList(
-      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-      int* input_idx, int list_length,
-      std::vector<NodeBuilder::NodeOut>* output_nodes);
-
-  // Get nodes that will feed a list of Mkl tensors to the new
-  // node that we are constructing.
-  //
-  // @input g - input graph,
-  // @input orig_node - Original node that we are rewriting
-  // @input inputs - inputs to old node that we are using for constructing
-  //                 new inputs,
-  // @input input_idx - the index in the 'inputs' vector pointing to the
-  //                    current input that we have processed so far
-  // @output input_idx - index will be incremented by the number of nodes
-  //                     from 'inputs' that are processed
-  // @input list_length - The expected length of list of Mkl tensors
-  // @output output_nodes - the list of new nodes creating Mkl tensors
-  //
-  // @return None
-  void GetNodesProducingMklTensorList(
-      std::unique_ptr<Graph>* g, Node* orig_node,
-      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-      int* input_idx, int list_length,
-      std::vector<NodeBuilder::NodeOut>* output_nodes);
-
-  // Get a node that will feed an Mkl tensor to the new
-  // node that we are constructing. The output node could be (1) 'n'
-  // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
-  // if 'n' is not an Mkl layer.
-  //
-  // @input g - input graph,
-  // @input orig_node - Original node that we are rewriting,
-  // @input n - Node based on which we are creating Mkl node,
-  // @input n_output_slot - the output slot of node 'n'
-  //            which is feeding to the node that we are constructing
-  // @output mkl_node - the new node that will feed Mkl tensor
-  // @output mkl_node_output_slot - the slot number of mkl_node that
-  //                                will feed the tensor
-  // @return None
-  void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
-                                 Node* n, int n_output_slot, Node** mkl_node,
-                                 int* mkl_node_output_slot);
-
-  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
-  // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
-  // set up in contiguous fashion. 'workspace_tensors' carry graph nodes
-  // producing workspace edges if 'are_workspace_tensors_available' is true.
-  // Otherwise, 'workspace_tensors' is empty vector.
-  //
-  // For details, refer to 'Ordering of inputs after rewriting' section in the
-  // documentation above.
-  //
-  // Returns Status::OK() if setting up inputs is successful, otherwise
-  // returns appropriate status code.
-  int SetUpContiguousInputs(
-      std::unique_ptr<Graph>* g,
-      const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
-      NodeBuilder* nb, Node* old_node,
-      std::vector<NodeBuilder::NodeOut>* workspace_tensors,
-      bool are_workspace_tensors_available);
-
-  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
-  // in graph 'g'. Original node is input in 'orig_node'.
-  //
-  // For details, refer to 'Ordering of Tensorflow tensors and Mkl tensors'
-  // section in the documentation above.
-  //
-  // Returns Status::OK() if setting up inputs is successful, otherwise
-  // returns appropriate status code.
-  Status SetUpInputs(std::unique_ptr<Graph>* g,
-                     const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-                     NodeBuilder* nb, Node* orig_node);
-
-  // Add workspace edge on the input or output side of Node 'orig_node' by using
-  // NodeBuilder 'nb' for the new node provided. If 'orig_node' does not dictate
-  // adding workspace edge then do not add it. Workspace Tensorflow and Mkl
-  // tensors, if they need to be added, will be set into these tensors.
-  // If we set workspace tensors, then are_ws_tensors_added should be true.
-  void AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g, Node* orig_node,
-                                NodeBuilder* nb,
-                                std::vector<NodeBuilder::NodeOut>* ws_tensors,
-                                bool* are_ws_tensors_added);
-
-  // Functions specific to operators to copy attributes
-  // We need operator-specific function to copy attributes because the framework
-  // does not provide any generic function for it.
-  // NOTE: names are alphabetically sorted.
-  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConv2D(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsIdentity(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb);
-
-  // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
-  // using node for original node 'orig_node' and return it in '*out'.
-  // TODO(nhasabni) We should move this to mkl_util.h
-  void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out,
-                             Node* orig_node);
-  void GetDummyWorkspaceTensorNode(std::unique_ptr<Graph>* g, Node** out,
-                                   Node* orig_node);
-};
-
-MklLayoutRewritePass::ConstStringsInfo MklLayoutRewritePass::csinfo_;
-MklLayoutRewritePass::ContextInfo
-    MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
-MklLayoutRewritePass::ContextInfo
-    MklLayoutRewritePass::biasaddgrad_matmul_context_;
-std::vector<MklLayoutRewritePass::ContextInfo*> MklLayoutRewritePass::cinfo_;
-
-// We register Mkl rewrite pass for phase 1 in post partitioning group.
-// We register it here so that we get a complete picture of all users of Mkl
-// nodes. Do not change the ordering of the Mkl passes.
-const OptimizationPassRegistry::Grouping kMklLayoutRewritePassGroup =
-    OptimizationPassRegistry::POST_PARTITIONING;
-#ifdef ENABLE_MKL
-REGISTER_OPTIMIZATION(kMklLayoutRewritePassGroup, 1, MklLayoutRewritePass);
-#endif  // ENABLE_MKL
-
-//////////////////////////////////////////////////////////////////////////
-//           Helper functions for creating new node
-//////////////////////////////////////////////////////////////////////////
-
-static void FillInputs(const Node* n,
-                       gtl::InlinedVector<Node*, 4>* control_edges,
-                       gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
-  control_edges->clear();
-  for (const Edge* e : n->in_edges()) {
-    if (e->IsControlEdge()) {
-      control_edges->push_back(e->src());
-    } else {
-      (*in)[e->dst_input()] = std::make_pair(e->src(), e->src_output());
-    }
-  }
-  std::sort(control_edges->begin(), control_edges->end());
-  if (n->op_def().is_commutative()) {
-    // For commutative inputs, we sort the input by the input Node*
-    // to get a canonical ordering (so that add(a,b) and add(b, a) will
-    // hash to the same value if is_commutative is true for 'add').
-    std::sort(in->begin(), in->end());
-  }
-}
-
-void MklLayoutRewritePass::GetNodesProducingTFTensorList(
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
-    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
-  CHECK_LT(*input_idx, inputs.size());
-  CHECK_GT(list_length, 0);
-  CHECK_NOTNULL(output_nodes);
-  output_nodes->reserve(list_length);
-
-  while (list_length != 0) {
-    CHECK_GT(list_length, 0);
-    CHECK_LT(*input_idx, inputs.size());
-    Node* n = inputs[*input_idx].first;
-    int slot = inputs[*input_idx].second;
-    // If input node 'n' is just producing a single tensor at
-    // output slot 'slot' then we just add that single node.
-    output_nodes->push_back(NodeBuilder::NodeOut(n, slot));
-    (*input_idx)++;
-    list_length--;
-  }
-}
-
-// TODO(nhasabni) We should move this to mkl_util.h.
-void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
-                                                 Node** out, Node* orig_node) {
-  // We use a tensor of shape {8} and value 0,0,0,0,0,0,0,0 to represent
-  // dummy Mkl tensor. 8 = 2*size_t.
-  const DataType dt = DataTypeToEnum<uint8>::v();
-  TensorProto proto;
-  proto.set_dtype(dt);
-  uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-  proto.set_tensor_content(string(reinterpret_cast<const char*>(zero), 8));
-  TensorShape dummy_shape({8});
-  dummy_shape.AsProto(proto.mutable_tensor_shape());
-  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                  .Attr("value", proto)
-                  .Attr("dtype", dt)
-                  .Device(orig_node->def().device())  // We place this node on
-                                                      // the same device as the
-                                                      // device of the original
-                                                      // node.
-                  .Finalize(&**g, out));
-  CHECK_NOTNULL(*out); // Make sure we got a valid object before using it
-
-  // If number of inputs to the original node is > 0, then we add
-  // control dependency between 1st input (index 0) of the original node and
-  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
-  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
-  // rewritten node. Adding control edge between 1st input of the original node
-  // and the dummy Mkl node ensures that the dummy node is in the same frame
-  // as the original node. Choosing 1st input is not necessary - any input of
-  // the original node is fine because all the inputs of a node are always in
-  // the same frame.
-  if (orig_node->num_inputs() > 0) {
-    Node* orig_input0 = nullptr;
-    TF_CHECK_OK(
-        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
-    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
-  }
-
-  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
-}
-
-void MklLayoutRewritePass::GetNodesProducingMklTensorList(
-    std::unique_ptr<Graph>* g, Node* orig_node,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
-    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
-  CHECK_LT(*input_idx, inputs.size());
-  CHECK_GT(list_length, 0);
-  CHECK_NOTNULL(output_nodes);
-  output_nodes->reserve(list_length);
-
-  while (list_length != 0) {
-    CHECK_GT(list_length, 0);
-    CHECK_LT(*input_idx, inputs.size());
-    Node* n = inputs[*input_idx].first;
-    int slot = inputs[*input_idx].second;
-    // If 'n' is producing a single tensor, then create a single Mkl tensor
-    // node.
-    Node* mkl_node = nullptr;
-    int mkl_node_output_slot = 0;
-    GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
-                              &mkl_node_output_slot);
-    output_nodes->push_back(
-        NodeBuilder::NodeOut(mkl_node, mkl_node_output_slot));
-    (*input_idx)++;
-    list_length--;
-  }
-}
-
-// Get an input node that will feed Mkl tensor to the new
-// node that we are constructing. An input node could be (1) 'n'
-// if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
-// if 'n' is not an Mkl layer.
-void MklLayoutRewritePass::GetNodeProducingMklTensor(
-    std::unique_ptr<Graph>* g, Node* orig_node, Node* n, int n_output_slot,
-    Node** mkl_node, int* mkl_node_output_slot) {
-  CHECK_NOTNULL(n);
-  CHECK_NOTNULL(mkl_node);
-  CHECK_NOTNULL(mkl_node_output_slot);
-
-  // If this is an MKL op, then it will create extra output for MKL layout.
-  DataType T;
-  if (GetNodeAttr(n->def(), "T", &T).ok() &&
-      mkl_op_registry::IsMklOp(n->type_string(), T)) {
-    // If this is an MKL op, then it will generate an edge that will receive
-    // Mkl tensor from a node.
-    // output slot number for Mkl tensor would be N+slot number of TensorFlow
-    // tensor, where N is total number of TensorFlow tensors.
-    *mkl_node = n;
-    *mkl_node_output_slot =
-        GetTensorMetaDataIndex(n_output_slot, n->num_outputs());
-  } else {
-    // If we have not visited the node and rewritten it, then we need
-    // to create a dummy node that will feed a dummy Mkl tensor to this node.
-    // DummyMklTensor node has no input and generates only 1 output
-    // (dummy Mkl tensor) as output slot number 0.
-    GetDummyMklTensorNode(g, mkl_node, orig_node);
-    CHECK_NOTNULL(*mkl_node);
-    *mkl_node_output_slot = 0;
-  }
-}
-
-int MklLayoutRewritePass::SetUpContiguousInputs(
-    std::unique_ptr<Graph>* g,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
-    NodeBuilder* nb, Node* old_node,
-    std::vector<NodeBuilder::NodeOut>* workspace_tensors,
-    bool are_workspace_tensors_available) {
-  CHECK_NOTNULL(workspace_tensors);
-  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-
-  // TODO(nhasabni): Temporary solution to connect filter input of
-  // BackpropInput with the converted filter from Conv2D.
-  bool do_connect_conv2d_backprop_input_filter = false;
-  Node* conv2d_node = nullptr;
-  // Filter node is 2nd input (slot index 1) of Conv2D.
-  int kConv2DFilterInputSlotIdx = 1;
-  int kConv2DBackpropInputFilterInputSlotIdx = 1;
-  int kConv2DFilterOutputSlotIdx = 1;
-  if (old_node->type_string() == csinfo_.conv2d_grad_input) {
-    // We need to find Conv2D node from Conv2DBackpropInput.
-    // For that let's first find filter node that is 2nd input (slot 1)
-    // of BackpropInput.
-    Node* filter_node = nullptr;
-    TF_CHECK_OK(old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx,
-                                     &filter_node));
-    CHECK_NOTNULL(filter_node);
-
-    // Now check which nodes receive from filter_node. Filter feeds as
-    // 2nd input (slot 1) of _MklConv2D and _MklConv2DWithBias.
-    for (const Edge* e : filter_node->out_edges()) {
-      if (e->dst()->type_string() == csinfo_.mkl_conv2d &&
-          e->dst_input() == kConv2DFilterInputSlotIdx
-          /* filter is 2nd input of Conv2D and _MklConv2D. */) {
-        if (conv2d_node != nullptr) {
-          VLOG(1) << "MklLayoutRewritePass: unusual case of same filter"
-                  << " feeding multiple Conv2D nodes: "
-                  << filter_node->DebugString();
-          // We will not connect filter input of Conv2DBackpropInput
-          // to be safe here.
-          do_connect_conv2d_backprop_input_filter = false;
-          break;
-        } else {
-          conv2d_node = e->dst();
-          do_connect_conv2d_backprop_input_filter = true;
-        }
-      }
-    }
-  }
-
-  // Number of input slots to original op
-  // Input slots are represented by .Input() calls in REGISTER_OP.
-  int old_node_input_slots = old_node->op_def().input_arg_size();
-  // Actual number of inputs can be greater than or equal to number
-  // of Input slots because inputs of type list could be unfolded.
-  CHECK_GE(old_node_inputs.size(), old_node_input_slots);
-  int nn_slot_idx = 0;  // slot index for inputs of new node
-
-  // Let's copy all inputs (TF tensors) of original node to new node.
-  int iidx = 0;
-  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
-    // An input slot could be a single tensor or a list. We need
-    // to handle this case accordingly.
-    CHECK_LT(iidx, old_node_inputs.size());
-    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
-    if (ArgIsList(arg)) {
-      std::vector<NodeBuilder::NodeOut> new_node_inputs;
-      int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
-                                    &new_node_inputs);
-      nb->Input(new_node_inputs);
-      nn_slot_idx++;
-    } else {
-      // Special case for connecting filter input of Conv2DBackpropInput
-      if (do_connect_conv2d_backprop_input_filter &&
-          iidx == kConv2DBackpropInputFilterInputSlotIdx) {
-        nb->Input(conv2d_node, kConv2DFilterOutputSlotIdx);
-      } else {
-        nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second);
-      }
-      iidx++;
-      nn_slot_idx++;
-    }
-  }
-
-  // If workspace tensors are available for this op and we are using
-  // contiguous ordering then we need to add Tensorflow tensor for
-  // workspace here because Tensorflow tensor for workspace is the
-  // last tensor in the list of Tensorflow tensors.
-  if (are_workspace_tensors_available) {
-    CHECK_EQ(workspace_tensors->size(), 2);
-    // Tensorflow tensor
-    nb->Input((*workspace_tensors)[0].node, (*workspace_tensors)[0].index);
-    nn_slot_idx++;
-  }
-
-  // Let's now setup all Mkl inputs to new node.
-  // Number of Mkl inputs must be same as number of TF inputs.
-  iidx = 0;
-  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
-    // An input slot could be a single tensor or a list. We need
-    // to handle this case accordingly.
-    CHECK_LT(iidx, old_node_inputs.size());
-    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
-    if (ArgIsList(arg)) {
-      std::vector<NodeBuilder::NodeOut> new_node_inputs;
-      int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N,
-                                     &new_node_inputs);
-      nb->Input(new_node_inputs);
-      nn_slot_idx++;
-    } else {
-      Node* mkl_node = nullptr;
-      int mkl_node_output_slot = 0;
-      // Special case for connecting filter input of Conv2DBackpropInput
-      if (do_connect_conv2d_backprop_input_filter &&
-          iidx == kConv2DBackpropInputFilterInputSlotIdx) {
-        GetNodeProducingMklTensor(g, old_node, conv2d_node,
-                                  kConv2DFilterOutputSlotIdx, &mkl_node,
-                                  &mkl_node_output_slot);
-      } else {
-        GetNodeProducingMklTensor(g, old_node, old_node_inputs[iidx].first,
-                                  old_node_inputs[iidx].second, &mkl_node,
-                                  &mkl_node_output_slot);
-      }
-      nb->Input(mkl_node, mkl_node_output_slot);
-      iidx++;
-      nn_slot_idx++;
-    }
-  }
-
-  // If workspace tensors are available for this op and we are using
-  // contiguous ordering then we need to add Mkl tensor for
-  // workspace here because Mkl tensor for workspace is the
-  // last tensor in the list of Mkl tensors.
-  if (are_workspace_tensors_available) {
-    CHECK_EQ(workspace_tensors->size(), 2);
-    // Mkl tensor
-    nb->Input((*workspace_tensors)[1].node, (*workspace_tensors)[1].index);
-    nn_slot_idx++;
-  }
-
-  return nn_slot_idx;
-}
-
-Status MklLayoutRewritePass::SetUpInputs(
-    std::unique_ptr<Graph>* g,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
-    NodeBuilder* nb, Node* old_node) {
-  // Let's check if we need to add workspace tensors for this node.
-  // We add workspace edge only for MaxPool, LRN and BatchNorm.
-  std::vector<NodeBuilder::NodeOut> workspace_tensors;
-  bool are_workspace_tensors_available = false;
-  AddWorkSpaceEdgeIfNeeded(g, old_node, nb, &workspace_tensors,
-                           &are_workspace_tensors_available);
-
-  int new_node_input_slots = 0;
-  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-    // TODO(nhasabni): implement this function just for same of completion.
-    // We do not use interleaved ordering right now.
-    return Status(
-        error::Code::UNIMPLEMENTED,
-        "Interleaved ordering of tensors is currently not supported.");
-  } else {
-    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-    new_node_input_slots = SetUpContiguousInputs(
-        g, old_node_inputs, nb, old_node, &workspace_tensors,
-        are_workspace_tensors_available);
-  }
-
-  // Sanity check
-  int old_node_input_slots = old_node->op_def().input_arg_size();
-  if (!are_workspace_tensors_available) {
-    // If we are not adding workspace tensors for this op, then the total
-    // number of input slots to the new node _must_ be 2 times the number
-    // of input slots to the original node: N original Tensorflow tensors and
-    // N for Mkl tensors corresponding to each Tensorflow tensors.
-    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2);
-  } else {
-    // If we are adding workspace tensors for this op, then the total
-    // The total number of input slots to new node _must_ be 2 times the number
-    // of input slots to the original node: N original Tensorflow tensors and
-    // N for Mkl tensors corresponding to each Tensorflow tensors plus 2
-    // (for workspace Tensorflow tensor and workspace Mkl tensor).
-    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2 + 2);
-  }
-
-  return Status::OK();
-}
-
-//////////////////////////////////////////////////////////////////////////
-//           Helper functions related to workspace pass
-//////////////////////////////////////////////////////////////////////////
-
-// TODO(nhasabni) We should move this to mkl_util.h.
-void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
-    std::unique_ptr<Graph>* g, Node** out, Node* orig_node) {
-  // We use a tensor of shape {1} and value 0 to represent
-  // dummy float tensor. We need this as a dummy workspace tensor.
-  // Workspace tensor has type float.
-  const DataType dt = DataTypeToEnum<float>::v();
-  TensorProto proto;
-  proto.set_dtype(dt);
-  float zero[1] = {0};
-  proto.set_tensor_content(string(reinterpret_cast<char*>(&zero), 4));
-  TensorShape dummy_shape({1});
-  dummy_shape.AsProto(proto.mutable_tensor_shape());
-  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                  .Attr("value", proto)
-                  .Attr("dtype", dt)
-                  .Device(orig_node->def().device())  // We place this node on
-                                                      // same the device as the
-                                                      // device of the original
-                                                      // node.
-                  .Finalize(&**g, out));
-  CHECK_NOTNULL(*out); // Make sure we got a valid object before using it
-
-  // If number of inputs to the original node is > 0, then we add
-  // control dependency between 1st input (index 0) of the original node and
-  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
-  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
-  // rewritten node. Adding control edge between 1st input of the original node
-  // and the dummy Mkl node ensures that the dummy node is in the same frame
-  // as the original node. Choosing 1st input is not necessary - any input of
-  // the original node is fine because all the inputs of a node are always in
-  // the same frame.
-  if (orig_node->num_inputs() > 0) {
-    Node* orig_input0 = nullptr;
-    TF_CHECK_OK(
-        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
-    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
-  }
-
-  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
-}
-
-void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
-    std::unique_ptr<Graph>* g, Node* orig_node, NodeBuilder* nb,
-    std::vector<NodeBuilder::NodeOut>* ws_tensors, bool* are_ws_tensors_added) {
-  bool workspace_edge_added = false;  // Default initializer
-  CHECK_NOTNULL(are_ws_tensors_added);
-  *are_ws_tensors_added = false;  // Default initializer
-
-  DataType T;
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  for (auto ws : wsinfo_) {
-    if (orig_node->type_string() == ws.fwd_op &&
-        mkl_op_registry::IsMklOp(
-            mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
-      // If this op is a fwd op, then we need to check if there is an
-      // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
-      // an edge, then we just add an attribute on this node for setting
-      // workspace_passed to true. We don't add actual workspace edge
-      // in this node. Actual workspace edge gets added in the backward
-      // op for this node.
-      for (const Edge* e : orig_node->out_edges()) {
-        if (e->src_output() == ws.fwd_slot &&
-            e->dst()->type_string() == ws.bwd_op &&
-            e->dst_input() == ws.bwd_slot) {
-          nb->Attr("workspace_enabled", true);
-          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
-                  << orig_node->type_string();
-          workspace_edge_added = true;
-          // We found the edge that we were looking for, so break.
-          break;
-        }
-      }
-
-      if (!workspace_edge_added) {
-        // If we are here, then we did not find backward operator for this
-        // node.
-        nb->Attr("workspace_enabled", false);
-      }
-    } else if (orig_node->type_string() == ws.bwd_op &&
-               mkl_op_registry::IsMklOp(
-                   mkl_op_registry::GetMklOpName(orig_node->type_string()),
-                   T)) {
-      // If this op is a bwd op, then we need to add workspace edge and
-      // it's Mkl tensor edge between its corresponding fwd op and this
-      // op. Corresponding fwd op is specified in 'fwd_op' field of
-      // workspace info. fwd_slot and bwd_slot in workspace info specify
-      // an edge between which slots connect forward and backward op.
-      // Once all these criteria match, we add a workspace edge between
-      // ws_fwd_slot and ws_bwd_slot. Its corresponding Mkl tensor is
-      // determined by interleaved/contiguous ordering. Function
-      // DataIndexToMetaDataIndex tells us the location of Mkl tensor
-      // from the location of the Tensorflow tensor.
-      for (const Edge* e : orig_node->in_edges()) {
-        if (e->src_output() == ws.fwd_slot &&
-            // We would have rewritten the forward op, so we need to use
-            // GetMklOpName call to get its Mkl name.
-            e->src()->type_string() ==
-                mkl_op_registry::GetMklOpName(ws.fwd_op) &&
-            e->dst_input() == ws.bwd_slot) {
-          nb->Attr("workspace_enabled", true);
-          CHECK_NOTNULL(ws_tensors);
-          // Add workspace edge between fwd op and bwd op.
-          ws_tensors->push_back(NodeBuilder::NodeOut(e->src(), ws.ws_fwd_slot));
-          // Add Mkl tensor edge for workspace edge between fwd op and bwd op.
-          ws_tensors->push_back(NodeBuilder::NodeOut(
-              e->src(), DataIndexToMetaDataIndex(ws.ws_fwd_slot,
-                                                 e->src()->num_outputs())));
-          *are_ws_tensors_added = true;
-          // In terms of input ordering, we add these calls to add Input
-          // here because workspace edge (and its Mkl tensor) is the last
-          // edge in the fwdop and bwdop. So all inputs before workspace
-          // tensor have been added by SetUpInputs function.
-          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
-                  << orig_node->type_string();
-          workspace_edge_added = true;
-          // We found the edge that we were looking for, so break.
-          break;
-        }
-      }
-
-      // If we are here means we did not find fwd op that feeds to this
-      // bwd op. So in this case, we need to generate dummy tensors for
-      // workspace input and Mkl tensor for workspace, and set
-      // workspace_enabled to false.
-      if (!workspace_edge_added) {
-        nb->Attr("workspace_enabled", false);
-        Node* dmt_ws = nullptr;      // Dummy tensor for workspace
-        Node* dmt_mkl_ws = nullptr;  // Dummy Mkl tensor for workspace
-        GetDummyWorkspaceTensorNode(g, &dmt_ws, orig_node);
-        GetDummyMklTensorNode(g, &dmt_mkl_ws, orig_node);
-        CHECK_NOTNULL(dmt_ws);
-        CHECK_NOTNULL(dmt_mkl_ws);
-        CHECK_NOTNULL(ws_tensors);
-        // We add dummy tensor as workspace tensor.
-        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_ws, 0));
-        // We add dummy tensor as Mkl tensor for workspace tensor.
-        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_mkl_ws, 0));
-        *are_ws_tensors_added = true;
-        VLOG(1) << "MklLayoutRewritePass: dummy workspace_enabled for "
-                << orig_node->type_string();
-      }
-    } else {
-      // If this node does not match any workspace info, then we do not
-      // do anything special for workspace propagation for it.
-    }
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Op-specific functions to copy attributes from old node to new node
-//////////////////////////////////////////////////////////////////////////
-
-void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
-                                           NodeBuilder* nb) {
-  DataType T;
-  string data_format;
-  string padding;
-  std::vector<int32> strides;
-  bool use_cudnn_on_gpu;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-  TF_CHECK_OK(
-      GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("strides", strides);
-  nb->Attr("padding", padding);
-  nb->Attr("data_format", data_format);
-  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
-}
-
-void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
-                                         NodeBuilder* nb) {
-  DataType T;
-  int N;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("N", N);
-}
-
-void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
-                                                NodeBuilder* nb) {
-  DataType T;
-  string data_format;
-  std::vector<int32> strides;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("strides", strides);
-  nb->Attr("data_format", data_format);
-}
-
-void MklLayoutRewritePass::CopyAttrsIdentity(const Node* orig_node,
-                                             NodeBuilder* nb) {
-  DataType T;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  // Add attributes to new node.
-  nb->Attr("T", T);
-}
-
-void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
-                                        NodeBuilder* nb) {
-  DataType T;
-  int depth_radius;
-  float bias;
-  float alpha;
-  float beta;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "depth_radius", &depth_radius));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "bias", &bias));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "alpha", &alpha));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "beta", &beta));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("depth_radius", depth_radius);
-  nb->Attr("bias", bias);
-  nb->Attr("alpha", alpha);
-  nb->Attr("beta", beta);
-}
-
-void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
-                                            NodeBuilder* nb) {
-  DataType T;
-  string data_format;
-  string padding;
-  std::vector<int32> ksize, strides;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "ksize", &ksize));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("ksize", ksize);
-  nb->Attr("strides", strides);
-  nb->Attr("padding", padding);
-  nb->Attr("data_format", data_format);
-}
-
-void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
-                                             NodeBuilder* nb) {
-  DataType T;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-}
-
-void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                            NodeBuilder* nb) {
-  DataType T;
-  DataType Tshape;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tshape", &Tshape));
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("Tshape", Tshape);
-}
-
-void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
-                                          NodeBuilder* nb) {
-  DataType T;
-  string data_format;
-  int num_split;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_split", &num_split));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("num_split", num_split);
-  nb->Attr("data_format", data_format);
-}
-
-void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
-                                           NodeBuilder* nb) {
-  DataType T;
-  int N;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("N", N);
-}
-
-void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
-                                             NodeBuilder* nb) {
-  DataType T;
-  int N;
-  DataType tidx;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tidx", &tidx));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("N", N);
-  nb->Attr("Tidx", tidx);
-}
-
-void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
-                                                   NodeBuilder* nb) {
-  DataType T;
-  float epsilon;
-  string data_format;
-  bool is_training;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "is_training", &is_training));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("epsilon", epsilon);
-  nb->Attr("data_format", data_format);
-  nb->Attr("is_training", is_training);
-}
-
-//////////////////////////////////////////////////////////////////////////
-//           Helper functions related to node merge pass
-//////////////////////////////////////////////////////////////////////////
-
-Node* MklLayoutRewritePass::CheckForNodeMerge(const Node* a) const {
-  // TODO(nhasabni) Add check for type of node similar to CheckForNodeRewrite
-  // once we support BiasAddGrad as Mkl layer.
-
-  // Search for all matching mergeinfo.
-  // We allow more than one match for extensibility.
-  std::vector<const MergeInfo*> matching_mi;
-  for (auto mi = minfo_.cbegin(); mi != minfo_.cend(); ++mi) {
-    if (a->type_string() == mi->succ) {
-      matching_mi.push_back(&*mi);
-    }
-  }
-
-  for (const MergeInfo* mi : matching_mi) {
-    const int N_in = a->num_inputs();
-    if (mi->op >= N_in) {
-      continue;
-    }
-
-    // Get the control edges and input of node
-    gtl::InlinedVector<Node*, 4> a_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> a_in(N_in);
-    FillInputs(a, &a_control_edges, &a_in);
-
-    // Get operand op of the operator
-    Node* b = nullptr;
-    b = a_in[mi->op].first;
-    if (b == nullptr || (b->type_string() != mi->pred)) {
-      // NOTE: Should the first check be assert?
-      continue;
-    }
-
-    const int B_in = b->num_inputs();
-    gtl::InlinedVector<Node*, 4> b_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(B_in);
-    FillInputs(b, &b_control_edges, &b_in);
-
-    // Shouldn't merge if a and b have different control edges.
-    if (a_control_edges != b_control_edges) {
-      continue;
-    } else {
-      // We found a match.
-      return b;
-    }
-  }
-
-  return nullptr;
-}
-
-Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* succ,
-                                       Node* pred) {
-  CHECK_NOTNULL(succ);
-  CHECK_NOTNULL(pred);
-
-  if (succ->type_string() == csinfo_.bias_add &&
-      pred->type_string() == csinfo_.mkl_conv2d) {
-    // 1. Get all attributes from input nodes.
-    DataType T_pred, T_succ;
-    string padding;
-    std::vector<int32> strides;
-    string data_format_pred, data_format_succ;
-    bool use_cudnn_on_gnu;
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
-    TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
-    TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-    TF_CHECK_OK(
-        GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
-    // We check to ensure that data formats of both succ and pred are same.
-    // We expect them to be same, so we can enforce this as assert.
-    // But assert can be too strict, so we enforce this as a check.
-    // If the check fails, then we do not merge two nodes.
-    // We also do same check for devices.
-    if (data_format_pred != data_format_succ || T_pred != T_succ ||
-        pred->assigned_device_name() != succ->assigned_device_name() ||
-        pred->def().device() != succ->def().device()) {
-      return Status(error::Code::INVALID_ARGUMENT,
-                    "data_format or T attribute or devices of Conv2D and "
-                    "BiasAdd do not match. Will skip node merge optimization");
-    }
-
-    const int succ_num = succ->num_inputs();
-    gtl::InlinedVector<Node*, 4> succ_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> succ_in(succ_num);
-    FillInputs(succ, &succ_control_edges, &succ_in);
-
-    const int pred_num = pred->num_inputs();
-    gtl::InlinedVector<Node*, 4> pred_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> pred_in(pred_num);
-    FillInputs(pred, &pred_control_edges, &pred_in);
-
-    // We need to ensure that there is only 1 edge between Conv2D and AddBias.
-    // Otherwise, merging is semantically incorrect.
-    if (pred->out_edges().size() != 1) {
-      return Status(error::Code::INVALID_ARGUMENT,
-                    "Conv2D has multiple outputs."
-                    "Will skip node merge optimization");
-    }
-
-    for (const Edge* e : pred->out_edges()) {
-      if (e->dst() != succ) {
-        return Status(error::Code::INVALID_ARGUMENT,
-                      "Conv2D does not feed to BiasAdd."
-                      "Will skip node merge optimization");
-      }
-    }
-
-    // 2. Get inputs from both the nodes.
-    // Find the 2 inputs from the conv and the bias from the add Bias.
-    // Get operand 0, 1 of conv2D and their Mkl tensors.
-    CHECK_EQ(pred->in_edges().size(), 4);  // _MklConv2D must have 4 inputs.
-    // Get operand 1 of add_bias
-    // BiasAdd must have 2 inputs: Conv, bias
-    CHECK_EQ(succ->in_edges().size(), 2);
-    Node* oper3_mkl = nullptr;  // Mkl tensor corresponding to oper3
-    int oper3_mkl_slot = 0;     // For dummy MKL tensor node, output slot is 0.
-    GetDummyMklTensorNode(g, &oper3_mkl, pred);  // Get dummy Mkl tensor node
-    // as BiasAdd does not have Mkl tensor as input.
-    CHECK_NOTNULL(oper3_mkl);
-
-    // We will use the node name of BiasAdd as the name of new node
-    // Build new node. We use same name as original node, but change the op
-    // name.
-    NodeBuilder nb(succ->name(), csinfo_.mkl_conv2d_with_bias);
-    if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-      nb.Input(pred_in[0].first, pred_in[0].second);  // In1 of Conv2D
-      // pred_in[1] will be Mkl tensor for In1 if we follow interleaved
-      // ordering, and it will be 2nd Tensorflow tensor for Conv2D if
-      // we follow contiguous ordering.
-      nb.Input(pred_in[1].first, pred_in[1].second);  // Mkl for In1
-      nb.Input(pred_in[2].first, pred_in[2].second);  // In2 of Conv2D
-      nb.Input(pred_in[3].first, pred_in[3].second);  // Mkl for In2
-      nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
-      nb.Input(oper3_mkl, oper3_mkl_slot);            // Mkl for In2 of BiasAdd
-    } else {
-      CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-      nb.Input(pred_in[0].first, pred_in[0].second);  // In1 of Conv2D
-      // pred_in[1] will be Mkl tensor for In1 if we follow interleaved
-      // ordering, and it will be 2nd Tensorflow tensor for Conv2D if
-      // we follow contiguous ordering.
-      nb.Input(pred_in[1].first, pred_in[1].second);  // In2 of Conv2D
-      nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
-      nb.Input(pred_in[2].first, pred_in[2].second);  // Mkl for In1 of Conv2D
-      nb.Input(pred_in[3].first, pred_in[3].second);  // Mkl for In2 of Conv2D
-      nb.Input(oper3_mkl, oper3_mkl_slot);            // Mkl for In2 of BiasAdd
-    }
-
-    // Copy attributes from Conv2D to Conv2DWithBias.
-    CopyAttrsConv2D(const_cast<const Node*>(pred), &nb);
-
-    // Copy the device assigned to old node to new node.
-    nb.Device(succ->def().device());
-
-    // Create node.
-    Node* new_node;
-    TF_CHECK_OK(nb.Finalize(&**g, &new_node));
-    CHECK_NOTNULL(new_node);
-
-    // Set the Mkl layer label for this op.
-    new_node->AddAttr("_kernel", mkl_op_registry::kMklOpLabel);
-
-    // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
-    // node are already copied in BuildNode. We handle control edges now.
-    for (const Edge* e : pred->in_edges()) {
-      if (e->IsControlEdge()) {
-        CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
-      }
-    }
-    for (const Edge* e : succ->in_edges()) {
-      if (e->IsControlEdge()) {
-        CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
-      }
-    }
-
-    // Incoming edges are fixed, we will fix the outgoing edges now.
-    // First, we will fix outgoing control edges from 'pred' node.
-    // We don't need to handle outgoing data edges from 'pred' node
-    // because pred has only 1 output going to succ node (we enforced
-    // this check for merge already).
-    for (const Edge* e : pred->out_edges()) {
-      if (e->IsControlEdge()) {
-        CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
-      }
-    }
-
-    // Second, we will fix outgoing control and data edges from 'succ' node.
-    for (const Edge* e : succ->out_edges()) {
-      if (e->IsControlEdge()) {
-        CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
-      } else {
-        CHECK_NOTNULL(
-            (*g)->AddEdge(new_node, e->src_output(), e->dst(), e->dst_input()));
-      }
-    }
-
-    // Copy device assigned to old node to new node.
-    // It's ok to use pred or succ as we have enforced a check that
-    // both have same device assigned.
-    new_node->set_assigned_device_name(pred->assigned_device_name());
-
-    VLOG(1) << "MklLayoutRewritePass: Merged old node:" << pred->DebugString()
-            << ", and node: " << succ->DebugString()
-            << ", into node:" << new_node->DebugString();
-
-    (*g)->RemoveNode(succ);
-    (*g)->RemoveNode(pred);
-
-    return Status::OK();
-  }
-
-  return Status(error::Code::UNIMPLEMENTED,
-                "Unimplemented case for node merge optimization.");
-}
-
-//////////////////////////////////////////////////////////////////////////
-//           Helper functions for node rewrite
-//////////////////////////////////////////////////////////////////////////
-
-Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
-                                         Node* orig_node,
-                                         const RewriteInfo* ri) {
-  CHECK_NOTNULL(ri);
-  CHECK_NOTNULL(orig_node);
-
-  VLOG(1) << "MklLayoutRewritePass: Original node:" << orig_node->DebugString();
-
-  // Check if this is scenario 2 (context-based rewrite).
-  // Get the matching ContextInfo if it is.
-  const Node* fwd_node = nullptr;
-  const ContextInfo* ci = nullptr;
-  bool is_context_based_rewrite = false;
-  if ((ci = SearchMatchingContext(orig_node, &fwd_node)) != nullptr) {
-    is_context_based_rewrite = true;
-
-    // Sanity checks for context-based rewrite (if any)
-    if (orig_node->type_string() == csinfo_.bias_add_grad &&
-        ri->new_name == csinfo_.mkl_conv2d_with_bias_backprop_bias) {
-      CHECK_NOTNULL(fwd_node);
-      DataType orig_T, ctx_T;
-      string orig_data_format, ctx_data_format;
-      TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &orig_T));
-      TF_CHECK_OK(
-          GetNodeAttr(orig_node->def(), "data_format", &orig_data_format));
-      TF_CHECK_OK(GetNodeAttr(fwd_node->def(), "T", &ctx_T));
-      TF_CHECK_OK(
-          GetNodeAttr(fwd_node->def(), "data_format", &ctx_data_format));
-
-      if (orig_data_format != ctx_data_format || orig_T != ctx_T ||
-          orig_node->assigned_device_name() !=
-              fwd_node->assigned_device_name() ||
-          orig_node->def().device() != fwd_node->def().device()) {
-        return Status(
-            error::Code::INVALID_ARGUMENT,
-            "data_format or T attribute or devices of BiasAddGrad and "
-            "Conv2D do not match. Will skip node rewrite optimization");
-      }
-    } else if (orig_node->type_string() == csinfo_.bias_add_grad &&
-               ri->new_name == csinfo_.matmul) {
-      // When BiasAddGrad has MatMul in context, we do not do any rewrite
-      // and leave BiasAddGrad as it is. But we check for this condition
-      // when we check for node rewrite rule. So we should not even come
-      // here for MatMul. So we will fail now.
-      return Status(
-          error::Code::INVALID_ARGUMENT,
-          "No rewrite is required for BiasAddGrad for MatMul context.");
-    }
-  }
-
-  // Get all inputs.
-  int num_inputs = orig_node->in_edges().size();
-
-  // Drop count for control edges from inputs
-  for (const Edge* e : orig_node->in_edges()) {
-    if (e->IsControlEdge()) {
-      num_inputs--;
-    }
-  }
-
-  gtl::InlinedVector<Node*, 4> control_edges;
-  gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num_inputs);
-  FillInputs(orig_node, &control_edges, &inputs);
-
-  // Build new node. We use same name as original node, but change the op name.
-  NodeBuilder nb(orig_node->name().c_str(), ri->new_name.c_str());
-  // Copy user-specified device assigned to original node to new node.
-  nb.Device(orig_node->def().device());
-  // Set up new inputs to the rewritten node.
-  Status s = SetUpInputs(g, inputs, &nb, orig_node);
-  if (s != Status::OK()) {
-    return s;
-  }
-
-  // Copy attributes from original node to new node (for scenario 1).
-  // For context-based rewrite, we use context to copy the attributes.
-  if (is_context_based_rewrite) {
-    if (orig_node->type_string() == csinfo_.bias_add_grad &&
-        ri->new_name == csinfo_.mkl_conv2d_with_bias_backprop_bias) {
-      CHECK_NOTNULL(fwd_node);
-      ri->copy_attrs(fwd_node, &nb);
-    } else {
-      return Status(error::Code::UNIMPLEMENTED,
-                    "Unimplemented case for node rewrite optimization.");
-    }
-  } else {
-    ri->copy_attrs(const_cast<const Node*>(orig_node), &nb);
-  }
-  // Set the Mkl layer label for this op.
-  nb.Attr("_kernel", mkl_op_registry::kMklOpLabel);
-
-  // Finalize graph and get new node.
-  Node* new_node = nullptr;
-  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
-  CHECK_NOTNULL(new_node);
-
-  // Incoming data edges from 'orig_node' node to new 'new_node' node are
-  // already copied in BuildNode. We need to handle control edges now.
-  for (const Edge* e : orig_node->in_edges()) {
-    if (e->IsControlEdge()) {
-      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
-    }
-  }
-
-  // Copy outgoing edges from 'orig_node' node to new
-  // 'new_node' node, since the output also follows same ordering among
-  // Tensorflow tensors and Mkl tensors. We need to connect Tensorflow
-  // tensors appropriately. Specifically, nth output of the original node
-  // will become 2*nth output of the Mkl node for the interleaved ordering
-  // of the tensors. For the contiguous ordering of the tensors, it will be n.
-  // GetTensorDataIndex provides this mapping function.
-  for (const Edge* e : orig_node->out_edges()) {
-    if (e->IsControlEdge()) {
-      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
-    } else {
-      CHECK_NOTNULL((*g)->AddEdge(
-          new_node,
-          GetTensorDataIndex(e->src_output(), e->src()->num_outputs()),
-          e->dst(), e->dst_input()));
-    }
-  }
-
-  // Copy the runtime device assigned from original code to new node.
-  new_node->set_assigned_device_name(orig_node->assigned_device_name());
-
-  // Delete original node and mark new node as rewritten.
-  (*g)->RemoveNode(orig_node);
-
-  VLOG(1) << "MklLayoutRewritePass: New node:" << new_node->DebugString();
-  return Status::OK();
-}
-
-const MklLayoutRewritePass::ContextInfo*
-MklLayoutRewritePass::SearchMatchingContext(const Node* n,
-                                            const Node** fwd_node) {
-  CHECK_NOTNULL(n);
-  CHECK_NOTNULL(fwd_node);
-  *fwd_node = nullptr;
-
-  // Search for matching contextinfo based on node name and call
-  // callback function using matching contextinfo.
-  // There could be more than one matching contextinfos but whichever
-  // matches first is returned.
-  for (auto ci = cinfo_.cbegin(); ci != cinfo_.cend(); ++ci) {
-    if (n->type_string() == (*ci)->node &&
-        (*ci)->context_match_fn(n, fwd_node, *ci)) {
-      VLOG(1) << "Found context as matching: " << (*ci)->fwd;
-      return *ci;
-    }
-  }
-  return nullptr;
-}
-
-bool MklLayoutRewritePass::ContextMatchRewrite(const Node* n,
-                                               const ContextInfo* c) {
-  const Node* fwd_node = nullptr;
-  return SearchMatchingContext(n, &fwd_node) == c;
-}
-
-const MklLayoutRewritePass::RewriteInfo*
-MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
-  CHECK_NOTNULL(n);
-
-  // First check if node along with its type is supported by MKL layer.
-  // We do not want to rewrite an op into Mkl op if types are not supported.
-  // E.g., MklRelu does not support INT32. So we cannot rewrite Relu to
-  // MklRelu if type is INT32.
-  DataType T;
-  if (!GetNodeAttr(n->def(), "T", &T).ok()) {
-    return nullptr;
-  }
-
-  // BiasAddGrad is not an Mkl layer, so we make an exception for it.
-  if (n->type_string() != csinfo_.bias_add_grad) {
-    if (!mkl_op_registry::IsMklOp(
-            mkl_op_registry::GetMklOpName(n->type_string()), T)) {
-      return nullptr;
-    }
-  }
-
-  // For elementwise node, we reuse the Eigen implementation and pass the MKL
-  // metadata tensor through so we can avoid conversions. However, if all
-  // incoming edges are in TF format, we don't need all this overhead, so
-  // replace the elementwise node only if at least one of its parents is a MKL
-  // node.
-  //
-  // TODO(vrane): Add implementation for element-wise ops that doesn't reuse
-  // eigen code to reduce cross-library dependency.
-  if (mkl_op_registry::IsMklElementWiseOp(
-          mkl_op_registry::GetMklOpName(n->type_string()), T)) {
-    bool incoming_mkl_edge = false;
-    for (auto parent : n->in_edges()) {
-      if (mkl_op_registry::IsMklOp(
-              mkl_op_registry::GetMklOpName(parent->src()->type_string()), T)) {
-        incoming_mkl_edge = true;
-        break;
-      } else {
-        VLOG(1) << "Non-MKL parent is: " << parent->src()->type_string();
-      }
-    }
-    if (incoming_mkl_edge == false) {
-      VLOG(1) << "Skipping replacement of elementwise node which has no MKL "
-                 "parents.";
-      return nullptr;
-    }
-  }
-
-  // We support 2 types of node rewrites:
-  // 1. Rewriting BiasAddGrad depending on its MklConv2DWithBias context.
-  // 2. Rewriting an op to Mkl op always
-  // We return true if any of these 2 conditions is met.
-
-  // Find matching RewriteInfo and then check that rewrite rule applies.
-  for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
-    if (n->type_string().compare(ri->name) == 0 &&
-        ri->rewrite_rule(n, ri->context)) {
-      // If we are rewriting BiasAddGrad into BiasAddGrad for MatMul context,
-      // then we just return directly.
-      if (n->type_string() == csinfo_.bias_add_grad &&
-          ri->context->fwd == csinfo_.matmul &&
-          ri->new_name == csinfo_.bias_add_grad) {
-        return nullptr;
-      }
-      return &*ri;
-    }
-  }
-
-  // Else return not found.
-  return nullptr;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-//              Run function for the pass
-///////////////////////////////////////////////////////////////////////////////
-
-bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
-  bool result = false;
-  CHECK_NOTNULL(g);
-
-  DumpGraph("Before running MklLayoutRewritePass", &**g);
-
-  std::vector<Node*> order;
-  GetReversePostOrder(**g, &order);  // This will give us topological sort.
-
-  for (Node* n : order) {
-    // If node is not an op or it cannot run on CPU device, then skip.
-    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
-      continue;
-    }
-
-    const RewriteInfo* ri = nullptr;
-    Node* predn = nullptr;
-    // We will first search if node is to be rewritten
-    if ((ri = CheckForNodeRewrite(n)) != nullptr) {
-      string node_name = n->name();
-      string op_name = n->type_string();
-
-      VLOG(1) << "MklLayoutRewritePass: Scheduled node " << node_name
-              << " with op " << op_name << " for rewrite using"
-              << " layout optimization.";
-
-      if (RewriteNode(g, n, ri) == Status::OK()) {
-        VLOG(1) << "MklLayoutRewritePass: rewrote node " << node_name
-                << " with op " << op_name << " for Mkl layout optimization.";
-        result = true;
-      }
-    } else if ((predn = CheckForNodeMerge(n)) != nullptr) {
-      // Otherwise, we will check if the node is to be merged.
-      string n1_name = n->name();
-      string n2_name = predn->name();
-
-      VLOG(1) << "MklLayoutRewritePass: Scheduled nodes " << n1_name << " and "
-              << n2_name << " for merging";
-
-      if (MergeNode(g, n, predn) == Status::OK()) {
-        VLOG(1) << "MklLayoutRewritePass: Merged nodes " << n1_name << " and "
-                << n2_name;
-        result = true;
-      }
-    }
-  }
-
-  DumpGraph("After running MklLayoutRewritePass", &**g);
-
-  return result;
-}
-
-bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
-  return MklLayoutRewritePass().RunPass(g);
-}
-
-Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
-  if (options.graph == nullptr && options.partition_graphs == nullptr) {
-    return Status::OK();
-  }
-
-  auto process_graph = [&](std::unique_ptr<Graph>* g) {
-    // Get the ownership of a graph
-    std::unique_ptr<Graph>* ng = std::move(g);
-    RunPass(ng);
-    // Return the ownership of a graph back
-    g->reset(ng->release());
-  };
-
-  if (kMklLayoutRewritePassGroup !=
-      OptimizationPassRegistry::POST_PARTITIONING) {
-    // For any pre-partitioning phase, a graph is stored in options.graph.
-    process_graph(options.graph);
-  } else {
-    // For post partitioning phase, graphs are stored in
-    // options.partition_graphs.
-    for (auto& pg : *options.partition_graphs) {
-      process_graph(&pg.second);
-    }
-  }
-
-  return Status::OK();
-}
-
-#else   // INTEL_MKL_ML_ONLY
-
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
 // (B) Rewriting a node in the graph to a new node
@@ -4539,7 +2364,7 @@ Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
 
   return Status::OK();
 }
-#endif  // INTEL_MKL_ML_ONLY
+
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 77640e287c..0eda8170f8 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -37,1869 +37,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-#ifdef INTEL_MKL_ML_ONLY
-
-namespace {
-
-const char kCPUDevice[] = "/job:a/replica:0/task:0/device:CPU:0";
-const char kGPUDevice[] = "/job:a/replica:0/task:0/device:GPU:0";
-
-static void InitGraph(const string& s, Graph* graph,
-                      const string& device = kCPUDevice) {
-  GraphDef graph_def;
-
-  auto parser = protobuf::TextFormat::Parser();
-  //  parser.AllowRelaxedWhitespace(true);
-  CHECK(parser.MergeFromString(s, &graph_def)) << s;
-  GraphConstructorOptions opts;
-  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
-
-  for (Node* node : graph->nodes()) {
-    node->set_assigned_device_name(device);
-  }
-}
-
-class MklLayoutPassTest : public ::testing::Test {
- public:
-  MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
-
-  void InitGraph(const string& s, const string& device = kCPUDevice) {
-    ::tensorflow::InitGraph(s, &graph_, device);
-    original_ = CanonicalGraphString(&graph_);
-  }
-
-  static bool IncludeNode(const Node* n) { return n->IsOp(); }
-
-  static string EdgeId(const Node* n, int index) {
-    if (index == 0) {
-      return n->name();
-    } else if (index == Graph::kControlSlot) {
-      return strings::StrCat(n->name(), ":control");
-    } else {
-      return strings::StrCat(n->name(), ":", index);
-    }
-  }
-
-  string CanonicalGraphString(Graph* g) {
-    std::vector<string> nodes;
-    std::vector<string> edges;
-    for (const Node* n : g->nodes()) {
-      if (IncludeNode(n)) {
-        nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
-      }
-    }
-    for (const Edge* e : g->edges()) {
-      if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
-        edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
-                                        EdgeId(e->dst(), e->dst_input())));
-      }
-    }
-    // Canonicalize
-    std::sort(nodes.begin(), nodes.end());
-    std::sort(edges.begin(), edges.end());
-    return strings::StrCat(str_util::Join(nodes, ";"), "|",
-                           str_util::Join(edges, ";"));
-  }
-
-  string DoMklLayoutOptimizationPass() {
-    string before = CanonicalGraphString(&graph_);
-    LOG(ERROR) << "Before MKL layout rewrite pass: " << before;
-
-    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
-    RunMklLayoutRewritePass(ug);
-
-    string result = CanonicalGraphString(&graph_);
-    LOG(ERROR) << "After MKL layout rewrite pass:  " << result;
-    return result;
-  }
-
-  const string& OriginalGraph() const { return original_; }
-
-  Graph graph_;
-  string original_;
-};
-
-REGISTER_OP("Input").Output("o: float").SetIsStateful();
-REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
-REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
-REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
-REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
-REGISTER_OP("_MklInput2")
-    .Output("o: uint8")
-    .Output("o1: uint8")
-    .SetIsStateful();
-
-/////////////////////////////////////////////////////////////////////
-//  Unit tests related to node merge optiimization
-/////////////////////////////////////////////////////////////////////
-
-TEST_F(MklLayoutPassTest, Basic) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Zeta);D(Zeta)|"
-            "A->C;A->D;B->C:1;B->D:1");
-}
-
-// Test set 1: Conv2D + AddBias
-
-// C=_MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Zeta(E,Y) (for interleaved ordering)
-// C=_MklConv2D(A,B,M,N); E=BiasAdd(C,D); Z=Zeta(E,Y) (for contiguous ordering)
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
-  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['C', 'D'] }"
-      "node { name: 'Y' op: 'Input'}"
-      "node { name: 'Z' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);D(Input);DMT/_0(Const);E(_MklConv2DWithBias);"
-            "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->E;"
-            "A:control->DMT/_0:control;B->E:1;D->E:2;DMT/_0->E:5;E->Z;M->E:3;"
-            "N->E:4;Y->Z:1");
-}
-
-// C=_MklConv2D(A,M:1,B,N:1); E=BiasAdd(C,D); Z=Zeta(E,Y) (for interleaved)
-// C=_MklConv2D(A,B,M:1,N:1); E=BiasAdd(C,D); Z=Zeta(E,Y) (for contiguous)
-// Test for correct output slots selected
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive1) {
-  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput2'}"
-      "node { name: 'N' op: '_MklInput2'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M:1', 'N:1']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['C', 'D'] }"
-      "node { name: 'Y' op: 'Input'}"
-      "node { name: 'Z' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);D(Input);DMT/_0(Const);E(_MklConv2DWithBias);"
-            "M(_MklInput2);N(_MklInput2);Y(Input);Z(Zeta)|A->E;"
-            "A:control->DMT/_0:control;B->E:1;D->E:2;DMT/_0->E:5;E->Z;"
-            "M:1->E:3;N:1->E:4;Y->Z:1");
-}
-
-// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Zeta(E,Y);
-// This is a case of node rewrite followed by node merge.
-// We will first rewrite Conv2D to _MklConv2D, and then merge _MklConv2D
-// with BiasAdd to produce _MklConv2DWithBias.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive2) {
-  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['C', 'D'] }"
-      "node { name: 'Y' op: 'Input'}"
-      "node { name: 'Z' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);E(_MklConv2DWithBias);Y(Input);Z(Zeta)|"
-            "A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;DMT/_1->E:4;"
-            "DMT/_2->E:5;E->Z;Y->Z:1");
-}
-
-// Graph contains only _MklConv2D, no AddBias.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);M(_MklInput);N(_MklInput)|"
-            "A->C;B->C:1;M->C:2;N->C:3");
-}
-
-// _MklConv2D output does not go to BiasAdd.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D', 'E'] }");  // Output of _MklConv2D does not go to BiasAdd.
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Input);E(Input);F(BiasAdd);"
-            "M(_MklInput);N(_MklInput)|A->C;B->C:1;D->F;E->F:1;M->C:2;N->C:3");
-}
-
-// _MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Zeta).
-// Merge should not be done in such case.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D', 'E'] }"  // Conv2D has two outputs.
-                              // No merge should happen.
-      "node { name: 'G' op: 'Zeta'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['C', 'E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Input);E(Input);F(BiasAdd);"
-            "G(Zeta);M(_MklInput);N(_MklInput)|A->C;B->C:1;C->G;D->F;"
-            "E->F:1;E->G:1;M->C:2;N->C:3");
-}
-
-// data_format attribute value mismatch. Merge should not be done
-// in such case.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NHCW' } }"
-      " input: ['C', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);M(_MklInput);"
-            "N(_MklInput)|A->C;B->C:1;C->E;D->E:1;M->C:2;N->C:3");
-}
-
-// Test set 2: _MklConv2D..BiasAddGrad -> _MklConv2DWithBiasBackpropBias
-// rewrite tests
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
-// and BackpropInput
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'Int32Input'}"
-      "node { name: 'I' op: '_MklConv2DBackpropInput'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
-      "node { name: 'J' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
-            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
-            "I(_MklConv2DBackpropInput);J(_MklConv2DWithBiasBackpropBias);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G;B->D:1;"
-            "B->I:1;C->D:2;D->E;DMT/_0->J:1;E->G:2;E->I:2;E->J;"
-            "E:control->DMT/_0:control;F->G:1;H->I;M->D:3;M->G:3;M->I:3;"
-            "N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
-}
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
-// and BackpropInput. But nodes do not match criteria for rewrite. So
-// rewrite should not happen.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['E', 'F', 'A', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'Int32Input'}"
-      "node { name: 'I' op: '_MklConv2DBackpropInput'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
-      "node { name: 'J' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
-            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
-            "I(_MklConv2DBackpropInput);J(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
-            "B->I:1;C->D:2;D->E;E->G;E->I:2;E->J;F->G:1;H->I;M->D:3;M->G:3;"
-            "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
-}
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
-// and BackpropInput. But nodes do not match criteria for rewrite. So
-// rewrite should not happen.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['B', 'A', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'Int32Input'}"
-      "node { name: 'I' op: '_MklConv2DBackpropInput'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
-      "node { name: 'J' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
-            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
-            "I(_MklConv2DBackpropInput);J(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D:1;A->E:1;A->G;B->D;"
-            "B->I:1;C->D:2;D->E;E->G:2;E->I:2;E->J;F->G:1;H->I;M->D:3;M->G:3;"
-            "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
-}
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
-            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);"
-            "H(_MklConv2DWithBiasBackpropBias);M(_MklInput);N(_MklInput);"
-            "O(_MklInput)|A->D;A->E:1;A->G;B->D:1;C->D:2;D->E;DMT/_0->H:1;"
-            "E->G:2;E->H;E:control->DMT/_0:control;F->G:1;M->D:3;M->G:3;"
-            "N->D:4;N->G:4;O->D:5;O->G:5");
-}
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
-// But BackpropFilter node inputs do not satisfy criteria for rewrite.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['E', 'F', 'A', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
-            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
-            "C->D:2;D->E;E->G;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
-            "O->G:5");
-}
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
-// But BackpropFilter node inputs do not satisfy criteria for rewrite.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['B', 'A', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
-            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D:1;A->E:1;A->G;B->D;"
-            "C->D:2;D->E;E->G:2;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
-            "O->G:5");
-}
-
-// No _MklConv2DWithBias in context, but _MklConv2D in context.
-// No rewrite for BiasAddGrad should happen.
-// C=_MklConv2D(A,M,B,N); D=Zeta(C,A); E=BiasAddGrad(D) (for interleaved)
-// C=_MklConv2D(A,B,M,N); D=Zeta(C,A); E=BiasAddGrad(D) (for contiguous)
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Zeta);E(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput)|A->C;A->D:1;B->C:1;C->D;D->E;"
-            "M->C:2;N->C:3");
-}
-
-// No Conv2D in the context for BiasAddGrad. No rewrite should happen.
-// C=Polygamma(A,B); D=Zeta(C,A); E=BiasAddGrad(D)
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Polygamma'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Polygamma);D(Zeta);E(BiasAddGrad)|"
-            "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-// No Conv2D in the context for BiasAddGrad, but MatMul in context.
-// Rewrite should happen, but name of BiasAddGrad does not change.
-// C=MatMul(A,B); D=Zeta(C,A); E=BiasAddGrad(D)
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D_MatMul) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'MatMul'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'transpose_a'      value { b: false } }"
-      " attr { key: 'transpose_b'      value { b: false } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(MatMul);D(Zeta);E(BiasAddGrad)|"
-            "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-// Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests
-// C=MatMul(A,B); D=Zeta(C,A); E=BiasAddGrad(D)
-TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'MatMul'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'transpose_a'      value { b: false } }"
-      " attr { key: 'transpose_b'      value { b: false } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(MatMul);D(Zeta);E(BiasAddGrad)|"
-            "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-// No MatMul in the context for BiasAddGrad. No rewrite should happen.
-// C=Polygamma(A,B); D=Zeta(C,A); E=BiasAddGrad(D)
-TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Negative_NoMatMul) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Polygamma'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Polygamma);D(Zeta);E(BiasAddGrad)|"
-            "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-/////////////////////////////////////////////////////////////////////
-//  Unit tests related to rewriting node to Mkl node
-/////////////////////////////////////////////////////////////////////
-
-// Single Conv2D Op; No Mkl layer on the input and on the output.
-// We will generate dummy Mkl tensor as 2nd input of Conv2D.
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
-            "DMT/_1->C:3");
-}
-
-// 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
-// have 2 outputs, both of which will be inputs to next Conv2D.
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'C']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(_MklConv2D);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->C;A->D;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;B->C:1;C->D:1;C->E;"
-            "C:2->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2");
-}
-
-// Conv2D with INT32 which is not supported by Mkl
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
-  InitGraph(
-      "node { name: 'A' op: 'HalfInput'}"
-      "node { name: 'B' op: 'HalfInput'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_HALF } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
-      " input: ['B', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(HalfInput);B(HalfInput);C(Conv2D);D(Zeta)|"
-            "A->C;B->C:1;B->D;C->D:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Int32Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Conv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropFilter);"
-            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
-            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
-            "DMT/_1->D:4;DMT/_2->D:5");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Int32Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Conv2DBackpropInput'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['B', 'A', 'C']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropInput);"
-            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
-            "A->D:1;A->E;B->D;B:control->DMT/_0:control;"
-            "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
-            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
-}
-
-// Concat Op test: Concat with no Mkl layer feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
-  InitGraph(
-      "node { name: 'A' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'B' op: 'InputList'"
-      " attr { key: 'N'                value { i: 2 } }}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Concat'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['A', 'B:0', 'B:1']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }");
-  EXPECT_EQ(
-      DoMklLayoutOptimizationPass(),
-      "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
-      "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
-      "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
-      "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
-}
-
-// Concat with 2 Mkl layers feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'F' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['C', 'D']}"
-      "node { name: 'G' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'H' op: 'Concat'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['G', 'E', 'F']}"
-      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'H'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
-            "F(_MklConv2D);G(Const);H(_MklConcat);I(Zeta)|A->E;A->I;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
-            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
-            "DMT/_4->H:3;E->H:1;E:2->H:4;F->H:2;F:2->H:5;G->H;"
-            "G:control->DMT/_4:control;H->I:1");
-}
-
-// Concat with 1 Mkl and 1 non-Mkl layer feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D']}"
-      "node { name: 'G' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'H' op: 'Concat'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['G', 'E', 'F']}"
-      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'H'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
-            "H(_MklConcat);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
-            "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:2->H:4;F->H:2;"
-            "G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1");
-}
-
-// ConcatV2 Op test: ConcatV2 with no Mkl layer feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Basic) {
-  InitGraph(
-      "node { name: 'A' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'B' op: 'InputList'"
-      " attr { key: 'N'                value { i: 2 } }}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'ConcatV2'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['B:0', 'B:1', 'A']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Const);B(InputList);C(Input);D(_MklConcatV2);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D:2;B->D;B:1->D:1;"
-            "B:control->DMT/_0:control;B:control->DMT/_1:control;"
-            "B:control->DMT/_2:control;C->E;D->E:1;DMT/_0->D:3;"
-            "DMT/_1->D:4;DMT/_2->D:5");
-}
-
-// ConcatV2 with 2 Mkl layers feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'F' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['C', 'D']}"
-      "node { name: 'G' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'H' op: 'ConcatV2'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['E', 'F', 'G']}"
-      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'H'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
-            "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Zeta)|A->E;A->I;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;"
-            "C:control->DMT/_0:control;C:control->DMT/_1:control;"
-            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
-            "DMT/_4->H:5;E->H;E:2->H:3;E:control->DMT/_4:control;F->H:1;"
-            "F:2->H:4;G->H:2;H->I:1");
-}
-
-// ConcatV2 with 1 Mkl and 1 non-Mkl layer feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D']}"
-      "node { name: 'G' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'H' op: 'ConcatV2'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['E', 'F', 'G']}"
-      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'H'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
-            "H(_MklConcatV2);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
-            "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:2->H:3;"
-            "E:control->DMT/_2:control;E:control->DMT/_3:control;F->H:1;"
-            "G->H:2;H->I:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_Relu_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Relu'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklRelu);C(Zeta);DMT/_0(Const)|A->B;A->C;"
-            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'ReluGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_ReluReluGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Relu'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'ReluGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklRelu);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
-            "DMT/_1->C:2");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'AvgPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklAvgPool);C(Zeta);DMT/_0(Const)|A->B;A->C;"
-            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Int32Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'AvgPoolGrad' "
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
-            "DMT/_1->C:3");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolAvgPoolGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'I' op: 'Int32Input'}"
-      "node { name: 'B' op: 'AvgPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'AvgPoolGrad' "
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['I', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;"
-            "B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;"
-            "I:control->DMT/_1:control");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
-            "F(_MklFusedBatchNormGrad);G(Zeta)|A->F;A->G;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
-            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
-            "E->F:4;F->G:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNorm'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
-            "F(_MklFusedBatchNorm);G(Zeta)|A->F;A->G;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
-            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
-            "E->F:4;F->G:1");
-}
-
-/////////////////////////////////////////////////////////////////////
-//  Unit tests related to rewriting node for workspace edges
-/////////////////////////////////////////////////////////////////////
-
-/* Test LRN->MaxPool->MaxPoolGrad->LRNGrad replacement by workspace nodes. */
-TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['B'] }"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'MaxPoolGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['B', 'C', 'D'] }"
-      "node { name: 'F' op: 'Input'}"
-      "node { name: 'G' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['E', 'F', 'B'] }"
-      "node { name: 'H' op: 'Input'}"
-      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['H', 'G'] }");
-  EXPECT_EQ(
-      DoMklLayoutOptimizationPass(),
-      "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
-      "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
-      "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
-      "B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;B:control->DMT/_1:control;C->E:1;"
-      "C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;"
-      "E->G;E:1->G:4;E:control->DMT/_2:control;F->G:1;G->I:1;H->I");
-}
-
-/* Test LRN->LRNGrad replacement by workspace nodes. */
-TEST_F(MklLayoutPassTest, LRN_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['C', 'D', 'B'] }"
-      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);E(_MklLRNGrad);F(Zeta)|"
-            "A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;"
-            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
-            "D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");
-}
-
-/* Test LRN->LRNGrad replacement when only one of them is present. */
-TEST_F(MklLayoutPassTest, LRN_Negative1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklLRN);C(Zeta);DMT/_0(Const)|"
-            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
-}
-
-/* Test LRN->LRNGrad replacement when only one of them is present. */
-TEST_F(MklLayoutPassTest, LRN_Negative2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A', 'B', 'C'] }"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklLRNGrad);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|"
-            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
-            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
-}
-
-/* Test LRN->LRNGrad negative case, where single LRN feeds
-   2 LRNGrad nodes at different slots. */
-TEST_F(MklLayoutPassTest, LRN_Negative3) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['C', 'D', 'B'] }"
-      "node { name: 'F' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['C', 'B', 'D'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['E', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);DMT/_5(Const);"
-            "DMT/_6(Const);E(_MklLRNGrad);F(_MklLRNGrad);G(Zeta)|A->B;"
-            "A:control->DMT/_0:control;B->E:2;"
-            "B->F:1;B:1->E:3;B:2->E:6;B:2->F:5;B:3->E:7;C->E;C->F;"
-            "C:control->DMT/_1:control;C:control->DMT/_2:control;"
-            "C:control->DMT/_3:control;C:control->DMT/_4:control;"
-            "C:control->DMT/_5:control;C:control->DMT/_6:control;"
-            "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->F:3;DMT/_2->F:7;DMT/_3->F:4;"
-            "DMT/_4->F:6;DMT/_5->E:4;DMT/_6->E:5;E->G;F->G:1");
-}
-
-/* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'MaxPoolGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['C', 'B', 'D'] }"
-      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Zeta)|"
-            "A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;B:3->E:7;"
-            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
-            "D->E:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:6;E->F:1");
-}
-
-// Test MaxPool>MaxPoolGrad replacement when only one of them is present.
-// In this case, we will rewrite MaxPool node but workspace edges will not
-// be present.
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklMaxPool);C(Zeta);DMT/_0(Const)|"
-            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
-}
-
-// Test MaxPoolGrad replacement when only one of them is present.
-// In this case, we will rewrite MaxPoolGrad and for workspace tensor and
-// its Mkl part, we will generate dummy tensor.
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'MaxPoolGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A', 'B', 'C'] }"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklMaxPoolGrad);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|"
-            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
-            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
-}
-
-// Test MaxPool handling for batch-wise pooling (NCHW)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative3) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Test MaxPool handling for batch-wise pooling (NCHW)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative4) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Test MaxPool handling for depth-wise pooling (NHWC)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative5) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:2, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Test MaxPool handling for depth-wise pooling (NCHW)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative6) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:2, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Test MaxPool handling for batch-wise pooling (NHWC)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative7) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Test MaxPool handling for batch-wise pooling (NHWC)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative8) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Test MaxPool handling for depth-wise pooling (NHWC)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative9) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:2} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Test MaxPool handling for depth-wise pooling (NHWC)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative10) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-/////////////////////////////////////////////////////////////////////
-
-// Single Conv2D Op on GPU device
-// No rewrite should happen
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
-            "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
-            "O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;E->F;"
-            "M->D:3;N->D:4;O->D:5");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Int32Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Conv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|"
-            "A->D;A->E;B->D:1;C->D:2;D->E:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Relu'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'ReluGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'AvgPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Concat Op test: Concat with no Mkl layer feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'B' op: 'InputList'"
-      " attr { key: 'N'                value { i: 2 } }}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Concat'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['A', 'B:0', 'B:1']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;"
-            "B->D:1;B:1->D:2;C->E;D->E:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'B' op: 'InputList'"
-      " attr { key: 'N'                value { i: 2 } }}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'ConcatV2'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['B:0', 'B:1', 'A']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
-            "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNorm'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);E(Input);"
-            "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
-            "E->F:4;F->G:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
-  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['C', 'D'] }"
-      "node { name: 'Y' op: 'Input'}"
-      "node { name: 'Z' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
-            "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;"
-            "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
-}
-
-/////////////////////////////////////////////////////////////////////
-
-static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
-  testing::StopTiming();
-  string s;
-  for (int in = 0; in < 10; in++) {
-    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
-  }
-  random::PhiloxRandom philox(301, 17);
-  random::SimplePhilox rnd(&philox);
-  for (int op = 0; op < op_nodes; op++) {
-    s += strings::Printf(
-        "node { name: 'op%04d' op: 'Zeta' attr { key: 'T' value { "
-        "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
-        op, rnd.Uniform(10), rnd.Uniform(10));
-  }
-
-  bool first = true;
-  while (iters > 0) {
-    Graph* graph = new Graph(OpRegistry::Global());
-    InitGraph(s, graph);
-    int N = graph->num_node_ids();
-    if (first) {
-      testing::SetLabel(strings::StrCat("Per graph node.  Nodes: ", N));
-      first = false;
-    }
-    {
-      testing::StartTiming();
-      std::unique_ptr<Graph> ug(graph);
-      RunMklLayoutRewritePass(&ug);
-      testing::StopTiming();
-    }
-    iters -= N;  // Our benchmark units are individual graph nodes,
-                 // not whole graphs
-    // delete graph;
-  }
-}
-BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
-
-}  // namespace
-
-#else  // INTEL_MKL_ML_ONLY
-
 // NOTE: Unit tests in this file rely on a topological sorted graph for
 // printing. But since sibling nodes of a node in the topologically sorted graph
 // can be printed in different orders, tests may fail if the order in which
@@ -3602,8 +1739,6 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace
 
-#endif  // INTEL_MKL_ML_ONLY
-
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL && ENABLE_MKL
-- 
GitLab


From 470101040d2174ddcb41990e5e16ed6dfa6f6436 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 13:33:43 -0700
Subject: [PATCH 488/570] Remove commented out code errantly checked in.

PiperOrigin-RevId: 215957544
---
 third_party/jpeg/workspace.bzl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/jpeg/workspace.bzl b/third_party/jpeg/workspace.bzl
index 4b517240ec..2bb7dacd32 100644
--- a/third_party/jpeg/workspace.bzl
+++ b/third_party/jpeg/workspace.bzl
@@ -12,6 +12,5 @@ def repo():
         sha256 = "f892fff427ab3adffc289363eac26d197ce3ccacefe5f5822377348a8166069b",
         strip_prefix = "libjpeg-turbo-2.0.0",
         build_file = "//third_party/jpeg:BUILD.bazel",
-        # build_file = clean_dep("//third_party/jpeg:jpeg.BUILD"),
         system_build_file = "//third_party/jpeg:BUILD.system",
     )
-- 
GitLab


From ae0bc6f006497cc04a2ee75166d4ec71c7154fd8 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Fri, 5 Oct 2018 13:34:01 -0700
Subject: [PATCH 489/570] [tf.data] Adding specialization for `MapDataset`,
 `ParallelMapDataset`, and `MapAndBatchDataset` whose user-provided functions
 have the property that each output argument take its value directly from an
 input argument (e.g. `lambda x, y: y, x`). This specialization can produce
 the result without having to schedule the function using the executor.

PiperOrigin-RevId: 215957592
---
 tensorflow/core/kernels/data/BUILD            |  14 ++
 tensorflow/core/kernels/data/dataset_utils.cc |  47 +++++
 tensorflow/core/kernels/data/dataset_utils.h  |  20 ++
 .../core/kernels/data/dataset_utils_test.cc   |  46 +++++
 .../core/kernels/data/filter_dataset_op.cc    | 162 +++++++---------
 .../kernels/data/map_and_batch_dataset_op.cc  | 180 +++++++++++-------
 .../core/kernels/data/map_dataset_op.cc       |  56 ++++--
 .../kernels/data/parallel_map_dataset_op.cc   |  73 ++++---
 .../kernels/data/parallel_map_iterator.cc     |  17 +-
 .../core/kernels/data/parallel_map_iterator.h |   2 +-
 .../kernels/data/parse_example_dataset_op.cc  |   2 +-
 .../kernel_tests/map_and_batch_test.py        |  20 ++
 .../kernel_tests/filter_dataset_op_test.py    |   2 +-
 .../data/kernel_tests/map_dataset_op_test.py  |  80 ++++++--
 .../python/data/kernel_tests/test_base.py     |  29 +++
 15 files changed, 520 insertions(+), 230 deletions(-)
 create mode 100644 tensorflow/core/kernels/data/dataset_utils_test.cc

diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 451f8c1a6c..37c1c54786 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -45,6 +45,16 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "dataset_utils_test",
+    srcs = ["dataset_utils_test.cc"],
+    deps = [
+        ":dataset_utils",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "captured_function",
     srcs = ["captured_function.cc"],
@@ -205,6 +215,7 @@ tf_kernel_library(
     deps = [
         ":captured_function",
         ":dataset",
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -232,6 +243,7 @@ tf_kernel_library(
     deps = [
         ":captured_function",
         ":dataset",
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -245,6 +257,7 @@ tf_kernel_library(
     deps = [
         ":captured_function",
         ":dataset",
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -285,6 +298,7 @@ tf_kernel_library(
     deps = [
         ":captured_function",
         ":dataset",
+        ":dataset_utils",
         ":parallel_map_iterator",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index e10833f525..a40f7f2146 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -15,10 +15,57 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 
 namespace tensorflow {
 namespace data {
 
+Status ComputeShortCircuitIndices(OpKernelContext* ctx,
+                                  const NameAttrList& func,
+                                  std::vector<int>* indices) {
+  FunctionLibraryRuntime::Handle fn_handle;
+  TF_RETURN_IF_ERROR(ctx->function_library()->Instantiate(
+      func.name(), AttrSlice(&func.attr()), &fn_handle));
+  auto cleanup = gtl::MakeCleanup([ctx, fn_handle]() {
+    Status s = ctx->function_library()->ReleaseHandle(fn_handle);
+    if (!s.ok()) {
+      LOG(WARNING) << "Failed to release handle: " << s.error_message();
+    }
+  });
+
+  const FunctionBody* fn_body =
+      ctx->function_library()->GetFunctionBody(fn_handle);
+  indices->resize(fn_body->ret_nodes.size());
+  for (size_t i = 0; i < fn_body->ret_nodes.size(); ++i) {
+    Node* ret_node = fn_body->ret_nodes[i];
+    Node* ret_input_node;
+    TF_RETURN_IF_ERROR(ret_node->input_node(0, &ret_input_node));
+    if (ret_input_node->def().op() == FunctionLibraryDefinition::kArgOp) {
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(ret_input_node->def(), "index", &((*indices)[i])));
+    } else {
+      indices->clear();
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+std::vector<bool> ComputeMoveVector(const std::vector<int>& indices) {
+  std::map<int, int> last_use;
+  for (size_t i = 0; i < indices.size(); ++i) {
+    last_use[indices[i]] = i;
+  }
+  std::vector<bool> can_move;
+  can_move.resize(indices.size());
+  for (size_t i = 0; i < indices.size(); ++i) {
+    can_move[i] = last_use[indices[i]] == i;
+  }
+  return can_move;
+}
+
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
     int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 6ec1350cd4..d777062293 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -22,6 +22,26 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
+// This method is used to determine whether we can short-circuit the evaluation
+// of the user-defined function `func`. Short-circuting is possible if every
+// function output corresponds to one of its inputs (e.g. `f(x) = x`, `f(x,y) =
+// (y,x)`, or `f(x) = (x,x)`).
+//
+// If short-circuiting is possible, the method stores the mapping from output
+// indices to input indices in `indices`. Otherwise, `indices` will be empty.
+//
+// Returns non-ok status if analysis of the function fails.
+//
+// TODO(jsimsa): Extend this to support constants as well.
+Status ComputeShortCircuitIndices(OpKernelContext* ctx,
+                                  const NameAttrList& func,
+                                  std::vector<int>* indices);
+
+// Given a vector that maps output indices to input indices, return a vector
+// that identifies for which output indices can we move the input (assuming
+// output indices are processed left to right).
+std::vector<bool> ComputeMoveVector(const std::vector<int>& indices);
+
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
     int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc
new file mode 100644
index 0000000000..43295b8ebb
--- /dev/null
+++ b/tensorflow/core/kernels/data/dataset_utils_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+TEST(DatasetUtils, ComputeMoveVector) {
+  struct TestCase {
+    std::vector<int> indices;
+    std::vector<bool> expected;
+  };
+
+  TestCase test_cases[] = {
+      TestCase{{}, {}},
+      TestCase{{1}, {true}},
+      TestCase{{1, 1}, {false, true}},
+      TestCase{{1, 2}, {true, true}},
+      TestCase{{1, 1, 2}, {false, true, true}},
+      TestCase{{1, 2, 2}, {true, false, true}},
+  };
+
+  for (auto& test_case : test_cases) {
+    EXPECT_EQ(test_case.expected, ComputeMoveVector(test_case.indices));
+  }
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 00884314a9..be7d182a1f 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -18,9 +18,11 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -31,67 +33,84 @@ namespace {
 
 class FilterDatasetOp : public UnaryDatasetOpKernel {
  public:
+  using FilterIteratorPredicate =
+      std::function<Status(IteratorContext*, std::vector<Tensor>, bool*)>;
+
   explicit FilterDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
+      : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("predicate", &func_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    FunctionLibraryRuntime::Handle pred_handle;
-    OP_REQUIRES_OK(ctx,
-                   ctx->function_library()->Instantiate(
-                       func_.name(), AttrSlice(&func_.attr()), &pred_handle));
-    auto cleanup = gtl::MakeCleanup([ctx, pred_handle]() {
-      OP_REQUIRES_OK(ctx, ctx->function_library()->ReleaseHandle(pred_handle));
-    });
-
-    const FunctionBody* pred_body =
-        ctx->function_library()->GetFunctionBody(pred_handle);
-    OP_REQUIRES(ctx, pred_body->ret_nodes.size() == 1,
-                errors::InvalidArgument(
-                    "predicate function must have a single return value."));
-    Node* ret_node = pred_body->ret_nodes[0];
-    Node* ret_input_node;
-    OP_REQUIRES_OK(ctx, ret_node->input_node(0, &ret_input_node));
-
     std::unique_ptr<CapturedFunction> captured_func;
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
                                                  &captured_func));
 
-    if (ret_input_node->def().op() == "_Arg") {
-      int32 index = -1;
-      OP_REQUIRES_OK(ctx, GetNodeAttr(ret_input_node->def(), "index", &index));
-      *output = new FilterTensorDataset(ctx, input, func_,
-                                        std::move(captured_func), index);
+    std::vector<int> indices;
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
+    OP_REQUIRES(ctx, indices.size() <= 1,
+                errors::InvalidArgument(
+                    "predicate function has more than one return value."));
+
+    FilterIteratorPredicate filter_pred;
+    if (indices.empty()) {
+      CapturedFunction* raw_captured_func = captured_func.get();
+      filter_pred = [raw_captured_func](IteratorContext* ctx,
+                                        const std::vector<Tensor>& args,
+                                        bool* out_matched) {
+        std::vector<Tensor> result;
+        TF_RETURN_IF_ERROR(
+            raw_captured_func->RunWithBorrowedArgs(ctx, args, &result));
+
+        if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
+            result[0].NumElements() != 1) {
+          return errors::InvalidArgument(
+              "Filter predicate `f` must return a scalar bool.");
+        }
+        *out_matched = result[0].scalar<bool>()();
+        return Status::OK();
+      };
     } else {
-      *output = new FilterFunctionDataset(ctx, input, func_,
-                                          std::move(captured_func));
+      filter_pred = [indices](IteratorContext* ctx,
+                              const std::vector<Tensor>& args,
+                              bool* out_matched) {
+        const Tensor& predicate = args[indices[0]];
+        if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
+          return errors::InvalidArgument(
+              "Filter predicate `f` must return a scalar bool.");
+        }
+        *out_matched = predicate.scalar<bool>()();
+        return Status::OK();
+      };
     }
+
+    *output = new Dataset(ctx, input, func_, std::move(captured_func),
+                          std::move(filter_pred));
   }
 
  private:
-  const int graph_def_version_;
-
-  class FilterDatasetBase : public DatasetBase {
+  class Dataset : public DatasetBase {
    public:
-    FilterDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
-                      const NameAttrList& func,
-                      std::unique_ptr<CapturedFunction> captured_func)
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
+            std::unique_ptr<CapturedFunction> captured_func,
+            FilterIteratorPredicate filter_pred)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
-          captured_func_(std::move(captured_func)) {
+          captured_func_(std::move(captured_func)),
+          filter_pred_(std::move(filter_pred)) {
       input_->Ref();
     }
 
-    ~FilterDatasetBase() override { input_->Unref(); }
+    ~Dataset() override { input_->Unref(); }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Filter")}));
+      return MakeUnique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Filter")},
+          filter_pred_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -133,17 +152,15 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       return Status::OK();
     }
 
-    virtual Status EvaluatePredicate(IteratorContext* ctx,
-                                     const std::vector<Tensor>& element,
-                                     bool* out_matched) const = 0;
-
    private:
-    class Iterator : public DatasetIterator<FilterDatasetBase> {
+    class Iterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<FilterDatasetBase>(params),
+      explicit Iterator(const Params& params,
+                        FilterIteratorPredicate filter_pred)
+          : DatasetIterator<Dataset>(params),
             filtered_elements_(0),
-            dropped_elements_(0) {
+            dropped_elements_(0),
+            filter_pred_(std::move(filter_pred)) {
         std::vector<string> components =
             str_util::Split(params.prefix, "::", str_util::SkipEmpty());
         prefix_end_ = components.back();
@@ -180,8 +197,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
             return Status::OK();
           }
 
-          TF_RETURN_IF_ERROR(
-              dataset()->EvaluatePredicate(ctx, *out_tensors, &matched));
+          TF_RETURN_IF_ERROR(filter_pred_(ctx, *out_tensors, &matched));
           if (!matched) {
             // Clear the output tensor list since it didn't match.
             out_tensors->clear();
@@ -251,64 +267,14 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       int64 filtered_elements_ GUARDED_BY(mu_);
       int64 dropped_elements_ GUARDED_BY(mu_);
+      const FilterIteratorPredicate filter_pred_;
       string prefix_end_;
     };
 
     const DatasetBase* const input_;
     const NameAttrList func_;
-
-   protected:
     const std::unique_ptr<CapturedFunction> captured_func_;
-  };
-
-  class FilterFunctionDataset : public FilterDatasetBase {
-   public:
-    using FilterDatasetBase::FilterDatasetBase;
-
-   protected:
-    Status EvaluatePredicate(IteratorContext* ctx,
-                             const std::vector<Tensor>& element,
-                             bool* out_matched) const override {
-      // TODO(mrry): Avoid blocking a threadpool thread. We will need to
-      // stack-rip the iterators and use async kernels.
-      std::vector<Tensor> result;
-      TF_RETURN_IF_ERROR(
-          captured_func_->RunWithBorrowedArgs(ctx, element, &result));
-
-      if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
-          result[0].NumElements() != 1) {
-        return errors::InvalidArgument(
-            "Filter predicate `f` must return a scalar bool.");
-      }
-      *out_matched = result[0].scalar<bool>()();
-      return Status::OK();
-    }
-  };
-
-  class FilterTensorDataset : public FilterDatasetBase {
-   public:
-    FilterTensorDataset(OpKernelContext* ctx, const DatasetBase* input,
-                        const NameAttrList& func,
-                        std::unique_ptr<CapturedFunction> captured_func,
-                        int32 index)
-        : FilterDatasetBase(ctx, input, func, std::move(captured_func)),
-          index_(index) {}
-
-   protected:
-    Status EvaluatePredicate(IteratorContext* ctx,
-                             const std::vector<Tensor>& element,
-                             bool* out_matched) const override {
-      const Tensor& predicate = element[index_];
-      if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
-        return errors::InvalidArgument(
-            "Filter predicate `f` must return a scalar bool.");
-      }
-      *out_matched = predicate.scalar<bool>()();
-      return Status::OK();
-    }
-
-   private:
-    const int32 index_;
+    const FilterIteratorPredicate filter_pred_;
   };
 
  private:
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index bf08970560..f9aaa3080e 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -41,6 +43,10 @@ namespace {
 // transformation more robust.
 class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
+  using MapAndBatchIteratorFunction =
+      std::function<void(IteratorContext*, const string&, std::vector<Tensor>,
+                         std::shared_ptr<std::vector<Tensor>>, StatusCallback)>;
+
   explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx),
         op_version_(ctx->def().op() == "MapAndBatchDataset" ? 1 : 2) {
@@ -91,31 +97,66 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
                                                  &captured_func));
 
-    *output = new Dataset(ctx, input, batch_size, num_parallel_calls,
-                          drop_remainder, output_types_, output_shapes_, func_,
-                          std::move(captured_func), &ctx->eigen_cpu_device());
+    std::vector<int> indices;
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
+
+    MapAndBatchIteratorFunction map_func;
+    if (indices.empty()) {
+      CapturedFunction* raw_captured_func = captured_func.get();
+      map_func = [raw_captured_func](
+                     IteratorContext* ctx, const string& prefix,
+                     std::vector<Tensor> args,
+                     std::shared_ptr<std::vector<Tensor>> out_tensors,
+                     StatusCallback done) {
+        raw_captured_func->RunAsync(ctx, std::move(args), out_tensors.get(),
+                                    std::move(done), prefix);
+      };
+    } else {
+      std::vector<bool> can_move = ComputeMoveVector(indices);
+      map_func = [indices, can_move](
+                     IteratorContext* ctx, const string& prefix,
+                     std::vector<Tensor> args,
+                     std::shared_ptr<std::vector<Tensor>> out_tensors,
+                     StatusCallback done) {
+        for (size_t i = 0; i < indices.size(); ++i) {
+          if (can_move[i]) {
+            out_tensors->push_back(std::move(args[indices[i]]));
+          } else {
+            out_tensors->push_back(args[indices[i]]);
+          }
+        }
+        done(Status::OK());
+      };
+    }
+
+    *output = new Dataset(ctx, input, func_, batch_size, num_parallel_calls,
+                          drop_remainder, output_types_, output_shapes_,
+                          std::move(captured_func), &ctx->eigen_cpu_device(),
+                          std::move(map_func));
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func, int64 batch_size,
             int64 num_parallel_calls, bool drop_remainder,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
-            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
-            const Eigen::ThreadPoolDevice* device)
+            const Eigen::ThreadPoolDevice* device,
+            MapAndBatchIteratorFunction map_func)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
+          func_(func),
           batch_size_(batch_size),
           num_parallel_calls_(num_parallel_calls),
           drop_remainder_(drop_remainder),
           output_types_(output_types),
           output_shapes_(output_shapes),
-          map_fn_(func),
           captured_func_(std::move(captured_func)),
-          device_(device) {
+          device_(device),
+          map_func_(std::move(map_func)) {
       input_->Ref();
     }
 
@@ -123,8 +164,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::MapAndBatch")}));
+      return MakeUnique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::MapAndBatch")},
+          map_func_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -143,7 +185,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, map_fn_.name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
       Node* input_graph_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* batch_size_node;
@@ -165,7 +207,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         other_arguments_types.emplace_back(t.dtype());
       }
       AttrValue f;
-      b->BuildAttrValue(map_fn_, &f);
+      b->BuildAttrValue(func_, &f);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
 
@@ -185,12 +227,14 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params)
+      explicit Iterator(const Params& params,
+                        MapAndBatchIteratorFunction map_func)
           : DatasetIterator<Dataset>(params),
             mu_(std::make_shared<mutex>()),
             cond_var_(std::make_shared<condition_variable>()),
             num_parallel_calls_(std::make_shared<model::SharedState>(
-                params.dataset->num_parallel_calls_, mu_, cond_var_)) {}
+                params.dataset->num_parallel_calls_, mu_, cond_var_)),
+            map_func_(std::move(map_func)) {}
 
       ~Iterator() override {
         mutex_lock l(*mu_);
@@ -297,44 +341,6 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         int64 num_calls;  // access guarded by owner's mutex
       };
 
-      void Callback(const std::shared_ptr<IteratorContext>& ctx,
-                    const std::shared_ptr<BatchResult>& result,
-                    const std::shared_ptr<std::vector<Tensor>>& return_values,
-                    int64 offset, const Status& status) LOCKS_EXCLUDED(*mu_) {
-        result->UpdateStatus(status);
-        if (status.ok()) {
-          EnsureOutputAllocated(ctx, result, return_values);
-          for (size_t i = 0; i < return_values->size(); ++i) {
-            const Tensor& tensor = return_values->at(i);
-            Tensor* batch = &(result->output)[i];
-            if (tensor.NumElements() !=
-                (batch->NumElements() / batch->dim_size(0))) {
-              TensorShape batch_shape = batch->shape();
-              batch_shape.RemoveDim(0);
-              result->UpdateStatus(errors::InvalidArgument(
-                  "Cannot add tensor to the batch: number of elements does not "
-                  "match. Shapes are: [tensor]: ",
-                  tensor.shape().DebugString(),
-                  ", [batch]: ", batch_shape.DebugString()));
-              break;
-            }
-            // TODO(mrry): Add a version of DoParallelConcat that allows us to
-            // move `tensor` where possible, to speed up string tensor batching.
-            Status copy_status = ::tensorflow::functor::DoParallelConcat(
-                *dataset()->device_, tensor, offset, batch);
-            if (!copy_status.ok()) {
-              result->UpdateStatus(copy_status);
-              break;
-            }
-          }
-          {
-            mutex_lock l(result->mu);
-            result->num_elements++;
-          }
-        }
-        CallCompleted(result);
-      }
-
       void CallCompleted(const std::shared_ptr<BatchResult>& result)
           LOCKS_EXCLUDED(*mu_) {
         mutex_lock l(*mu_);
@@ -363,21 +369,48 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           return;
         }
 
-        // Call `captured_func_(input_element)`, using `Callback` to store the
-        // result in `result`.
-        (*ctx->runner())(std::bind(
-            [this, result, offset](std::shared_ptr<IteratorContext> ctx,
-                                   std::vector<Tensor> input_element) {
-              std::shared_ptr<std::vector<Tensor>> return_values(
-                  new std::vector<Tensor>());
-              dataset()->captured_func_->RunAsync(
-                  ctx.get(), std::move(input_element), return_values.get(),
-                  [this, ctx, result, return_values, offset](Status status) {
-                    Callback(ctx, result, return_values, offset, status);
-                  },
-                  prefix());
-            },
-            ctx, std::move(input_element)));
+        std::shared_ptr<std::vector<Tensor>> return_values =
+            std::make_shared<std::vector<Tensor>>();
+        auto done = [this, ctx, result, return_values, offset](Status status) {
+          result->UpdateStatus(status);
+          if (status.ok()) {
+            EnsureOutputAllocated(ctx, result, return_values);
+            for (size_t i = 0; i < return_values->size(); ++i) {
+              const Tensor& tensor = return_values->at(i);
+              Tensor* batch = &(result->output)[i];
+              if (tensor.NumElements() !=
+                  (batch->NumElements() / batch->dim_size(0))) {
+                TensorShape batch_shape = batch->shape();
+                batch_shape.RemoveDim(0);
+                result->UpdateStatus(errors::InvalidArgument(
+                    "Cannot add tensor to the batch: number of elements does "
+                    "not match. Shapes are: [tensor]: ",
+                    tensor.shape().DebugString(),
+                    ", [batch]: ", batch_shape.DebugString()));
+                break;
+              }
+              // TODO(mrry): Add a version of DoParallelConcat that allows us to
+              // move `tensor` where possible, to speed up string tensor
+              // batching.
+              Status copy_status = ::tensorflow::functor::DoParallelConcat(
+                  *dataset()->device_, tensor, offset, batch);
+              if (!copy_status.ok()) {
+                result->UpdateStatus(copy_status);
+                break;
+              }
+            }
+            {
+              mutex_lock l(result->mu);
+              result->num_elements++;
+            }
+          }
+          CallCompleted(result);
+        };
+
+        // Apply the map function on `input_element`, storing the result in
+        // `return_values`, and invoking `done` when finished.
+        map_func_(ctx.get(), prefix(), std::move(input_element),
+                  std::move(return_values), std::move(done));
       }
 
       Status CopyPartialBatch(Tensor* output, const Tensor& value,
@@ -404,7 +437,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       void EnsureRunnerThreadStarted(IteratorContext* ctx)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!runner_thread_) {
-          std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
+          auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
           runner_thread_.reset(ctx->env()->StartThread(
               {}, "runner_thread",
               std::bind(&Iterator::RunnerThread, this, ctx_copy)));
@@ -509,8 +542,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
             while (!busy()) {
               if (call_counter_ % dataset()->batch_size_ == 0) {
-                batch_results_.emplace_back(
-                    new BatchResult(dataset()->batch_size_));
+                batch_results_.push_back(
+                    std::make_shared<BatchResult>(dataset()->batch_size_));
               }
               int64 offset = call_counter_++ % dataset()->batch_size_;
               new_calls.emplace_back(batch_results_.back(), offset);
@@ -527,7 +560,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
       Status ReadBatchResult(IteratorContext* ctx, IteratorStateReader* reader,
                              size_t index) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        batch_results_.emplace_back(new BatchResult(dataset()->batch_size_));
+        batch_results_.push_back(
+            std::make_shared<BatchResult>(dataset()->batch_size_));
         std::shared_ptr<BatchResult> result = batch_results_.back();
         string prefix = strings::StrCat("batch_results_", index);
         mutex_lock l(result->mu);
@@ -653,6 +687,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       const std::shared_ptr<condition_variable> cond_var_;
       // Identifies the maximum number of parallel calls.
       const std::shared_ptr<model::SharedState> num_parallel_calls_;
+      const MapAndBatchIteratorFunction map_func_;
+
       // Counts the number of outstanding calls for this batch.
       int64 num_calls_ GUARDED_BY(*mu_) = 0;
       // Counts the total number of calls.
@@ -671,9 +707,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     const bool drop_remainder_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
-    const NameAttrList map_fn_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const Eigen::ThreadPoolDevice* device_;  // not owned
+    const MapAndBatchIteratorFunction map_func_;
   };
 
   const int op_version_;
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index f112e1dc43..0abb2eb4f3 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -17,7 +17,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -28,6 +30,9 @@ namespace {
 
 class MapDatasetOp : public UnaryDatasetOpKernel {
  public:
+  using MapIteratorFunction = std::function<Status(
+      IteratorContext*, std::vector<Tensor>, std::vector<Tensor>*)>;
+
   explicit MapDatasetOp(OpKernelConstruction* ctx) : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
@@ -43,8 +48,36 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
                                                  use_inter_op_parallelism_,
                                                  &captured_func));
 
+    std::vector<int> indices;
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
+
+    MapIteratorFunction map_func;
+    if (indices.empty()) {
+      CapturedFunction* raw_captured_func = captured_func.get();
+      map_func = [raw_captured_func](IteratorContext* ctx,
+                                     std::vector<Tensor> args,
+                                     std::vector<Tensor>* out_tensors) {
+        return raw_captured_func->Run(ctx, std::move(args), out_tensors);
+      };
+    } else {
+      std::vector<bool> can_move = ComputeMoveVector(indices);
+      map_func = [indices, can_move](IteratorContext* ctx,
+                                     std::vector<Tensor> args,
+                                     std::vector<Tensor>* out_tensors) {
+        std::map<int, int> counts;
+        for (size_t i = 0; i < indices.size(); ++i) {
+          if (can_move[i]) {
+            out_tensors->push_back(std::move(args[indices[i]]));
+          } else {
+            out_tensors->push_back(args[indices[i]]);
+          }
+        }
+        return Status::OK();
+      };
+    }
+
     *output = new Dataset(ctx, input, func_, std::move(captured_func),
-                          output_types_, output_shapes_);
+                          output_types_, output_shapes_, std::move(map_func));
   }
 
  private:
@@ -54,13 +87,15 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
             const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
+            const std::vector<PartialTensorShape>& output_shapes,
+            MapIteratorFunction map_func)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
           captured_func_(std::move(captured_func)),
           output_types_(output_types),
-          output_shapes_(output_shapes) {
+          output_shapes_(output_shapes),
+          map_func_(std::move(map_func)) {
       input_->Ref();
     }
 
@@ -68,8 +103,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Map")}));
+      return MakeUnique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Map")}, map_func_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -116,8 +151,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
+      explicit Iterator(const Params& params, MapIteratorFunction map_func)
+          : DatasetIterator<Dataset>(params), map_func_(std::move(map_func)) {}
 
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
@@ -139,10 +174,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
           return Status::OK();
         }
 
-        // TODO(mrry): Avoid blocking a threadpool thread. We will need to
-        // stack-rip the iterators and use async kernels.
-        Status s =
-            dataset()->captured_func_->Run(ctx, std::move(args), out_tensors);
+        Status s = map_func_(ctx, args, out_tensors);
         if (errors::IsOutOfRange(s)) {
           // `f` may deliberately raise `errors::OutOfRange` to indicate
           // that we should terminate the iteration early.
@@ -167,6 +199,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       std::unique_ptr<IteratorBase> input_impl_;
+      const MapIteratorFunction map_func_;
     };
 
     const DatasetBase* const input_;
@@ -174,6 +207,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
     const std::unique_ptr<CapturedFunction> captured_func_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
+    const MapIteratorFunction map_func_;
   };
 
   DataTypeVector output_types_;
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 6abe6c8338..a34bb172d4 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/parallel_map_iterator.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -56,9 +57,49 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
                                                  use_inter_op_parallelism_,
                                                  &captured_func));
 
+    std::vector<int> indices;
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
+
+    ParallelMapIteratorFunction map_func;
+    if (indices.empty()) {
+      CapturedFunction* raw_captured_func = captured_func.get();
+      map_func = [raw_captured_func](IteratorContext* ctx, const string& prefix,
+                                     std::vector<Tensor> args,
+                                     std::vector<Tensor>* out_tensors,
+                                     StatusCallback done) {
+        raw_captured_func->RunAsync(ctx, std::move(args), out_tensors,
+                                    std::move(done), prefix);
+      };
+      if (!use_inter_op_parallelism_) {
+        map_func = [map_func](IteratorContext* ctx, const string& prefix,
+                              std::vector<Tensor> args,
+                              std::vector<Tensor>* out_tensors,
+                              StatusCallback done) {
+          (*ctx->runner())(std::bind(map_func, ctx, prefix, std::move(args),
+                                     out_tensors, std::move(done)));
+        };
+      }
+    } else {
+      std::vector<bool> can_move = ComputeMoveVector(indices);
+      map_func = [indices, can_move](IteratorContext* ctx, const string& prefix,
+                                     std::vector<Tensor> args,
+                                     std::vector<Tensor>* out_tensors,
+                                     StatusCallback done) {
+        std::map<int, int> counts;
+        for (size_t i = 0; i < indices.size(); ++i) {
+          if (can_move[i]) {
+            out_tensors->push_back(std::move(args[indices[i]]));
+          } else {
+            out_tensors->push_back(args[indices[i]]);
+          }
+        }
+        done(Status::OK());
+      };
+    }
+
     *output = new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
                           output_shapes_, use_inter_op_parallelism_,
-                          std::move(captured_func));
+                          std::move(captured_func), std::move(map_func));
   }
 
  private:
@@ -69,7 +110,8 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
             bool use_inter_op_parallelism,
-            std::unique_ptr<CapturedFunction> captured_func)
+            std::unique_ptr<CapturedFunction> captured_func,
+            ParallelMapIteratorFunction map_func)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
@@ -77,7 +119,8 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
           output_types_(output_types),
           output_shapes_(output_shapes),
           use_inter_op_parallelism_(use_inter_op_parallelism),
-          captured_func_(std::move(captured_func)) {
+          captured_func_(std::move(captured_func)),
+          map_func_(std::move(map_func)) {
       input_->Ref();
     }
 
@@ -89,26 +132,9 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
         return captured_func_->Instantiate(ctx);
       };
 
-      const string& new_prefix = strings::StrCat(prefix, "::ParallelMap");
-      ParallelMapIteratorFunction map_func =
-          [this, new_prefix](IteratorContext* ctx,
-                             std::vector<Tensor> input_element,
-                             std::vector<Tensor>* result, StatusCallback done) {
-            captured_func_->RunAsync(ctx, std::move(input_element), result,
-                                     std::move(done), new_prefix);
-          };
-      if (!use_inter_op_parallelism_) {
-        map_func = [map_func](
-                       IteratorContext* ctx, std::vector<Tensor> input_element,
-                       std::vector<Tensor>* result, StatusCallback done) {
-          (*ctx->runner())(std::bind(map_func, ctx, std::move(input_element),
-                                     result, std::move(done)));
-        };
-      }
-
-      return NewParallelMapIterator({this, new_prefix}, input_,
-                                    std::move(init_func), std::move(map_func),
-                                    num_parallel_calls_);
+      return NewParallelMapIterator(
+          {this, strings::StrCat(prefix, "::ParallelMap")}, input_,
+          std::move(init_func), map_func_, num_parallel_calls_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -176,6 +202,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     const std::vector<PartialTensorShape> output_shapes_;
     const bool use_inter_op_parallelism_;
     const std::unique_ptr<CapturedFunction> captured_func_;
+    const ParallelMapIteratorFunction map_func_;
   };
 
   DataTypeVector output_types_;
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index 13bd4b6036..ebf41925c9 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -179,7 +180,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   void EnsureRunnerThreadStarted(IteratorContext* ctx)
       EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     if (!runner_thread_) {
-      std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
+      auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
       runner_thread_.reset(ctx->env()->StartThread(
           {}, "runner_thread",
           std::bind(&ParallelMapIterator::RunnerThread, this, ctx_copy)));
@@ -208,15 +209,15 @@ class ParallelMapIterator : public DatasetBaseIterator {
       return;
     }
 
-    // Call `func_(input_element)`, store the result in `result->return_values`,
-    // and notify `result->notification` to unblock a consumer.
     auto done = [this, result](Status status) {
       result->status.Update(status);
       CallCompleted(result);
     };
 
-    map_func_(ctx.get(), std::move(input_element), &result->return_values,
-              std::move(done));
+    // Apply the map function on `input_element`, storing the result in
+    // `result->return_values`, and invoking `done` when finished.
+    map_func_(ctx.get(), prefix(), std::move(input_element),
+              &result->return_values, std::move(done));
   }
 
   Status ProcessResult(const std::shared_ptr<InvocationResult>& result,
@@ -349,9 +350,9 @@ std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBase* input_dataset,
     std::function<Status(IteratorContext*)> init_func,
     ParallelMapIteratorFunction map_func, int32 num_parallel_calls) {
-  return std::unique_ptr<IteratorBase>(
-      new ParallelMapIterator(params, input_dataset, std::move(init_func),
-                              std::move(map_func), num_parallel_calls));
+  return MakeUnique<ParallelMapIterator>(
+      params, input_dataset, std::move(init_func), std::move(map_func),
+      num_parallel_calls);
 }
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.h b/tensorflow/core/kernels/data/parallel_map_iterator.h
index dc26c5cf25..813f13c9e4 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.h
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.h
@@ -30,7 +30,7 @@ namespace data {
 // 3. A `std::vector<Tensor>*` to which the function will write the result.
 // 4. A `StatusCallback` that should be invoked when the function is complete.
 using ParallelMapIteratorFunction =
-    std::function<void(IteratorContext*, std::vector<Tensor>,
+    std::function<void(IteratorContext*, const string&, std::vector<Tensor>,
                        std::vector<Tensor>*, StatusCallback)>;
 
 // Returns a new iterator that applies `map_func` to the elements of
diff --git a/tensorflow/core/kernels/data/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/parse_example_dataset_op.cc
index 1d1a717062..7de5ea8860 100644
--- a/tensorflow/core/kernels/data/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parse_example_dataset_op.cc
@@ -182,7 +182,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      auto map_fn = [this](IteratorContext* ctx,
+      auto map_fn = [this](IteratorContext* ctx, const string& prefix,
                            std::vector<Tensor> input_element,
                            std::vector<Tensor>* result, StatusCallback done) {
         (*ctx->runner())([this, ctx, input_element, result, done]() {
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index afd0fc3abf..0703955fd4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -332,6 +332,26 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       for _ in range(10):
         self.assertAllEqual([element for _ in range(10)], sess.run(get_next))
 
+  @parameterized.named_parameters(
+      ("Identity", None, lambda x: x, None),
+      ("Replicate", None, lambda x: (x, x), None),
+      ("Swap", (None, None), lambda x, y: (y, x), None),
+      ("Project", (None, None), lambda x, y: x, None),
+  )
+  def testShortCircuit(self, structure, map_fn, num_parallel_calls):
+    dataset = self.structuredDataset(structure).repeat().apply(
+        batching.map_and_batch(map_fn, batch_size=10))
+    get_next = dataset.make_one_shot_iterator().get_next()
+
+    with self.cached_session() as sess:
+      if isinstance(structure, tuple):
+        expected = map_fn(
+            *sess.run(self.structuredElement(structure, shape=[10])))
+      else:
+        expected = map_fn(
+            sess.run(self.structuredElement(structure, shape=[10])))
+      self.assertAllEqual(expected, sess.run(get_next))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
index 6b7afafa5d..a0c6b37a6d 100644
--- a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
@@ -156,7 +156,7 @@ class FilterDatasetTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testReturnComponent(self):
+  def testShortCircuit(self):
     iterator = (
         dataset_ops.Dataset.zip(
             (dataset_ops.Dataset.range(10),
diff --git a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
index 0c372ebb10..6efbe31ca1 100644
--- a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
@@ -622,7 +622,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       sess.run(init_op)
       for i in range(10):
         actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
+        self.assertIsInstance(actual, sparse_tensor.SparseTensorValue)
         self.assertSparseValuesEqual(actual, _sparse(i))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -649,7 +649,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       sess.run(init_op)
       for i in range(10):
         actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
+        self.assertIsInstance(actual, sparse_tensor.SparseTensorValue)
         self.assertSparseValuesEqual(actual, _check(_sparse(i)).eval())
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -783,19 +783,57 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertTrue(all(tids[0] == tid for tid in tids))
 # pylint: enable=g-long-lambda
 
+  @parameterized.named_parameters(
+      ("SequentialIdentity", None, lambda x: x, None),
+      ("SequentialReplicate", None, lambda x: (x, x), None),
+      ("SequentialSwap", (None, None), lambda x, y: (y, x), None),
+      ("SequentialProject", (None, None), lambda x, y: x, None),
+      ("ParallelIdentity", None, lambda x: x, 10),
+      ("ParallelReplicate", None, lambda x: (x, x), 10),
+      ("ParallelSwap", (None, None), lambda x, y: (y, x), 10),
+      ("ParallelProject", (None, None), lambda x, y: x, 10),
+  )
+  def testShortCircuit(self, structure, map_fn, num_parallel_calls):
+    dataset = self.structuredDataset(structure).repeat().map(
+        map_fn, num_parallel_calls=num_parallel_calls)
+    get_next = dataset.make_one_shot_iterator().get_next()
+
+    with self.cached_session() as sess:
+      if isinstance(structure, tuple):
+        expected = map_fn(*sess.run(self.structuredElement(structure)))
+      else:
+        expected = map_fn(sess.run(self.structuredElement(structure)))
+      self.assertEqual(expected, sess.run(get_next))
+
 
 class MapDatasetBenchmark(test.Benchmark):
 
   def benchmarkChainOfMaps(self):
     chain_lengths = [0, 1, 2, 5, 10, 20, 50]
     for chain_length in chain_lengths:
-      for use_inter_op_parallelism in [False, True]:
+      for mode in ["general", "single-threaded", "short-circuit"]:
+        if mode == "general":
+          map_fn = lambda x: x + 1
+          use_inter_op_parallelism = True
+          print_label = ""
+          benchmark_label = ""
+        if mode == "single-threaded":
+          map_fn = lambda x: x + 1
+          use_inter_op_parallelism = False
+          print_label = " (single threaded mode)"
+          benchmark_label = "_single_threaded"
+        if mode == "short-circuit":
+          map_fn = lambda x: x
+          use_inter_op_parallelism = True  # should not have any significance
+          print_label = " (short circuit mode)"
+          benchmark_label = "_short_circuit"
+
         with ops.Graph().as_default():
           dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
           for _ in range(chain_length):
             dataset = dataset_ops.MapDataset(
                 dataset,
-                lambda x: x,
+                map_fn,
                 use_inter_op_parallelism=use_inter_op_parallelism)
           iterator = dataset.make_one_shot_iterator()
           next_element = iterator.get_next()
@@ -813,25 +851,39 @@ class MapDatasetBenchmark(test.Benchmark):
 
             median_wall_time = np.median(deltas) / 100
             print("Map dataset chain length%s: %d Median wall time: %f" %
-                  (" (single threaded mode)" if not use_inter_op_parallelism
-                   else "", chain_length, median_wall_time))
+                  (print_label, chain_length, median_wall_time))
             self.report_benchmark(
                 iters=1000,
                 wall_time=median_wall_time,
                 name="benchmark_map_dataset_chain_latency_%d%s" %
-                (chain_length, "_single_threaded"
-                 if not use_inter_op_parallelism else ""))
+                (chain_length, benchmark_label))
 
   def benchmarkMapFanOut(self):
     fan_outs = [1, 2, 5, 10, 20, 50, 100]
     for fan_out in fan_outs:
-      for use_inter_op_parallelism in [False, True]:
+      for mode in ["general", "single-threaded", "short-circuit"]:
+        if mode == "general":
+          map_fn = lambda *xs: [x + 1 for x in xs]
+          use_inter_op_parallelism = True
+          print_label = ""
+          benchmark_label = ""
+        if mode == "single-threaded":
+          map_fn = lambda *xs: [x + 1 for x in xs]
+          use_inter_op_parallelism = False
+          print_label = " (single threaded mode)"
+          benchmark_label = "_single_threaded"
+        if mode == "short-circuit":
+          map_fn = lambda *xs: xs
+          use_inter_op_parallelism = True  # should not have any significance
+          print_label = " (short circuit mode)"
+          benchmark_label = "_short_circuit"
+
         with ops.Graph().as_default():
           dataset = dataset_ops.Dataset.from_tensors(
               tuple(0 for _ in range(fan_out))).repeat(None)
           dataset = dataset_ops.MapDataset(
               dataset,
-              lambda *xs: xs,
+              map_fn,
               use_inter_op_parallelism=use_inter_op_parallelism)
           iterator = dataset.make_one_shot_iterator()
           next_element = iterator.get_next()
@@ -849,14 +901,12 @@ class MapDatasetBenchmark(test.Benchmark):
 
             median_wall_time = np.median(deltas) / 100
             print("Map dataset fan out%s: %d Median wall time: %f" %
-                  (" (single threaded mode)" if not use_inter_op_parallelism
-                   else "", fan_out, median_wall_time))
+                  (print_label, fan_out, median_wall_time))
             self.report_benchmark(
                 iters=1000,
                 wall_time=median_wall_time,
-                name="benchmark_map_dataset_fan_out_%d%s" %
-                (fan_out, "_single_threaded"
-                 if not use_inter_op_parallelism else ""))
+                name="benchmark_map_dataset_fan_out_%d%s" % (fan_out,
+                                                             benchmark_label))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index b730e10949..b73a94e683 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -19,10 +19,13 @@ from __future__ import print_function
 
 import re
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -107,3 +110,29 @@ class DatasetTestBase(test.TestCase):
       with self.assertRaisesRegexp(exception_class,
                                    re.escape(expected_message)):
         self.evaluate(next2())
+
+  def structuredDataset(self, structure, shape=None, dtype=dtypes.int64):
+    """Returns a singleton dataset with the given structure."""
+    if shape is None:
+      shape = []
+    if structure is None:
+      return dataset_ops.Dataset.from_tensors(
+          array_ops.zeros(shape, dtype=dtype))
+    else:
+      return dataset_ops.Dataset.zip(
+          tuple([
+              self.structuredDataset(substructure, shape, dtype)
+              for substructure in structure
+          ]))
+
+  def structuredElement(self, structure, shape=None, dtype=dtypes.int64):
+    """Returns an element with the given structure."""
+    if shape is None:
+      shape = []
+    if structure is None:
+      return array_ops.zeros(shape, dtype=dtype)
+    else:
+      return tuple([
+          self.structuredElement(substructure, shape, dtype)
+          for substructure in structure
+      ])
-- 
GitLab


From 6123677f264c615042a816e713f7f1204685e544 Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Fri, 5 Oct 2018 14:18:41 -0700
Subject: [PATCH 490/570] Fix bug in nonpip builds in ci_parameterized_build.sh

The extra spaces were confusing bash's string-line-continuation from
the backslash `\` on the previous line.

PiperOrigin-RevId: 215964853
---
 tensorflow/tools/ci_build/ci_parameterized_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index fdff867ff0..489722c0e9 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -423,7 +423,7 @@ if [[ ${TF_BUILD_IS_PIP} == "no_pip" ]] ||
      [[ ${CTYPE} == "debian.jessie.cpu" ]]; then
     # CPU only command, fully parallel.
     NO_PIP_MAIN_CMD="${MAIN_CMD} ${BAZEL_CMD} ${OPT_FLAG} "\
-      "${EXTRA_ARGS} -- ${BAZEL_TARGET}"
+"${EXTRA_ARGS} -- ${BAZEL_TARGET}"
   elif [[ ${CTYPE} == gpu* ]]; then
     # GPU only command, run as many jobs as the GPU count only.
     NO_PIP_MAIN_CMD="${BAZEL_CMD} ${OPT_FLAG} "\
-- 
GitLab


From c221f04b7efff5929f3a6d090983b52f3aa16166 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Fri, 5 Oct 2018 14:44:47 -0700
Subject: [PATCH 491/570] Automated rollback of commit
 ae0bc6f006497cc04a2ee75166d4ec71c7154fd8

PiperOrigin-RevId: 215969360
---
 tensorflow/core/kernels/data/BUILD            |  14 --
 tensorflow/core/kernels/data/dataset_utils.cc |  47 -----
 tensorflow/core/kernels/data/dataset_utils.h  |  20 --
 .../core/kernels/data/dataset_utils_test.cc   |  46 -----
 .../core/kernels/data/filter_dataset_op.cc    | 162 +++++++++-------
 .../kernels/data/map_and_batch_dataset_op.cc  | 180 +++++++-----------
 .../core/kernels/data/map_dataset_op.cc       |  56 ++----
 .../kernels/data/parallel_map_dataset_op.cc   |  73 +++----
 .../kernels/data/parallel_map_iterator.cc     |  17 +-
 .../core/kernels/data/parallel_map_iterator.h |   2 +-
 .../kernels/data/parse_example_dataset_op.cc  |   2 +-
 .../kernel_tests/map_and_batch_test.py        |  20 --
 .../kernel_tests/filter_dataset_op_test.py    |   2 +-
 .../data/kernel_tests/map_dataset_op_test.py  |  80 ++------
 .../python/data/kernel_tests/test_base.py     |  29 ---
 15 files changed, 230 insertions(+), 520 deletions(-)
 delete mode 100644 tensorflow/core/kernels/data/dataset_utils_test.cc

diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 37c1c54786..451f8c1a6c 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -45,16 +45,6 @@ cc_library(
     ],
 )
 
-tf_cc_test(
-    name = "dataset_utils_test",
-    srcs = ["dataset_utils_test.cc"],
-    deps = [
-        ":dataset_utils",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
 cc_library(
     name = "captured_function",
     srcs = ["captured_function.cc"],
@@ -215,7 +205,6 @@ tf_kernel_library(
     deps = [
         ":captured_function",
         ":dataset",
-        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -243,7 +232,6 @@ tf_kernel_library(
     deps = [
         ":captured_function",
         ":dataset",
-        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -257,7 +245,6 @@ tf_kernel_library(
     deps = [
         ":captured_function",
         ":dataset",
-        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -298,7 +285,6 @@ tf_kernel_library(
     deps = [
         ":captured_function",
         ":dataset",
-        ":dataset_utils",
         ":parallel_map_iterator",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index a40f7f2146..e10833f525 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -15,57 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
 
 namespace tensorflow {
 namespace data {
 
-Status ComputeShortCircuitIndices(OpKernelContext* ctx,
-                                  const NameAttrList& func,
-                                  std::vector<int>* indices) {
-  FunctionLibraryRuntime::Handle fn_handle;
-  TF_RETURN_IF_ERROR(ctx->function_library()->Instantiate(
-      func.name(), AttrSlice(&func.attr()), &fn_handle));
-  auto cleanup = gtl::MakeCleanup([ctx, fn_handle]() {
-    Status s = ctx->function_library()->ReleaseHandle(fn_handle);
-    if (!s.ok()) {
-      LOG(WARNING) << "Failed to release handle: " << s.error_message();
-    }
-  });
-
-  const FunctionBody* fn_body =
-      ctx->function_library()->GetFunctionBody(fn_handle);
-  indices->resize(fn_body->ret_nodes.size());
-  for (size_t i = 0; i < fn_body->ret_nodes.size(); ++i) {
-    Node* ret_node = fn_body->ret_nodes[i];
-    Node* ret_input_node;
-    TF_RETURN_IF_ERROR(ret_node->input_node(0, &ret_input_node));
-    if (ret_input_node->def().op() == FunctionLibraryDefinition::kArgOp) {
-      TF_RETURN_IF_ERROR(
-          GetNodeAttr(ret_input_node->def(), "index", &((*indices)[i])));
-    } else {
-      indices->clear();
-      break;
-    }
-  }
-  return Status::OK();
-}
-
-std::vector<bool> ComputeMoveVector(const std::vector<int>& indices) {
-  std::map<int, int> last_use;
-  for (size_t i = 0; i < indices.size(); ++i) {
-    last_use[indices[i]] = i;
-  }
-  std::vector<bool> can_move;
-  can_move.resize(indices.size());
-  for (size_t i = 0; i < indices.size(); ++i) {
-    can_move[i] = last_use[indices[i]] == i;
-  }
-  return can_move;
-}
-
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
     int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index d777062293..6ec1350cd4 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -22,26 +22,6 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
-// This method is used to determine whether we can short-circuit the evaluation
-// of the user-defined function `func`. Short-circuting is possible if every
-// function output corresponds to one of its inputs (e.g. `f(x) = x`, `f(x,y) =
-// (y,x)`, or `f(x) = (x,x)`).
-//
-// If short-circuiting is possible, the method stores the mapping from output
-// indices to input indices in `indices`. Otherwise, `indices` will be empty.
-//
-// Returns non-ok status if analysis of the function fails.
-//
-// TODO(jsimsa): Extend this to support constants as well.
-Status ComputeShortCircuitIndices(OpKernelContext* ctx,
-                                  const NameAttrList& func,
-                                  std::vector<int>* indices);
-
-// Given a vector that maps output indices to input indices, return a vector
-// that identifies for which output indices can we move the input (assuming
-// output indices are processed left to right).
-std::vector<bool> ComputeMoveVector(const std::vector<int>& indices);
-
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
     int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc
deleted file mode 100644
index 43295b8ebb..0000000000
--- a/tensorflow/core/kernels/data/dataset_utils_test.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/data/dataset_utils.h"
-
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-TEST(DatasetUtils, ComputeMoveVector) {
-  struct TestCase {
-    std::vector<int> indices;
-    std::vector<bool> expected;
-  };
-
-  TestCase test_cases[] = {
-      TestCase{{}, {}},
-      TestCase{{1}, {true}},
-      TestCase{{1, 1}, {false, true}},
-      TestCase{{1, 2}, {true, true}},
-      TestCase{{1, 1, 2}, {false, true, true}},
-      TestCase{{1, 2, 2}, {true, false, true}},
-  };
-
-  for (auto& test_case : test_cases) {
-    EXPECT_EQ(test_case.expected, ComputeMoveVector(test_case.indices));
-  }
-}
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index be7d182a1f..00884314a9 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -18,11 +18,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -33,84 +31,67 @@ namespace {
 
 class FilterDatasetOp : public UnaryDatasetOpKernel {
  public:
-  using FilterIteratorPredicate =
-      std::function<Status(IteratorContext*, std::vector<Tensor>, bool*)>;
-
   explicit FilterDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("predicate", &func_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
+    FunctionLibraryRuntime::Handle pred_handle;
+    OP_REQUIRES_OK(ctx,
+                   ctx->function_library()->Instantiate(
+                       func_.name(), AttrSlice(&func_.attr()), &pred_handle));
+    auto cleanup = gtl::MakeCleanup([ctx, pred_handle]() {
+      OP_REQUIRES_OK(ctx, ctx->function_library()->ReleaseHandle(pred_handle));
+    });
+
+    const FunctionBody* pred_body =
+        ctx->function_library()->GetFunctionBody(pred_handle);
+    OP_REQUIRES(ctx, pred_body->ret_nodes.size() == 1,
+                errors::InvalidArgument(
+                    "predicate function must have a single return value."));
+    Node* ret_node = pred_body->ret_nodes[0];
+    Node* ret_input_node;
+    OP_REQUIRES_OK(ctx, ret_node->input_node(0, &ret_input_node));
+
     std::unique_ptr<CapturedFunction> captured_func;
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
                                                  &captured_func));
 
-    std::vector<int> indices;
-    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
-    OP_REQUIRES(ctx, indices.size() <= 1,
-                errors::InvalidArgument(
-                    "predicate function has more than one return value."));
-
-    FilterIteratorPredicate filter_pred;
-    if (indices.empty()) {
-      CapturedFunction* raw_captured_func = captured_func.get();
-      filter_pred = [raw_captured_func](IteratorContext* ctx,
-                                        const std::vector<Tensor>& args,
-                                        bool* out_matched) {
-        std::vector<Tensor> result;
-        TF_RETURN_IF_ERROR(
-            raw_captured_func->RunWithBorrowedArgs(ctx, args, &result));
-
-        if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
-            result[0].NumElements() != 1) {
-          return errors::InvalidArgument(
-              "Filter predicate `f` must return a scalar bool.");
-        }
-        *out_matched = result[0].scalar<bool>()();
-        return Status::OK();
-      };
+    if (ret_input_node->def().op() == "_Arg") {
+      int32 index = -1;
+      OP_REQUIRES_OK(ctx, GetNodeAttr(ret_input_node->def(), "index", &index));
+      *output = new FilterTensorDataset(ctx, input, func_,
+                                        std::move(captured_func), index);
     } else {
-      filter_pred = [indices](IteratorContext* ctx,
-                              const std::vector<Tensor>& args,
-                              bool* out_matched) {
-        const Tensor& predicate = args[indices[0]];
-        if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
-          return errors::InvalidArgument(
-              "Filter predicate `f` must return a scalar bool.");
-        }
-        *out_matched = predicate.scalar<bool>()();
-        return Status::OK();
-      };
+      *output = new FilterFunctionDataset(ctx, input, func_,
+                                          std::move(captured_func));
     }
-
-    *output = new Dataset(ctx, input, func_, std::move(captured_func),
-                          std::move(filter_pred));
   }
 
  private:
-  class Dataset : public DatasetBase {
+  const int graph_def_version_;
+
+  class FilterDatasetBase : public DatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func,
-            std::unique_ptr<CapturedFunction> captured_func,
-            FilterIteratorPredicate filter_pred)
+    FilterDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
+                      const NameAttrList& func,
+                      std::unique_ptr<CapturedFunction> captured_func)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
-          captured_func_(std::move(captured_func)),
-          filter_pred_(std::move(filter_pred)) {
+          captured_func_(std::move(captured_func)) {
       input_->Ref();
     }
 
-    ~Dataset() override { input_->Unref(); }
+    ~FilterDatasetBase() override { input_->Unref(); }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return MakeUnique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::Filter")},
-          filter_pred_);
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Filter")}));
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -152,15 +133,17 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       return Status::OK();
     }
 
+    virtual Status EvaluatePredicate(IteratorContext* ctx,
+                                     const std::vector<Tensor>& element,
+                                     bool* out_matched) const = 0;
+
    private:
-    class Iterator : public DatasetIterator<Dataset> {
+    class Iterator : public DatasetIterator<FilterDatasetBase> {
      public:
-      explicit Iterator(const Params& params,
-                        FilterIteratorPredicate filter_pred)
-          : DatasetIterator<Dataset>(params),
+      explicit Iterator(const Params& params)
+          : DatasetIterator<FilterDatasetBase>(params),
             filtered_elements_(0),
-            dropped_elements_(0),
-            filter_pred_(std::move(filter_pred)) {
+            dropped_elements_(0) {
         std::vector<string> components =
             str_util::Split(params.prefix, "::", str_util::SkipEmpty());
         prefix_end_ = components.back();
@@ -197,7 +180,8 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
             return Status::OK();
           }
 
-          TF_RETURN_IF_ERROR(filter_pred_(ctx, *out_tensors, &matched));
+          TF_RETURN_IF_ERROR(
+              dataset()->EvaluatePredicate(ctx, *out_tensors, &matched));
           if (!matched) {
             // Clear the output tensor list since it didn't match.
             out_tensors->clear();
@@ -267,14 +251,64 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       int64 filtered_elements_ GUARDED_BY(mu_);
       int64 dropped_elements_ GUARDED_BY(mu_);
-      const FilterIteratorPredicate filter_pred_;
       string prefix_end_;
     };
 
     const DatasetBase* const input_;
     const NameAttrList func_;
+
+   protected:
     const std::unique_ptr<CapturedFunction> captured_func_;
-    const FilterIteratorPredicate filter_pred_;
+  };
+
+  class FilterFunctionDataset : public FilterDatasetBase {
+   public:
+    using FilterDatasetBase::FilterDatasetBase;
+
+   protected:
+    Status EvaluatePredicate(IteratorContext* ctx,
+                             const std::vector<Tensor>& element,
+                             bool* out_matched) const override {
+      // TODO(mrry): Avoid blocking a threadpool thread. We will need to
+      // stack-rip the iterators and use async kernels.
+      std::vector<Tensor> result;
+      TF_RETURN_IF_ERROR(
+          captured_func_->RunWithBorrowedArgs(ctx, element, &result));
+
+      if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
+          result[0].NumElements() != 1) {
+        return errors::InvalidArgument(
+            "Filter predicate `f` must return a scalar bool.");
+      }
+      *out_matched = result[0].scalar<bool>()();
+      return Status::OK();
+    }
+  };
+
+  class FilterTensorDataset : public FilterDatasetBase {
+   public:
+    FilterTensorDataset(OpKernelContext* ctx, const DatasetBase* input,
+                        const NameAttrList& func,
+                        std::unique_ptr<CapturedFunction> captured_func,
+                        int32 index)
+        : FilterDatasetBase(ctx, input, func, std::move(captured_func)),
+          index_(index) {}
+
+   protected:
+    Status EvaluatePredicate(IteratorContext* ctx,
+                             const std::vector<Tensor>& element,
+                             bool* out_matched) const override {
+      const Tensor& predicate = element[index_];
+      if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
+        return errors::InvalidArgument(
+            "Filter predicate `f` must return a scalar bool.");
+      }
+      *out_matched = predicate.scalar<bool>()();
+      return Status::OK();
+    }
+
+   private:
+    const int32 index_;
   };
 
  private:
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index f9aaa3080e..bf08970560 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -30,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/tracing.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -43,10 +41,6 @@ namespace {
 // transformation more robust.
 class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
-  using MapAndBatchIteratorFunction =
-      std::function<void(IteratorContext*, const string&, std::vector<Tensor>,
-                         std::shared_ptr<std::vector<Tensor>>, StatusCallback)>;
-
   explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx),
         op_version_(ctx->def().op() == "MapAndBatchDataset" ? 1 : 2) {
@@ -97,66 +91,31 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
                                                  &captured_func));
 
-    std::vector<int> indices;
-    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
-
-    MapAndBatchIteratorFunction map_func;
-    if (indices.empty()) {
-      CapturedFunction* raw_captured_func = captured_func.get();
-      map_func = [raw_captured_func](
-                     IteratorContext* ctx, const string& prefix,
-                     std::vector<Tensor> args,
-                     std::shared_ptr<std::vector<Tensor>> out_tensors,
-                     StatusCallback done) {
-        raw_captured_func->RunAsync(ctx, std::move(args), out_tensors.get(),
-                                    std::move(done), prefix);
-      };
-    } else {
-      std::vector<bool> can_move = ComputeMoveVector(indices);
-      map_func = [indices, can_move](
-                     IteratorContext* ctx, const string& prefix,
-                     std::vector<Tensor> args,
-                     std::shared_ptr<std::vector<Tensor>> out_tensors,
-                     StatusCallback done) {
-        for (size_t i = 0; i < indices.size(); ++i) {
-          if (can_move[i]) {
-            out_tensors->push_back(std::move(args[indices[i]]));
-          } else {
-            out_tensors->push_back(args[indices[i]]);
-          }
-        }
-        done(Status::OK());
-      };
-    }
-
-    *output = new Dataset(ctx, input, func_, batch_size, num_parallel_calls,
-                          drop_remainder, output_types_, output_shapes_,
-                          std::move(captured_func), &ctx->eigen_cpu_device(),
-                          std::move(map_func));
+    *output = new Dataset(ctx, input, batch_size, num_parallel_calls,
+                          drop_remainder, output_types_, output_shapes_, func_,
+                          std::move(captured_func), &ctx->eigen_cpu_device());
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func, int64 batch_size,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
             int64 num_parallel_calls, bool drop_remainder,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
+            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
-            const Eigen::ThreadPoolDevice* device,
-            MapAndBatchIteratorFunction map_func)
+            const Eigen::ThreadPoolDevice* device)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
-          func_(func),
           batch_size_(batch_size),
           num_parallel_calls_(num_parallel_calls),
           drop_remainder_(drop_remainder),
           output_types_(output_types),
           output_shapes_(output_shapes),
+          map_fn_(func),
           captured_func_(std::move(captured_func)),
-          device_(device),
-          map_func_(std::move(map_func)) {
+          device_(device) {
       input_->Ref();
     }
 
@@ -164,9 +123,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return MakeUnique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::MapAndBatch")},
-          map_func_);
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::MapAndBatch")}));
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -185,7 +143,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, map_fn_.name()));
       Node* input_graph_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* batch_size_node;
@@ -207,7 +165,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         other_arguments_types.emplace_back(t.dtype());
       }
       AttrValue f;
-      b->BuildAttrValue(func_, &f);
+      b->BuildAttrValue(map_fn_, &f);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
 
@@ -227,14 +185,12 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params,
-                        MapAndBatchIteratorFunction map_func)
+      explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
             mu_(std::make_shared<mutex>()),
             cond_var_(std::make_shared<condition_variable>()),
             num_parallel_calls_(std::make_shared<model::SharedState>(
-                params.dataset->num_parallel_calls_, mu_, cond_var_)),
-            map_func_(std::move(map_func)) {}
+                params.dataset->num_parallel_calls_, mu_, cond_var_)) {}
 
       ~Iterator() override {
         mutex_lock l(*mu_);
@@ -341,6 +297,44 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         int64 num_calls;  // access guarded by owner's mutex
       };
 
+      void Callback(const std::shared_ptr<IteratorContext>& ctx,
+                    const std::shared_ptr<BatchResult>& result,
+                    const std::shared_ptr<std::vector<Tensor>>& return_values,
+                    int64 offset, const Status& status) LOCKS_EXCLUDED(*mu_) {
+        result->UpdateStatus(status);
+        if (status.ok()) {
+          EnsureOutputAllocated(ctx, result, return_values);
+          for (size_t i = 0; i < return_values->size(); ++i) {
+            const Tensor& tensor = return_values->at(i);
+            Tensor* batch = &(result->output)[i];
+            if (tensor.NumElements() !=
+                (batch->NumElements() / batch->dim_size(0))) {
+              TensorShape batch_shape = batch->shape();
+              batch_shape.RemoveDim(0);
+              result->UpdateStatus(errors::InvalidArgument(
+                  "Cannot add tensor to the batch: number of elements does not "
+                  "match. Shapes are: [tensor]: ",
+                  tensor.shape().DebugString(),
+                  ", [batch]: ", batch_shape.DebugString()));
+              break;
+            }
+            // TODO(mrry): Add a version of DoParallelConcat that allows us to
+            // move `tensor` where possible, to speed up string tensor batching.
+            Status copy_status = ::tensorflow::functor::DoParallelConcat(
+                *dataset()->device_, tensor, offset, batch);
+            if (!copy_status.ok()) {
+              result->UpdateStatus(copy_status);
+              break;
+            }
+          }
+          {
+            mutex_lock l(result->mu);
+            result->num_elements++;
+          }
+        }
+        CallCompleted(result);
+      }
+
       void CallCompleted(const std::shared_ptr<BatchResult>& result)
           LOCKS_EXCLUDED(*mu_) {
         mutex_lock l(*mu_);
@@ -369,48 +363,21 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           return;
         }
 
-        std::shared_ptr<std::vector<Tensor>> return_values =
-            std::make_shared<std::vector<Tensor>>();
-        auto done = [this, ctx, result, return_values, offset](Status status) {
-          result->UpdateStatus(status);
-          if (status.ok()) {
-            EnsureOutputAllocated(ctx, result, return_values);
-            for (size_t i = 0; i < return_values->size(); ++i) {
-              const Tensor& tensor = return_values->at(i);
-              Tensor* batch = &(result->output)[i];
-              if (tensor.NumElements() !=
-                  (batch->NumElements() / batch->dim_size(0))) {
-                TensorShape batch_shape = batch->shape();
-                batch_shape.RemoveDim(0);
-                result->UpdateStatus(errors::InvalidArgument(
-                    "Cannot add tensor to the batch: number of elements does "
-                    "not match. Shapes are: [tensor]: ",
-                    tensor.shape().DebugString(),
-                    ", [batch]: ", batch_shape.DebugString()));
-                break;
-              }
-              // TODO(mrry): Add a version of DoParallelConcat that allows us to
-              // move `tensor` where possible, to speed up string tensor
-              // batching.
-              Status copy_status = ::tensorflow::functor::DoParallelConcat(
-                  *dataset()->device_, tensor, offset, batch);
-              if (!copy_status.ok()) {
-                result->UpdateStatus(copy_status);
-                break;
-              }
-            }
-            {
-              mutex_lock l(result->mu);
-              result->num_elements++;
-            }
-          }
-          CallCompleted(result);
-        };
-
-        // Apply the map function on `input_element`, storing the result in
-        // `return_values`, and invoking `done` when finished.
-        map_func_(ctx.get(), prefix(), std::move(input_element),
-                  std::move(return_values), std::move(done));
+        // Call `captured_func_(input_element)`, using `Callback` to store the
+        // result in `result`.
+        (*ctx->runner())(std::bind(
+            [this, result, offset](std::shared_ptr<IteratorContext> ctx,
+                                   std::vector<Tensor> input_element) {
+              std::shared_ptr<std::vector<Tensor>> return_values(
+                  new std::vector<Tensor>());
+              dataset()->captured_func_->RunAsync(
+                  ctx.get(), std::move(input_element), return_values.get(),
+                  [this, ctx, result, return_values, offset](Status status) {
+                    Callback(ctx, result, return_values, offset, status);
+                  },
+                  prefix());
+            },
+            ctx, std::move(input_element)));
       }
 
       Status CopyPartialBatch(Tensor* output, const Tensor& value,
@@ -437,7 +404,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       void EnsureRunnerThreadStarted(IteratorContext* ctx)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!runner_thread_) {
-          auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
+          std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
           runner_thread_.reset(ctx->env()->StartThread(
               {}, "runner_thread",
               std::bind(&Iterator::RunnerThread, this, ctx_copy)));
@@ -542,8 +509,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
             while (!busy()) {
               if (call_counter_ % dataset()->batch_size_ == 0) {
-                batch_results_.push_back(
-                    std::make_shared<BatchResult>(dataset()->batch_size_));
+                batch_results_.emplace_back(
+                    new BatchResult(dataset()->batch_size_));
               }
               int64 offset = call_counter_++ % dataset()->batch_size_;
               new_calls.emplace_back(batch_results_.back(), offset);
@@ -560,8 +527,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
       Status ReadBatchResult(IteratorContext* ctx, IteratorStateReader* reader,
                              size_t index) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        batch_results_.push_back(
-            std::make_shared<BatchResult>(dataset()->batch_size_));
+        batch_results_.emplace_back(new BatchResult(dataset()->batch_size_));
         std::shared_ptr<BatchResult> result = batch_results_.back();
         string prefix = strings::StrCat("batch_results_", index);
         mutex_lock l(result->mu);
@@ -687,8 +653,6 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       const std::shared_ptr<condition_variable> cond_var_;
       // Identifies the maximum number of parallel calls.
       const std::shared_ptr<model::SharedState> num_parallel_calls_;
-      const MapAndBatchIteratorFunction map_func_;
-
       // Counts the number of outstanding calls for this batch.
       int64 num_calls_ GUARDED_BY(*mu_) = 0;
       // Counts the total number of calls.
@@ -707,9 +671,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     const bool drop_remainder_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
+    const NameAttrList map_fn_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const Eigen::ThreadPoolDevice* device_;  // not owned
-    const MapAndBatchIteratorFunction map_func_;
   };
 
   const int op_version_;
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index 0abb2eb4f3..f112e1dc43 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -17,9 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -30,9 +28,6 @@ namespace {
 
 class MapDatasetOp : public UnaryDatasetOpKernel {
  public:
-  using MapIteratorFunction = std::function<Status(
-      IteratorContext*, std::vector<Tensor>, std::vector<Tensor>*)>;
-
   explicit MapDatasetOp(OpKernelConstruction* ctx) : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
@@ -48,36 +43,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
                                                  use_inter_op_parallelism_,
                                                  &captured_func));
 
-    std::vector<int> indices;
-    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
-
-    MapIteratorFunction map_func;
-    if (indices.empty()) {
-      CapturedFunction* raw_captured_func = captured_func.get();
-      map_func = [raw_captured_func](IteratorContext* ctx,
-                                     std::vector<Tensor> args,
-                                     std::vector<Tensor>* out_tensors) {
-        return raw_captured_func->Run(ctx, std::move(args), out_tensors);
-      };
-    } else {
-      std::vector<bool> can_move = ComputeMoveVector(indices);
-      map_func = [indices, can_move](IteratorContext* ctx,
-                                     std::vector<Tensor> args,
-                                     std::vector<Tensor>* out_tensors) {
-        std::map<int, int> counts;
-        for (size_t i = 0; i < indices.size(); ++i) {
-          if (can_move[i]) {
-            out_tensors->push_back(std::move(args[indices[i]]));
-          } else {
-            out_tensors->push_back(args[indices[i]]);
-          }
-        }
-        return Status::OK();
-      };
-    }
-
     *output = new Dataset(ctx, input, func_, std::move(captured_func),
-                          output_types_, output_shapes_, std::move(map_func));
+                          output_types_, output_shapes_);
   }
 
  private:
@@ -87,15 +54,13 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
             const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes,
-            MapIteratorFunction map_func)
+            const std::vector<PartialTensorShape>& output_shapes)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
           captured_func_(std::move(captured_func)),
           output_types_(output_types),
-          output_shapes_(output_shapes),
-          map_func_(std::move(map_func)) {
+          output_shapes_(output_shapes) {
       input_->Ref();
     }
 
@@ -103,8 +68,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return MakeUnique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::Map")}, map_func_);
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Map")}));
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -151,8 +116,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params, MapIteratorFunction map_func)
-          : DatasetIterator<Dataset>(params), map_func_(std::move(map_func)) {}
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
@@ -174,7 +139,10 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
           return Status::OK();
         }
 
-        Status s = map_func_(ctx, args, out_tensors);
+        // TODO(mrry): Avoid blocking a threadpool thread. We will need to
+        // stack-rip the iterators and use async kernels.
+        Status s =
+            dataset()->captured_func_->Run(ctx, std::move(args), out_tensors);
         if (errors::IsOutOfRange(s)) {
           // `f` may deliberately raise `errors::OutOfRange` to indicate
           // that we should terminate the iteration early.
@@ -199,7 +167,6 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       std::unique_ptr<IteratorBase> input_impl_;
-      const MapIteratorFunction map_func_;
     };
 
     const DatasetBase* const input_;
@@ -207,7 +174,6 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
     const std::unique_ptr<CapturedFunction> captured_func_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
-    const MapIteratorFunction map_func_;
   };
 
   DataTypeVector output_types_;
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index a34bb172d4..6abe6c8338 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/parallel_map_iterator.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -57,49 +56,9 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
                                                  use_inter_op_parallelism_,
                                                  &captured_func));
 
-    std::vector<int> indices;
-    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
-
-    ParallelMapIteratorFunction map_func;
-    if (indices.empty()) {
-      CapturedFunction* raw_captured_func = captured_func.get();
-      map_func = [raw_captured_func](IteratorContext* ctx, const string& prefix,
-                                     std::vector<Tensor> args,
-                                     std::vector<Tensor>* out_tensors,
-                                     StatusCallback done) {
-        raw_captured_func->RunAsync(ctx, std::move(args), out_tensors,
-                                    std::move(done), prefix);
-      };
-      if (!use_inter_op_parallelism_) {
-        map_func = [map_func](IteratorContext* ctx, const string& prefix,
-                              std::vector<Tensor> args,
-                              std::vector<Tensor>* out_tensors,
-                              StatusCallback done) {
-          (*ctx->runner())(std::bind(map_func, ctx, prefix, std::move(args),
-                                     out_tensors, std::move(done)));
-        };
-      }
-    } else {
-      std::vector<bool> can_move = ComputeMoveVector(indices);
-      map_func = [indices, can_move](IteratorContext* ctx, const string& prefix,
-                                     std::vector<Tensor> args,
-                                     std::vector<Tensor>* out_tensors,
-                                     StatusCallback done) {
-        std::map<int, int> counts;
-        for (size_t i = 0; i < indices.size(); ++i) {
-          if (can_move[i]) {
-            out_tensors->push_back(std::move(args[indices[i]]));
-          } else {
-            out_tensors->push_back(args[indices[i]]);
-          }
-        }
-        done(Status::OK());
-      };
-    }
-
     *output = new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
                           output_shapes_, use_inter_op_parallelism_,
-                          std::move(captured_func), std::move(map_func));
+                          std::move(captured_func));
   }
 
  private:
@@ -110,8 +69,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
             bool use_inter_op_parallelism,
-            std::unique_ptr<CapturedFunction> captured_func,
-            ParallelMapIteratorFunction map_func)
+            std::unique_ptr<CapturedFunction> captured_func)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
@@ -119,8 +77,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
           output_types_(output_types),
           output_shapes_(output_shapes),
           use_inter_op_parallelism_(use_inter_op_parallelism),
-          captured_func_(std::move(captured_func)),
-          map_func_(std::move(map_func)) {
+          captured_func_(std::move(captured_func)) {
       input_->Ref();
     }
 
@@ -132,9 +89,26 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
         return captured_func_->Instantiate(ctx);
       };
 
-      return NewParallelMapIterator(
-          {this, strings::StrCat(prefix, "::ParallelMap")}, input_,
-          std::move(init_func), map_func_, num_parallel_calls_);
+      const string& new_prefix = strings::StrCat(prefix, "::ParallelMap");
+      ParallelMapIteratorFunction map_func =
+          [this, new_prefix](IteratorContext* ctx,
+                             std::vector<Tensor> input_element,
+                             std::vector<Tensor>* result, StatusCallback done) {
+            captured_func_->RunAsync(ctx, std::move(input_element), result,
+                                     std::move(done), new_prefix);
+          };
+      if (!use_inter_op_parallelism_) {
+        map_func = [map_func](
+                       IteratorContext* ctx, std::vector<Tensor> input_element,
+                       std::vector<Tensor>* result, StatusCallback done) {
+          (*ctx->runner())(std::bind(map_func, ctx, std::move(input_element),
+                                     result, std::move(done)));
+        };
+      }
+
+      return NewParallelMapIterator({this, new_prefix}, input_,
+                                    std::move(init_func), std::move(map_func),
+                                    num_parallel_calls_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -202,7 +176,6 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     const std::vector<PartialTensorShape> output_shapes_;
     const bool use_inter_op_parallelism_;
     const std::unique_ptr<CapturedFunction> captured_func_;
-    const ParallelMapIteratorFunction map_func_;
   };
 
   DataTypeVector output_types_;
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index ebf41925c9..13bd4b6036 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -180,7 +179,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   void EnsureRunnerThreadStarted(IteratorContext* ctx)
       EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     if (!runner_thread_) {
-      auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
+      std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
       runner_thread_.reset(ctx->env()->StartThread(
           {}, "runner_thread",
           std::bind(&ParallelMapIterator::RunnerThread, this, ctx_copy)));
@@ -209,15 +208,15 @@ class ParallelMapIterator : public DatasetBaseIterator {
       return;
     }
 
+    // Call `func_(input_element)`, store the result in `result->return_values`,
+    // and notify `result->notification` to unblock a consumer.
     auto done = [this, result](Status status) {
       result->status.Update(status);
       CallCompleted(result);
     };
 
-    // Apply the map function on `input_element`, storing the result in
-    // `result->return_values`, and invoking `done` when finished.
-    map_func_(ctx.get(), prefix(), std::move(input_element),
-              &result->return_values, std::move(done));
+    map_func_(ctx.get(), std::move(input_element), &result->return_values,
+              std::move(done));
   }
 
   Status ProcessResult(const std::shared_ptr<InvocationResult>& result,
@@ -350,9 +349,9 @@ std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBase* input_dataset,
     std::function<Status(IteratorContext*)> init_func,
     ParallelMapIteratorFunction map_func, int32 num_parallel_calls) {
-  return MakeUnique<ParallelMapIterator>(
-      params, input_dataset, std::move(init_func), std::move(map_func),
-      num_parallel_calls);
+  return std::unique_ptr<IteratorBase>(
+      new ParallelMapIterator(params, input_dataset, std::move(init_func),
+                              std::move(map_func), num_parallel_calls));
 }
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.h b/tensorflow/core/kernels/data/parallel_map_iterator.h
index 813f13c9e4..dc26c5cf25 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.h
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.h
@@ -30,7 +30,7 @@ namespace data {
 // 3. A `std::vector<Tensor>*` to which the function will write the result.
 // 4. A `StatusCallback` that should be invoked when the function is complete.
 using ParallelMapIteratorFunction =
-    std::function<void(IteratorContext*, const string&, std::vector<Tensor>,
+    std::function<void(IteratorContext*, std::vector<Tensor>,
                        std::vector<Tensor>*, StatusCallback)>;
 
 // Returns a new iterator that applies `map_func` to the elements of
diff --git a/tensorflow/core/kernels/data/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/parse_example_dataset_op.cc
index 7de5ea8860..1d1a717062 100644
--- a/tensorflow/core/kernels/data/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parse_example_dataset_op.cc
@@ -182,7 +182,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      auto map_fn = [this](IteratorContext* ctx, const string& prefix,
+      auto map_fn = [this](IteratorContext* ctx,
                            std::vector<Tensor> input_element,
                            std::vector<Tensor>* result, StatusCallback done) {
         (*ctx->runner())([this, ctx, input_element, result, done]() {
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index 0703955fd4..afd0fc3abf 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -332,26 +332,6 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       for _ in range(10):
         self.assertAllEqual([element for _ in range(10)], sess.run(get_next))
 
-  @parameterized.named_parameters(
-      ("Identity", None, lambda x: x, None),
-      ("Replicate", None, lambda x: (x, x), None),
-      ("Swap", (None, None), lambda x, y: (y, x), None),
-      ("Project", (None, None), lambda x, y: x, None),
-  )
-  def testShortCircuit(self, structure, map_fn, num_parallel_calls):
-    dataset = self.structuredDataset(structure).repeat().apply(
-        batching.map_and_batch(map_fn, batch_size=10))
-    get_next = dataset.make_one_shot_iterator().get_next()
-
-    with self.cached_session() as sess:
-      if isinstance(structure, tuple):
-        expected = map_fn(
-            *sess.run(self.structuredElement(structure, shape=[10])))
-      else:
-        expected = map_fn(
-            sess.run(self.structuredElement(structure, shape=[10])))
-      self.assertAllEqual(expected, sess.run(get_next))
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
index a0c6b37a6d..6b7afafa5d 100644
--- a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
@@ -156,7 +156,7 @@ class FilterDatasetTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testShortCircuit(self):
+  def testReturnComponent(self):
     iterator = (
         dataset_ops.Dataset.zip(
             (dataset_ops.Dataset.range(10),
diff --git a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
index 6efbe31ca1..0c372ebb10 100644
--- a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
@@ -622,7 +622,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       sess.run(init_op)
       for i in range(10):
         actual = sess.run(get_next)
-        self.assertIsInstance(actual, sparse_tensor.SparseTensorValue)
+        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
         self.assertSparseValuesEqual(actual, _sparse(i))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -649,7 +649,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       sess.run(init_op)
       for i in range(10):
         actual = sess.run(get_next)
-        self.assertIsInstance(actual, sparse_tensor.SparseTensorValue)
+        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
         self.assertSparseValuesEqual(actual, _check(_sparse(i)).eval())
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -783,57 +783,19 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertTrue(all(tids[0] == tid for tid in tids))
 # pylint: enable=g-long-lambda
 
-  @parameterized.named_parameters(
-      ("SequentialIdentity", None, lambda x: x, None),
-      ("SequentialReplicate", None, lambda x: (x, x), None),
-      ("SequentialSwap", (None, None), lambda x, y: (y, x), None),
-      ("SequentialProject", (None, None), lambda x, y: x, None),
-      ("ParallelIdentity", None, lambda x: x, 10),
-      ("ParallelReplicate", None, lambda x: (x, x), 10),
-      ("ParallelSwap", (None, None), lambda x, y: (y, x), 10),
-      ("ParallelProject", (None, None), lambda x, y: x, 10),
-  )
-  def testShortCircuit(self, structure, map_fn, num_parallel_calls):
-    dataset = self.structuredDataset(structure).repeat().map(
-        map_fn, num_parallel_calls=num_parallel_calls)
-    get_next = dataset.make_one_shot_iterator().get_next()
-
-    with self.cached_session() as sess:
-      if isinstance(structure, tuple):
-        expected = map_fn(*sess.run(self.structuredElement(structure)))
-      else:
-        expected = map_fn(sess.run(self.structuredElement(structure)))
-      self.assertEqual(expected, sess.run(get_next))
-
 
 class MapDatasetBenchmark(test.Benchmark):
 
   def benchmarkChainOfMaps(self):
     chain_lengths = [0, 1, 2, 5, 10, 20, 50]
     for chain_length in chain_lengths:
-      for mode in ["general", "single-threaded", "short-circuit"]:
-        if mode == "general":
-          map_fn = lambda x: x + 1
-          use_inter_op_parallelism = True
-          print_label = ""
-          benchmark_label = ""
-        if mode == "single-threaded":
-          map_fn = lambda x: x + 1
-          use_inter_op_parallelism = False
-          print_label = " (single threaded mode)"
-          benchmark_label = "_single_threaded"
-        if mode == "short-circuit":
-          map_fn = lambda x: x
-          use_inter_op_parallelism = True  # should not have any significance
-          print_label = " (short circuit mode)"
-          benchmark_label = "_short_circuit"
-
+      for use_inter_op_parallelism in [False, True]:
         with ops.Graph().as_default():
           dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
           for _ in range(chain_length):
             dataset = dataset_ops.MapDataset(
                 dataset,
-                map_fn,
+                lambda x: x,
                 use_inter_op_parallelism=use_inter_op_parallelism)
           iterator = dataset.make_one_shot_iterator()
           next_element = iterator.get_next()
@@ -851,39 +813,25 @@ class MapDatasetBenchmark(test.Benchmark):
 
             median_wall_time = np.median(deltas) / 100
             print("Map dataset chain length%s: %d Median wall time: %f" %
-                  (print_label, chain_length, median_wall_time))
+                  (" (single threaded mode)" if not use_inter_op_parallelism
+                   else "", chain_length, median_wall_time))
             self.report_benchmark(
                 iters=1000,
                 wall_time=median_wall_time,
                 name="benchmark_map_dataset_chain_latency_%d%s" %
-                (chain_length, benchmark_label))
+                (chain_length, "_single_threaded"
+                 if not use_inter_op_parallelism else ""))
 
   def benchmarkMapFanOut(self):
     fan_outs = [1, 2, 5, 10, 20, 50, 100]
     for fan_out in fan_outs:
-      for mode in ["general", "single-threaded", "short-circuit"]:
-        if mode == "general":
-          map_fn = lambda *xs: [x + 1 for x in xs]
-          use_inter_op_parallelism = True
-          print_label = ""
-          benchmark_label = ""
-        if mode == "single-threaded":
-          map_fn = lambda *xs: [x + 1 for x in xs]
-          use_inter_op_parallelism = False
-          print_label = " (single threaded mode)"
-          benchmark_label = "_single_threaded"
-        if mode == "short-circuit":
-          map_fn = lambda *xs: xs
-          use_inter_op_parallelism = True  # should not have any significance
-          print_label = " (short circuit mode)"
-          benchmark_label = "_short_circuit"
-
+      for use_inter_op_parallelism in [False, True]:
         with ops.Graph().as_default():
           dataset = dataset_ops.Dataset.from_tensors(
               tuple(0 for _ in range(fan_out))).repeat(None)
           dataset = dataset_ops.MapDataset(
               dataset,
-              map_fn,
+              lambda *xs: xs,
               use_inter_op_parallelism=use_inter_op_parallelism)
           iterator = dataset.make_one_shot_iterator()
           next_element = iterator.get_next()
@@ -901,12 +849,14 @@ class MapDatasetBenchmark(test.Benchmark):
 
             median_wall_time = np.median(deltas) / 100
             print("Map dataset fan out%s: %d Median wall time: %f" %
-                  (print_label, fan_out, median_wall_time))
+                  (" (single threaded mode)" if not use_inter_op_parallelism
+                   else "", fan_out, median_wall_time))
             self.report_benchmark(
                 iters=1000,
                 wall_time=median_wall_time,
-                name="benchmark_map_dataset_fan_out_%d%s" % (fan_out,
-                                                             benchmark_label))
+                name="benchmark_map_dataset_fan_out_%d%s" %
+                (fan_out, "_single_threaded"
+                 if not use_inter_op_parallelism else ""))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index b73a94e683..b730e10949 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -19,13 +19,10 @@ from __future__ import print_function
 
 import re
 
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.eager import context
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -110,29 +107,3 @@ class DatasetTestBase(test.TestCase):
       with self.assertRaisesRegexp(exception_class,
                                    re.escape(expected_message)):
         self.evaluate(next2())
-
-  def structuredDataset(self, structure, shape=None, dtype=dtypes.int64):
-    """Returns a singleton dataset with the given structure."""
-    if shape is None:
-      shape = []
-    if structure is None:
-      return dataset_ops.Dataset.from_tensors(
-          array_ops.zeros(shape, dtype=dtype))
-    else:
-      return dataset_ops.Dataset.zip(
-          tuple([
-              self.structuredDataset(substructure, shape, dtype)
-              for substructure in structure
-          ]))
-
-  def structuredElement(self, structure, shape=None, dtype=dtypes.int64):
-    """Returns an element with the given structure."""
-    if shape is None:
-      shape = []
-    if structure is None:
-      return array_ops.zeros(shape, dtype=dtype)
-    else:
-      return tuple([
-          self.structuredElement(substructure, shape, dtype)
-          for substructure in structure
-      ])
-- 
GitLab


From 07921022ddc68aacbf210acc62545a90e3091fb1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 14:57:15 -0700
Subject: [PATCH 492/570] Add deprecation call-out for tf_mobile

PiperOrigin-RevId: 215971335
---
 .../lite/g3doc/tfmobile/android_build.md       | 18 +++++++++++++++++-
 .../contrib/lite/g3doc/tfmobile/index.md       | 18 +++++++++++++++++-
 .../contrib/lite/g3doc/tfmobile/ios_build.md   | 18 +++++++++++++++++-
 .../lite/g3doc/tfmobile/linking_libs.md        | 18 +++++++++++++++++-
 .../contrib/lite/g3doc/tfmobile/optimizing.md  | 18 +++++++++++++++++-
 .../lite/g3doc/tfmobile/prepare_models.md      | 18 +++++++++++++++++-
 6 files changed, 102 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/android_build.md b/tensorflow/contrib/lite/g3doc/tfmobile/android_build.md
index b0f32a8d6c..2eb776d10c 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/android_build.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/android_build.md
@@ -1,6 +1,22 @@
-
 # Building TensorFlow on Android
 
+Warning: We expect to deprecate TensorFlow Mobile in early 2019
+
+<div class="caution">
+  <p>
+    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
+    working hard to close the feature gap between TensorFlow Mobile and
+    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
+    will give ample notice to our users when we get to that point and will
+    provide help and support to ensure easy migrations.
+  </p>
+  <p>
+    In the meantime, please use TensorFlow Lite. If you have a feature request,
+    such as a missing op, please post to our <a
+    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
+  </p>
+</div>
+
 To get you started working with TensorFlow on Android, we'll walk through two
 ways to build our TensorFlow mobile demos and deploying them on an Android
 device. The first is Android Studio, which lets you build and deploy in an
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/index.md b/tensorflow/contrib/lite/g3doc/tfmobile/index.md
index 49ad35d4e6..15f0fd3961 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/index.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/index.md
@@ -1,6 +1,22 @@
-
 # Overview
 
+Warning: We expect to deprecate TensorFlow Mobile in early 2019
+
+<div class="caution">
+  <p>
+    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
+    working hard to close the feature gap between TensorFlow Mobile and
+    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
+    will give ample notice to our users when we get to that point and will
+    provide help and support to ensure easy migrations.
+  </p>
+  <p>
+    In the meantime, please use TensorFlow Lite. If you have a feature request,
+    such as a missing op, please post to our <a
+    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
+  </p>
+</div>
+
 TensorFlow was designed to be a good deep learning solution for mobile
 platforms. Currently we have two solutions for deploying machine learning
 applications on mobile and embedded devices: TensorFlow for Mobile and
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md b/tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md
index be8b4100c8..d922907cdc 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md
@@ -1,6 +1,22 @@
-
 # Building TensorFlow on iOS
 
+Warning: We expect to deprecate TensorFlow Mobile in early 2019
+
+<div class="caution">
+  <p>
+    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
+    working hard to close the feature gap between TensorFlow Mobile and
+    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
+    will give ample notice to our users when we get to that point and will
+    provide help and support to ensure easy migrations.
+  </p>
+  <p>
+    In the meantime, please use TensorFlow Lite. If you have a feature request,
+    such as a missing op, please post to our <a
+    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
+  </p>
+</div>
+
 ## Using CocoaPods
 
 The simplest way to get started with TensorFlow on iOS is using the CocoaPods
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md b/tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md
index 4d4bb3bc08..fd0e322c93 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md
@@ -1,6 +1,22 @@
-
 # Integrating TensorFlow libraries
 
+Warning: We expect to deprecate TensorFlow Mobile in early 2019
+
+<div class="caution">
+  <p>
+    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
+    working hard to close the feature gap between TensorFlow Mobile and
+    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
+    will give ample notice to our users when we get to that point and will
+    provide help and support to ensure easy migrations.
+  </p>
+  <p>
+    In the meantime, please use TensorFlow Lite. If you have a feature request,
+    such as a missing op, please post to our <a
+    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
+  </p>
+</div>
+
 Once you have made some progress on a model that addresses the problem you’re
 trying to solve, it’s important to test it out inside your application
 immediately. There are often unexpected differences between your training data
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md b/tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md
index 7436594fd8..59ff8e774c 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md
@@ -1,6 +1,22 @@
-
 # Optimizing for mobile
 
+Warning: We expect to deprecate TensorFlow Mobile in early 2019
+
+<div class="caution">
+  <p>
+    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
+    working hard to close the feature gap between TensorFlow Mobile and
+    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
+    will give ample notice to our users when we get to that point and will
+    provide help and support to ensure easy migrations.
+  </p>
+  <p>
+    In the meantime, please use TensorFlow Lite. If you have a feature request,
+    such as a missing op, please post to our <a
+    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
+  </p>
+</div>
+
 There are some special issues that you have to deal with when you’re trying to
 ship on mobile or embedded devices, and you’ll need to think about these as
 you’re developing your model.
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md b/tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md
index d1c67d4c61..1d373251dd 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md
@@ -1,6 +1,22 @@
-
 # Preparing models for mobile deployment
 
+Warning: We expect to deprecate TensorFlow Mobile in early 2019
+
+<div class="caution">
+  <p>
+    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
+    working hard to close the feature gap between TensorFlow Mobile and
+    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
+    will give ample notice to our users when we get to that point and will
+    provide help and support to ensure easy migrations.
+  </p>
+  <p>
+    In the meantime, please use TensorFlow Lite. If you have a feature request,
+    such as a missing op, please post to our <a
+    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
+  </p>
+</div>
+
 The requirements for storing model information during training are very
 different from when you want to release it as part of a mobile app. This section
 covers the tools involved in converting from a training model to something
-- 
GitLab


From 1e104d80826fed95f9fad6f07f68e35cae3527b2 Mon Sep 17 00:00:00 2001
From: Geoffrey Irving <irving@naml.us>
Date: Wed, 19 Sep 2018 09:33:19 -0700
Subject: [PATCH 493/570] Expand stateless random generators to match their
 stateful cousins

stateless_random_uniform now take minval+maxval and handles ints,
and stateless_normal/stateless_truncated_normal take mean+stddev.
Additionally, all of the stateless functions now have proper doc
strings.

This is step one of moving stateless random numbers out of contrib.
---
 tensorflow/contrib/stateless/BUILD            |   5 +-
 tensorflow/contrib/stateless/__init__.py      |   9 +-
 .../kernel_tests/stateless_random_ops_test.py | 156 ++++++-------
 .../contrib/stateless/python/stateless_ops.py | 214 ++++++++++++++++++
 .../api_def_StatelessRandomUniformInt.pbtxt   |  46 ++++
 tensorflow/core/kernels/random_op.cc          |  34 +--
 .../core/kernels/stateless_random_ops.cc      | 155 ++++++++-----
 tensorflow/core/ops/stateless_random_ops.cc   |  53 +++--
 8 files changed, 491 insertions(+), 181 deletions(-)
 create mode 100644 tensorflow/contrib/stateless/python/stateless_ops.py
 create mode 100644 tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformInt.pbtxt

diff --git a/tensorflow/contrib/stateless/BUILD b/tensorflow/contrib/stateless/BUILD
index a217397c1a..e9ddec8889 100644
--- a/tensorflow/contrib/stateless/BUILD
+++ b/tensorflow/contrib/stateless/BUILD
@@ -11,7 +11,10 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 py_library(
     name = "stateless",
-    srcs = ["__init__.py"],
+    srcs = [
+        "__init__.py",
+        "python/stateless_ops.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/contrib/stateless/__init__.py b/tensorflow/contrib/stateless/__init__.py
index fe23fe0dd8..30d0a7ab6a 100644
--- a/tensorflow/contrib/stateless/__init__.py
+++ b/tensorflow/contrib/stateless/__init__.py
@@ -32,16 +32,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import ops
-
 # pylint: disable=wildcard-import
-from tensorflow.python.ops.gen_stateless_random_ops import *
+from tensorflow.contrib.stateless.python.stateless_ops import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 
-ops.NotDifferentiable("StatelessMultinomial")
-ops.NotDifferentiable("StatelessRandomNormal")
-ops.NotDifferentiable("StatelessRandomUniform")
-ops.NotDifferentiable("StatelessTruncatedNormal")
-
 remove_undocumented(__name__)
diff --git a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
index d724a5c014..c0c1430d84 100644
--- a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
+++ b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 import numpy as np
 from tensorflow.contrib import stateless
 from tensorflow.python.framework import constant_op
@@ -27,10 +29,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
-CASES = [(stateless.stateless_random_uniform, random_ops.random_uniform),
-         (stateless.stateless_random_normal, random_ops.random_normal),
-         (stateless.stateless_truncated_normal, random_ops.truncated_normal)]
-
 
 def invert_philox(key, value):
   """Invert the Philox bijection."""
@@ -51,96 +49,102 @@ def invert_philox(key, value):
 
 class StatelessOpsTest(test.TestCase):
 
-  def testMatchStateful(self):
+  def _test_match(self, cases):
     # Stateless ops should be the same as stateful ops on the first call
     # after seed scrambling.
+    cases = tuple(cases)
     key = 0x3ec8f720, 0x02461e29
     for seed in (7, 17), (11, 5), (2, 3):
       preseed = invert_philox(key, (seed[0], 0, seed[1], 0)).astype(np.uint64)
       preseed = preseed[::2] | preseed[1::2] << 32
       random_seed.set_random_seed(seed[0])
       with self.test_session(use_gpu=True):
-        for stateless_op, stateful_op in CASES:
-          for shape in (), (3,), (2, 5):
-            stateful = stateful_op(shape, seed=seed[1])
-            pure = stateless_op(shape, seed=preseed)
-            self.assertAllEqual(stateful.eval(), pure.eval())
+        for stateless_op, stateful_op in cases:
+          stateful = stateful_op(seed=seed[1])
+          pure = stateless_op(seed=preseed)
+          self.assertAllEqual(stateful.eval(), pure.eval())
 
-  def testDeterminism(self):
+  def _test_determinism(self, cases):
     # Stateless values should be equal iff the seeds are equal (roughly)
+    cases = tuple(cases)
     with self.test_session(use_gpu=True):
       for seed_type in [dtypes.int32, dtypes.int64]:
         seed_t = array_ops.placeholder(seed_type, shape=[2])
         seeds = [(x, y) for x in range(5) for y in range(5)] * 3
-        for stateless_op, _ in CASES:
-          for shape in (), (3,), (2, 5):
-            pure = stateless_op(shape, seed=seed_t)
-            values = [(seed, pure.eval(feed_dict={seed_t: seed}))
-                      for seed in seeds]
-            for s0, v0 in values:
-              for s1, v1 in values:
-                self.assertEqual(s0 == s1, np.all(v0 == v1))
-
-  def testShapeType(self):
-    with self.test_session(use_gpu=True):
-      for shape_dtype in [dtypes.int32, dtypes.int64]:
-        seed_t = array_ops.placeholder(dtypes.int64, shape=[2])
-        seeds = [(x, y) for x in range(5) for y in range(5)] * 3
-        for stateless_op, _ in CASES:
-          for shape in (), (3,), (2, 5):
-            pure = stateless_op(constant_op.constant(shape, dtype=shape_dtype),
-                                seed=seed_t)
-            values = [(seed, pure.eval(feed_dict={seed_t: seed}))
-                      for seed in seeds]
-            for s0, v0 in values:
-              for s1, v1 in values:
-                self.assertEqual(s0 == s1, np.all(v0 == v1))
-
-  def testMatchStatefulMultinomial(self):
-    # Stateless ops should be the same as stateful ops on the first call
-    # after seed scrambling.
-    key = 0x3ec8f720, 0x02461e29
-    num_samples = 4
-    for logits_dtype in np.float16, np.float32, np.float64:
-      for output_dtype in dtypes.int32, dtypes.int64:
-        for seed in (7, 17), (11, 5), (2, 3):
-          preseed = invert_philox(key,
-                                  (seed[0], 0, seed[1], 0)).astype(np.uint64)
-          preseed = preseed[::2] | preseed[1::2] << 32
-          random_seed.set_random_seed(seed[0])
-          with self.test_session(use_gpu=True):
-            for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
-                                                      [0.25, 0.75]]):
-              logits_t = constant_op.constant(logits, dtype=logits_dtype)
-              stateful = random_ops.multinomial(
-                  logits_t,
-                  num_samples,
-                  seed=seed[1],
-                  output_dtype=output_dtype)
-              pure = stateless.stateless_multinomial(
-                  logits_t,
-                  num_samples,
-                  seed=preseed,
-                  output_dtype=output_dtype)
-              self.assertAllEqual(stateful.eval(), pure.eval())
+        for stateless_op, _ in cases:
+          pure = stateless_op(seed=seed_t)
+          values = [(seed, pure.eval(feed_dict={seed_t: seed}))
+                    for seed in seeds]
+          for s0, v0 in values:
+            for s1, v1 in values:
+              self.assertEqual(s0 == s1, np.all(v0 == v1))
 
-  def testDeterminismMultinomial(self):
-    # Stateless values should be equal iff the seeds are equal (roughly)
+  def _float_cases(self, shape_dtypes=(None,)):
+    float_cases = (
+        # Uniform distribution, with and without range
+        (stateless.stateless_random_uniform, random_ops.random_uniform, {}),
+        (stateless.stateless_random_uniform, random_ops.random_uniform,
+         dict(minval=2.2, maxval=7.1)),
+        # Normal distribution, with and without mean+stddev
+        (stateless.stateless_random_normal, random_ops.random_normal, {}),
+        (stateless.stateless_random_normal, random_ops.random_normal,
+         dict(mean=2, stddev=3)),
+        # Truncated normal distribution, with and without mean+stddev
+        (stateless.stateless_truncated_normal, random_ops.truncated_normal, {}),
+        (stateless.stateless_truncated_normal, random_ops.truncated_normal,
+         dict(mean=3, stddev=4)),
+    )
+    for dtype in dtypes.float16, dtypes.float32, dtypes.float64:
+      for shape_dtype in shape_dtypes:
+        for shape in (), (3,), (2, 5):
+          if shape_dtype is not None:
+            shape = constant_op.constant(shape, dtype=shape_dtype)
+          for stateless_op, stateful_op, kwds in float_cases:
+            kwds = dict(shape=shape, dtype=dtype, **kwds)
+            yield (functools.partial(stateless_op, **kwds),
+                   functools.partial(stateful_op, **kwds))
+
+  def _int_cases(self, shape_dtypes=(None,)):
+    for shape_dtype in shape_dtypes:
+      for shape in (), (3,), (2, 5):
+        if shape_dtype is not None:
+          shape = constant_op.constant(shape, dtype=shape_dtype)
+        for dtype in dtypes.int32, dtypes.int64:
+          kwds = dict(minval=2, maxval=11111, dtype=dtype, shape=shape)
+          yield (functools.partial(stateless.stateless_random_uniform, **kwds),
+                 functools.partial(random_ops.random_uniform, **kwds))
+
+  def _multinomial_cases(self):
     num_samples = 10
-    with self.test_session(use_gpu=True):
-      for seed_type in [dtypes.int32, dtypes.int64]:
-        seed_t = array_ops.placeholder(seed_type, shape=[2])
-        seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+    for logits_dtype in np.float16, np.float32, np.float64:
+      for output_dtype in dtypes.int32, dtypes.int64:
         for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
                                                   [0.25, 0.75]]):
-          pure = stateless.stateless_multinomial(
-              logits, num_samples, seed=seed_t)
-          values = [
-              (seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds
-          ]
-          for s0, v0 in values:
-            for s1, v1 in values:
-              self.assertEqual(s0 == s1, np.all(v0 == v1))
+          kwds = dict(logits=constant_op.constant(logits, dtype=logits_dtype),
+                      num_samples=num_samples,
+                      output_dtype=output_dtype)
+          yield (functools.partial(stateless.stateless_multinomial, **kwds),
+                 functools.partial(random_ops.multinomial, **kwds))
+
+  def testMatchFloat(self):
+    self._test_match(self._float_cases())
+
+  def testMatchInt(self):
+    self._test_match(self._int_cases())
+
+  def testMatchMultinomial(self):
+    self._test_match(self._multinomial_cases())
+
+  def testDeterminismFloat(self):
+    self._test_determinism(self._float_cases(
+        shape_dtypes=(dtypes.int32, dtypes.int64)))
+
+  def testDeterminismInt(self):
+    self._test_determinism(self._int_cases(
+        shape_dtypes=(dtypes.int32, dtypes.int64)))
+
+  def testDeterminismMultinomial(self):
+    self._test_determinism(self._multinomial_cases())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/stateless/python/stateless_ops.py b/tensorflow/contrib/stateless/python/stateless_ops.py
new file mode 100644
index 0000000000..db9b7a87f2
--- /dev/null
+++ b/tensorflow/contrib/stateless/python/stateless_ops.py
@@ -0,0 +1,214 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stateless random ops which take seed as a tensor input."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import gen_stateless_random_ops
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import math_ops
+
+ops.NotDifferentiable("StatelessMultinomial")
+ops.NotDifferentiable("StatelessRandomNormal")
+ops.NotDifferentiable("StatelessRandomUniform")
+ops.NotDifferentiable("StatelessRandomUniformInt")
+ops.NotDifferentiable("StatelessTruncatedNormal")
+
+
+def stateless_random_uniform(shape,
+                             seed,
+                             minval=0,
+                             maxval=None,
+                             dtype=dtypes.float32,
+                             name=None):
+  """Outputs deterministic pseudorandom values from a uniform distribution.
+
+  This is a stateless version of `tf.random_uniform`: if run twice with the
+  same seeds, it will produce the same pseudorandom numbers.  The output is
+  consistent across multiple runs on the same hardware (and between CPU
+  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  hardware.
+
+  The generated values follow a uniform distribution in the range
+  `[minval, maxval)`. The lower bound `minval` is included in the range, while
+  the upper bound `maxval` is excluded.
+
+  For floats, the default range is `[0, 1)`.  For ints, at least `maxval` must
+  be specified explicitly.
+
+  In the integer case, the random integers are slightly biased unless
+  `maxval - minval` is an exact power of two.  The bias is small for values of
+  `maxval - minval` significantly smaller than the range of the output (either
+  `2**32` or `2**64`).
+
+  Args:
+    shape: A 1-D integer Tensor or Python array. The shape of the output tensor.
+    seed: A shape [2] integer Tensor of seeds to the random number generator.
+    minval: A 0-D Tensor or Python value of type `dtype`. The lower bound on the
+      range of random values to generate.  Defaults to 0.
+    maxval: A 0-D Tensor or Python value of type `dtype`. The upper bound on
+      the range of random values to generate.  Defaults to 1 if `dtype` is
+      floating point.
+    dtype: The type of the output: `float16`, `float32`, `float64`, `int32`,
+      or `int64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tensor of the specified shape filled with random uniform values.
+
+  Raises:
+    ValueError: If `dtype` is integral and `maxval` is not specified.
+  """
+  dtype = dtypes.as_dtype(dtype)
+  if dtype not in (dtypes.float16, dtypes.bfloat16, dtypes.float32,
+                   dtypes.float64, dtypes.int32, dtypes.int64):
+    raise ValueError("Invalid dtype %r" % dtype)
+  if maxval is None:
+    if dtype.is_integer:
+      raise ValueError("Must specify maxval for integer dtype %r" % dtype)
+    maxval = 1
+  with ops.name_scope(name, "stateless_random_uniform",
+                      [shape, seed, minval, maxval]) as name:
+    shape = random_ops._ShapeTensor(shape)  # pylint: disable=protected-access
+    minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
+    maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
+    if dtype.is_integer:
+      return gen_stateless_random_ops.stateless_random_uniform_int(
+          shape, seed=seed, minval=minval, maxval=maxval, name=name)
+    else:
+      rnd = gen_stateless_random_ops.stateless_random_uniform(
+          shape, seed=seed, dtype=dtype)
+      return math_ops.add(rnd * (maxval - minval), minval, name=name)
+
+
+def stateless_random_normal(shape,
+                            seed,
+                            mean=0.0,
+                            stddev=1.0,
+                            dtype=dtypes.float32,
+                            name=None):
+  """Outputs deterministic pseudorandom values from a normal distribution.
+
+  This is a stateless version of `tf.random_normal`: if run twice with the
+  same seeds, it will produce the same pseudorandom numbers.  The output is
+  consistent across multiple runs on the same hardware (and between CPU
+  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  hardware.
+
+  Args:
+    shape: A 1-D integer Tensor or Python array. The shape of the output tensor.
+    seed: A shape [2] integer Tensor of seeds to the random number generator.
+    mean: A 0-D Tensor or Python value of type `dtype`. The mean of the normal
+      distribution.
+    stddev: A 0-D Tensor or Python value of type `dtype`. The standard deviation
+      of the normal distribution.
+    dtype: The type of the output.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tensor of the specified shape filled with random normal values.
+  """
+  with ops.name_scope(name, "stateless_random_normal",
+                      [shape, seed, mean, stddev]) as name:
+    shape = random_ops._ShapeTensor(shape)  # pylint: disable=protected-access
+    mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
+    stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
+    rnd = gen_stateless_random_ops.stateless_random_normal(shape, seed, dtype)
+    return math_ops.add(rnd * stddev, mean, name=name)
+
+
+def stateless_truncated_normal(shape,
+                               seed,
+                               mean=0.0,
+                               stddev=1.0,
+                               dtype=dtypes.float32,
+                               name=None):
+  """Outputs deterministic pseudorandom values, truncated normally distributed.
+
+  This is a stateless version of `tf.truncated_normal`: if run twice with the
+  same seeds, it will produce the same pseudorandom numbers.  The output is
+  consistent across multiple runs on the same hardware (and between CPU
+  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  hardware.
+
+  The generated values follow a normal distribution with specified mean and
+  standard deviation, except that values whose magnitude is more than 2 standard
+  deviations from the mean are dropped and re-picked.
+
+  Args:
+    shape: A 1-D integer Tensor or Python array. The shape of the output tensor.
+    seed: A shape [2] integer Tensor of seeds to the random number generator.
+    mean: A 0-D Tensor or Python value of type `dtype`. The mean of the
+      truncated normal distribution.
+    stddev: A 0-D Tensor or Python value of type `dtype`. The standard deviation
+      of the normal distribution, before truncation.
+    dtype: The type of the output.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tensor of the specified shape filled with random truncated normal values.
+  """
+  with ops.name_scope(name, "stateless_truncated_normal",
+                      [shape, seed, mean, stddev]) as name:
+    shape = random_ops._ShapeTensor(shape)  # pylint: disable=protected-access
+    mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
+    stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
+    rnd = gen_stateless_random_ops.stateless_truncated_normal(
+        shape, seed, dtype)
+    return math_ops.add(rnd * stddev, mean, name=name)
+
+
+def stateless_multinomial(logits,
+                          num_samples,
+                          seed,
+                          output_dtype=dtypes.int64,
+                          name=None):
+  """Draws deterministic pseudorandom samples from a multinomial distribution.
+
+  This is a stateless version of `tf.multinomial`: if run twice with the
+  same seeds, it will produce the same pseudorandom numbers.  The output is
+  consistent across multiple runs on the same hardware (and between CPU
+  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  hardware.
+
+  Example:
+
+  ```python
+  # samples has shape [1, 5], where each value is either 0 or 1 with equal
+  # probability.
+  samples = tf.contrib.stateless.stateless_multinomial(
+      tf.log([[10., 10.]]), 5, seed=[7, 17])
+  ```
+
+  Args:
+    logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
+      `[i, :]` represents the unnormalized log-probabilities for all classes.
+    num_samples: 0-D.  Number of independent samples to draw for each row slice.
+    seed: A shape [2] integer Tensor of seeds to the random number generator.
+    name: Optional name for the operation.
+    output_dtype: integer type to use for the output. Defaults to int64.
+
+  Returns:
+    The drawn samples of shape `[batch_size, num_samples]`.
+  """
+  with ops.name_scope(name, "stateless_multinomial", [logits, seed]):
+    logits = ops.convert_to_tensor(logits, name="logits")
+    return gen_stateless_random_ops.stateless_multinomial(
+        logits, num_samples, seed, output_dtype=output_dtype)
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformInt.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformInt.pbtxt
new file mode 100644
index 0000000000..b6a6dbdf54
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformInt.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "StatelessRandomUniformInt"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+2 seeds (shape [2]).
+END
+  }
+  in_arg {
+    name: "minval"
+    description: <<END
+Minimum value (inclusive, scalar).
+END
+  }
+  in_arg {
+    name: "maxval"
+    description: <<END
+Maximum value (exclusive, scalar).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom random integers from a uniform distribution."
+  description: <<END
+The generated values follow a uniform distribution in the range `[minval, maxval)`.
+
+The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+END
+}
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 04a53697c0..3810d817ca 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -489,13 +489,15 @@ class RandomGammaOp : public OpKernel {
       Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),        \
       RandomGammaOp<TYPE>)
 
-#define REGISTER_INT(IntType)                                   \
-  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
-                              .Device(DEVICE_CPU)               \
-                              .HostMemory("shape")              \
-                              .HostMemory("minval")             \
-                              .HostMemory("maxval")             \
-                              .TypeConstraint<IntType>("Tout"), \
+#define REGISTER_INT(IntType)                                                 \
+  template struct functor::FillPhiloxRandom<                                  \
+      CPUDevice, random::UniformDistribution<random::PhiloxRandom, IntType>>; \
+  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")                            \
+                              .Device(DEVICE_CPU)                             \
+                              .HostMemory("shape")                            \
+                              .HostMemory("minval")                           \
+                              .HostMemory("maxval")                           \
+                              .TypeConstraint<IntType>("Tout"),               \
                           RandomUniformIntOp<CPUDevice, IntType>);
 
 TF_CALL_half(REGISTER);
@@ -538,14 +540,16 @@ TF_CALL_int64(REGISTER_INT);
           random::TruncatedNormalDistribution<                                 \
               random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
 
-#define REGISTER_INT(IntType)                                   \
-  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
-                              .Device(DEVICE_GPU)               \
-                              .HostMemory("shape")              \
-                              .HostMemory("minval")             \
-                              .HostMemory("maxval")             \
-                              .TypeConstraint<int32>("T")       \
-                              .TypeConstraint<IntType>("Tout"), \
+#define REGISTER_INT(IntType)                                                 \
+  template struct functor::FillPhiloxRandom<                                  \
+      GPUDevice, random::UniformDistribution<random::PhiloxRandom, IntType>>; \
+  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")                            \
+                              .Device(DEVICE_GPU)                             \
+                              .HostMemory("shape")                            \
+                              .HostMemory("minval")                           \
+                              .HostMemory("maxval")                           \
+                              .TypeConstraint<int32>("T")                     \
+                              .TypeConstraint<IntType>("Tout"),               \
                           RandomUniformIntOp<GPUDevice, IntType>);
 
 TF_CALL_half(REGISTER);
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
index eab176c7fb..925f5291a6 100644
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -113,74 +113,109 @@ class StatelessRandomOp : public StatelessRandomOpBase {
   }
 };
 
-#define REGISTER(TYPE)                                                 \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("StatelessRandomUniform")                                   \
-          .Device(DEVICE_CPU)                                          \
-          .HostMemory("shape")                                         \
-          .TypeConstraint<TYPE>("dtype"),                              \
-      StatelessRandomOp<CPUDevice, random::UniformDistribution<        \
-                                       random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("StatelessRandomNormal")                                    \
-          .Device(DEVICE_CPU)                                          \
-          .HostMemory("shape")                                         \
-          .TypeConstraint<TYPE>("dtype"),                              \
-      StatelessRandomOp<CPUDevice, random::NormalDistribution<         \
-                                       random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("StatelessTruncatedNormal")                                 \
-          .Device(DEVICE_CPU)                                          \
-          .HostMemory("shape")                                         \
-          .TypeConstraint<TYPE>("dtype"),                              \
-      StatelessRandomOp<                                               \
-          CPUDevice,                                                   \
-          random::TruncatedNormalDistribution<                         \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+template <typename Device, typename IntType>
+class StatelessRandomUniformIntOp : public StatelessRandomOpBase {
+ public:
+  using StatelessRandomOpBase::StatelessRandomOpBase;
 
-TF_CALL_half(REGISTER);
-TF_CALL_float(REGISTER);
-TF_CALL_double(REGISTER);
+  void Fill(OpKernelContext* context, random::PhiloxRandom random,
+            Tensor* output) override {
+    const Tensor& minval = context->input(2);
+    const Tensor& maxval = context->input(3);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(minval.shape()),
+                errors::InvalidArgument("minval must be 0-D, got shape ",
+                                        minval.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(maxval.shape()),
+                errors::InvalidArgument("maxval must be 0-D, got shape ",
+                                        maxval.shape().DebugString()));
+
+    // Verify that minval < maxval.  Note that we'll never reach this point for
+    // empty output.  Zero impossible things are fine.
+    const auto lo = minval.scalar<IntType>()();
+    const auto hi = maxval.scalar<IntType>()();
+    OP_REQUIRES(
+        context, lo < hi,
+        errors::InvalidArgument("Need minval < maxval, got ", lo, " >= ", hi));
+
+    // Build distribution
+    typedef random::UniformDistribution<random::PhiloxRandom, IntType>
+        Distribution;
+    Distribution dist(lo, hi);
+
+    auto flat = output->flat<IntType>();
+    // Reuse the compute kernels from the stateful random ops
+    functor::FillPhiloxRandom<Device, Distribution>()(
+        context, context->eigen_device<Device>(), random, flat.data(),
+        flat.size(), dist);
+  }
+};
 
-#undef REGISTER
+#define REGISTER(DEVICE, TYPE)                                              \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("StatelessRandomUniform")                                        \
+          .Device(DEVICE_##DEVICE)                                          \
+          .HostMemory("shape")                                              \
+          .HostMemory("seed")                                               \
+          .TypeConstraint<TYPE>("dtype"),                                   \
+      StatelessRandomOp<DEVICE##Device, random::UniformDistribution<        \
+                                            random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("StatelessRandomNormal")                                         \
+          .Device(DEVICE_##DEVICE)                                          \
+          .HostMemory("shape")                                              \
+          .HostMemory("seed")                                               \
+          .TypeConstraint<TYPE>("dtype"),                                   \
+      StatelessRandomOp<DEVICE##Device, random::NormalDistribution<         \
+                                            random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("StatelessTruncatedNormal")                                      \
+          .Device(DEVICE_##DEVICE)                                          \
+          .HostMemory("shape")                                              \
+          .HostMemory("seed")                                               \
+          .TypeConstraint<TYPE>("dtype"),                                   \
+      StatelessRandomOp<                                                    \
+          DEVICE##Device,                                                   \
+          random::TruncatedNormalDistribution<                              \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+
+#define REGISTER_INT(DEVICE, TYPE)                            \
+  REGISTER_KERNEL_BUILDER(Name("StatelessRandomUniformInt")   \
+                              .Device(DEVICE_##DEVICE)        \
+                              .HostMemory("shape")            \
+                              .HostMemory("seed")             \
+                              .HostMemory("minval")           \
+                              .HostMemory("maxval")           \
+                              .TypeConstraint<TYPE>("dtype"), \
+                          StatelessRandomUniformIntOp<DEVICE##Device, TYPE>);
+
+#define REGISTER_CPU(TYPE) REGISTER(CPU, TYPE)
+#define REGISTER_GPU(TYPE) REGISTER(GPU, TYPE)
+#define REGISTER_INT_CPU(TYPE) REGISTER_INT(CPU, TYPE)
+#define REGISTER_INT_GPU(TYPE) REGISTER_INT(GPU, TYPE)
+
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_bfloat16(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
+TF_CALL_int32(REGISTER_INT_CPU);
+TF_CALL_int64(REGISTER_INT_CPU);
 
 #if GOOGLE_CUDA
 
-#define REGISTER(TYPE)                                                 \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("StatelessRandomUniform")                                   \
-          .Device(DEVICE_GPU)                                          \
-          .HostMemory("shape")                                         \
-          .HostMemory("seed")                                          \
-          .TypeConstraint<TYPE>("dtype"),                              \
-      StatelessRandomOp<GPUDevice, random::UniformDistribution<        \
-                                       random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("StatelessRandomNormal")                                    \
-          .Device(DEVICE_GPU)                                          \
-          .HostMemory("shape")                                         \
-          .HostMemory("seed")                                          \
-          .TypeConstraint<TYPE>("dtype"),                              \
-      StatelessRandomOp<GPUDevice, random::NormalDistribution<         \
-                                       random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("StatelessTruncatedNormal")                                 \
-          .Device(DEVICE_GPU)                                          \
-          .HostMemory("shape")                                         \
-          .HostMemory("seed")                                          \
-          .TypeConstraint<TYPE>("dtype"),                              \
-      StatelessRandomOp<                                               \
-          GPUDevice,                                                   \
-          random::TruncatedNormalDistribution<                         \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+TF_CALL_int32(REGISTER_INT_GPU);
+TF_CALL_int64(REGISTER_INT_GPU);
 
-TF_CALL_half(REGISTER);
-TF_CALL_float(REGISTER);
-TF_CALL_double(REGISTER);
+#endif  // GOOGLE_CUDA
 
 #undef REGISTER
-
-#endif  // GOOGLE_CUDA
+#undef REGISTER_INT
+#undef REGISTER_CPU
+#undef REGISTER_GPU
+#undef REGISTER_INT_CPU
+#undef REGISTER_INT_GPU
 
 }  // namespace
 
diff --git a/tensorflow/core/ops/stateless_random_ops.cc b/tensorflow/core/ops/stateless_random_ops.cc
index 742709fb18..f919a21d60 100644
--- a/tensorflow/core/ops/stateless_random_ops.cc
+++ b/tensorflow/core/ops/stateless_random_ops.cc
@@ -19,42 +19,55 @@ limitations under the License.
 namespace tensorflow {
 
 using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
-static Status StatelessShape(shape_inference::InferenceContext* context) {
+static Status StatelessShape(InferenceContext* c) {
   // Check seed shape
   ShapeHandle seed;
-  TF_RETURN_IF_ERROR(context->WithRank(context->input(1), 1, &seed));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &seed));
   DimensionHandle unused;
-  TF_RETURN_IF_ERROR(context->WithValue(context->Dim(seed, 0), 2, &unused));
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(seed, 0), 2, &unused));
 
   // Set output shape
   ShapeHandle out;
-  TF_RETURN_IF_ERROR(context->MakeShapeFromShapeTensor(0, &out));
-  context->set_output(0, out);
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+  c->set_output(0, out);
   return Status::OK();
 }
 
-#define REGISTER_STATELESS_OP(name)                  \
-  REGISTER_OP(name)                                  \
-      .Input("shape: T")                             \
-      .Input("seed: Tseed")                          \
-      .Output("output: dtype")                       \
-      .Attr("dtype: {half,float,double} = DT_FLOAT") \
-      .Attr("T: {int32, int64} = DT_INT32")          \
-      .Attr("Tseed: {int32, int64} = DT_INT64")      \
+#define REGISTER_STATELESS_OP(name)                           \
+  REGISTER_OP(name)                                           \
+      .Input("shape: T")                                      \
+      .Input("seed: Tseed")                                   \
+      .Output("output: dtype")                                \
+      .Attr("dtype: {half,bfloat16,float,double} = DT_FLOAT") \
+      .Attr("T: {int32, int64} = DT_INT32")                   \
+      .Attr("Tseed: {int32, int64} = DT_INT64")               \
       .SetShapeFn(StatelessShape)
 
-// This op is exposed through contrib/stateless only.  The interface may change.
 REGISTER_STATELESS_OP("StatelessRandomUniform");
-
-// This op is exposed through contrib/stateless only.  The interface may change.
 REGISTER_STATELESS_OP("StatelessRandomNormal");
-
-// This op is exposed through contrib/stateless only.  The interface may change.
 REGISTER_STATELESS_OP("StatelessTruncatedNormal");
 
-// This op is exposed through contrib/stateless only.  The interface may change.
+#undef REGISTER_STATELESS_OP
+
+REGISTER_OP("StatelessRandomUniformInt")
+    .Input("shape: T")
+    .Input("seed: Tseed")
+    .Input("minval: dtype")
+    .Input("maxval: dtype")
+    .Output("output: dtype")
+    .Attr("dtype: {int32, int64}")
+    .Attr("T: {int32, int64}")
+    .Attr("Tseed: {int32, int64} = DT_INT64")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      return StatelessShape(c);
+    });
+
 REGISTER_OP("StatelessMultinomial")
     .Input("logits: T")
     .Input("num_samples: int32")
@@ -80,6 +93,4 @@ REGISTER_OP("StatelessMultinomial")
       return Status::OK();
     });
 
-#undef REGISTER_STATELESS_OP
-
 }  // namespace tensorflow
-- 
GitLab


From c966b5eed60a570f2121cb84ddb4ece84c413719 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 15:08:18 -0700
Subject: [PATCH 494/570] Add DistributionStrategy support to moving average
 APIs.

Fixes #21405.

PiperOrigin-RevId: 215973401
---
 tensorflow/contrib/distribute/python/BUILD    |  18 +++
 .../distribute/python/moving_averages_test.py | 141 ++++++++++++++++++
 tensorflow/python/training/moving_averages.py |  49 +++---
 3 files changed, 189 insertions(+), 19 deletions(-)
 create mode 100644 tensorflow/contrib/distribute/python/moving_averages_test.py

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 8267612236..76d5b59ce1 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -411,6 +411,24 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "moving_averages_test",
+    srcs = ["moving_averages_test.py"],
+    additional_deps = [
+        ":combinations",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+    tags = [
+        "no_pip",
+    ],
+)
+
 cuda_py_test(
     name = "optimizer_v2_test",
     srcs = ["optimizer_v2_test.py"],
diff --git a/tensorflow/contrib/distribute/python/moving_averages_test.py b/tensorflow/contrib/distribute/python/moving_averages_test.py
new file mode 100644
index 0000000000..119352ad91
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/moving_averages_test.py
@@ -0,0 +1,141 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training.moving_averages when using a DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training import moving_averages
+
+
+all_combinations = combinations.combine(
+    distribution=[combinations.default_strategy,
+                  combinations.one_device_strategy,
+                  combinations.mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph"])
+
+
+class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(all_combinations)
+  def testTowerModeWithoutZeroDebias(self, distribution):
+    tower_id = [0]
+
+    def tower_fn():
+      var = variables.Variable([10.0, 11.0])
+      val = constant_op.constant([1.0 + tower_id[0], 2.0 - tower_id[0]])
+      tower_id[0] += 1
+      decay = 0.25
+      assign = moving_averages.assign_moving_average(
+          var, val, decay, zero_debias=False)
+      return var, assign
+
+    with distribution.scope(), self.cached_session() as sess:
+      var, assign = distribution.call_for_each_tower(tower_fn)
+      variables.global_variables_initializer().run()
+      self.assertAllClose([10.0, 11.0], var.eval())
+      sess.run(distribution.unwrap(assign))
+      # Mean of val across calls to tower_fn().
+      average_val = [1.0 + 0.5 * (tower_id[0] - 1),
+                     2.0 - 0.5 * (tower_id[0] - 1)]
+      val_weight = 1.0 - 0.25
+      self.assertAllClose(
+          [10.0 * 0.25 + average_val[0] * val_weight,
+           11.0 * 0.25 + average_val[1] * val_weight],
+          var.eval())
+
+  @combinations.generate(all_combinations)
+  def testTowerMode(self, distribution):
+    tower_id = [0]
+
+    def tower_fn():
+      var = variables.Variable([0.0, 0.0])
+      val = constant_op.constant([1.0 + tower_id[0], 2.0 - tower_id[0]])
+      tower_id[0] += 1
+      decay = 0.25
+      assign = moving_averages.assign_moving_average(var, val, decay)
+      return var, assign.op
+
+    with distribution.scope(), self.cached_session() as sess:
+      var, assign_op = distribution.call_for_each_tower(tower_fn)
+      variables.global_variables_initializer().run()
+      self.assertAllClose([0.0, 0.0], var.eval())
+      sess.run(distribution.unwrap(assign_op))
+      # Mean of val across calls to tower_fn().
+      average_val = [1.0 + 0.5 * (tower_id[0] - 1),
+                     2.0 - 0.5 * (tower_id[0] - 1)]
+      self.assertAllClose(average_val, var.eval())
+
+  @combinations.generate(all_combinations)
+  def testCrossTowerWithoutZeroDebias(self, distribution):
+    with distribution.scope(), self.cached_session() as sess:
+      var = variables.Variable([10.0, 11.0])
+      val = constant_op.constant([1.0, 2.0])
+      decay = 0.25
+      # NOTE(josh11b): We currently generate an error if val is a PerDevice value.
+      assign = moving_averages.assign_moving_average(
+          var, val, decay, zero_debias=False)
+
+      variables.global_variables_initializer().run()
+      self.assertAllClose([10.0, 11.0], var.eval())
+      sess.run(assign)
+      average_val = [1.0, 2.0]
+      val_weight = 1.0 - 0.25
+      self.assertAllClose(
+          [10.0 * 0.25 + average_val[0] * val_weight,
+           11.0 * 0.25 + average_val[1] * val_weight],
+          var.eval())
+      # Also try assign.op.
+      sess.run(assign.op)
+      orig_weight = 0.25 * 0.25
+      val_weight = 1.0 - orig_weight
+      self.assertAllClose(
+          [10.0 * orig_weight + average_val[0] * val_weight,
+           11.0 * orig_weight + average_val[1] * val_weight],
+          var.eval())
+
+  @combinations.generate(all_combinations)
+  def testCrossTower(self, distribution):
+    with distribution.scope(), self.cached_session() as sess:
+      var = variables.Variable([0.0, 0.0])
+      val = array_ops.placeholder(dtypes.float32)
+      decay = 0.25
+      # NOTE(josh11b): We currently generate an error if val is a PerDevice value.
+      assign = moving_averages.assign_moving_average(var, val, decay)
+
+      variables.global_variables_initializer().run()
+      self.assertAllClose([0.0, 0.0], var.eval())
+      sess.run(assign, feed_dict={val: [1.0, 2.0]})
+      self.assertAllClose([1.0, 2.0], var.eval())
+
+      # Also try assign.op.
+      sess.run(assign.op, feed_dict={val: [10.0, 0.0]})
+      self.assertAllClose(
+          [(1.0 * 0.25 + 10.0) / (1.0 * 0.25 + 1.0),
+           (2.0 * 0.25 + 0.0) / (1.0 * 0.25 + 1.0)],
+          var.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 041266da3e..89bfcaf4ad 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -25,6 +25,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import slot_creator
 from tensorflow.python.util.tf_export import tf_export
 
@@ -36,9 +37,8 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
   The moving average of 'variable' updated with 'value' is:
     variable * decay + value * (1 - decay)
 
-  The returned Operation sets 'variable' to the newly computed moving average.
-
-  The new value of 'variable' can be set with the 'AssignSub' op as:
+  The returned Operation sets 'variable' to the newly computed moving average,
+  by performing this subtraction:
      variable -= (1 - decay) * (variable - value)
 
   Since variables that are initialized to a `0` value will be `0` biased,
@@ -50,7 +50,7 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
 
   The names of the debias shadow variables, by default, include both the scope
   they were created in and the scope of the variables they debias. They are also
-  given a uniqifying-suffix.
+  given a uniquifying-suffix.
 
   E.g.:
 
@@ -58,8 +58,8 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
     with tf.variable_scope('scope1'):
       with tf.variable_scope('scope2'):
         var = tf.get_variable('foo')
-        tf.assign_moving_average(var, 0.0, 1.0)
-        tf.assign_moving_average(var, 0.0, 0.9)
+        update_1 = tf.assign_moving_average(var, 0.0, 1.0)
+        update_2 = tf.assign_moving_average(var, 0.0, 0.9)
 
     # var.name: 'scope1/scope2/foo'
     # shadow var names: 'scope1/scope2/scope1/scope2/foo/biased'
@@ -76,20 +76,33 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
     name: Optional name of the returned operation.
 
   Returns:
-    A reference to the input 'variable' tensor with the newly computed
-    moving average.
+    A tensor which if evaluated will compute and return the new moving average.
   """
+  def update_fn(v, value, decay=decay):
+    decay = ops.convert_to_tensor(1.0 - decay, name="decay")
+    if decay.dtype != v.dtype.base_dtype:
+      decay = math_ops.cast(decay, v.dtype.base_dtype)
+    if zero_debias:
+      update_delta = _zero_debias(v, value, decay)
+    else:
+      update_delta = (v - value) * decay
+    return state_ops.assign_sub(v, update_delta, name=scope)
+
   with ops.name_scope(name, "AssignMovingAvg",
                       [variable, value, decay]) as scope:
-    with ops.colocate_with(variable):
-      decay = ops.convert_to_tensor(1.0 - decay, name="decay")
-      if decay.dtype != variable.dtype.base_dtype:
-        decay = math_ops.cast(decay, variable.dtype.base_dtype)
-      if zero_debias:
-        update_delta = _zero_debias(variable, value, decay)
-      else:
-        update_delta = (variable - value) * decay
-      return state_ops.assign_sub(variable, update_delta, name=scope)
+    tower_context = distribution_strategy_context.get_tower_context()
+    if tower_context:
+      # In a tower context, we update variable using the mean of value across
+      # towers.
+      def merge_fn(strategy, v, value):
+        value = strategy.reduce(
+            variable_scope.VariableAggregation.MEAN, value, v)
+        return strategy.update(v, update_fn, value)
+
+      return tower_context.merge_call(merge_fn, variable, value)
+    else:
+      strategy = distribution_strategy_context.get_cross_tower_context()
+      return strategy.update(variable, update_fn, value)
 
 
 def weighted_moving_average(value,
@@ -379,8 +392,6 @@ class ExponentialMovingAverage(object):
 
     Raises:
       TypeError: If the arguments are not an allowed type.
-      ValueError: If the moving average of one of the variables is already
-        being computed.
     """
     # TODO(touts): op_scope
     if var_list is None:
-- 
GitLab


From 5ac6e1e4b8318bad2f2bc7e5a08a58a7ed31e4c6 Mon Sep 17 00:00:00 2001
From: Penporn Koanantakool <penporn@google.com>
Date: Fri, 5 Oct 2018 15:43:32 -0700
Subject: [PATCH 495/570] Removes the INTEL_MKL_ML_ONLY option from the
 CMakeLists build file since the main logic for INTEL_MKL_ML_ONLY is getting
 removed in PR#22783. #22783

PiperOrigin-RevId: 215978712
---
 tensorflow/contrib/cmake/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index f675c135f4..60f53b8b75 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -352,9 +352,7 @@ if (tensorflow_ENABLE_MKL_SUPPORT)
     list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkldnn_STATIC_LIBRARIES})
     list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn_copy_shared_to_destination)
     include_directories(${mkldnn_INCLUDE_DIRS})
-  else (tensorflow_ENABLE_MKLDNN_SUPPORT)
-    add_definitions(-DINTEL_MKL_ML_ONLY)
-  endif()
+  endif(tensorflow_ENABLE_MKLDNN_SUPPORT)
 endif (tensorflow_ENABLE_MKL_SUPPORT)
 
 if (tensorflow_ENABLE_GPU)
-- 
GitLab


From 4aad5382f0e7148d8489d24d8355b828b3f7811b Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Fri, 5 Oct 2018 15:43:58 -0700
Subject: [PATCH 496/570] Internal change

PiperOrigin-RevId: 215978771
---
 tensorflow/contrib/lite/java/BUILD            | 95 ++++++++++++++-----
 tensorflow/contrib/lite/java/aar_with_jni.bzl |  5 +-
 .../org/tensorflow/lite/TensorFlowLite.java   | 20 +++-
 .../tensorflow/lite/InterpreterFlexTest.java  | 46 +++++++++
 .../org/tensorflow/lite/InterpreterTest.java  | 14 +++
 5 files changed, 153 insertions(+), 27 deletions(-)
 create mode 100644 tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java

diff --git a/tensorflow/contrib/lite/java/BUILD b/tensorflow/contrib/lite/java/BUILD
index 098ba7e773..e68cd26f81 100644
--- a/tensorflow/contrib/lite/java/BUILD
+++ b/tensorflow/contrib/lite/java/BUILD
@@ -11,6 +11,10 @@ load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_jni_binary")
 load("//tensorflow/contrib/lite/java:aar_with_jni.bzl", "aar_with_jni")
 
+JAVA_SRCS = glob([
+    "src/main/java/org/tensorflow/lite/*.java",
+])
+
 # Building tensorflow-lite.aar including 4 variants of .so
 # To build an aar for release, run below command:
 # bazel build --cxxopt='--std=c++11' -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
@@ -20,28 +24,38 @@ aar_with_jni(
     android_library = ":tensorflowlite",
 )
 
+# EXPERIMENTAL: AAR target that supports TensorFlow op execution with TFLite.
+aar_with_jni(
+    name = "tensorflow-lite-flex",
+    android_library = ":tensorflowlite_flex",
+)
+
 android_library(
     name = "tensorflowlite",
-    srcs = glob(
-        [
-            "src/main/java/org/tensorflow/lite/*.java",
-        ],
-    ),
+    srcs = JAVA_SRCS,
+    manifest = "AndroidManifest.xml",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tensorflowlite_native",
+        "@org_checkerframework_qual",
+    ],
+)
+
+# EXPERIMENTAL: Android target that supports TensorFlow op execution with TFLite.
+android_library(
+    name = "tensorflowlite_flex",
+    srcs = JAVA_SRCS,
     manifest = "AndroidManifest.xml",
     visibility = ["//visibility:public"],
     deps = [
-        ":tflite_runtime",
+        ":tensorflowlite_native_flex",
         "@org_checkerframework_qual",
     ],
 )
 
 android_library(
     name = "tensorflowlite_java",
-    srcs = glob(
-        [
-            "src/main/java/org/tensorflow/lite/*.java",
-        ],
-    ),
+    srcs = JAVA_SRCS,
     visibility = ["//visibility:public"],
     deps = [
         "@org_checkerframework_qual",
@@ -50,16 +64,23 @@ android_library(
 
 java_library(
     name = "tensorflowlitelib",
-    srcs = glob(
-        [
-            "src/main/java/org/tensorflow/lite/*.java",
-        ],
-    ),
+    srcs = JAVA_SRCS,
     javacopts = JAVACOPTS,
     visibility = ["//visibility:public"],
     deps = [
         ":libtensorflowlite_jni.so",
-        "//tensorflow/contrib/lite/java/src/main/native",
+        "@org_checkerframework_qual",
+    ],
+)
+
+# EXPERIMENTAL: Java target that supports TensorFlow op execution with TFLite.
+java_library(
+    name = "tensorflowlitelib_flex",
+    srcs = JAVA_SRCS,
+    javacopts = JAVACOPTS,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":libtensorflowlite_flex_jni.so",
         "@org_checkerframework_qual",
     ],
 )
@@ -72,7 +93,6 @@ java_test(
     tags = ["no_oss"],
     test_class = "org.tensorflow.lite.TensorFlowLiteTest",
     deps = [
-        ":libtensorflowlite_jni.so",
         ":tensorflowlitelib",
         "@com_google_truth",
         "@junit",
@@ -87,7 +107,6 @@ java_test(
     tags = ["no_oss"],
     test_class = "org.tensorflow.lite.DataTypeTest",
     deps = [
-        ":libtensorflowlite_jni.so",
         ":tensorflowlitelib",
         "@com_google_truth",
         "@junit",
@@ -110,7 +129,6 @@ java_test(
     tags = ["no_oss"],
     test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest",
     deps = [
-        ":libtensorflowlite_jni.so",
         ":tensorflowlitelib",
         "@com_google_truth",
         "@junit",
@@ -125,19 +143,37 @@ java_test(
     data = [
         "src/testdata/add.bin",
         "src/testdata/mobilenet.tflite.bin",
+        "//tensorflow/contrib/lite:testdata/multi_add_flex.bin",
     ],
     javacopts = JAVACOPTS,
     tags = ["no_oss"],
     test_class = "org.tensorflow.lite.InterpreterTest",
     visibility = ["//visibility:private"],
     deps = [
-        ":libtensorflowlite_jni.so",
         ":tensorflowlitelib",
         "@com_google_truth",
         "@junit",
     ],
 )
 
+java_test(
+    name = "InterpreterFlexTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/lite/InterpreterFlexTest.java"],
+    data = [
+        "//tensorflow/contrib/lite:testdata/multi_add_flex.bin",
+    ],
+    javacopts = JAVACOPTS,
+    tags = ["no_oss"],
+    test_class = "org.tensorflow.lite.InterpreterFlexTest",
+    visibility = ["//visibility:private"],
+    deps = [
+        ":tensorflowlitelib_flex",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
 java_test(
     name = "TensorTest",
     size = "small",
@@ -164,14 +200,29 @@ filegroup(
 )
 
 cc_library(
-    name = "tflite_runtime",
+    name = "tensorflowlite_native",
     srcs = ["libtensorflowlite_jni.so"],
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "tensorflowlite_native_flex",
+    srcs = ["libtensorflowlite_flex_jni.so"],
+    visibility = ["//visibility:public"],
+)
+
 tflite_jni_binary(
     name = "libtensorflowlite_jni.so",
     deps = [
         "//tensorflow/contrib/lite/java/src/main/native",
     ],
 )
+
+# EXPERIMENTAL: Native target that supports TensorFlow op execution with TFLite.
+tflite_jni_binary(
+    name = "libtensorflowlite_flex_jni.so",
+    deps = [
+        "//tensorflow/contrib/lite/delegates/flex:delegate",
+        "//tensorflow/contrib/lite/java/src/main/native",
+    ],
+)
diff --git a/tensorflow/contrib/lite/java/aar_with_jni.bzl b/tensorflow/contrib/lite/java/aar_with_jni.bzl
index 9d2aead266..360d622b1b 100644
--- a/tensorflow/contrib/lite/java/aar_with_jni.bzl
+++ b/tensorflow/contrib/lite/java/aar_with_jni.bzl
@@ -30,7 +30,10 @@ EOF
         # In some platforms we don't have an Android SDK/NDK and this target
         # can't be built. We need to prevent the build system from trying to
         # use the target in that case.
-        tags = ["manual"],
+        tags = [
+            "manual",
+            "no_cuda_on_cpu_tap",
+        ],
     )
 
     native.genrule(
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
index 711638a9f9..d5447b3bf8 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
@@ -18,7 +18,8 @@ package org.tensorflow.lite;
 /** Static utility methods loading the TensorFlowLite runtime. */
 public final class TensorFlowLite {
 
-  private static final String LIBNAME = "tensorflowlite_jni";
+  private static final String PRIMARY_LIBNAME = "tensorflowlite_jni";
+  private static final String FALLBACK_LIBNAME = "tensorflowlite_flex_jni";
 
   private TensorFlowLite() {}
 
@@ -29,13 +30,24 @@ public final class TensorFlowLite {
    * Load the TensorFlowLite runtime C library.
    */
   static boolean init() {
+    Throwable primaryLibException;
     try {
-      System.loadLibrary(LIBNAME);
+      System.loadLibrary(PRIMARY_LIBNAME);
       return true;
     } catch (UnsatisfiedLinkError e) {
-      System.err.println("TensorFlowLite: failed to load native library: " + e.getMessage());
-      return false;
+      primaryLibException = e;
     }
+
+    try {
+      System.loadLibrary(FALLBACK_LIBNAME);
+      return true;
+    } catch (UnsatisfiedLinkError e) {
+      // If the fallback fails, log the error for the primary load instead.
+      System.err.println(
+          "TensorFlowLite: failed to load native library: " + primaryLibException.getMessage());
+    }
+
+    return false;
   }
 
   static {
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java
new file mode 100644
index 0000000000..2791c3864b
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import java.io.File;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/**
+ * Unit tests for {@link org.tensorflow.lite.Interpreter} that validate execution with models that
+ * have TensorFlow ops.
+ */
+@RunWith(JUnit4.class)
+public final class InterpreterFlexTest {
+
+  private static final File FLEX_MODEL_FILE =
+      new File("tensorflow/contrib/lite/testdata/multi_add_flex.bin");
+
+  /** Smoke test validating that flex model loading works when the flex delegate is linked. */
+  @Test
+  public void testFlexModel() throws Exception {
+    try (Interpreter interpreter = new Interpreter(FLEX_MODEL_FILE)) {
+      assertThat(interpreter.getInputTensorCount()).isEqualTo(4);
+      assertThat(interpreter.getInputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
+      assertThat(interpreter.getOutputTensorCount()).isEqualTo(4);
+      assertThat(interpreter.getOutputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
+      interpreter.run(new float[1], new float[1]);
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index a98fca0132..f8b73c7cf3 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -43,6 +43,9 @@ public final class InterpreterTest {
   private static final File MOBILENET_MODEL_FILE =
       new File("tensorflow/contrib/lite/java/src/testdata/mobilenet.tflite.bin");
 
+  private static final File FLEX_MODEL_FILE =
+      new File("tensorflow/contrib/lite/testdata/multi_add_flex.bin");
+
   @Test
   public void testInterpreter() throws Exception {
     Interpreter interpreter = new Interpreter(MODEL_FILE);
@@ -345,4 +348,15 @@ public final class InterpreterTest {
     interpreter.close();
     interpreter.close();
   }
+
+  /** Smoke test validating that flex model loading fails when the flex delegate is not linked. */
+  @Test
+  public void testFlexModel() throws Exception {
+    try {
+      new Interpreter(FLEX_MODEL_FILE);
+      fail();
+    } catch (IllegalStateException e) {
+      // Expected failure.
+    }
+  }
 }
-- 
GitLab


From 89c887558d8b0067213c39a79d5d048d3422b6dd Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 5 Oct 2018 16:02:49 -0700
Subject: [PATCH 497/570] [TF:XLA] Bump open source abseil revision to
 e821380d69a549dc64900693942789d21aa4df5e

PiperOrigin-RevId: 215981413
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b9ced1bd6c..6f5aa85b01 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -112,11 +112,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
-        sha256 = "507903ef9353cb25cccd0a6840048fdd348fd20e98314d694f04a990c0f277e3",
-        strip_prefix = "abseil-cpp-f21d187b80e3b7f08fb279775ea9c8b48c636030",
+        sha256 = "f186bf5d9fce3037c602a21f86facbdd317adecef36e1726ec7bc7b496943a82",
+        strip_prefix = "abseil-cpp-e821380d69a549dc64900693942789d21aa4df5e",
         urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/f21d187b80e3b7f08fb279775ea9c8b48c636030.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/f21d187b80e3b7f08fb279775ea9c8b48c636030.tar.gz",
+            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/e821380d69a549dc64900693942789d21aa4df5e.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/e821380d69a549dc64900693942789d21aa4df5e.tar.gz",
         ],
     )
 
-- 
GitLab


From 1daaf0fabee1c59af00e14f358d08ac9f5390b9f Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 5 Oct 2018 16:32:30 -0700
Subject: [PATCH 498/570] Orders non-resource-affecting stateful ops in defuns.

PiperOrigin-RevId: 215985679
---
 tensorflow/python/eager/function.py                |  7 +++++++
 tensorflow/python/kernel_tests/logging_ops_test.py | 13 +++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 2750461fb2..f06148b5d2 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -1906,8 +1906,10 @@ class AutomaticControlDependencies(object):
               last_op_using_resource_tensor[inp] = op
         ops_which_must_run = set([op])
         continue
+      found_resource = False
       for inp in op.inputs:
         if inp.dtype == dtypes_module.resource:
+          found_resource = True
           # Deal with switches, finally.
           if inp.op.type == "Switch":
             self._process_switch(inp.op, ops_which_must_run,
@@ -1922,6 +1924,11 @@ class AutomaticControlDependencies(object):
           if inp in merge_for_resource:
             merge_for_resource[inp]._add_control_input(op)  # pylint: disable=protected-access
           last_op_using_resource_tensor[inp] = op
+      if (op.op_def.is_stateful and not found_resource
+          and op._control_flow_context is None):  # pylint: disable=protected-access
+        if None in last_op_using_resource_tensor:
+          op._add_control_input(last_op_using_resource_tensor[None])  # pylint: disable=protected-access
+        last_op_using_resource_tensor[None] = op
       control_inputs = [c for c in control_inputs
                         if c._control_flow_context is op._control_flow_context]  # pylint: disable=protected-access
       op._add_control_inputs(control_inputs)  # pylint: disable=protected-access
diff --git a/tensorflow/python/kernel_tests/logging_ops_test.py b/tensorflow/python/kernel_tests/logging_ops_test.py
index 4beddd00bb..2f19ecc0e6 100644
--- a/tensorflow/python/kernel_tests/logging_ops_test.py
+++ b/tensorflow/python/kernel_tests/logging_ops_test.py
@@ -306,6 +306,19 @@ class PrintV2Test(test.TestCase):
           logging_ops.print_v2(tensor)
         self.assertTrue((expected + "\n") in printed.contents())
 
+  def testPrintsOrderedInDefun(self):
+    with context.eager_mode():
+
+      @function.defun
+      def prints():
+        logging_ops.print_v2("A")
+        logging_ops.print_v2("B")
+        logging_ops.print_v2("C")
+
+      with self.captureWritesToStream(sys.stderr) as printed:
+        prints()
+      self.assertTrue(("A\nB\nC\n") in printed.contents())
+
   @test_util.run_in_graph_and_eager_modes()
   def testPrintInDefunWithoutExplicitEvalOfPrint(self):
     @function.defun
-- 
GitLab


From 29af23aeadd1d6fccbfa4223b58dad8f5b8df4f8 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 5 Oct 2018 16:47:07 -0700
Subject: [PATCH 499/570] Fix api_compatibility_test diff for large files.
 assertEqual might be applied instead of assertMultiLineEqual if input is too
 large (https://bugs.python.org/issue11763). This change is switching to use
 unified_diff in that case.

PiperOrigin-RevId: 215987656
---
 tensorflow/python/util/protobuf/compare.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/util/protobuf/compare.py b/tensorflow/python/util/protobuf/compare.py
index a0e6bf65cf..3a3af4bffa 100644
--- a/tensorflow/python/util/protobuf/compare.py
+++ b/tensorflow/python/util/protobuf/compare.py
@@ -63,6 +63,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import difflib
 
 import six
 
@@ -101,10 +102,19 @@ def assertProtoEqual(self, a, b, check_initialized=True,  # pylint: disable=inva
     if normalize_numbers:
       NormalizeNumberFields(pb)
 
-  self.assertMultiLineEqual(
-      text_format.MessageToString(a, descriptor_pool=pool),
-      text_format.MessageToString(b, descriptor_pool=pool),
-      msg=msg)
+  a_str = text_format.MessageToString(a, descriptor_pool=pool)
+  b_str = text_format.MessageToString(b, descriptor_pool=pool)
+
+  # Some Python versions would perform regular diff instead of multi-line
+  # diff if string is longer than 2**16. We substitute this behavior
+  # with a call to unified_diff instead to have easier-to-read diffs.
+  # For context, see: https://bugs.python.org/issue11763.
+  if len(a_str) < 2**16 and len(b_str) < 2**16:
+    self.assertMultiLineEqual(a_str, b_str, msg=msg)
+  else:
+    diff = '\n' + ''.join(difflib.unified_diff(a_str.splitlines(True),
+                                               b_str.splitlines(True)))
+    self.fail('%s : %s' % (msg, diff))
 
 
 def NormalizeNumberFields(pb):
-- 
GitLab


From 55081a9d21ab42834ac4fb70351e3d2ee13ef78b Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Fri, 5 Oct 2018 16:47:51 -0700
Subject: [PATCH 500/570] [XLA:GPU] Use a struct for the return value of
 CudnnConvolutionAlgorithmPicker::PickBestAlgorithm.

Using a struct lets us return additional data -- namely, the elapsed time to
run the best algo -- without adding a fourth entry to the tuple, which would be
confusing.

No functional change.

PiperOrigin-RevId: 215987795
---
 tensorflow/compiler/xla/service/gpu/BUILD     |  1 +
 .../gpu/cudnn_convolution_algorithm_picker.cc | 40 ++++++++-----------
 .../gpu/cudnn_convolution_algorithm_picker.h  | 11 ++++-
 3 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 522e9f5948..7b84f691f6 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -404,6 +404,7 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index 7125673887..590c0a7d54 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -145,7 +145,7 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
 // cache misses and doing extra work.  Overall, caching doesn't seem worth the
 // trouble, but we may want to revisit this if we ever find a model where
 // caching would speed up compilation a lot.
-StatusOr<std::tuple<int64, bool, int64>>
+StatusOr<CudnnConvolutionAlgorithmPicker::AutotuneResult>
 CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
     HloCustomCallInstruction* instr) {
   // TODO(timshen): for now only check fp16. It can be expanded to other types,
@@ -316,9 +316,10 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
             << AlgorithmToString(best_result.algorithm()) << ", takes "
             << best_result.elapsed_time_in_ms() << "ms, and uses "
             << best_result_bytes_used << "B of scratch memory.";
-    return std::make_tuple(best_result.algorithm().algo_id(),
-                           best_result.algorithm().tensor_ops_enabled(),
-                           best_result_bytes_used);
+    return AutotuneResult{best_result.algorithm().algo_id(),
+                          best_result.algorithm().tensor_ops_enabled(),
+                          best_result_bytes_used,
+                          absl::Milliseconds(best_result.elapsed_time_in_ms())};
   }
 
   return InternalError(
@@ -331,37 +332,30 @@ StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
     HloInstruction* instr) {
   CHECK(IsCustomCallToDnnConvolution(*instr));
 
-  StatusOr<std::tuple<int64, bool, int64>> alg_scratch_and_tc =
+  StatusOr<AutotuneResult> best_algo_or =
       PickBestAlgorithm(Cast<HloCustomCallInstruction>(instr));
-
-  if (!alg_scratch_and_tc.ok()) {
-    LOG(ERROR) << alg_scratch_and_tc.status();
+  if (!best_algo_or.ok()) {
+    LOG(ERROR) << best_algo_or.status();
     return false;
   }
 
-  int64 algorithm;
-  bool tensor_ops_enabled;
-  int64 scratch_bytes;
-
-  std::tie(algorithm, tensor_ops_enabled, scratch_bytes) =
-      alg_scratch_and_tc.ConsumeValueOrDie();
-
-  VLOG(1) << "Setting cudnn conv to use algorithm " << algorithm << " and "
-          << NumBytesToString(scratch_bytes)
+  auto best_algo = std::move(best_algo_or).ValueOrDie();
+  VLOG(1) << "Setting cudnn conv to use algorithm " << best_algo.algorithm
+          << " and " << NumBytesToString(best_algo.scratch_bytes)
           << " of scratch memory: " << instr->ToString()
-          << " tensor_ops_enabled: " << tensor_ops_enabled;
+          << " tensor_ops_enabled: " << best_algo.tensor_ops_enabled;
 
   // Replace instr with a new CustomCall which has the correct algorithm, and
   // whose output shape has the appropriate amount of scratch memory.
   HloComputation* computation = instr->parent();
-  Shape new_call_shape =
-      ShapeUtil::MakeTupleShape({instr->shape().tuple_shapes(0),
-                                 ShapeUtil::MakeShape(U8, {scratch_bytes})});
+  Shape new_call_shape = ShapeUtil::MakeTupleShape(
+      {instr->shape().tuple_shapes(0),
+       ShapeUtil::MakeShape(U8, {best_algo.scratch_bytes})});
 
   TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
                       instr->backend_config<CudnnConvBackendConfig>());
-  backend_config.set_algorithm(algorithm);
-  backend_config.set_tensor_ops_enabled(tensor_ops_enabled);
+  backend_config.set_algorithm(best_algo.algorithm);
+  backend_config.set_tensor_ops_enabled(best_algo.tensor_ops_enabled);
 
   HloInstruction* new_call = computation->AddInstruction(
       instr->CloneWithNewOperands(new_call_shape, instr->operands()));
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
index aeda2fc7f8..136c32210a 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
 
+#include "absl/time/time.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -47,10 +48,16 @@ class CudnnConvolutionAlgorithmPicker : public HloModulePass {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
+  struct AutotuneResult {
+    int64 algorithm;
+    bool tensor_ops_enabled;
+    int64 scratch_bytes;
+    absl::Duration runtime;
+  };
+
   StatusOr<bool> RunOnComputation(HloComputation* computation);
   StatusOr<bool> RunOnInstruction(HloInstruction* instr);
-  StatusOr<std::tuple<int64, bool, int64>> PickBestAlgorithm(
-      HloCustomCallInstruction* instr);
+  StatusOr<AutotuneResult> PickBestAlgorithm(HloCustomCallInstruction* instr);
 
   se::StreamExecutor* stream_exec_;                   // never null
   DeviceMemoryAllocator* allocator_;                  // may be null
-- 
GitLab


From ab97f1323bd2a98d20ed82dc3ff8585481961f0d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 16:59:39 -0700
Subject: [PATCH 501/570] Automated rollback of commit
 d258207f1583df4faa452265b051879af6c15dac

PiperOrigin-RevId: 215989111
---
 tensorflow/python/ops/array_ops.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 4be9c532f4..e3e4d5f910 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1407,8 +1407,13 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
         gen_array_ops.conjugate_transpose
         if (conjugate and a.dtype.is_complex) else gen_array_ops.transpose)
     if perm is None:
-      rank = gen_array_ops.rank(a)
-      perm = (rank - 1) - gen_math_ops._range(0, rank, 1)
+      a = ops.convert_to_tensor(a, name="a")
+      if not a.get_shape().ndims:
+        rank = gen_array_ops.rank(a)
+        perm = (rank - 1) - gen_math_ops._range(0, rank, 1)
+      else:
+        rank = a.get_shape().ndims
+        perm = (rank - 1) - np.arange(rank)
       ret = transpose_fn(a, perm, name=name)
       # NOTE(mrry): Setting the shape explicitly because
       #   reverse is not handled by the shape function.
-- 
GitLab


From 15d399cd8590c18dc643d979883fe4201c8ea631 Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Fri, 5 Oct 2018 17:01:01 -0700
Subject: [PATCH 502/570] [tf.data vectorization] Feed inputs to vectorizers
 with notion of stackedness

PiperOrigin-RevId: 215989259
---
 .../optimizers/data/vectorization/BUILD       |  10 ++
 .../data/vectorization/cast_vectorizer.cc     |  16 +--
 .../data/vectorization/unpack_vectorizer.cc   |  16 +--
 .../data/vectorization/vectorizer.h           |  19 ++-
 .../data/vectorization/vectorizer_registry.cc |   2 -
 .../data/vectorization/vectorizer_registry.h  |  15 +--
 .../vectorization/vectorizer_registry_test.cc |  11 +-
 .../data/vectorization/wrapped_tensor.h       |  44 +++++++
 .../optimizers/data/vectorization_utils.cc    | 116 +++++++++---------
 9 files changed, 144 insertions(+), 105 deletions(-)
 create mode 100644 tensorflow/core/grappler/optimizers/data/vectorization/wrapped_tensor.h

diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
index 37aa24b947..985d6c6c3a 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
@@ -12,10 +12,20 @@ VECTORIZER_DEPS = [
     "//tensorflow/core/grappler/optimizers/data:graph_utils",
 ] + tf_protos_all()
 
+cc_library(
+    name = "wrapped_tensor",
+    hdrs = ["wrapped_tensor.h"],
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "vectorizer",
     hdrs = ["vectorizer.h"],
     deps = [
+        ":wrapped_tensor",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
     ] + tf_protos_all(),
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/cast_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/cast_vectorizer.cc
index 3af6bab409..f445157531 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/cast_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/cast_vectorizer.cc
@@ -19,13 +19,13 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-namespace vectorization_utils {
+namespace {
 
 class CastVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<Port>* input_ports,
-                   std::vector<Port>* output_ports) override {
+                   std::vector<WrappedTensor>&& inputs,
+                   std::vector<WrappedTensor>* outputs) override {
     Status s;
     if (node.num_inputs() != 1) {
       return errors::Internal("Cast op should only have one input.");
@@ -35,15 +35,17 @@ class CastVectorizer : public Vectorizer {
     auto new_cast_node = outer_scope->AddNode(node.def(), &s);
     TF_RETURN_IF_ERROR(s);
 
-    // Add input and output mappings
-    input_ports->push_back({new_cast_node, 0});
-    output_ports->push_back({new_cast_node, 0});
+    outer_scope->AddEdge(inputs[0].node, inputs[0].output_index, new_cast_node,
+                         0);
+
+    // Add output mappings
+    outputs->push_back({new_cast_node, 0, true});
     return Status::OK();
   }
 };
 
 REGISTER_VECTORIZER("Cast", CastVectorizer);
 
-}  // namespace vectorization_utils
+}  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
index 74ce520ce1..f1ba741821 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
@@ -19,15 +19,15 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-namespace vectorization_utils {
+namespace {
 
 class UnpackVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<Port>* input_ports,
-                   std::vector<Port>* output_ports) override {
+                   std::vector<WrappedTensor>&& inputs,
+                   std::vector<WrappedTensor>* outputs) override {
     Status s;
-    if (node.num_inputs() != 1) {
+    if (node.num_inputs() != 1 || inputs.size() != 1) {
       return errors::Internal("Unpack op should only have one input.");
     }
 
@@ -39,13 +39,13 @@ class UnpackVectorizer : public Vectorizer {
     int new_axis = node.def().attr().at("axis").i() + 1;
     new_unpack_node->AddAttr("axis", new_axis);
 
-    // Add the input mappings
-    input_ports->push_back({new_unpack_node, 0});
+    outer_scope->AddEdge(inputs[0].node, inputs[0].output_index,
+                         new_unpack_node, 0);
 
     // Add the output mappings
     int num = node.def().attr().at("num").i();
     for (int i = 0; i < num; ++i) {
-      output_ports->push_back({new_unpack_node, i});
+      outputs->push_back({new_unpack_node, i, true});
     }
 
     return Status::OK();
@@ -54,6 +54,6 @@ class UnpackVectorizer : public Vectorizer {
 
 REGISTER_VECTORIZER("Unpack", UnpackVectorizer);
 
-}  // namespace vectorization_utils
+}  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
index 56eb88c95e..8d4676aae0 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
@@ -18,15 +18,12 @@ limitations under the License.
 
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/grappler/optimizers/data/vectorization/wrapped_tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
-namespace vectorization_utils {
-
-// Describes a tensor with its operation Node and output position
-typedef std::pair<Node*, int> Port;
 
 // Interface for vectorization of TensorFlow operations. See `CastVectorizer`
 // for an example.
@@ -36,17 +33,17 @@ class Vectorizer {
 
   // Vectorizes an operation, `node`, by adding Node(s) to `outer_scope`
   // that produce the same vector output(s) as executing `node`'s op
-  // on elements of the vector inputs. The new Node(s) collectively have the
+  // on elements of `inputs`. The new Node(s) collectively have the
   // same number of input and output ports as the node being converted.
-  // Adds mappings for the new nodes' input and output ports to `inputs` and
-  // `outputs` respectively, where the i'th Port in inputs/outputs
-  // corresponds to the i'th input/output port of the node to be converted.
+  // Adds edges between the newly created nodes and nodes in `inputs`, and adds
+  // mappings to the new nodes' output ports to `outputs`, where the i'th
+  // value in `outputs` corresponds to the i'th output port of the node
+  // to be converted.
   virtual Status Vectorize(const Node& node, Graph* outer_scope,
-                           std::vector<Port>* input_ports,
-                           std::vector<Port>* output_ports) = 0;
+                           std::vector<WrappedTensor>&& inputs,
+                           std::vector<WrappedTensor>* outputs) = 0;
 };
 
-}  // namespace vectorization_utils
 }  // namespace grappler
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_VECTORIZATION_VECTORIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.cc b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.cc
index a6551e36ac..e1cf77a7d5 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-namespace vectorization_utils {
 
 VectorizerRegistry* VectorizerRegistry::Global() {
   static VectorizerRegistry* registry = new VectorizerRegistry;
@@ -42,6 +41,5 @@ void VectorizerRegistry::Register(const string& op_type,
   vectorizers_.insert(std::pair<const string&, std::unique_ptr<Vectorizer>>(
       op_type, std::move(vectorizer)));
 }
-}  // namespace vectorization_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h
index 16159d47ca..ad54c74933 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h
@@ -23,7 +23,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-namespace vectorization_utils {
 
 // A global VectorizerRegistry is used to hold all the vectorizers.
 class VectorizerRegistry {
@@ -59,16 +58,12 @@ class VectorizerRegistration {
 #define REGISTER_VECTORIZER_UNIQ_HELPER(ctr, op_type, vectorizer) \
   REGISTER_VECTORIZER_UNIQ(ctr, op_type, vectorizer)
 
-#define REGISTER_VECTORIZER_UNIQ(ctr, op_type, vectorizer)                  \
-  static ::tensorflow::grappler::vectorization_utils::                      \
-      vectorizer_registration::VectorizerRegistration                       \
-          vectorizer_registration_##ctr(                                    \
-              op_type,                                                      \
-              ::std::unique_ptr<                                            \
-                  ::tensorflow::grappler::vectorization_utils::Vectorizer>( \
-                  new vectorizer()))
+#define REGISTER_VECTORIZER_UNIQ(ctr, op_type, vectorizer)                \
+  static ::tensorflow::grappler::vectorizer_registration::                \
+      VectorizerRegistration vectorizer_registration_##ctr(               \
+          op_type, ::std::unique_ptr<::tensorflow::grappler::Vectorizer>( \
+                       new vectorizer()))
 
-}  // namespace vectorization_utils
 }  // namespace grappler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
index 663ceba027..054aeb9a8f 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
@@ -20,13 +20,12 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-namespace vectorization_utils {
 
 class TestVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<Port>* inputs,
-                   std::vector<Port>* outputs) override {
+                   std::vector<WrappedTensor>&& inputs,
+                   std::vector<WrappedTensor>* outputs) override {
     return Status::OK();
   }
 };
@@ -43,10 +42,10 @@ TEST(TestVectorizer, TestTestVectorizer) {
   NodeDef node_def;
   Status s;
   Node* node = g.AddNode(node_def, &s);
-  std::vector<Port> inputs, outputs;
-  EXPECT_TRUE(vectorizer->Vectorize(*node, &g, &inputs, &outputs).ok());
+  std::vector<WrappedTensor> inputs, outputs;
+  EXPECT_TRUE(
+      vectorizer->Vectorize(*node, &g, std::move(inputs), &outputs).ok());
 }
 
-}  // namespace vectorization_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/wrapped_tensor.h b/tensorflow/core/grappler/optimizers/data/vectorization/wrapped_tensor.h
new file mode 100644
index 0000000000..4439b4ab4e
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/wrapped_tensor.h
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_VECTORIZATION_WRAPPED_TENSOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_VECTORIZATION_WRAPPED_TENSOR_H_
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Represents a tensor that has been vectorized.
+struct WrappedTensor {
+  Node* const node;
+  const int output_index;
+
+  // Whether the tensor is stacked, i.e. represents the results of applying
+  // the operation on all slices of the input, where each row i of the
+  // tensor corresponds to the op's output on slice i of the input. False
+  // if the tensor is not stacked, i.e. represents the result of the op on
+  // a single slice of the input, where the result does not vary between
+  // slices.
+  bool stacked;
+
+  WrappedTensor(Node* node, int output_index, bool stacked)
+      : node(node), output_index(output_index), stacked(stacked) {}
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_VECTORIZATION_WRAPPED_TENSOR_H_
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
index 344c420902..ba857ab5d9 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
@@ -45,22 +45,6 @@ namespace {
 // Describes a tensor with its operation Node and output position
 typedef std::pair<Node*, int> TensorDesc;
 
-// Equivalent to python Pfor's WrappedTensor struct
-struct WrappedTensor {
-  TensorDesc tensor;
-
-  // Whether the tensor is stacked, i.e. represents the results of applying
-  // the operation on all slices of the input, where each row i of the
-  // tensor corresponds to the op's output on slice i of the input. False
-  // if the tensor is not stacked, i.e. represents the result of the op on
-  // a single slice of the input, where the result does not vary between
-  // slices.
-  bool stacked;
-
-  WrappedTensor(TensorDesc&& tensor, bool stacked)
-      : tensor(std::move(tensor)), stacked(stacked) {}
-};
-
 const char* const kRetValOp = "_Retval";
 
 void ReplaceEdgeSources(const TensorDesc& old_src, const TensorDesc& new_src,
@@ -239,34 +223,48 @@ Status Vectorization::AddConversionMapping(Node* op_node) {
     return errors::Unimplemented("No vectorizer registered for op: ",
                                  op_node->type_string());
   }
-  std::vector<Port> input_ports, output_ports;
-  input_ports.reserve(op_node->num_inputs());
-  output_ports.reserve(op_node->num_outputs());
-  TF_RETURN_IF_ERROR(vectorizer->Vectorize(*op_node, outer_scope_.get(),
-                                           &input_ports, &output_ports));
+  std::vector<WrappedTensor> inputs, outputs;
+  inputs.reserve(op_node->num_inputs());
+  outputs.reserve(op_node->num_outputs());
 
   std::vector<const Edge*> input_edges;
   TF_RETURN_IF_ERROR(op_node->input_edges(&input_edges));
 
-  if (op_node->num_outputs() != output_ports.size() ||
-      op_node->num_inputs() != input_ports.size() ||
-      input_edges.size() != input_ports.size()) {
-    return errors::Internal("Vectorizer inputs/outputs don't match.");
-  }
-
-  // Promote the inputs of the op to MapDefun outputs and connect the edges
-  // accordingly.
+  // The inputs for the node to be converted may already have been converted
+  // themselves. For those that are not, we promote them to MapDefun outputs.
   for (size_t i = 0; i < op_node->num_inputs(); ++i) {
     auto edge = input_edges[i];
-    TF_RETURN_IF_ERROR(AddMapDefunOutput(map_defun_fn_.get(), map_defun_node_,
-                                         {edge->src(), edge->src_output()}));
-    outer_scope_->AddEdge(map_defun_node_, map_defun_fn_->ret_nodes.size() - 1,
-                          input_ports[i].first, input_ports[i].second);
+    if (auto found = gtl::FindOrNull(conversion_map_,
+                                     {edge->src(), edge->src_output()})) {
+      inputs.push_back(*found);
+    } else {
+      // TODO(rachelim): Handle the case where unconverted inputs are unstacked.
+      // We assume that all unconverted inputs will be stacked, since we
+      // converted all unstacked nodes in `Initialize`. However, it's actually
+      // possible that yet-unconverted nodes may produce unstacked outputs after
+      // they are vectorized. (For example, see the "Shape" converter in
+      // tensorflow/python/ops/parallel_for/pfor.py). If a vectorizer expects
+      // an unstacked input but receives a stacked one, vectorizer->Vectorize
+      // will return an error.
+      TF_RETURN_IF_ERROR(AddMapDefunOutput(map_defun_fn_.get(), map_defun_node_,
+                                           {edge->src(), edge->src_output()}));
+      int output_index = map_defun_fn_->ret_nodes.size() - 1;
+      inputs.push_back({map_defun_node_, output_index, true});
+    }
+  }
+
+  TF_RETURN_IF_ERROR(vectorizer->Vectorize(*op_node, outer_scope_.get(),
+                                           std::move(inputs), &outputs));
+
+  if (op_node->num_outputs() != outputs.size()) {
+    return errors::Internal(
+        "Number of vectorizer outputs does not match. Expected: ",
+        op_node->num_outputs(), " Actual: ", outputs.size());
   }
 
   // Add output mappings.
   for (size_t i = 0; i < op_node->num_outputs(); ++i) {
-    conversion_map_.insert({{op_node, i}, {std::move(output_ports[i]), true}});
+    conversion_map_.insert({{op_node, i}, outputs[i]});
   }
 
   return Status::OK();
@@ -281,25 +279,22 @@ Status Vectorization::ConvertOutput(int output_position) {
 
   TensorDesc output({ret_edge->src(), ret_edge->src_output()});
   TensorDesc converted_output;
-  if (auto found = gtl::FindOrNull(conversion_map_, output)) {
-    // It's possible the output already has a mapping, if it comes from a node
-    // that has already been converted.
-    if (found->stacked) {
-      converted_output = found->tensor;
-    } else {
-      // Some outputs may be unstacked if they don't derive from arg nodes
-      // (for example, if a function returns a constant). For these, we
-      // have to add extra nodes to tile it in the 0th dimension.
-      TF_RETURN_IF_ERROR(StackTensor(found, &converted_output));
-    }
-  } else {
-    // Note: All unstacked nodes are converted ahead of time in `Initialize`,
-    // and here we assume that all op vectorizers create only stacked outputs.
-    // This may not hold in the future, as more vectorizers are added that
-    // may actually create unstacked outputs. For example, see the `Shape`
-    // converter in third_party/tensorflow/python/ops/parallel_for/pfor.py
+
+  // It's possible the output already has a mapping, if it comes from a node
+  // that has already been converted.
+  auto found = gtl::FindOrNull(conversion_map_, output);
+  if (!found) {
     TF_RETURN_IF_ERROR(AddConversionMapping(output.first));
-    converted_output = conversion_map_.at(output).tensor;
+    found = &conversion_map_.at(output);
+  }
+
+  if (found->stacked) {
+    converted_output = {found->node, found->output_index};
+  } else {
+    // Some outputs may be unstacked if they don't derive from arg nodes
+    // (for example, if a function returns a constant). For these, we
+    // have to add extra nodes to tile it in the 0th dimension.
+    TF_RETURN_IF_ERROR(StackTensor(found, &converted_output));
   }
 
   ReplaceEdgeSources({map_defun_node_, output_position}, converted_output,
@@ -455,7 +450,7 @@ Status Vectorization::StackTensor(WrappedTensor* unstacked,
 
   Node* ones_shape;
   TF_RETURN_IF_ERROR(node_builder("Shape")
-                         .Input(unstacked->tensor.first)  // input
+                         .Input(unstacked->node)  // input
                          .Finalize(g, &ones_shape));
 
   Node* ones;
@@ -473,8 +468,8 @@ Status Vectorization::StackTensor(WrappedTensor* unstacked,
 
   Node* expand_dims;
   TF_RETURN_IF_ERROR(node_builder("ExpandDims")
-                         .Input(unstacked->tensor.first)  // input
-                         .Input(const_0)                  // dim
+                         .Input(unstacked->node)  // input
+                         .Input(const_0)          // dim
                          .Finalize(g, &expand_dims));
 
   TF_RETURN_IF_ERROR(node_builder("Tile")
@@ -491,11 +486,11 @@ Status Vectorization::AddArgNodeMappings() {
     TF_RETURN_IF_ERROR(map_defun_node_->input_node(
         arg_node->attrs().Find("index")->i(), &input_node));
 
-    conversion_map_.insert({{arg_node, 0}, {{input_node, 0}, true}});
+    conversion_map_.insert({{arg_node, 0}, {input_node, 0, true}});
 
     // Control inputs
     conversion_map_.insert({{arg_node, Graph::kControlSlot},
-                            {{input_node, Graph::kControlSlot}, true}});
+                            {input_node, Graph::kControlSlot, true}});
   }
   return Status::OK();
 }
@@ -541,7 +536,7 @@ bool Vectorization::AddUnstackedNodeMappingsHelper(TensorDesc&& tensor,
 
     if (auto found = gtl::FindOrNull(conversion_map_,
                                      {edge->src(), edge->src_output()})) {
-      outer_scope_->AddEdge(found->tensor.first, found->tensor.second, node,
+      outer_scope_->AddEdge(found->node, found->output_index, node,
                             edge->dst_input());
     } else {
       status->Update(errors::Internal(
@@ -552,11 +547,10 @@ bool Vectorization::AddUnstackedNodeMappingsHelper(TensorDesc&& tensor,
 
   // Add output mappings
   for (int i = 0; i < tensor.first->num_outputs(); ++i) {
-    conversion_map_.insert(
-        {{tensor.first, i}, WrappedTensor({node, i}, false)});
+    conversion_map_.insert({{tensor.first, i}, WrappedTensor(node, i, false)});
   }
   conversion_map_.insert({{tensor.first, Graph::kControlSlot},
-                          WrappedTensor({node, Graph::kControlSlot}, false)});
+                          WrappedTensor(node, Graph::kControlSlot, false)});
 
   return true;
 }
-- 
GitLab


From 4831740f90eaf266a99d3ffa7d390d54325b689f Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Fri, 5 Oct 2018 17:05:17 -0700
Subject: [PATCH 503/570] [XLA:GPU] Remove hidden flag for disabling heuristic
 layout assignment.

Heuristic NCHW/NHWC layout assignment works great; we've never had to flip this
flag.  Might as well remove it and simplify things a bit.

PiperOrigin-RevId: 215989807
---
 tensorflow/compiler/xla/service/gpu/BUILD     | 11 -------
 .../xla/service/gpu/gpu_layout_assignment.cc  | 11 ++-----
 .../compiler/xla/service/gpu/gpu_options.cc   | 28 ----------------
 .../compiler/xla/service/gpu/gpu_options.h    | 33 -------------------
 4 files changed, 2 insertions(+), 81 deletions(-)
 delete mode 100644 tensorflow/compiler/xla/service/gpu/gpu_options.cc
 delete mode 100644 tensorflow/compiler/xla/service/gpu/gpu_options.h

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 7b84f691f6..350fd32537 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -781,7 +781,6 @@ cc_library(
     srcs = ["gpu_layout_assignment.cc"],
     hdrs = ["gpu_layout_assignment.h"],
     deps = [
-        ":gpu_options",
         ":ir_emission_utils",
         ":stream_executor_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -882,16 +881,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "gpu_options",
-    srcs = ["gpu_options.cc"],
-    hdrs = ["gpu_options.h"],
-    deps = [
-        "//tensorflow/compiler/xla/service:hlo_module_config",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 cc_library(
     name = "stream_executor_util",
     srcs = ["stream_executor_util.cc"],
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 74352f26aa..1ffe855750 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_options.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
@@ -125,14 +124,8 @@ Status GpuLayoutAssignment::AddBackendConstraintsToDnnConvCustomCall(
     DataLayout input;
     FilterLayout filter;
     DataLayout output;
-    if (ConvUseLayoutHeuristic(instr->GetModule()->config())) {
-      std::tie(input, filter, output) =
-          HeuristicLayoutAssignment(instr, stream_executor_);
-    } else {
-      input = DataLayout::kBatchDepthYX;
-      filter = FilterLayout::kOutputInputYX;
-      output = DataLayout::kBatchDepthYX;
-    }
+    std::tie(input, filter, output) =
+        HeuristicLayoutAssignment(instr, stream_executor_);
 
     TF_ASSIGN_OR_RETURN(
         std::tie(*input_shape->mutable_layout(),
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_options.cc b/tensorflow/compiler/xla/service/gpu/gpu_options.cc
deleted file mode 100644
index 35b4b4e20b..0000000000
--- a/tensorflow/compiler/xla/service/gpu/gpu_options.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/gpu_options.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-
-namespace xla {
-namespace gpu {
-
-bool ConvUseLayoutHeuristic(const HloModuleConfig& config) {
-  return !config.debug_options().xla_backend_extra_options().count(
-      "xla_gpu_experimental_conv_disable_layout_heuristic");
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_options.h b/tensorflow/compiler/xla/service/gpu/gpu_options.h
deleted file mode 100644
index 498d4a9495..0000000000
--- a/tensorflow/compiler/xla/service/gpu/gpu_options.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_OPTIONS_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_OPTIONS_H_
-
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
-
-// Helper functions for querying options that are specific to the GPU backend.
-
-namespace xla {
-namespace gpu {
-
-// Returns true if we should use heuristics to assign convolution layouts, as
-// opposed to always assigning NCHW.
-bool ConvUseLayoutHeuristic(const HloModuleConfig& config);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_OPTIONS_H_
-- 
GitLab


From 213d76a6ed77a696883502c53a3a4f81d2ee4042 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Fri, 5 Oct 2018 17:34:30 -0700
Subject: [PATCH 504/570] Simply the logic for bubbling captured tensors when
 building cond_v2 grad. The current logic tries to bubble the forward pass
 tensor to the outermost graph. That might not always be do-able e.g. when the
 cond is inside a while loop it will need to know accumulator logic for
 while_loop. So instead, the cond_grad now captures tensors from the forward
 If op's graph. When the grad If op is built these tensors will be
 appropriately captured by the surrounding FuncGraph.

PiperOrigin-RevId: 215993009
---
 .../kernel_tests/control_flow_ops_py_test.py  |  6 +--
 tensorflow/python/ops/cond_v2_impl.py         | 48 ++++++++-----------
 2 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 7fae5249aa..baea5c0f6d 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -661,8 +661,7 @@ class ControlFlowTest(test.TestCase):
       sess.run(r)
 
   def testCondGrad_1(self):
-    graph = ops.Graph()
-    with graph.as_default():
+    with self.cached_session():
       x = constant_op.constant(10.0, name="x")
       pred = math_ops.less(1, 2)
       fn1 = lambda: array_ops.identity(x)
@@ -670,8 +669,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       grad = gradients_impl.gradients(r, [x])[0]
-      with self.cached_session():
-        self.assertAllEqual(1.0, grad.eval())
+      self.assertAllEqual(1.0, grad.eval())
 
   def testCondGrad_2(self):
     with self.cached_session():
diff --git a/tensorflow/python/ops/cond_v2_impl.py b/tensorflow/python/ops/cond_v2_impl.py
index 195ad11c71..c9aa4d4889 100644
--- a/tensorflow/python/ops/cond_v2_impl.py
+++ b/tensorflow/python/ops/cond_v2_impl.py
@@ -282,9 +282,10 @@ def _resolve_grad_inputs(cond_graph, grad_graph):
      as is.
   2. Tensors in the forward pass graph. These tensors may not be "live"
      when the gradient is being computed. We replace such references by their
-     corresponding tensor in the least common ancestor graph of `grad_graph` and
-     `cond_graph`. Since we export intermediate tensors for all branch
-     functions, this is always possible.
+     corresponding tensor in `cond_graph.outer_graph`. In the case of nested
+     control flow or functions, the gradient logic handling
+     `grad_graph.outer_graph` will make sure the tensor from
+     `cond_graph.outer_graph` is also correctly captured.
 
   Args:
     cond_graph: function.FuncGraph. The forward-pass function.
@@ -296,24 +297,23 @@ def _resolve_grad_inputs(cond_graph, grad_graph):
   new_inputs = []
 
   for t in grad_graph.external_captures:
+    # `t` must either be in `grad_graph.outer_graph` or in the forward
+    # `cond_graph`.
     if t.graph != grad_graph.outer_graph:
-      # `t` is a tensor in `cond_graph` or one of its ancestors. We bubble this
-      # tensor to the least common ancestor of the `cond_graph` and
-      # `grad_graph` so that it is "in-scope" for `grad_graph`.
-      # TODO(srbs): `_is_ancestor` calls may be expensive. Compute the least
-      # common ancestor once and re-use.
-      assert _is_ancestor(cond_graph, t.graph)
-      while not _is_ancestor(grad_graph, t.graph):
-        assert isinstance(t.graph, _function.FuncGraph)
-        if t in t.graph.internal_captures:
-          # TODO(srbs): Consider building a map of internal_captures ->
-          # external_captures instead of searching for `t` twice.
-          t = t.graph.external_captures[t.graph.internal_captures.index(t)]
-        else:
-          # Note: All intermediate tensors are output by the If op.
-          # TODO(srbs): .index() calls may be expensive. Optimize.
-          t = t.graph._if.outputs[t.graph.outputs.index(t)]
-      assert _is_ancestor(grad_graph, t.graph)
+      assert t.graph == cond_graph
+      # `internal_captures` are not treated as intermediates and hence not added
+      # to If op outputs. So we get the outer tensor corresponding to those
+      # from the list of `external_captures`.
+      try:
+        t = t.graph._if.outputs[t.graph.outputs.index(t)]
+      except ValueError:
+        index = t.graph.internal_captures.index(t)
+        t = t.graph.external_captures[index]
+
+      # Note: We rely on the capturing logic of the gradient If op graph to
+      # correctly capture the tensors in `cond_graph.outer_graph`. Both cond_v2
+      # and while_v2 handle this while building their gradient functions.
+      assert t.graph == cond_graph.outer_graph
     new_inputs.append(t)
 
   return new_inputs
@@ -492,11 +492,3 @@ def _get_output_shapes(true_graph_outputs, false_graph_outputs):
       for t_out, f_out in zip(true_graph_outputs, false_graph_outputs)
   ]
   return output_shapes
-
-
-def _is_ancestor(graph, maybe_ancestor):
-  if maybe_ancestor == graph:
-    return True
-  if isinstance(graph, _function.FuncGraph):
-    return _is_ancestor(graph.outer_graph, maybe_ancestor)
-  return False
-- 
GitLab


From 1484bad99cfd46cb63a839643cfce917b6f0cdd8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 19:18:32 -0700
Subject: [PATCH 505/570] Update ops-related pbtxt files.

PiperOrigin-RevId: 216000752
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 224 ++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  59 +++++
 2 files changed, 283 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 780c6f6448..0753316724 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -70896,6 +70896,62 @@ op {
     }
   }
 }
+op {
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessRandomUniform"
   input_arg {
@@ -70993,6 +71049,118 @@ op {
     }
   }
 }
+op {
+  name: "StatelessRandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomUniformInt"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessTruncatedNormal"
   input_arg {
@@ -71090,6 +71258,62 @@ op {
     }
   }
 }
+op {
+  name: "StatelessTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessWhile"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 0d8997c1bd..14cc9df9a2 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -32978,6 +32978,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -33033,6 +33034,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -33065,6 +33067,62 @@ op {
     }
   }
 }
+op {
+  name: "StatelessRandomUniformInt"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessTruncatedNormal"
   input_arg {
@@ -33088,6 +33146,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
-- 
GitLab


From 45f594a0bce42787356700c0e20f5fbc47193fa3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 5 Oct 2018 19:45:59 -0700
Subject: [PATCH 506/570] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 216001984

---
 tensorflow/go/op/wrappers.go | 712 +++++++++++++++++------------------
 1 file changed, 356 insertions(+), 356 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a7bbb80c82..5d17605e37 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -9640,36 +9640,6 @@ func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// Returns the element-wise sum of a list of tensors.
-//
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
-//
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
-//
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
-//
-// Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // RandomShuffleAttr is an optional argument to RandomShuffle.
 type RandomShuffleAttr func(optionalAttr)
 
@@ -10383,206 +10353,65 @@ func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.
 	return scope.AddOperation(opspec)
 }
 
-// Encode audio data using the WAV file format.
-//
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
-//
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
-//
-// Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
-//
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeWav",
-		Input: []tf.Input{
-			audio, sample_rate,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes atan of x element-wise.
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atan",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
-type ResourceApplyAdaMaxAttr func(optionalAttr)
-
-// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AdaMax algorithm.
+// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// v_t <- max(beta2 * v_{t-1}, abs(g))
-// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+// is alive, any other request to use `MutexLock` with this mutex will wait.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+// This is particularly useful for creating a critical section when used in
+// conjunction with `MutexLockIdentity`:
 //
-// Returns the created operation.
-func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdaMax",
-		Input: []tf.Input{
-			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
-
-// AssertSummarize sets the optional summarize attribute to value.
+// ```python
 //
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Asserts that the given condition is true.
+// mutex = mutex_v2(
+//   shared_name=handle_name, container=container, name=name)
 //
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
+// def execute_in_critical_section(fn, *args, **kwargs):
+//   lock = gen_resource_variable_ops.mutex_lock(mutex)
 //
-// Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
+//   with ops.control_dependencies([lock]):
+//     r = fn(*args, **kwargs)
 //
-// Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Assert",
-		Input: []tf.Input{
-			condition, tf.OutputList(data),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
+//   with ops.control_dependencies(nest.flatten(r)):
+//     with ops.colocate_with(mutex):
+//       ensure_lock_exists = mutex_lock_identity(lock)
 //
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
+//     # Make sure that if any element of r is accessed, all of
+//     # them are executed together.
+//     r = nest.map_structure(tf.identity, r)
 //
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
+//   with ops.control_dependencies([ensure_lock_exists]):
+//     return nest.map_structure(tf.identity, r)
+// ```
 //
-// Graphically the output tensors are:
+// While `fn` is running in the critical section, no other functions which wish to
+// use this critical section may run.
 //
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// Often the use case is that two executions of the same graph, in parallel,
+// wish to run `fn`; and we wish to ensure that only one of them executes
+// at a time.  This is especially important if `fn` modifies one or more
+// variables at a time.
 //
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// It is also useful if two separate functions must share a resource, but we
+// wish to ensure the usage is exclusive.
 //
 // Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
+//	mutex: The mutex resource to lock.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+// Returns A tensor that keeps a shared pointer to a lock on the mutex;
+// when the Tensor is destroyed, the use count on the shared pointer is decreased
+// by 1.  When it reaches 0, the lock is released.
+func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "SparseSplit",
+		Type: "MutexLock",
 		Input: []tf.Input{
-			split_dim, indices, values, shape,
+			mutex,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
+	return op.Output(0)
 }
 
 // ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
@@ -11611,89 +11440,321 @@ func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToN
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
+		Type: "StringToNumber",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+//
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Encode audio data using the WAV file format.
+//
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
+//
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+//
+// Arguments:
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
+//
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeWav",
+		Input: []tf.Input{
+			audio, sample_rate,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes atan of x element-wise.
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
+type ResourceApplyAdaMaxAttr func(optionalAttr)
+
+// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AdaMax algorithm.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// v_t <- max(beta2 * v_{t-1}, abs(g))
+// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdaMax",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
+
+// AssertSummarize sets the optional summarize attribute to value.
+//
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Asserts that the given condition is true.
+//
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
+//
+// Arguments:
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
+//
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Assert",
 		Input: []tf.Input{
-			string_tensor,
+			condition, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
 //
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
+// Graphically the output tensors are:
 //
-//	lr_power: Scaling factor. Must be a scalar.
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
 //
-// Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
+		Type: "SparseSplit",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+			split_dim, indices, values, shape,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
 }
 
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+// Returns the element-wise sum of a list of tensors.
 //
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
+//
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+//
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
+		Type: "AccumulateNV2",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			tf.OutputList(inputs),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -13925,67 +13986,6 @@ func CudnnRNNBackpropV2(scope *Scope, input tf.Output, input_h tf.Output, input_
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
-//
-// is alive, any other request to use `MutexLock` with this mutex will wait.
-//
-// This is particularly useful for creating a critical section when used in
-// conjunction with `MutexLockIdentity`:
-//
-// ```python
-//
-// mutex = mutex_v2(
-//   shared_name=handle_name, container=container, name=name)
-//
-// def execute_in_critical_section(fn, *args, **kwargs):
-//   lock = gen_resource_variable_ops.mutex_lock(mutex)
-//
-//   with ops.control_dependencies([lock]):
-//     r = fn(*args, **kwargs)
-//
-//   with ops.control_dependencies(nest.flatten(r)):
-//     with ops.colocate_with(mutex):
-//       ensure_lock_exists = mutex_lock_identity(lock)
-//
-//     # Make sure that if any element of r is accessed, all of
-//     # them are executed together.
-//     r = nest.map_structure(tf.identity, r)
-//
-//   with ops.control_dependencies([ensure_lock_exists]):
-//     return nest.map_structure(tf.identity, r)
-// ```
-//
-// While `fn` is running in the critical section, no other functions which wish to
-// use this critical section may run.
-//
-// Often the use case is that two executions of the same graph, in parallel,
-// wish to run `fn`; and we wish to ensure that only one of them executes
-// at a time.  This is especially important if `fn` modifies one or more
-// variables at a time.
-//
-// It is also useful if two separate functions must share a resource, but we
-// wish to ensure the usage is exclusive.
-//
-// Arguments:
-//	mutex: The mutex resource to lock.
-//
-// Returns A tensor that keeps a shared pointer to a lock on the mutex;
-// when the Tensor is destroyed, the use count on the shared pointer is decreased
-// by 1.  When it reaches 0, the lock is released.
-func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MutexLock",
-		Input: []tf.Input{
-			mutex,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // StringFormatAttr is an optional argument to StringFormat.
 type StringFormatAttr func(optionalAttr)
 
@@ -16807,26 +16807,6 @@ func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values
 	return op.Output(0), op.Output(1)
 }
 
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-//
-// The Hurwitz zeta function is defined as:
-//
-//
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Zeta",
-		Input: []tf.Input{
-			x, q,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns a list of tensors with the same shapes and contents as the input
 //
 // tensors.
@@ -18873,6 +18853,26 @@ func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.D
 	return op.Output(0)
 }
 
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+//
+// The Hurwitz zeta function is defined as:
+//
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Zeta",
+		Input: []tf.Input{
+			x, q,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Inverse fast Fourier transform.
 //
 // Computes the inverse 1-dimensional discrete Fourier transform over the
@@ -22757,6 +22757,21 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 	return op.Output(0)
 }
 
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the maximum along segments of a tensor.
 //
 // Read
@@ -22794,21 +22809,6 @@ func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.
 	return op.Output(0)
 }
 
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tanh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a dataset that skips `count` elements from the `input_dataset`.
 //
 // Arguments:
-- 
GitLab


From 7d3bfc143a74d8e49f138841a07f7f4693b0a911 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Fri, 5 Oct 2018 20:07:12 -0700
Subject: [PATCH 507/570] Add the plumbing for an autograph flag to defun.
 Disabled and experimental for now.

PiperOrigin-RevId: 216003028
---
 tensorflow/python/eager/BUILD       |  1 +
 tensorflow/python/eager/function.py | 61 +++++++++++++++++++++++------
 2 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index d0c1a93118..cae809a7c3 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -251,6 +251,7 @@ py_library(
         "//tensorflow/python:gradients_impl",
         "//tensorflow/python:graph_to_function_def",
         "//tensorflow/python:util",
+        "//tensorflow/python/autograph",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:core",
         "//tensorflow/python/eager:execute",
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index f06148b5d2..bafe07de2b 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -31,6 +31,7 @@ import six
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import function_pb2
+from tensorflow.python import autograph
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
@@ -877,7 +878,8 @@ def func_graph_from_py_func(name,
                             args,
                             kwargs,
                             signature=None,
-                            func_graph=None):
+                            func_graph=None,
+                            experimental_autograph=False):
   """Returns a `FuncGraph` generated from `python_func`.
 
   Args:
@@ -894,6 +896,8 @@ def func_graph_from_py_func(name,
       inputs.
     func_graph: Optional. An instance of FuncGraph. If provided, we will use
       this graph else a new one is built and returned.
+    experimental_autograph: whether to use autograph to compile `python_func`.
+      See https://www.tensorflow.org/guide/autograph for more information.
 
   Returns:
     A FuncGraph.
@@ -939,7 +943,17 @@ def func_graph_from_py_func(name,
 
     this_tape = tape.push_new_tape()
     try:
-      func_outputs = python_func(*func_args, **func_kwargs)
+      if experimental_autograph:
+        func_outputs = autograph.converted_call(
+            python_func,
+            autograph.ConversionOptions(
+                verbose=True,
+                recursive=True,
+                force_conversion=False,
+                strip_decorators=(defun,),
+                arg_types={}), *func_args, **func_kwargs)
+      else:
+        func_outputs = python_func(*func_args, **func_kwargs)
       # invariant: `func_outputs` contains only Tensors and `None`s.
       func_outputs = nest.map_structure(convert, func_outputs)
 
@@ -1035,7 +1049,8 @@ class PolymorphicFunction(object):
                python_function,
                name,
                input_signature=None,
-               attributes=None):
+               attributes=None,
+               experimental_autograph=False):
     """Initializes a polymorphic function.
 
     Args:
@@ -1045,7 +1060,10 @@ class PolymorphicFunction(object):
         specifying the input signature of this function. If `None`, a separate
         function is instantiated for each inferred input signature.
       attributes: dict, extra keyword arguments that will be added as attribute
-         of the function.
+        of the function.
+      experimental_autograph: whether to use autograph to compile
+        `python_function`. See https://www.tensorflow.org/guide/autograph for
+        more information.
 
     Raises:
       ValueError: if `input_signature` is not None and the `python_function`'s
@@ -1061,6 +1079,7 @@ class PolymorphicFunction(object):
       self._args_to_prepend = tuple()
       self._kwargs_to_include = {}
     self._name = name
+    self._experimental_autograph = experimental_autograph
     self._function_cache = collections.OrderedDict()
     self._function_attributes = attributes or {}
 
@@ -1286,8 +1305,13 @@ class PolymorphicFunction(object):
 
       if graph_function is None:
         graph_function = Function(
-            func_graph_from_py_func(self._name, self._python_function, args,
-                                    kwargs, self._input_signature),
+            func_graph_from_py_func(
+                self._name,
+                self._python_function,
+                args,
+                kwargs,
+                self._input_signature,
+                experimental_autograph=self._experimental_autograph),
             self._function_attributes)
         self._function_cache[cache_key] = graph_function
       return graph_function, [
@@ -1348,7 +1372,7 @@ def _validate_signature(signature):
                     "a possibly nested sequence of TensorSpec objects.")
 
 
-def defun(func=None, input_signature=None):
+def defun(func=None, input_signature=None, experimental_autograph=False):
   """Compiles a Python function into a callable TensorFlow graph.
 
   `defun` (short for "define function") trace-compiles a Python function
@@ -1657,6 +1681,10 @@ def defun(func=None, input_signature=None):
       function is instantiated for each inferred input signature.  If a
       signature is specified, every input to `func` must be a `Tensor`, and
       `func` cannot accept `**kwargs`.
+    experimental_autograph: Whether `func` should be compiled before
+      constructing the graph. See https://www.tensorflow.org/guide/autograph
+      for more information.
+
 
   Returns:
      If `func` is not None, returns a callable that will execute the compiled
@@ -1668,10 +1696,16 @@ def defun(func=None, input_signature=None):
     TypeError: If `input_signature` is neither `None` nor a sequence of
       `tf.contrib.eager.TensorSpec` objects.
   """
-  return defun_with_attributes(func=func, input_signature=input_signature)
+  return defun_with_attributes(
+      func=func,
+      input_signature=input_signature,
+      experimental_autograph=experimental_autograph)
 
 
-def defun_with_attributes(func=None, input_signature=None, attributes=None):
+def defun_with_attributes(func=None,
+                          input_signature=None,
+                          attributes=None,
+                          experimental_autograph=False):
   """Compiles a Python function into a callable TensorFlow graph.
 
   This function supports adding extra function attributes. See detailed
@@ -1686,6 +1720,7 @@ def defun_with_attributes(func=None, input_signature=None, attributes=None):
       attributes. Currently only support primitive types as value, and only
       whitelisted attribute name is allowed. Unwhitelisted attribute name or
       unsupported value will result into ValueError.
+    experimental_autograph: same as defun()'s experimental_autograph.
 
   Returns:
     Same as the return value of defun, with attributes added to the function in
@@ -1702,8 +1737,12 @@ def defun_with_attributes(func=None, input_signature=None, attributes=None):
       name = "function"
     return tf_decorator.make_decorator(
         function,
-        PolymorphicFunction(function, name, input_signature=input_signature,
-                            attributes=attributes))
+        PolymorphicFunction(
+            function,
+            name,
+            input_signature=input_signature,
+            attributes=attributes,
+            experimental_autograph=experimental_autograph))
 
   # This code path is for the `foo = tfe.defun(foo, ...)` use case
   if func is not None:
-- 
GitLab


From fb92d456476c36210cea3b76393f584a306f092b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 6 Oct 2018 02:01:17 -0700
Subject: [PATCH 508/570] compat: Update forward compatibility horizon to
 2018-10-06

PiperOrigin-RevId: 216021117
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 8f4e8e0b98..d85fb00414 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -26,7 +26,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 5)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 6)
 
 
 @tf_export("compat.forward_compatible")
-- 
GitLab


From 5c0a6bdfeb1848b0146a36706d921dde06ba160a Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Sat, 6 Oct 2018 10:04:16 -0700
Subject: [PATCH 509/570] [XLA] Add base and window dilation support to
 ReduceWindow

PiperOrigin-RevId: 216041507
---
 .../tf2xla/kernels/reduce_window_op.cc        | 21 +++++++-
 .../compiler/tf2xla/kernels/scan_ops.cc       |  3 +-
 tensorflow/compiler/tf2xla/ops/xla_ops.cc     |  2 +
 tensorflow/compiler/tf2xla/python/xla.py      |  6 +++
 tensorflow/compiler/xla/client/xla_builder.cc | 15 ++++--
 tensorflow/compiler/xla/client/xla_builder.h  |  6 +++
 .../xla/python/local_computation_builder.cc   |  5 +-
 .../xla/python/local_computation_builder.h    |  2 +
 tensorflow/compiler/xla/python/xla_client.py  | 25 ++++++++-
 .../xla/service/algebraic_simplifier.cc       |  6 +++
 .../compiler/xla/service/cpu/ir_emitter.cc    | 27 +++++++---
 .../xla/service/gpu/elemental_ir_emitter.cc   | 26 ++++++----
 .../xla/service/hlo_evaluator_test.cc         | 52 +++++++++++++++++++
 .../xla/service/hlo_evaluator_typed_visitor.h | 13 ++++-
 .../compiler/xla/tests/reduce_window_test.cc  | 12 ++++-
 15 files changed, 191 insertions(+), 30 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
index 8102faad28..8eee5b1299 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
@@ -40,10 +40,16 @@ class ReduceWindowOp : public XlaOpKernel {
 
     std::vector<int64> window_dimensions;
     std::vector<int64> window_strides;
+    std::vector<int64> base_dilations;
+    std::vector<int64> window_dilations;
     OP_REQUIRES_OK(context, context->ConstantInputAsIntVector(
                                 "window_dimensions", &window_dimensions));
     OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("window_strides",
                                                               &window_strides));
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("base_dilations",
+                                                              &base_dilations));
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector(
+                                "window_dilations", &window_dilations));
 
     const int rank = input_shape.dims();
     OP_REQUIRES(context, rank == window_dimensions.size(),
@@ -56,6 +62,16 @@ class ReduceWindowOp : public XlaOpKernel {
                     "The size of window_strides must be equal to the input "
                     "rank (",
                     window_strides.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == base_dilations.size(),
+                errors::InvalidArgument(
+                    "The size of base_dilations must be equal to the input "
+                    "rank (",
+                    base_dilations.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == window_dilations.size(),
+                errors::InvalidArgument(
+                    "The size of window_dilations must be equal to the input "
+                    "rank (",
+                    window_dilations.size(), " vs. ", rank, ")"));
 
     // Build the reducer function.
     XlaCompiler::Argument reducer_arg;
@@ -102,7 +118,8 @@ class ReduceWindowOp : public XlaOpKernel {
 
     xla::XlaOp output = xla::ReduceWindowWithGeneralPadding(
         context->Input(0), context->Input(1), *reducer.computation,
-        window_dimensions, window_strides, padding);
+        window_dimensions, window_strides, base_dilations, window_dilations,
+        padding);
     context->SetOutput(0, output);
   }
 
@@ -115,6 +132,8 @@ class ReduceWindowOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("XlaReduceWindow")
                     .CompileTimeConstInput("window_dimensions")
                     .CompileTimeConstInput("window_strides")
+                    .CompileTimeConstInput("base_dilations")
+                    .CompileTimeConstInput("window_dilations")
                     .CompileTimeConstInput("padding"),
                 ReduceWindowOp);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index ab094d7dd1..57afd608de 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -104,7 +104,8 @@ class ScanOp : public XlaOpKernel {
     }
     auto output = xla::ReduceWindowWithGeneralPadding(
         XlaHelpers::ConvertElementType(builder, ctx->Input(0), dtype), init,
-        *reducer, window_dims, window_strides, padding);
+        *reducer, window_dims, window_strides,
+        /*base_dilations=*/{}, /*window_dilations=*/{}, padding);
     output =
         XlaHelpers::ConvertElementType(builder, output, ctx->input_type(0));
 
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index 557911553d..bd2c0a5ee8 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -283,6 +283,8 @@ REGISTER_OP("XlaReduceWindow")
     .Input("init_value: T")
     .Input("window_dimensions: Tindices")
     .Input("window_strides: Tindices")
+    .Input("base_dilations: Tindices")
+    .Input("window_dilations: Tindices")
     .Input("padding: Tindices")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index bc7924c371..5e86b5d8ec 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -320,6 +320,8 @@ def reduce_window(operand,
                   reducer,
                   window_dimensions,
                   window_strides=None,
+                  base_dilations=None,
+                  window_dilations=None,
                   padding=None,
                   name=None):
   """Wraps the XLA ReduceWindow operator.
@@ -343,12 +345,16 @@ def reduce_window(operand,
     A tensor that represents the output of the reduce_window operator.
   """
   window_strides = window_strides or [1] * len(window_dimensions)
+  base_dilations = base_dilations or [1] * len(window_dimensions)
+  window_dilations = window_dilations or [1] * len(window_dimensions)
   padding = padding or [(0, 0)] * len(window_dimensions)
   return gen_xla_ops.xla_reduce_window(
       input=operand,
       init_value=init,
       window_dimensions=window_dimensions,
       window_strides=window_strides,
+      base_dilations=base_dilations,
+      window_dilations=window_dilations,
       padding=padding,
       computation=reducer,
       name=name)
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index d196252db1..6b31831010 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -1789,9 +1789,9 @@ XlaOp XlaBuilder::ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
     std::vector<std::pair<int64, int64>> padding_values =
         MakePadding(AsInt64Slice(operand_shape.dimensions()), window_dimensions,
                     window_strides, padding);
-    return ReduceWindowWithGeneralPadding(operand, init_value, computation,
-                                          window_dimensions, window_strides,
-                                          padding_values);
+    return ReduceWindowWithGeneralPadding(
+        operand, init_value, computation, window_dimensions, window_strides,
+        /*base_dilations=*/{}, /*window_dilations=*/{}, padding_values);
   });
 }
 
@@ -1800,6 +1800,8 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
     const XlaComputation& computation,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
+    absl::Span<const int64> base_dilations,
+    absl::Span<const int64> window_dilations,
     absl::Span<const std::pair<int64, int64>> padding) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
@@ -1810,7 +1812,8 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
                         computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
                         MakeWindow(window_dimensions, window_strides, padding,
-                                   /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
+                                   /*lhs_dilation=*/base_dilations,
+                                   /*rhs_dilation=*/window_dilations));
     TF_ASSIGN_OR_RETURN(
         *instr.mutable_shape(),
         ShapeInference::InferReduceWindowShape(operand_shape, init_shape,
@@ -2800,10 +2803,12 @@ XlaOp ReduceWindowWithGeneralPadding(
     const XlaComputation& computation,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
+    absl::Span<const int64> base_dilations,
+    absl::Span<const int64> window_dilations,
     absl::Span<const std::pair<int64, int64>> padding) {
   return operand.builder()->ReduceWindowWithGeneralPadding(
       operand, init_value, computation, window_dimensions, window_strides,
-      padding);
+      base_dilations, window_dilations, padding);
 }
 
 XlaOp CrossReplicaSum(const XlaOp& operand,
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index cd0d5ca5d3..2e14e47a35 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -671,6 +671,8 @@ class XlaBuilder {
       const XlaComputation& computation,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
+      absl::Span<const int64> base_dilations,
+      absl::Span<const int64> window_dilations,
       absl::Span<const std::pair<int64, int64>> padding);
 
   // Returns the sum of the operand value within each subgroup of replicas. All
@@ -1245,6 +1247,8 @@ class XlaBuilder {
       const XlaComputation& computation,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
+      absl::Span<const int64> base_dilations,
+      absl::Span<const int64> window_dilations,
       absl::Span<const std::pair<int64, int64>> padding);
   friend XlaOp CrossReplicaSum(const XlaOp& operand,
                                absl::Span<const ReplicaGroup> replica_groups);
@@ -1818,6 +1822,8 @@ XlaOp ReduceWindowWithGeneralPadding(
     const XlaComputation& computation,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
+    absl::Span<const int64> base_dilations,
+    absl::Span<const int64> window_dilations,
     absl::Span<const std::pair<int64, int64>> padding);
 
 // Returns the sum of the operand value within each subgroup of replicas. All
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index cd5fd33029..ffa336f304 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -532,10 +532,13 @@ LocalOp LocalComputationBuilder::ReduceWindowWithGeneralPadding(
     const LocalComputation& local_computation,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
+    absl::Span<const int64> base_dilations,
+    absl::Span<const int64> window_dilations,
     absl::Span<const std::pair<int64, int64>> padding) {
   return xla::ReduceWindowWithGeneralPadding(
       operand.op(), init_value.op(), local_computation.computation(),
-      window_dimensions, window_strides, padding);
+      window_dimensions, window_strides, base_dilations, window_dilations,
+      padding);
 }
 
 LocalOp LocalComputationBuilder::RngNormal(const LocalOp& mu,
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 2166bb6721..43332e0abd 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -278,6 +278,8 @@ class LocalComputationBuilder {
       const LocalComputation& local_computation,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
+      absl::Span<const int64> base_dilations,
+      absl::Span<const int64> window_dilations,
       absl::Span<const std::pair<int64, int64> > padding);
 
   LocalOp RngNormal(const LocalOp& mu, const LocalOp& sigma,
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index bb303c5678..f8197488fb 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -995,7 +995,30 @@ class ComputationBuilder(object):
         window_strides)
     return self._client.ReduceWindowWithGeneralPadding(
         operand, init_value, computation_to_apply.c_local_computation,
-        window_dimensions, window_strides, pads)
+        window_dimensions, window_strides, (), (), pads)
+
+  def ReduceWindowWithGeneralPadding(
+      self, operand, init_value, computation_to_apply, window_dimensions,
+      window_strides, base_dilations, window_dilations, padding):
+    """Enqueues a windowed reduction operation onto the computation.
+
+    Args:
+      operand: reduction operand (LocalOp).
+      init_value: reduction initial value (LocalOp).
+      computation_to_apply: a binary reduction function (Computation).
+      window_dimensions: dimensions of window (sequence of integers).
+      window_strides: strides for window (sequence of integers).
+      base_dilations: dilations for the base (sequence of integers).
+      window_dilations: dilations for window (sequence of integers).
+      padding: length-N array-like of pairs of integers of (low, high) padding.
+
+    Returns:
+      A LocalOp representing the added ReduceWindow op.
+    """
+    return self._client.ReduceWindowWithGeneralPadding(
+        operand, init_value, computation_to_apply.c_local_computation,
+        window_dimensions, window_strides, base_dilations, window_dilations,
+        padding)
 
   def RngNormal(self, mu, sigma, dims):
     """Enqueues an RngNormal operation onto the computation.
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 75dae7a714..86d9dbea90 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -2057,6 +2057,12 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     return Status::OK();
   }
 
+  // Bail on dilation.
+  if (window_util::HasDilation(window)) {
+    VLOG(10) << "Not folding pad into reduce-window as there is dilation.";
+    return Status::OK();
+  }
+
   VLOG(10) << "Considering folding Pad: " << pad->ToString()
            << "\ninto reduce-window: " << reduce_window->ToString()
            << (convert != nullptr
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index a70abb117a..b2abdb39a5 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -688,8 +688,25 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduceWindow(
   for (size_t i = 0; i < index.size(); ++i) {
     llvm::Value* strided_index =
         NSWMul(index[i], b_.getInt64(window.dimensions(i).stride()));
-    input_index[i] = NSWSub(NSWAdd(strided_index, window_index[i]),
-                            b_.getInt64(window.dimensions(i).padding_low()));
+    input_index[i] = NSWSub(
+        NSWAdd(strided_index,
+               NSWMul(window_index[i],
+                      b_.getInt64(window.dimensions(i).window_dilation()))),
+        b_.getInt64(window.dimensions(i).padding_low()));
+
+    // We need to verify that we are not in the dilated base area.
+    llvm::Value* dilation_condition = ICmpEQ(
+        SRem(input_index[i], b_.getInt64(window.dimensions(i).base_dilation())),
+        b_.getInt64(0));
+    if (in_bounds_condition == nullptr) {
+      in_bounds_condition = dilation_condition;
+    } else {
+      in_bounds_condition = And(in_bounds_condition, dilation_condition);
+    }
+
+    // Apply base dilation to the index.
+    input_index[i] =
+        SDiv(input_index[i], b_.getInt64(window.dimensions(i).base_dilation()));
 
     // We need to check if 0 <= input_index[i] < bound, as otherwise we are in
     // the padding so that we can skip the computation. That is equivalent to
@@ -728,12 +745,6 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
       /*operands=*/{reduce_window->operand(0)},
       /*supported_types=*/{F32, BF16, S32, F16}));
 
-  // TODO(b/31410564): Implement dilation for reduce-window.
-  if (window_util::HasDilation(reduce_window->window())) {
-    return Unimplemented(
-        "Dilation for ReduceWindow is not implemented on CPU.");
-  }
-
   // Pseudo code for reduce window:
   //
   //   for (coordinates O in the output)
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index c1aaa4bf04..6dcdaf1cfe 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -358,13 +358,6 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         const HloInstruction* operand = hlo->operand(0);
         const Window& window = hlo->window();
 
-        // TODO(b/31410564): Implement dilation for reduce-window.
-        if (window_util::HasDilation(window)) {
-          return Unimplemented(
-              "Dilation for reduce-window not implemented on GPU. "
-              "See b/31410564.");
-        }
-
         PrimitiveType operand_element_type = operand->shape().element_type();
         llvm::Value* accum_ptr = llvm_ir::EmitAllocaAtFunctionEntry(
             llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
@@ -397,9 +390,24 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         for (size_t i = 0; i < index.size(); ++i) {
           llvm::Value* stridden_index = NSWMul(
               index[i], index_typed_const(window.dimensions(i).stride()));
+          input_index[i] = NSWSub(
+              NSWAdd(stridden_index,
+                     NSWMul(window_index[i],
+                            index_typed_const(
+                                window.dimensions(i).window_dilation()))),
+              index_typed_const(window.dimensions(i).padding_low()));
+
+          // We need to verify that we are not in the dilated base area.
+          llvm::Value* dilation_condition = ICmpEQ(
+              SRem(input_index[i],
+                   index_typed_const(window.dimensions(i).base_dilation())),
+              index_typed_const(0));
+          in_bounds = And(in_bounds, dilation_condition);
+
+          // Apply base dilation to the index.
           input_index[i] =
-              NSWSub(NSWAdd(stridden_index, window_index[i]),
-                     index_typed_const(window.dimensions(i).padding_low()));
+              SDiv(input_index[i],
+                   index_typed_const(window.dimensions(i).base_dilation()));
 
           // We must check whether 0 ≤ input_index[i] < bound, as otherwise
           // we are in the pad and so can skip the computation. This
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index cee11a8a21..608a42bb60 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -1463,6 +1463,58 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
+TEST_P(HloEvaluatorTest, ReduceWindowMaxWindowDilation) {
+  HloComputation::Builder b(TestName());
+
+  // arg:
+  // f32[3,3] {
+  //  { 1, 2, 3 },
+  //  { 5, 6, 7 },
+  //  { 9, 10, 11 },
+  // }
+  auto arg_array = absl::make_unique<Array2D<float>>(3, 3);
+  arg_array->FillUnique(1.0f);
+  auto arg_literal = LiteralUtil::CreateR2FromArray2D<float>(*arg_array);
+
+  HloInstruction* arg_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal)));
+
+  auto init_value = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.f)));
+
+  HloComputation::Builder max_computation("max");
+  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  auto param_lhs = max_computation.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
+  auto param_rhs = max_computation.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
+  max_computation.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape, HloOpcode::kMaximum, param_lhs, param_rhs));
+  auto max_func = module().AddEmbeddedComputation(max_computation.Build());
+
+  Window window;
+  WindowDimension dim;
+  dim.set_size(2);
+  dim.set_stride(1);
+  dim.set_padding_low(0);
+  dim.set_padding_high(0);
+  dim.set_window_dilation(2);
+  dim.set_base_dilation(1);
+  *window.add_dimensions() = dim;
+  *window.add_dimensions() = dim;
+
+  Shape shape = ShapeUtil::MakeShape(F32, {1, 1});
+  b.AddInstruction(HloInstruction::CreateReduceWindow(
+      shape, arg_instruction, init_value, window, max_func));
+
+  module().AddEntryComputation(b.Build());
+
+  Literal result = Evaluate();
+
+  auto expected = LiteralUtil::CreateR2<float>({{11}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
 TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
   HloComputation::Builder b(TestName());
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index b2d12c94b8..a450dc6ff5 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -2613,8 +2613,17 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       std::vector<int64> base_index(rank);
       bool out_of_bound = false;
       for (int64 i = 0; i < rank; ++i) {
-        base_index[i] = window_count_index[i] * window.dimensions(i).stride() +
-                        window_index[i] - window.dimensions(i).padding_low();
+        base_index[i] =
+            window_count_index[i] * window.dimensions(i).stride() +
+            window_index[i] * window.dimensions(i).window_dilation() -
+            window.dimensions(i).padding_low();
+        // We are not in the base area if the dilation placed us out of bounds.
+        if (base_index[i] % window.dimensions(i).base_dilation() != 0) {
+          out_of_bound = true;
+          break;
+        }
+        // Apply the dilation to the base area.
+        base_index[i] /= window.dimensions(i).base_dilation();
         if (base_index[i] < 0 || base_index[i] >= base_shape.dimensions(i)) {
           out_of_bound = true;
           break;
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index c25ccafaf8..22fe4a2670 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -638,6 +638,8 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
         /*computation=*/computation,
         /*window_dimensions=*/param.window_bounds,
         /*window_strides=*/param.strides,
+        /*base_dilations=*/{},
+        /*window_dilations=*/{},
         /*padding=*/padding);
 
     CHECK(reducer == kAdd || reducer == kMax);
@@ -1158,7 +1160,10 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
         /*init_value=*/init_value,
         /*computation=*/computation,
         /*window_dimensions=*/param.window_bounds,
-        /*window_strides=*/param.strides, /*padding=*/padding);
+        /*window_strides=*/param.strides,
+        /*base_dilations=*/{},
+        /*window_dilations=*/{},
+        /*padding=*/padding);
 
     auto reduce_func = param.reducer == kAdd
                            ? +[](float a, float b) { return a + b; }
@@ -1369,7 +1374,10 @@ TEST_P(R1ReduceWindowTest, DoIt) {
       /*init_value=*/init_value,
       /*computation=*/computation,
       /*window_dimensions=*/param.window_bounds,
-      /*window_strides=*/param.strides, /*padding=*/padding);
+      /*window_strides=*/param.strides,
+      /*base_dilations=*/{},
+      /*window_dilations=*/{},
+      /*padding=*/padding);
 
   auto reduce_func = param.reducer == kAdd
                          ? +[](float a, float b) { return a + b; }
-- 
GitLab


From e93a18954689b6d522560f5273f6d3320d545b2e Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Sat, 6 Oct 2018 13:49:25 -0700
Subject: [PATCH 510/570] Mark tensorflow/contrib/tpu:datasets_test flaky

It fails 1/1000 runs in OSS builds.

PiperOrigin-RevId: 216050192
---
 tensorflow/contrib/tpu/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 10ed1c2891..8c36d5a297 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -302,6 +302,7 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         ":datasets",
     ],
+    flaky = 1,  # TODO(b/117363808): fails 1/1000 OSS runs
     grpc_enabled = True,
 )
 
-- 
GitLab


From 7fa6a6b42bc9d562e2b1cc765ca78d281b51f734 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 6 Oct 2018 21:00:57 -0700
Subject: [PATCH 511/570] Add SequenceLSTMOptions to schema to decouple the
 sequential Op from the LSTM.

PiperOrigin-RevId: 216066634
---
 tensorflow/contrib/lite/c/builtin_op_data.h   |   7 +
 .../lite/core/api/flatbuffer_conversions.cc   |  15 +-
 .../kernels/unidirectional_sequence_lstm.cc   |  14 +-
 .../unidirectional_sequence_lstm_test.cc      |  11 +-
 tensorflow/contrib/lite/schema/schema.fbs     |   8 +
 .../contrib/lite/schema/schema_generated.h    | 162 +++++++++++++++++-
 6 files changed, 205 insertions(+), 12 deletions(-)

diff --git a/tensorflow/contrib/lite/c/builtin_op_data.h b/tensorflow/contrib/lite/c/builtin_op_data.h
index 44daf7adaa..1e65c3cee2 100644
--- a/tensorflow/contrib/lite/c/builtin_op_data.h
+++ b/tensorflow/contrib/lite/c/builtin_op_data.h
@@ -186,6 +186,13 @@ typedef struct {
   TfLiteLSTMKernelType kernel_type;
 } TfLiteLSTMParams;
 
+typedef struct {
+  // Parameters for the LSTM kernel.
+  TfLiteFusedActivation activation;
+  float cell_clip;
+  float proj_clip;
+} TfLiteUnidirectionalSequenceLSTMParams;
+
 typedef struct {
   // Parameters for the LSTM kernel.
   TfLiteFusedActivation activation;
diff --git a/tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc b/tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc
index eac7db9a88..b092e5ee54 100644
--- a/tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc
@@ -371,7 +371,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
-    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
     case BuiltinOperator_LSTM: {
       auto params = allocator->AllocatePOD<TfLiteLSTMParams>();
       if (auto* lstm_params = op->builtin_options_as_LSTMOptions()) {
@@ -391,6 +390,20 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM: {
+      auto* params =
+          allocator->AllocatePOD<TfLiteUnidirectionalSequenceLSTMParams>();
+      if (auto* seq_lstm_params =
+              op->builtin_options_as_UnidirectionalSequenceLSTMOptions()) {
+        params->activation =
+            parse_activation(seq_lstm_params->fused_activation_function());
+        params->cell_clip = seq_lstm_params->cell_clip();
+        params->proj_clip = seq_lstm_params->proj_clip();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+
     case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM: {
       auto params =
           allocator->AllocatePOD<TfLiteBidirectionalSequenceLSTMParams>();
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
index ec9cf38b83..89d57e4599 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
@@ -431,7 +431,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+  const auto* params =
+      reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
+          node->builtin_data);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
 
   const TfLiteTensor* input_to_input_weights =
@@ -482,6 +484,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
+  // Copy out the LSTM specific params so they can be passed in the function.
+  TfLiteLSTMParams lstm_params;
+  lstm_params.activation = params->activation;
+  lstm_params.cell_clip = params->cell_clip;
+  lstm_params.proj_clip = params->proj_clip;
+
   switch (input_to_output_weights->type) {
     case kTfLiteFloat32: {
       return lstm_eval::EvalFloat(
@@ -496,7 +504,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*aux_input_to_cell_weights=*/nullptr,
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
           forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-          projection_bias, params, /*forward_sequence=*/true,
+          projection_bias, &lstm_params, /*forward_sequence=*/true,
           /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
           output);
     }
@@ -523,7 +531,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*aux_input_to_cell_weights=*/nullptr,
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
           forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-          projection_bias, params, /*forward_sequence=*/true,
+          projection_bias, &lstm_params, /*forward_sequence=*/true,
           /*output_offset=*/0, scratch_buffer, scaling_factors,
           prod_scaling_factors, recovered_cell_weights, input_quantized,
           /*aux_input_quantized=*/nullptr, activation_state_quantized,
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc
index cd3aac0532..c97b0fdd61 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -110,11 +110,12 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
 
     output_ = AddOutput(TensorType_FLOAT32);
 
-    SetBuiltinOp(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
-                 BuiltinOptions_LSTMOptions,
-                 CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
-                                   cell_clip, proj_clip)
-                     .Union());
+    SetBuiltinOp(
+        BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+        BuiltinOptions_UnidirectionalSequenceLSTMOptions,
+        CreateUnidirectionalSequenceLSTMOptions(
+            builder_, ActivationFunctionType_TANH, cell_clip, proj_clip)
+            .Union());
     BuildInterpreter(input_shapes);
   }
 
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index ff8430827c..cb7a282743 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -250,6 +250,7 @@ union BuiltinOptions {
   FillOptions,
   BidirectionalSequenceLSTMOptions,
   BidirectionalSequenceRNNOptions,
+  UnidirectionalSequenceLSTMOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -394,6 +395,13 @@ table LSTMOptions {
   kernel_type: LSTMKernelType = FULL;
 }
 
+// An implementation of TensorFlow dynamic_rnn with LSTMCell.
+table UnidirectionalSequenceLSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+}
+
 table BidirectionalSequenceLSTMOptions {
   fused_activation_function:ActivationFunctionType;
   cell_clip: float; // Optional, 0.0 means no clipping
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index f3cb113c9c..e7b7a59def 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -79,6 +79,9 @@ struct LocalResponseNormalizationOptionsT;
 struct LSTMOptions;
 struct LSTMOptionsT;
 
+struct UnidirectionalSequenceLSTMOptions;
+struct UnidirectionalSequenceLSTMOptionsT;
+
 struct BidirectionalSequenceLSTMOptions;
 struct BidirectionalSequenceLSTMOptionsT;
 
@@ -681,11 +684,12 @@ enum BuiltinOptions {
   BuiltinOptions_FillOptions = 68,
   BuiltinOptions_BidirectionalSequenceLSTMOptions = 69,
   BuiltinOptions_BidirectionalSequenceRNNOptions = 70,
+  BuiltinOptions_UnidirectionalSequenceLSTMOptions = 71,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_BidirectionalSequenceRNNOptions
+  BuiltinOptions_MAX = BuiltinOptions_UnidirectionalSequenceLSTMOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[71] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[72] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -757,7 +761,8 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[71] {
     BuiltinOptions_ZerosLikeOptions,
     BuiltinOptions_FillOptions,
     BuiltinOptions_BidirectionalSequenceLSTMOptions,
-    BuiltinOptions_BidirectionalSequenceRNNOptions
+    BuiltinOptions_BidirectionalSequenceRNNOptions,
+    BuiltinOptions_UnidirectionalSequenceLSTMOptions
   };
   return values;
 }
@@ -835,6 +840,7 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "FillOptions",
     "BidirectionalSequenceLSTMOptions",
     "BidirectionalSequenceRNNOptions",
+    "UnidirectionalSequenceLSTMOptions",
     nullptr
   };
   return names;
@@ -1129,6 +1135,10 @@ template<> struct BuiltinOptionsTraits<BidirectionalSequenceRNNOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_BidirectionalSequenceRNNOptions;
 };
 
+template<> struct BuiltinOptionsTraits<UnidirectionalSequenceLSTMOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnidirectionalSequenceLSTMOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1720,6 +1730,14 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_BidirectionalSequenceRNNOptions ?
       reinterpret_cast<const BidirectionalSequenceRNNOptionsT *>(value) : nullptr;
   }
+  UnidirectionalSequenceLSTMOptionsT *AsUnidirectionalSequenceLSTMOptions() {
+    return type == BuiltinOptions_UnidirectionalSequenceLSTMOptions ?
+      reinterpret_cast<UnidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
+  }
+  const UnidirectionalSequenceLSTMOptionsT *AsUnidirectionalSequenceLSTMOptions() const {
+    return type == BuiltinOptions_UnidirectionalSequenceLSTMOptions ?
+      reinterpret_cast<const UnidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -3469,6 +3487,84 @@ inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
 
 flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct UnidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
+  typedef UnidirectionalSequenceLSTMOptions TableType;
+  ActivationFunctionType fused_activation_function;
+  float cell_clip;
+  float proj_clip;
+  UnidirectionalSequenceLSTMOptionsT()
+      : fused_activation_function(ActivationFunctionType_NONE),
+        cell_clip(0.0f),
+        proj_clip(0.0f) {
+  }
+};
+
+struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef UnidirectionalSequenceLSTMOptionsT NativeTableType;
+  enum {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_CELL_CLIP = 6,
+    VT_PROJ_CLIP = 8
+  };
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  float cell_clip() const {
+    return GetField<float>(VT_CELL_CLIP, 0.0f);
+  }
+  float proj_clip() const {
+    return GetField<float>(VT_PROJ_CLIP, 0.0f);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<float>(verifier, VT_CELL_CLIP) &&
+           VerifyField<float>(verifier, VT_PROJ_CLIP) &&
+           verifier.EndTable();
+  }
+  UnidirectionalSequenceLSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UnidirectionalSequenceLSTMOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(UnidirectionalSequenceLSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_cell_clip(float cell_clip) {
+    fbb_.AddElement<float>(UnidirectionalSequenceLSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
+  }
+  void add_proj_clip(float proj_clip) {
+    fbb_.AddElement<float>(UnidirectionalSequenceLSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
+  }
+  explicit UnidirectionalSequenceLSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  UnidirectionalSequenceLSTMOptionsBuilder &operator=(const UnidirectionalSequenceLSTMOptionsBuilder &);
+  flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<UnidirectionalSequenceLSTMOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
+    float cell_clip = 0.0f,
+    float proj_clip = 0.0f) {
+  UnidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
+  builder_.add_proj_clip(proj_clip);
+  builder_.add_cell_clip(cell_clip);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct BidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
   typedef BidirectionalSequenceLSTMOptions TableType;
   ActivationFunctionType fused_activation_function;
@@ -6488,6 +6584,9 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const BidirectionalSequenceRNNOptions *builtin_options_as_BidirectionalSequenceRNNOptions() const {
     return builtin_options_type() == BuiltinOptions_BidirectionalSequenceRNNOptions ? static_cast<const BidirectionalSequenceRNNOptions *>(builtin_options()) : nullptr;
   }
+  const UnidirectionalSequenceLSTMOptions *builtin_options_as_UnidirectionalSequenceLSTMOptions() const {
+    return builtin_options_type() == BuiltinOptions_UnidirectionalSequenceLSTMOptions ? static_cast<const UnidirectionalSequenceLSTMOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -6799,6 +6898,10 @@ template<> inline const BidirectionalSequenceRNNOptions *Operator::builtin_optio
   return builtin_options_as_BidirectionalSequenceRNNOptions();
 }
 
+template<> inline const UnidirectionalSequenceLSTMOptions *Operator::builtin_options_as<UnidirectionalSequenceLSTMOptions>() const {
+  return builtin_options_as_UnidirectionalSequenceLSTMOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -7809,6 +7912,38 @@ inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBuffe
       _kernel_type);
 }
 
+inline UnidirectionalSequenceLSTMOptionsT *UnidirectionalSequenceLSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new UnidirectionalSequenceLSTMOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void UnidirectionalSequenceLSTMOptions::UnPackTo(UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  { auto _e = cell_clip(); _o->cell_clip = _e; };
+  { auto _e = proj_clip(); _o->proj_clip = _e; };
+}
+
+inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> UnidirectionalSequenceLSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUnidirectionalSequenceLSTMOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UnidirectionalSequenceLSTMOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _cell_clip = _o->cell_clip;
+  auto _proj_clip = _o->proj_clip;
+  return tflite::CreateUnidirectionalSequenceLSTMOptions(
+      _fbb,
+      _fused_activation_function,
+      _cell_clip,
+      _proj_clip);
+}
+
 inline BidirectionalSequenceLSTMOptionsT *BidirectionalSequenceLSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new BidirectionalSequenceLSTMOptionsT();
   UnPackTo(_o, _resolver);
@@ -9620,6 +9755,10 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const BidirectionalSequenceRNNOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const UnidirectionalSequenceLSTMOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -9918,6 +10057,10 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const BidirectionalSequenceRNNOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const UnidirectionalSequenceLSTMOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -10204,6 +10347,10 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const BidirectionalSequenceRNNOptionsT *>(value);
       return CreateBidirectionalSequenceRNNOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const UnidirectionalSequenceLSTMOptionsT *>(value);
+      return CreateUnidirectionalSequenceLSTMOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -10490,6 +10637,10 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new BidirectionalSequenceRNNOptionsT(*reinterpret_cast<BidirectionalSequenceRNNOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      value = new UnidirectionalSequenceLSTMOptionsT(*reinterpret_cast<UnidirectionalSequenceLSTMOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -10847,6 +10998,11 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<UnidirectionalSequenceLSTMOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
-- 
GitLab


From 367f7d651f19c5b111ea0292243eab81fb4058c7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 7 Oct 2018 02:01:04 -0700
Subject: [PATCH 512/570] compat: Update forward compatibility horizon to
 2018-10-07

PiperOrigin-RevId: 216079665
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index d85fb00414..ee56480b00 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -26,7 +26,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 6)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 7)
 
 
 @tf_export("compat.forward_compatible")
-- 
GitLab


From 393a13c1b1a7d51b0871a6d4b3d3413d8e1765bf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 02:03:43 -0700
Subject: [PATCH 513/570] compat: Update forward compatibility horizon to
 2018-10-08

PiperOrigin-RevId: 216151605
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index ee56480b00..349c84e13c 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -26,7 +26,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 7)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 8)
 
 
 @tf_export("compat.forward_compatible")
-- 
GitLab


From 3bdf3c592472c2b54c513417de8d9b538d3f6078 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 08:08:31 -0700
Subject: [PATCH 514/570] Make ExecutorState preserve the thread context.

PiperOrigin-RevId: 216187878
---
 tensorflow/core/common_runtime/executor.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 2c48084cab..40ec1502da 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -1240,6 +1241,7 @@ class ExecutorState {
   StepStatsCollectorInterface* const stats_collector_;
   const tracing::TraceCollector* const trace_collector_;
   const tracing::EventCollector* const event_collector_;
+  Context context_;
 
   // QUESTION: Make it a checkpoint::TensorSliceReaderCacheWrapper
   // instead of a pointer?  (avoids having to delete).
@@ -1367,6 +1369,7 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
       trace_collector_(tracing::GetTraceCollector()),
       event_collector_(
           tracing::GetEventCollector(tracing::EventCategory::kCompute)),
+      context_(ContextKind::kThread),
       slice_reader_cache_(new checkpoint::TensorSliceReaderCacheWrapper),
       call_frame_(args.call_frame),
       impl_(impl),
@@ -1586,6 +1589,7 @@ bool MightTrace(const NodeItem& item,
 }
 
 void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
+  WithContext wc(context_);
   const GraphView& gview = impl_->gview_;
   TaggedNodeSeq ready;
   TaggedNodeReadyQueue inline_ready;
-- 
GitLab


From 53961cc2f16dea9d9b2286950c1e4d4c0a3743c5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 08:22:48 -0700
Subject: [PATCH 515/570] Improve const correctness of HloDomainMap

PiperOrigin-RevId: 216189458
---
 tensorflow/compiler/xla/service/hlo_domain_map.cc | 12 +++++++-----
 tensorflow/compiler/xla/service/hlo_domain_map.h  | 14 +++++++-------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.cc b/tensorflow/compiler/xla/service/hlo_domain_map.cc
index 6ca1255ede..c6d02f9f67 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.cc
@@ -42,18 +42,19 @@ namespace xla {
   return std::move(domain_map);
 }
 
-bool HloDomainMap::InSameDomain(HloInstruction* instruction1,
-                                HloInstruction* instruction2) const {
+bool HloDomainMap::InSameDomain(const HloInstruction* instruction1,
+                                const HloInstruction* instruction2) const {
   int64 domain_id1 = GetDomainId(instruction1);
   int64 domain_id2 = GetDomainId(instruction2);
   return domain_id1 >= 0 && domain_id1 == domain_id2;
 }
 
-int64 HloDomainMap::GetDomainId(HloInstruction* instruction) const {
+int64 HloDomainMap::GetDomainId(const HloInstruction* instruction) const {
   return FindOrDefault(instruction_to_domain_, instruction, -1);
 }
 
-int64 HloDomainMap::GetDomainMetadataId(HloInstruction* instruction) const {
+int64 HloDomainMap::GetDomainMetadataId(
+    const HloInstruction* instruction) const {
   return FindOrDie(domain_metadata_id_, instruction);
 }
 
@@ -200,7 +201,8 @@ StatusOr<std::unique_ptr<DomainMetadata::Domain>> HloDomainMap::CreateDomain(
   return std::move(domain);
 }
 
-bool HloDomainMap::IsDomainInstruction(HloInstruction* instruction) const {
+bool HloDomainMap::IsDomainInstruction(
+    const HloInstruction* instruction) const {
   if (instruction->opcode() != HloOpcode::kDomain) {
     return false;
   }
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.h b/tensorflow/compiler/xla/service/hlo_domain_map.h
index c8d581b746..bce7d1aa7c 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.h
@@ -58,21 +58,21 @@ class HloDomainMap {
   }
 
   // Checks whether two instructions are within the same domain.
-  bool InSameDomain(HloInstruction* instruction1,
-                    HloInstruction* instruction2) const;
+  bool InSameDomain(const HloInstruction* instruction1,
+                    const HloInstruction* instruction2) const;
 
   // Checks whether instruction is a kDomain instruction of the kind we are
   // currently processing.
-  bool IsDomainInstruction(HloInstruction* instruction) const;
+  bool IsDomainInstruction(const HloInstruction* instruction) const;
 
   // Retrieves the domain identifier of the instruction, or -1 in case
   // instruction is not found within any domain.
-  int64 GetDomainId(HloInstruction* instruction) const;
+  int64 GetDomainId(const HloInstruction* instruction) const;
 
   // Returns the unique id of the domain metadata for the domain the given
   // instruction belongs to. The given instruction must not be a kDomain
   // instruction since each domain instruction is associated with 2 domains.
-  int64 GetDomainMetadataId(HloInstruction* instruction) const;
+  int64 GetDomainMetadataId(const HloInstruction* instruction) const;
 
  private:
   // Map used for representing instruction ordering, i.e.
@@ -119,8 +119,8 @@ class HloDomainMap {
 
   string domain_kind_;
   std::vector<std::unique_ptr<DomainMetadata::Domain>> instruction_domains_;
-  absl::flat_hash_map<HloInstruction*, int64> instruction_to_domain_;
-  absl::flat_hash_map<HloInstruction*, int64> domain_metadata_id_;
+  absl::flat_hash_map<const HloInstruction*, int64> instruction_to_domain_;
+  absl::flat_hash_map<const HloInstruction*, int64> domain_metadata_id_;
 };
 
 }  // namespace xla
-- 
GitLab


From 75f57a8b7836a1ed3cda8ba81c88f6caf15cf0c6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 08:35:14 -0700
Subject: [PATCH 516/570] Remove Dims from types.h, create build structure.

PiperOrigin-RevId: 216191084
---
 .../contrib/lite/kernels/internal/BUILD       | 16 ++++++++++++
 .../lite/kernels/internal/legacy_types.h      | 26 +++++++++++++++++++
 .../internal/reference/legacy_reference_ops.h |  7 ++++-
 .../internal/reference/reference_ops.h        |  5 ----
 4 files changed, 48 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/contrib/lite/kernels/internal/legacy_types.h

diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index afb5ec05df..5c9ca6e910 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -49,6 +49,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "legacy_types",
+    srcs = [],
+    hdrs = [
+        "compatibility.h",
+        "legacy_types.h",
+        "types.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite/kernels:op_macros",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
 config_setting(
     name = "arm",
     values = {
@@ -198,6 +212,7 @@ cc_library(
         ":strided_slice_logic",
         ":tensor_utils",
         ":types",
+        ":legacy_types",
         ":legacy_reference_base",
         ":round",
         "//third_party/eigen3",
@@ -336,6 +351,7 @@ cc_library(
         ":quantization_util",
         ":round",
         ":strided_slice_logic",
+        ":legacy_types",
         ":types",
         "@gemmlowp",
         "//tensorflow/contrib/lite/c:c_api_internal",
diff --git a/tensorflow/contrib/lite/kernels/internal/legacy_types.h b/tensorflow/contrib/lite/kernels/internal/legacy_types.h
new file mode 100644
index 0000000000..2e4d3137f5
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/legacy_types.h
@@ -0,0 +1,26 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_LEGACY_TYPES_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_LEGACY_TYPES_H_
+
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+// TODO(b/116772710): Insert legacy Dims<> code in here.
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_LEGACY_TYPES_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
index be99240b1f..c8b64cfd96 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -19,10 +19,10 @@ limitations under the License.
 #include <sys/types.h>
 
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/legacy_types.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
 
 namespace tflite {
 
@@ -30,6 +30,11 @@ namespace reference_ops {
 
 static constexpr int kDepthwiseReverseShift = -1;
 
+inline void ShapeFromDims(const tflite::Dims<4>& dims, RuntimeShape* shape) {
+  shape->BuildFrom(
+      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
+}
+
 inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
                           const float* filter_data, const Dims<4>& filter_dims,
                           const float* bias_data, const Dims<4>& bias_dims,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 59f17ae854..19d23fa80b 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -100,11 +100,6 @@ gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
 
 namespace reference_ops {
 
-inline void ShapeFromDims(const tflite::Dims<4>& dims, RuntimeShape* shape) {
-  shape->BuildFrom(
-      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
-}
-
 template <typename T>
 int CountLeadingZeros(T integer_input) {
   static_assert(std::is_unsigned<T>::value,
-- 
GitLab


From 5f308cb408eb46ec9af0546be6b9ae1d5166b185 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 09:06:04 -0700
Subject: [PATCH 517/570] Optimize PinToHostOptimizer by adding cache, also add
 PinToHostOptimizer to benchmarks.

original runtime: 4.83492736816 secs
w/ cache runtime: 2.19033999443 secs

PiperOrigin-RevId: 216195286
---
 tensorflow/core/grappler/op_types.cc          |  22 ++-
 .../optimizers/pin_to_host_optimizer.cc       | 162 ++++++++++++------
 .../optimizers/pin_to_host_optimizer.h        |   4 +-
 .../optimizers/pin_to_host_optimizer_test.cc  |  76 +++++---
 4 files changed, 179 insertions(+), 85 deletions(-)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 1b5a215987..cbf5c8e038 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -102,15 +102,19 @@ bool IsConjugateTranspose(const NodeDef& node) {
 }
 
 bool IsControlFlow(const NodeDef& node) {
-  // clang-format off
-  return node.op() == "ControlTrigger" ||
-         node.op() == "Enter" ||
-         node.op() == "Exit" ||
-         node.op() == "LoopCond" ||
-         node.op() == "Merge" ||
-         node.op() == "NextIteration" ||
-         node.op() == "Switch";
-  // clang-format on
+  // TODO(williamchan): Add a microbenchmark to compare FlatSet vs. iterative
+  // string comparison.
+  static const gtl::FlatSet<string>* const kControFlowOps =
+      CHECK_NOTNULL((new gtl::FlatSet<string>{
+          "ControlTrigger",
+          "Enter",
+          "Exit",
+          "LoopCond",
+          "Merge",
+          "NextIteration",
+          "Switch",
+      }));
+  return kControFlowOps->count(node.op()) > 0;
 }
 
 bool IsConv2D(const NodeDef& node) { return node.op() == "Conv2D"; }
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
index 8ed4271fa4..29a3b2b74c 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
@@ -25,16 +25,29 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace grappler {
+
 namespace internal {
 
+namespace {
 // TODO(williamchan): Change this constant to be something smarter, maybe
 // dynamically determined.
 constexpr int64 kTensorMaxSize = 64;
 
+struct OpDevicePortHasher {
+  std::size_t operator()(const std::tuple<string, string, int>& x) const {
+    uint64 code = Hash64Combine(Hash64(std::get<0>(x)), Hash64(std::get<1>(x)));
+
+    return Hash64Combine(code, hash<int>()(std::get<2>(x)));
+  }
+};
+using OpDevicePortOnHostMap =
+    gtl::FlatMap<std::tuple<string, string, int>, bool, OpDevicePortHasher>;
+
 // All the nodes that should be blacklisted and not swapped.
 bool IsBlacklisted(const NodeDef& node) {
   return
@@ -82,10 +95,10 @@ Status TryFindKernelDef(const std::vector<DeviceType>& devices,
 
 // Checks if a node's output port is host friendly.
 // Roughly this means checking if the output port is on Host memory.
-Status IsNodeOutputPortHostFriendly(const GraphView& graph,
-                                    GraphProperties* properties,
-                                    const NodeDef& node, int port_id,
-                                    bool* is_candidate) {
+Status IsNodeOutputPortHostFriendly(
+    const GraphView& graph, GraphProperties* properties, const NodeDef& node,
+    int port_id, OpDevicePortOnHostMap* op_device_outport_pinned_to_host_cache,
+    bool* is_candidate) {
   *is_candidate = false;
 
   // Make sure we are not a blacklisted op.
@@ -117,7 +130,8 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
     for (const auto& fanin : graph.GetFanins(node, false)) {
       bool fanin_candidate = false;
       TF_RETURN_IF_ERROR(IsNodeOutputPortHostFriendly(
-          graph, properties, *fanin.node, fanin.port_id, &fanin_candidate));
+          graph, properties, *fanin.node, fanin.port_id,
+          op_device_outport_pinned_to_host_cache, &fanin_candidate));
       if (!fanin_candidate) {
         return Status::OK();
       }
@@ -132,11 +146,22 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
     return Status::OK();
   }
 
+  // Check `op_device_outport_pinned_to_host_cache` for our
+  // {op, device, port_id} combo to see if the arg is pinned on Host.
+  const std::tuple<string, string, int> cache_key(node.op(), node.device(),
+                                                  port_id);
+  auto it = op_device_outport_pinned_to_host_cache->find(cache_key);
+  if (it != op_device_outport_pinned_to_host_cache->end()) {
+    *is_candidate = it->second;
+    return Status::OK();
+  }
+
   // Check if op's output port is pinned to HostMemory.
   const OpDef* op = nullptr;
   Status s = OpRegistry::Global()->LookUpOpDef(node.op(), &op);
   if (!s.ok()) {
     LOG(WARNING) << "Could not find OpDef for : " << node.op();
+    op_device_outport_pinned_to_host_cache->emplace(cache_key, false);
     return Status::OK();
   }
 
@@ -146,6 +171,7 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
     LOG(WARNING) << "Invalid port: " << port_id << "!\n"
                  << node.DebugString() << "\n"
                  << op->DebugString();
+    op_device_outport_pinned_to_host_cache->emplace(cache_key, false);
     return Status::OK();
   }
 
@@ -155,6 +181,7 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
                        &kernel);
   if (!s.ok()) {
     LOG(INFO) << "Could not find KernelDef for: " << node.op();
+    op_device_outport_pinned_to_host_cache->emplace(cache_key, false);
     return Status::OK();
   }
 
@@ -166,22 +193,35 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
     }
   }
 
+  op_device_outport_pinned_to_host_cache->emplace(cache_key, *is_candidate);
+
   return Status::OK();
 }
 
 // Checks if a node's input port is Host friendly.
 // Roughly this means checking if the input port is on Host memory.
-bool IsNodeInputPortHostFriendly(const NodeDef& node, int port_id) {
+bool IsNodeInputPortHostFriendly(
+    const NodeDef& node, int port_id,
+    OpDevicePortOnHostMap* op_device_inport_pinned_to_host_cache) {
   // If node is on Host, assume its inputs are Host friendly.
   if (str_util::StrContains(node.device(), DEVICE_CPU)) {
     return true;
   }
 
+  // Check `op_device_inport_pinned_to_host_cache` for our
+  // {op, device, port_id} combo to see if the arg is pinned on Host.
+  std::tuple<string, string, int> cache_key(node.op(), node.device(), port_id);
+  auto it = op_device_inport_pinned_to_host_cache->find(cache_key);
+  if (it != op_device_inport_pinned_to_host_cache->end()) {
+    return it->second;
+  }
+
   // Check if op's input port is pinned to HostMemory.
   const OpDef* op = nullptr;
   Status s = OpRegistry::Global()->LookUpOpDef(node.op(), &op);
   if (!s.ok()) {
     LOG(WARNING) << "Could not find OpDef for : " << node.op();
+    op_device_inport_pinned_to_host_cache->emplace(cache_key, false);
     return false;
   }
   const int input_arg_id = OpInputPortIdToArgId(node, *op, port_id);
@@ -192,16 +232,20 @@ bool IsNodeInputPortHostFriendly(const NodeDef& node, int port_id) {
       {node.device().c_str(), DEVICE_GPU, DEVICE_CPU}, node, &kernel);
   if (!s.ok()) {
     LOG(INFO) << "Could not find KernelDef for: " << node.op();
+    op_device_inport_pinned_to_host_cache->emplace(cache_key, false);
     return false;
   }
 
   // Check if the input_arg is pinned to Host.
   for (const string& host_memory_arg : kernel->host_memory_arg()) {
     if (op->input_arg(input_arg_id).name() == host_memory_arg) {
+      op_device_inport_pinned_to_host_cache->emplace(cache_key, true);
       return true;
     }
   }
 
+  op_device_inport_pinned_to_host_cache->emplace(cache_key, false);
+
   return false;
 }
 
@@ -211,18 +255,20 @@ bool IsNodeInputPortHostFriendly(const NodeDef& node, int port_id) {
 // 2] Check if node can run on Host.
 // 3] Check all input/outputs are Host "friendly" (atm, friendly means small,
 //    ints, and pinned to Host).
-Status IsNodeHostCandidate(const GraphView& graph, GraphProperties* properties,
-                           const NodeDef& node, bool* is_candidate) {
+Status IsNodeHostCandidate(
+    const GraphView& graph, GraphProperties* properties, const NodeDef& node,
+    OpDevicePortOnHostMap* op_device_outport_pinned_to_host_cache,
+    bool* is_candidate) {
   *is_candidate = false;
 
-  // Check if node already on CPU.
-  if (str_util::StrContains(node.device(), DEVICE_CPU)) {
-    *is_candidate = true;
+  // Skip these node types.
+  if (IsBlacklisted(node)) {
     return Status::OK();
   }
 
-  // Skip these node types.
-  if (IsBlacklisted(node)) {
+  // Check if node already on CPU.
+  if (str_util::StrContains(node.device(), DEVICE_CPU)) {
+    *is_candidate = true;
     return Status::OK();
   }
 
@@ -232,17 +278,6 @@ Status IsNodeHostCandidate(const GraphView& graph, GraphProperties* properties,
     return Status::OK();
   }
 
-  // Check all inputs are Host friendly.
-  for (const GraphView::OutputPort& fanin :
-       graph.GetFanins(node, /*include_controlling_nodes=*/false)) {
-    bool fanin_candidate = false;
-    TF_RETURN_IF_ERROR(IsNodeOutputPortHostFriendly(
-        graph, properties, *fanin.node, fanin.port_id, &fanin_candidate));
-    if (!fanin_candidate) {
-      return Status::OK();
-    }
-  }
-
   // Check all outputs are Host friendly.
   if (!properties->has_properties()) {
     // This is an expensive call, call it lazily.
@@ -255,16 +290,42 @@ Status IsNodeHostCandidate(const GraphView& graph, GraphProperties* properties,
     }
   }
 
+  // Check all inputs are Host friendly.
+  for (const GraphView::OutputPort& fanin :
+       graph.GetFanins(node, /*include_controlling_nodes=*/false)) {
+    bool fanin_candidate = false;
+    TF_RETURN_IF_ERROR(IsNodeOutputPortHostFriendly(
+        graph, properties, *fanin.node, fanin.port_id,
+        op_device_outport_pinned_to_host_cache, &fanin_candidate));
+    if (!fanin_candidate) {
+      return Status::OK();
+    }
+  }
+
   *is_candidate = true;
   return Status::OK();
 }
 
-string TryFindHostDevice(const gtl::FlatSet<string>& devices,
-                         bool has_device_cpu, const string& device) {
+bool IsTPUGraphDef(const GraphDef& def) {
+  for (const auto& node : def.node()) {
+    if (node.op() == "TPUCompile" || node.op() == "TPUExecute" ||
+        node.op() == "TPUPartitionedCall") {
+      return true;
+    }
+  }
+  return false;
+}
+}  // end namespace
+
+// Tries to swap `device` to a Host device from `devices`. Returns true iff
+// there was a swap.
+bool TrySwapToHostDevice(const gtl::FlatSet<string>& devices,
+                         bool has_device_cpu, string* device) {
   // Force this node onto the CPU.
-  if (device.empty() && has_device_cpu) {
-    return "/device:CPU:0";
-  } else if (str_util::StrContains(device, DEVICE_GPU)) {
+  if (device->empty() && has_device_cpu) {
+    *device = "/device:CPU:0";
+    return true;
+  } else if (str_util::StrContains(*device, DEVICE_GPU)) {
     // Sometimes the cluster can have:
     //   devices = {"/device:CPU:0", "/device:XLA_GPU:0"}
     // and we need to handle them properly.
@@ -272,27 +333,19 @@ string TryFindHostDevice(const gtl::FlatSet<string>& devices,
          {std::pair<string, string>("GPU", "CPU:0"),
           std::pair<string, string>("/device", "/device:CPU:0")}) {
       const string device_host =
-          strings::StrCat(device.substr(0, device.rfind(device_match.first)),
+          strings::StrCat(device->substr(0, device->rfind(device_match.first)),
                           device_match.second);
       if (devices.find(device_host) != devices.end()) {
-        return device_host;
+        *device = device_host;
+        return true;
       }
     }
   }
 
-  // We couldn't find an appropriate Host device, return original device.
-  return device;
-}
-
-bool IsTPUGraphDef(const GraphDef& def) {
-  for (const auto& node : def.node()) {
-    if (node.op() == "TPUCompile" || node.op() == "TPUExecute" ||
-        node.op() == "TPUPartitionedCall") {
-      return true;
-    }
-  }
+  // We couldn't find an appropriate Host device, return false.
   return false;
 }
+
 }  // end namespace internal
 
 Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
@@ -324,20 +377,26 @@ Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   // All the Const nodes, and their original devices in topological order.
   std::vector<std::pair<NodeDef*, string>> const_nodes;
 
+  // Cache to map {op, device, port} -> bool on whether it is pinned to host.
+  internal::OpDevicePortOnHostMap op_device_outport_pinned_to_host_cache;
+  internal::OpDevicePortOnHostMap op_device_inport_pinned_to_host_cache;
+
   for (auto& node : *optimized_graph->mutable_node()) {
     bool is_candidate = false;
-    TF_RETURN_IF_ERROR(
-        internal::IsNodeHostCandidate(graph, &properties, node, &is_candidate));
+    TF_RETURN_IF_ERROR(internal::IsNodeHostCandidate(
+        graph, &properties, node, &op_device_outport_pinned_to_host_cache,
+        &is_candidate));
     if (!is_candidate) {
       continue;
     }
 
-    if (IsConstant(node)) {
-      const_nodes.emplace_back(&node, node.device());
+    const string original_device = node.device();
+    const bool swapped = internal::TrySwapToHostDevice(devices, has_device_cpu,
+                                                       node.mutable_device());
+    // Keep track of all Const nodes that we swapped.
+    if (swapped && IsConstant(node)) {
+      const_nodes.emplace_back(&node, original_device);
     }
-    // Try and swap the device to Host.
-    node.set_device(
-        internal::TryFindHostDevice(devices, has_device_cpu, node.device()));
   }
 
   // Traverse all `const_nodes`, and map them back to GPU greedily.
@@ -349,8 +408,9 @@ Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     // this node back onto the original device.
     for (const GraphView::InputPort& fanout : graph.GetFanouts(*node, false)) {
       // The consumer is not Host friendly, swap it back to the original device.
-      if (!internal::IsNodeInputPortHostFriendly(*fanout.node,
-                                                 fanout.port_id)) {
+      if (!internal::IsNodeInputPortHostFriendly(
+              *fanout.node, fanout.port_id,
+              &op_device_inport_pinned_to_host_cache)) {
         node->set_device(device);
         break;
       }
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
index d557a03463..bed4a9ef95 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
@@ -26,8 +26,8 @@ namespace tensorflow {
 namespace grappler {
 namespace internal {
 // Try and find an appropriate Host device in `devices` given `device`.
-string TryFindHostDevice(const gtl::FlatSet<string>& devices,
-                         bool has_device_cpu, const string& device);
+bool TrySwapToHostDevice(const gtl::FlatSet<string>& devices,
+                         bool has_device_cpu, string* device);
 }  // end namespace internal
 
 // Optimize TensorFlow ops that should be swapped into the CPU to avoid
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
index 7c64529441..9bb030b220 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
@@ -28,30 +28,60 @@ namespace {
 
 class PinToHostOptimizerTest : public GrapplerTest {};
 
-TEST_F(PinToHostOptimizerTest, TryFindHostDevice) {
+TEST_F(PinToHostOptimizerTest, TrySwapToHostDeviceNoDevices) {
   gtl::FlatSet<string> devices = {};
-  EXPECT_EQ("ABC", internal::TryFindHostDevice(devices, false, "ABC"));
-
-  devices = {"/device:CPU:0", "/device:XLA_GPU:0"};
-  EXPECT_EQ(internal::TryFindHostDevice(devices, true, ""), "/device:CPU:0");
-  EXPECT_EQ(internal::TryFindHostDevice(devices, true, "/device:XLA_GPU:0"),
-            "/device:CPU:0");
-  EXPECT_EQ(internal::TryFindHostDevice(devices, true, "/device:XLA_GPU:*"),
-            "/device:CPU:0");
-
-  devices = {"/device:XLA_CPU:0", "/device:XLA_GPU:0"};
-  EXPECT_EQ(internal::TryFindHostDevice(devices, false, ""), "");
-  EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:0"),
-            "/device:XLA_CPU:0");
-  EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:*"),
-            "/device:XLA_CPU:0");
-
-  devices = {"/device:XLA_GPU:0"};
-  EXPECT_EQ(internal::TryFindHostDevice(devices, false, ""), "");
-  EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:0"),
-            "/device:XLA_GPU:0");
-  EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:*"),
-            "/device:XLA_GPU:*");
+
+  string device = "ABC";
+  EXPECT_FALSE(internal::TrySwapToHostDevice(devices, false, &device));
+  EXPECT_EQ(device, "ABC");
+}
+
+TEST_F(PinToHostOptimizerTest, TrySwapToHostDeviceCpuXlaGpu) {
+  gtl::FlatSet<string> devices = {"/device:CPU:0", "/device:XLA_GPU:0"};
+
+  string device = "";
+  EXPECT_TRUE(internal::TrySwapToHostDevice(devices, true, &device));
+  EXPECT_EQ(device, "/device:CPU:0");
+
+  device = "/device:XLA_GPU:0";
+  EXPECT_TRUE(internal::TrySwapToHostDevice(devices, true, &device));
+  EXPECT_EQ(device, "/device:CPU:0");
+
+  device = "/device:XLA_GPU:*";
+  EXPECT_TRUE(internal::TrySwapToHostDevice(devices, true, &device));
+  EXPECT_EQ(device, "/device:CPU:0");
+}
+
+TEST_F(PinToHostOptimizerTest, TrySwapToHostDeviceXlaCpuXlaGpu) {
+  gtl::FlatSet<string> devices = {"/device:XLA_CPU:0", "/device:XLA_GPU:0"};
+
+  string device = "";
+  EXPECT_FALSE(internal::TrySwapToHostDevice(devices, false, &device));
+  EXPECT_TRUE(device.empty());
+
+  device = "/device:XLA_GPU:0";
+  EXPECT_TRUE(internal::TrySwapToHostDevice(devices, false, &device));
+  EXPECT_EQ(device, "/device:XLA_CPU:0");
+
+  device = "/device:XLA_GPU:*";
+  EXPECT_TRUE(internal::TrySwapToHostDevice(devices, false, &device));
+  EXPECT_EQ(device, "/device:XLA_CPU:0");
+}
+
+TEST_F(PinToHostOptimizerTest, TrySwapToHostDeviceXlaGpu) {
+  gtl::FlatSet<string> devices = {"/device:XLA_GPU:0"};
+
+  string device = "";
+  EXPECT_FALSE(internal::TrySwapToHostDevice(devices, false, &device));
+  EXPECT_TRUE(device.empty());
+
+  device = "/device:XLA_GPU:0";
+  EXPECT_FALSE(internal::TrySwapToHostDevice(devices, false, &device));
+  EXPECT_EQ(device, "/device:XLA_GPU:0");
+
+  device = "/device:XLA_GPU:*";
+  EXPECT_FALSE(internal::TrySwapToHostDevice(devices, false, &device));
+  EXPECT_EQ(device, "/device:XLA_GPU:*");
 }
 
 TEST_F(PinToHostOptimizerTest, OptimizeSmallOpsToHost) {
-- 
GitLab


From 411b9baa39636030181fdff15d2e985824b03d61 Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Mon, 8 Oct 2018 09:42:50 -0700
Subject: [PATCH 518/570] Reduce tolerances for rmsprop_test float16, to fix
 OSS builds.

PiperOrigin-RevId: 216200439
---
 tensorflow/contrib/optimizer_v2/rmsprop_test.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/optimizer_v2/rmsprop_test.py b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
index 44301ffe9e..83f5971039 100644
--- a/tensorflow/contrib/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
@@ -157,8 +157,11 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
         self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
         self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
         self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-        self.assertAllCloseAccordingToType(var0_np, var0.eval())
-        self.assertAllCloseAccordingToType(var1_np, var1.eval())
+        # TODO(b/117393988): Reduce tolerances for float16.
+        self.assertAllCloseAccordingToType(
+            var0_np, var0.eval(), half_rtol=3e-3, half_atol=3e-3)
+        self.assertAllCloseAccordingToType(
+            var1_np, var1.eval(), half_rtol=3e-3, half_atol=3e-3)
 
   @parameterized.parameters([dtypes.float32, dtypes.float64])
   def testMinimizeSparseResourceVariable(self, dtype):
-- 
GitLab


From f435e776216c7a86f619a17064fd6e1deee638b3 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Mon, 8 Oct 2018 09:49:38 -0700
Subject: [PATCH 519/570] Avoid adding spurious ops when colocating with
 resource variables.

Prior to this change, tf.colocate_with(v) would insert spurious operations (a ReadVariableOp and an Identity) in the graph when v is a resource variable, and then
colocate the operations within the block with those newly added, otherwise disconnected, operations.

This commit avoids adding the unnecessary ReadVariableOp/Identity nodes and colocates
operations within the block with the VarHandleOp.

PiperOrigin-RevId: 216201638
---
 .../python/parameter_server_strategy_test.py  |  4 ++-
 tensorflow/python/framework/ops.py            | 28 ++++++++++++++++---
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index 353d11a583..9c112e4f85 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -262,7 +262,9 @@ class ParameterServerStrategyTestBase(
           h = f + 1.0
         self.assertEqual(
             device_util.canonicalize(u.device), tower_variable_device)
-        self.assertEqual(device_util.canonicalize(x.device), h.device)
+        self.assertEqual(
+            device_util.canonicalize(x.device),
+            device_util.canonicalize(h.device))
         return y_add, z_add, f
 
       y, z, f = d.call_for_each_tower(model_fn)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 8bb177939e..77c2bc930e 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -4140,10 +4140,7 @@ class Graph(object):
     if op is None and not ignore_existing:
       raise ValueError("Trying to reset colocation (op is None) but "
                        "ignore_existing is not True")
-
-    if op is not None and not isinstance(op, Operation):
-      # We always want to colocate with the reference op.
-      op = internal_convert_to_tensor_or_indexed_slices(op, as_ref=True).op
+    op = _op_to_colocate_with(op)
 
     # By default, colocate_with resets the device function stack,
     # since colocate_with is typically used in specific internal
@@ -6168,4 +6165,27 @@ def _operation_conversion_error(op, dtype=None, name=None, as_ref=False):
                                                                name, as_ref))
 
 
+def _op_to_colocate_with(v):
+  """Operation object corresponding to v to use for colocation constraints."""
+  if v is None:
+    return None
+  if isinstance(v, Operation):
+    return v
+  # We always want to colocate with the reference op.
+  # When 'v' is a ResourceVariable, the reference op is the handle creating op.
+  #
+  # What this should be is:
+  # if isinstance(v, ResourceVariable):
+  #   return v.handle.op
+  # However, that would require a circular import dependency.
+  # As of October 2018, there were attempts underway to remove
+  # colocation constraints altogether. Assuming that will
+  # happen soon, perhaps this hack to work around the circular
+  # import dependency is acceptable.
+  if hasattr(v, "handle") and hasattr(v.handle, "op") and isinstance(
+      v.handle.op, Operation):
+    return v.handle.op
+  return internal_convert_to_tensor_or_indexed_slices(v, as_ref=True).op
+
+
 register_tensor_conversion_function(Operation, _operation_conversion_error)
-- 
GitLab


From 87315f41ced19136819cef56ef37636c52c474de Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 09:49:59 -0700
Subject: [PATCH 520/570] Remove Raises documentation on imperative_grads for
 ValueErrror not raised.

PiperOrigin-RevId: 216201714
---
 tensorflow/python/eager/imperative_grad.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tensorflow/python/eager/imperative_grad.py b/tensorflow/python/eager/imperative_grad.py
index 5f5af4ab6c..5c35860e9d 100644
--- a/tensorflow/python/eager/imperative_grad.py
+++ b/tensorflow/python/eager/imperative_grad.py
@@ -51,11 +51,6 @@ def imperative_grad(
 
   Raises:
     RuntimeError: if something goes wrong.
-    ValueError: if there is no sequence of differentiable operations connecting
-     a source and any target Tensor. This can happen either if the target is
-     not computed based on the source, if the tracing was set up incorrectly,
-     or if only non-differentiable functions of the source were used in the
-     computation of target.
   """
   return pywrap_tensorflow.TFE_Py_TapeGradient(
       tape._tape,  # pylint: disable=protected-access
-- 
GitLab


From 07df147ab20c4a5329148e5fb5f7f6b187cb73a4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 09:50:08 -0700
Subject: [PATCH 521/570] Enable PinToHostOptimizer.

PiperOrigin-RevId: 216201732
---
 tensorflow/core/grappler/optimizers/meta_optimizer.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index c3d70a1fdf..3f33b16ba8 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -107,7 +107,8 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
   MK_OPT("scoped_allocator",
          new ScopedAllocatorOptimizer(cfg_.scoped_allocator_optimization(),
                                       cfg_.scoped_allocator_opts()));
-  MK_OPT("small_op", new PinToHostOptimizer(cfg_.pin_to_host_optimization()));
+  MK_OPT("pin_to_host",
+         new PinToHostOptimizer(cfg_.pin_to_host_optimization()));
 
   return std::unique_ptr<GraphOptimizer>();
 }
@@ -139,7 +140,7 @@ Status MetaOptimizer::InitializeOptimizers(
   if (cfg_.remapping() != RewriterConfig::OFF) {
     optimizers->push_back(MakeUnique<Remapper>(cfg_.remapping()));
   }
-  if (cfg_.pin_to_host_optimization() == RewriterConfig::ON) {
+  if (cfg_.pin_to_host_optimization() != RewriterConfig::OFF) {
     optimizers->push_back(MakeUnique<PinToHostOptimizer>());
   }
   if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
@@ -527,7 +528,7 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
          cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
          cfg.debug_stripper() == RewriterConfig::ON ||
          cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
-         cfg.pin_to_host_optimization() == RewriterConfig::ON ||
+         cfg.pin_to_host_optimization() != RewriterConfig::OFF ||
          !cfg.optimizers().empty() || !cfg.custom_optimizers().empty();
 }
 
-- 
GitLab


From da3abf6afeaf781b932bce9ccb6c17da911e49b6 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 8 Oct 2018 09:53:31 -0700
Subject: [PATCH 522/570] Benchmark for comparing original cond and cond_v2
 performance.

This benchmark creates many intermediates values, so we can make sure there's no performance overhead (it looks like there might be currently, or it might be from some other difference). It also runs in a defun and in legacy graph mode.

Results from my machine:

entry {
  name: "CondWithManyIntermediatesBenchmark.benchmark_cond_v1_defun"
  iters: 500
  wall_time: 1.25822591782
}

entry {
  name: "CondWithManyIntermediatesBenchmark.benchmark_cond_v2_defun"
  iters: 500
  wall_time: 5.99376106262
}

entry {
  name: "CondWithManyIntermediatesBenchmark.benchmark_cond_v1_graph"
  iters: 500
  wall_time: 2.05277585983
}

entry {
  name: "CondWithManyIntermediatesBenchmark.benchmark_cond_v2_graph"
  iters: 500
  wall_time: 2.84808516502
}

Clearly we have some work to do! I haven't looked into the time differences at all yet.

PiperOrigin-RevId: 216202325
---
 tensorflow/python/BUILD                       |  13 ++
 .../python/ops/control_flow_ops_benchmark.py  | 122 ++++++++++++++++++
 2 files changed, 135 insertions(+)
 create mode 100644 tensorflow/python/ops/control_flow_ops_benchmark.py

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index da3c56db92..822d596995 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -5196,6 +5196,19 @@ cuda_py_test(
     main = "ops/concat_benchmark.py",
 )
 
+cuda_py_test(
+    name = "control_flow_ops_benchmark",
+    srcs = ["ops/control_flow_ops_benchmark.py"],
+    additional_deps = [
+        ":client_testlib",
+        ":constant_op",
+        ":control_flow_ops",
+        ":framework_ops",
+        "//tensorflow/python/eager:function",
+    ],
+    main = "ops/control_flow_ops_benchmark.py",
+)
+
 cuda_py_test(
     name = "conv2d_benchmark",
     size = "large",
diff --git a/tensorflow/python/ops/control_flow_ops_benchmark.py b/tensorflow/python/ops/control_flow_ops_benchmark.py
new file mode 100644
index 0000000000..9ba5ff2c0f
--- /dev/null
+++ b/tensorflow/python/ops/control_flow_ops_benchmark.py
@@ -0,0 +1,122 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for control flow ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+
+class CondWithManyIntermediatesBenchmark(test.Benchmark):
+  """Checks the runtime performance of outputting all intermediates."""
+
+  NUM_INTERMEDIATES = 1000
+  NUM_ITERS = 500
+  NUM_WARM_UP_ITERS = 50
+
+  def _create_cond(self, x):
+
+    def branch_fn():
+      # Use a random value so the adds can't be constant folded.
+      return x + sum(random_ops.random_normal([])
+                     for _ in range(self.NUM_INTERMEDIATES))
+
+    # Use a dynamic predicate to make sure the cond isn't constant folded.
+    return control_flow_ops.cond(math_ops.not_equal(x, -1),
+                                 branch_fn, lambda: 0.0)
+
+  def _benchmark_defun(self):
+    """Benchmarks cond in a defun."""
+
+    @function.defun
+    def cond_fn(x):
+      return self._create_cond(x)
+
+    # Warm up
+    for _ in range(self.NUM_WARM_UP_ITERS):
+      cond_fn(0.0)
+
+    start_time = time.time()
+
+    for _ in range(self.NUM_ITERS):
+      cond_fn(0.0)
+
+    self.report_benchmark(
+        wall_time=time.time() - start_time,
+        iters=self.NUM_ITERS)
+
+  def _benchmark_graph(self):
+    """Benchmarks cond in legacy graph mode."""
+    with context.graph_mode():
+      with ops.Graph().as_default():
+        x = array_ops.placeholder(dtypes.float32)
+        cond_val = self._create_cond(x)
+
+        with session.Session() as sess:
+          cond_fn = sess.make_callable(cond_val, [x])
+
+          # Warm up
+          for _ in range(self.NUM_WARM_UP_ITERS):
+            cond_fn(0.0)
+
+          start_time = time.time()
+
+          for _ in range(self.NUM_ITERS):
+            cond_fn(0.0)
+
+          self.report_benchmark(
+              wall_time=time.time() - start_time,
+              iters=self.NUM_ITERS)
+
+  def benchmark_cond_v1_defun(self):
+    old_val = control_flow_ops.ENABLE_COND_V2
+    control_flow_ops.ENABLE_COND_V2 = False
+    self._benchmark_defun()
+    control_flow_ops.ENABLE_COND_V2 = old_val
+
+  def benchmark_cond_v2_defun(self):
+    old_val = control_flow_ops.ENABLE_COND_V2
+    control_flow_ops.ENABLE_COND_V2 = True
+    self._benchmark_defun()
+    control_flow_ops.ENABLE_COND_V2 = old_val
+
+  def benchmark_cond_v1_graph(self):
+    old_val = control_flow_ops.ENABLE_COND_V2
+    control_flow_ops.ENABLE_COND_V2 = False
+    self._benchmark_graph()
+    control_flow_ops.ENABLE_COND_V2 = old_val
+
+  def benchmark_cond_v2_graph(self):
+    old_val = control_flow_ops.ENABLE_COND_V2
+    control_flow_ops.ENABLE_COND_V2 = True
+    self._benchmark_graph()
+    control_flow_ops.ENABLE_COND_V2 = old_val
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
-- 
GitLab


From 6dd826b856acf6b060379251bfd91a950ee2b0af Mon Sep 17 00:00:00 2001
From: Makoto Uchida <muchida@google.com>
Date: Mon, 8 Oct 2018 10:00:18 -0700
Subject: [PATCH 523/570] Fix typo

PiperOrigin-RevId: 216203408
---
 .../experimental/kernel_tests/reader_dataset_ops_test_base.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
index fe0b3b5f3b..77df8310d4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
@@ -64,7 +64,7 @@ class FixedLengthRecordDatasetTestBase(test_base.DatasetTestBase):
 
 
 class MakeBatchedFeaturesDatasetTestBase(test_base.DatasetTestBase):
-  """Base class for setting up and testing `make_batched_feature_dataset`."""
+  """Base class for setting up and testing `make_batched_features_dataset`."""
 
   def setUp(self):
     super(MakeBatchedFeaturesDatasetTestBase, self).setUp()
-- 
GitLab


From 0e1ba8886b6a333b1ed8ed7548c55041c34e9623 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 10:09:50 -0700
Subject: [PATCH 524/570] Fix compilation in unique_op when Eigen::Index !=
 int64.

PiperOrigin-RevId: 216205396
---
 tensorflow/core/kernels/unique_op.cc | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 3559baa18e..3bdcfc90b8 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -108,7 +108,7 @@ class UniqueOp : public OpKernel {
 
       std::unordered_map<T, TIndex> uniq;
       uniq.reserve(2 * N);
-      for (int64 i = 0, j = 0; i < N; ++i) {
+      for (Eigen::Index i = 0, j = 0; i < N; ++i) {
         auto it = uniq.insert(std::make_pair(Tin(i), j));
         idx_vec(i) = it.first->second;
         if (it.second) {
@@ -131,19 +131,20 @@ class UniqueOp : public OpKernel {
       // General implementation when unique is run over multiple elements.
       auto Tin = input.shaped<T, 3>(new_sizes);
 
-      auto hash_fn = [&Tin](const int64& key) {
+      auto hash_fn = [&Tin](const Eigen::Index& key) {
         size_t h = 0;
-        for (int64 i = 0; i < Tin.dimension(0); i++) {
-          for (int64 j = 0; j < Tin.dimension(2); j++) {
+        for (Eigen::Index i = 0; i < Tin.dimension(0); i++) {
+          for (Eigen::Index j = 0; j < Tin.dimension(2); j++) {
             h = Hash64Combine(h, hash<T>{}(Tin(i, key, j)));
           }
         }
         return h;
       };
 
-      auto equal_to_fn = [&Tin](const int64& lhs, const int64& rhs) {
-        for (int64 i = 0; i < Tin.dimension(0); i++) {
-          for (int64 j = 0; j < Tin.dimension(2); j++) {
+      auto equal_to_fn = [&Tin](const Eigen::Index& lhs,
+                                const Eigen::Index& rhs) {
+        for (Eigen::Index i = 0; i < Tin.dimension(0); i++) {
+          for (Eigen::Index j = 0; j < Tin.dimension(2); j++) {
             if (Tin(i, lhs, j) != Tin(i, rhs, j)) {
               return false;
             }
-- 
GitLab


From 0e42fd6d0a88b30ab57959f38c79bea19d745ec3 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Mon, 8 Oct 2018 10:14:58 -0700
Subject: [PATCH 525/570] [tf.data] Adding specialization for `MapDataset`,
 `ParallelMapDataset`, and `MapAndBatchDataset` whose user-provided functions
 have the property that each output argument take its value directly from an
 input argument (e.g. `lambda x, y: y, x`). This specialization can produce
 the result without having to schedule the function using the executor.

PiperOrigin-RevId: 216206232
---
 tensorflow/core/kernels/data/BUILD            |  14 ++
 tensorflow/core/kernels/data/dataset_utils.cc |  47 +++++
 tensorflow/core/kernels/data/dataset_utils.h  |  20 ++
 .../core/kernels/data/dataset_utils_test.cc   |  46 +++++
 .../core/kernels/data/filter_dataset_op.cc    | 162 ++++++---------
 .../kernels/data/map_and_batch_dataset_op.cc  | 187 +++++++++++-------
 .../core/kernels/data/map_dataset_op.cc       |  62 ++++--
 .../kernels/data/parallel_map_dataset_op.cc   |  79 +++++---
 .../kernels/data/parallel_map_iterator.cc     |  17 +-
 .../core/kernels/data/parallel_map_iterator.h |   2 +-
 .../kernels/data/parse_example_dataset_op.cc  |   2 +-
 .../kernel_tests/map_and_batch_test.py        |  31 +++
 .../kernel_tests/filter_dataset_op_test.py    |   2 +-
 .../data/kernel_tests/map_dataset_op_test.py  |  95 +++++++--
 .../python/data/kernel_tests/test_base.py     |  29 +++
 15 files changed, 565 insertions(+), 230 deletions(-)
 create mode 100644 tensorflow/core/kernels/data/dataset_utils_test.cc

diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 451f8c1a6c..37c1c54786 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -45,6 +45,16 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "dataset_utils_test",
+    srcs = ["dataset_utils_test.cc"],
+    deps = [
+        ":dataset_utils",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "captured_function",
     srcs = ["captured_function.cc"],
@@ -205,6 +215,7 @@ tf_kernel_library(
     deps = [
         ":captured_function",
         ":dataset",
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -232,6 +243,7 @@ tf_kernel_library(
     deps = [
         ":captured_function",
         ":dataset",
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -245,6 +257,7 @@ tf_kernel_library(
     deps = [
         ":captured_function",
         ":dataset",
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -285,6 +298,7 @@ tf_kernel_library(
     deps = [
         ":captured_function",
         ":dataset",
+        ":dataset_utils",
         ":parallel_map_iterator",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index e10833f525..a40f7f2146 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -15,10 +15,57 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 
 namespace tensorflow {
 namespace data {
 
+Status ComputeShortCircuitIndices(OpKernelContext* ctx,
+                                  const NameAttrList& func,
+                                  std::vector<int>* indices) {
+  FunctionLibraryRuntime::Handle fn_handle;
+  TF_RETURN_IF_ERROR(ctx->function_library()->Instantiate(
+      func.name(), AttrSlice(&func.attr()), &fn_handle));
+  auto cleanup = gtl::MakeCleanup([ctx, fn_handle]() {
+    Status s = ctx->function_library()->ReleaseHandle(fn_handle);
+    if (!s.ok()) {
+      LOG(WARNING) << "Failed to release handle: " << s.error_message();
+    }
+  });
+
+  const FunctionBody* fn_body =
+      ctx->function_library()->GetFunctionBody(fn_handle);
+  indices->resize(fn_body->ret_nodes.size());
+  for (size_t i = 0; i < fn_body->ret_nodes.size(); ++i) {
+    Node* ret_node = fn_body->ret_nodes[i];
+    Node* ret_input_node;
+    TF_RETURN_IF_ERROR(ret_node->input_node(0, &ret_input_node));
+    if (ret_input_node->def().op() == FunctionLibraryDefinition::kArgOp) {
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(ret_input_node->def(), "index", &((*indices)[i])));
+    } else {
+      indices->clear();
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+std::vector<bool> ComputeMoveVector(const std::vector<int>& indices) {
+  std::map<int, int> last_use;
+  for (size_t i = 0; i < indices.size(); ++i) {
+    last_use[indices[i]] = i;
+  }
+  std::vector<bool> can_move;
+  can_move.resize(indices.size());
+  for (size_t i = 0; i < indices.size(); ++i) {
+    can_move[i] = last_use[indices[i]] == i;
+  }
+  return can_move;
+}
+
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
     int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 6ec1350cd4..d777062293 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -22,6 +22,26 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
+// This method is used to determine whether we can short-circuit the evaluation
+// of the user-defined function `func`. Short-circuting is possible if every
+// function output corresponds to one of its inputs (e.g. `f(x) = x`, `f(x,y) =
+// (y,x)`, or `f(x) = (x,x)`).
+//
+// If short-circuiting is possible, the method stores the mapping from output
+// indices to input indices in `indices`. Otherwise, `indices` will be empty.
+//
+// Returns non-ok status if analysis of the function fails.
+//
+// TODO(jsimsa): Extend this to support constants as well.
+Status ComputeShortCircuitIndices(OpKernelContext* ctx,
+                                  const NameAttrList& func,
+                                  std::vector<int>* indices);
+
+// Given a vector that maps output indices to input indices, return a vector
+// that identifies for which output indices can we move the input (assuming
+// output indices are processed left to right).
+std::vector<bool> ComputeMoveVector(const std::vector<int>& indices);
+
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
     int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc
new file mode 100644
index 0000000000..43295b8ebb
--- /dev/null
+++ b/tensorflow/core/kernels/data/dataset_utils_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+TEST(DatasetUtils, ComputeMoveVector) {
+  struct TestCase {
+    std::vector<int> indices;
+    std::vector<bool> expected;
+  };
+
+  TestCase test_cases[] = {
+      TestCase{{}, {}},
+      TestCase{{1}, {true}},
+      TestCase{{1, 1}, {false, true}},
+      TestCase{{1, 2}, {true, true}},
+      TestCase{{1, 1, 2}, {false, true, true}},
+      TestCase{{1, 2, 2}, {true, false, true}},
+  };
+
+  for (auto& test_case : test_cases) {
+    EXPECT_EQ(test_case.expected, ComputeMoveVector(test_case.indices));
+  }
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 00884314a9..be7d182a1f 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -18,9 +18,11 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -31,67 +33,84 @@ namespace {
 
 class FilterDatasetOp : public UnaryDatasetOpKernel {
  public:
+  using FilterIteratorPredicate =
+      std::function<Status(IteratorContext*, std::vector<Tensor>, bool*)>;
+
   explicit FilterDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
+      : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("predicate", &func_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    FunctionLibraryRuntime::Handle pred_handle;
-    OP_REQUIRES_OK(ctx,
-                   ctx->function_library()->Instantiate(
-                       func_.name(), AttrSlice(&func_.attr()), &pred_handle));
-    auto cleanup = gtl::MakeCleanup([ctx, pred_handle]() {
-      OP_REQUIRES_OK(ctx, ctx->function_library()->ReleaseHandle(pred_handle));
-    });
-
-    const FunctionBody* pred_body =
-        ctx->function_library()->GetFunctionBody(pred_handle);
-    OP_REQUIRES(ctx, pred_body->ret_nodes.size() == 1,
-                errors::InvalidArgument(
-                    "predicate function must have a single return value."));
-    Node* ret_node = pred_body->ret_nodes[0];
-    Node* ret_input_node;
-    OP_REQUIRES_OK(ctx, ret_node->input_node(0, &ret_input_node));
-
     std::unique_ptr<CapturedFunction> captured_func;
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
                                                  &captured_func));
 
-    if (ret_input_node->def().op() == "_Arg") {
-      int32 index = -1;
-      OP_REQUIRES_OK(ctx, GetNodeAttr(ret_input_node->def(), "index", &index));
-      *output = new FilterTensorDataset(ctx, input, func_,
-                                        std::move(captured_func), index);
+    std::vector<int> indices;
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
+    OP_REQUIRES(ctx, indices.size() <= 1,
+                errors::InvalidArgument(
+                    "predicate function has more than one return value."));
+
+    FilterIteratorPredicate filter_pred;
+    if (indices.empty()) {
+      CapturedFunction* raw_captured_func = captured_func.get();
+      filter_pred = [raw_captured_func](IteratorContext* ctx,
+                                        const std::vector<Tensor>& args,
+                                        bool* out_matched) {
+        std::vector<Tensor> result;
+        TF_RETURN_IF_ERROR(
+            raw_captured_func->RunWithBorrowedArgs(ctx, args, &result));
+
+        if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
+            result[0].NumElements() != 1) {
+          return errors::InvalidArgument(
+              "Filter predicate `f` must return a scalar bool.");
+        }
+        *out_matched = result[0].scalar<bool>()();
+        return Status::OK();
+      };
     } else {
-      *output = new FilterFunctionDataset(ctx, input, func_,
-                                          std::move(captured_func));
+      filter_pred = [indices](IteratorContext* ctx,
+                              const std::vector<Tensor>& args,
+                              bool* out_matched) {
+        const Tensor& predicate = args[indices[0]];
+        if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
+          return errors::InvalidArgument(
+              "Filter predicate `f` must return a scalar bool.");
+        }
+        *out_matched = predicate.scalar<bool>()();
+        return Status::OK();
+      };
     }
+
+    *output = new Dataset(ctx, input, func_, std::move(captured_func),
+                          std::move(filter_pred));
   }
 
  private:
-  const int graph_def_version_;
-
-  class FilterDatasetBase : public DatasetBase {
+  class Dataset : public DatasetBase {
    public:
-    FilterDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
-                      const NameAttrList& func,
-                      std::unique_ptr<CapturedFunction> captured_func)
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
+            std::unique_ptr<CapturedFunction> captured_func,
+            FilterIteratorPredicate filter_pred)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
-          captured_func_(std::move(captured_func)) {
+          captured_func_(std::move(captured_func)),
+          filter_pred_(std::move(filter_pred)) {
       input_->Ref();
     }
 
-    ~FilterDatasetBase() override { input_->Unref(); }
+    ~Dataset() override { input_->Unref(); }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Filter")}));
+      return MakeUnique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Filter")},
+          filter_pred_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -133,17 +152,15 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       return Status::OK();
     }
 
-    virtual Status EvaluatePredicate(IteratorContext* ctx,
-                                     const std::vector<Tensor>& element,
-                                     bool* out_matched) const = 0;
-
    private:
-    class Iterator : public DatasetIterator<FilterDatasetBase> {
+    class Iterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<FilterDatasetBase>(params),
+      explicit Iterator(const Params& params,
+                        FilterIteratorPredicate filter_pred)
+          : DatasetIterator<Dataset>(params),
             filtered_elements_(0),
-            dropped_elements_(0) {
+            dropped_elements_(0),
+            filter_pred_(std::move(filter_pred)) {
         std::vector<string> components =
             str_util::Split(params.prefix, "::", str_util::SkipEmpty());
         prefix_end_ = components.back();
@@ -180,8 +197,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
             return Status::OK();
           }
 
-          TF_RETURN_IF_ERROR(
-              dataset()->EvaluatePredicate(ctx, *out_tensors, &matched));
+          TF_RETURN_IF_ERROR(filter_pred_(ctx, *out_tensors, &matched));
           if (!matched) {
             // Clear the output tensor list since it didn't match.
             out_tensors->clear();
@@ -251,64 +267,14 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       int64 filtered_elements_ GUARDED_BY(mu_);
       int64 dropped_elements_ GUARDED_BY(mu_);
+      const FilterIteratorPredicate filter_pred_;
       string prefix_end_;
     };
 
     const DatasetBase* const input_;
     const NameAttrList func_;
-
-   protected:
     const std::unique_ptr<CapturedFunction> captured_func_;
-  };
-
-  class FilterFunctionDataset : public FilterDatasetBase {
-   public:
-    using FilterDatasetBase::FilterDatasetBase;
-
-   protected:
-    Status EvaluatePredicate(IteratorContext* ctx,
-                             const std::vector<Tensor>& element,
-                             bool* out_matched) const override {
-      // TODO(mrry): Avoid blocking a threadpool thread. We will need to
-      // stack-rip the iterators and use async kernels.
-      std::vector<Tensor> result;
-      TF_RETURN_IF_ERROR(
-          captured_func_->RunWithBorrowedArgs(ctx, element, &result));
-
-      if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
-          result[0].NumElements() != 1) {
-        return errors::InvalidArgument(
-            "Filter predicate `f` must return a scalar bool.");
-      }
-      *out_matched = result[0].scalar<bool>()();
-      return Status::OK();
-    }
-  };
-
-  class FilterTensorDataset : public FilterDatasetBase {
-   public:
-    FilterTensorDataset(OpKernelContext* ctx, const DatasetBase* input,
-                        const NameAttrList& func,
-                        std::unique_ptr<CapturedFunction> captured_func,
-                        int32 index)
-        : FilterDatasetBase(ctx, input, func, std::move(captured_func)),
-          index_(index) {}
-
-   protected:
-    Status EvaluatePredicate(IteratorContext* ctx,
-                             const std::vector<Tensor>& element,
-                             bool* out_matched) const override {
-      const Tensor& predicate = element[index_];
-      if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
-        return errors::InvalidArgument(
-            "Filter predicate `f` must return a scalar bool.");
-      }
-      *out_matched = predicate.scalar<bool>()();
-      return Status::OK();
-    }
-
-   private:
-    const int32 index_;
+    const FilterIteratorPredicate filter_pred_;
   };
 
  private:
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index bf08970560..f45a239793 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -41,6 +43,10 @@ namespace {
 // transformation more robust.
 class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
+  using MapAndBatchIteratorFunction =
+      std::function<void(IteratorContext*, const string&, std::vector<Tensor>,
+                         std::shared_ptr<std::vector<Tensor>>, StatusCallback)>;
+
   explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx),
         op_version_(ctx->def().op() == "MapAndBatchDataset" ? 1 : 2) {
@@ -91,31 +97,73 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
                                                  &captured_func));
 
-    *output = new Dataset(ctx, input, batch_size, num_parallel_calls,
-                          drop_remainder, output_types_, output_shapes_, func_,
-                          std::move(captured_func), &ctx->eigen_cpu_device());
+    std::vector<int> indices;
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
+
+    MapAndBatchIteratorFunction map_func;
+    CapturedFunction* raw_captured_func = captured_func.get();
+    if (indices.empty()) {
+      map_func = [raw_captured_func](
+                     IteratorContext* ctx, const string& prefix,
+                     std::vector<Tensor> args,
+                     std::shared_ptr<std::vector<Tensor>> out_tensors,
+                     StatusCallback done) {
+        raw_captured_func->RunAsync(ctx, std::move(args), out_tensors.get(),
+                                    std::move(done), prefix);
+      };
+    } else {
+      std::vector<bool> can_move = ComputeMoveVector(indices);
+      map_func = [raw_captured_func, indices, can_move](
+                     IteratorContext* ctx, const string& prefix,
+                     std::vector<Tensor> args,
+                     std::shared_ptr<std::vector<Tensor>> out_tensors,
+                     StatusCallback done) {
+        const std::vector<Tensor>& captured_inputs =
+            raw_captured_func->captured_inputs();
+        size_t num_args = args.size();
+        for (size_t i = 0; i < indices.size(); ++i) {
+          if (indices[i] < num_args) {
+            if (can_move[i]) {
+              out_tensors->push_back(std::move(args[indices[i]]));
+            } else {
+              out_tensors->push_back(args[indices[i]]);
+            }
+          } else {
+            out_tensors->push_back(captured_inputs[indices[i] - num_args]);
+          }
+        }
+        done(Status::OK());
+      };
+    }
+
+    *output = new Dataset(ctx, input, func_, batch_size, num_parallel_calls,
+                          drop_remainder, output_types_, output_shapes_,
+                          std::move(captured_func), &ctx->eigen_cpu_device(),
+                          std::move(map_func));
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func, int64 batch_size,
             int64 num_parallel_calls, bool drop_remainder,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
-            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
-            const Eigen::ThreadPoolDevice* device)
+            const Eigen::ThreadPoolDevice* device,
+            MapAndBatchIteratorFunction map_func)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
+          func_(func),
           batch_size_(batch_size),
           num_parallel_calls_(num_parallel_calls),
           drop_remainder_(drop_remainder),
           output_types_(output_types),
           output_shapes_(output_shapes),
-          map_fn_(func),
           captured_func_(std::move(captured_func)),
-          device_(device) {
+          device_(device),
+          map_func_(std::move(map_func)) {
       input_->Ref();
     }
 
@@ -123,8 +171,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::MapAndBatch")}));
+      return MakeUnique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::MapAndBatch")},
+          map_func_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -143,7 +192,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, map_fn_.name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
       Node* input_graph_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* batch_size_node;
@@ -165,7 +214,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         other_arguments_types.emplace_back(t.dtype());
       }
       AttrValue f;
-      b->BuildAttrValue(map_fn_, &f);
+      b->BuildAttrValue(func_, &f);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
 
@@ -185,12 +234,14 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params)
+      explicit Iterator(const Params& params,
+                        MapAndBatchIteratorFunction map_func)
           : DatasetIterator<Dataset>(params),
             mu_(std::make_shared<mutex>()),
             cond_var_(std::make_shared<condition_variable>()),
             num_parallel_calls_(std::make_shared<model::SharedState>(
-                params.dataset->num_parallel_calls_, mu_, cond_var_)) {}
+                params.dataset->num_parallel_calls_, mu_, cond_var_)),
+            map_func_(std::move(map_func)) {}
 
       ~Iterator() override {
         mutex_lock l(*mu_);
@@ -297,44 +348,6 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         int64 num_calls;  // access guarded by owner's mutex
       };
 
-      void Callback(const std::shared_ptr<IteratorContext>& ctx,
-                    const std::shared_ptr<BatchResult>& result,
-                    const std::shared_ptr<std::vector<Tensor>>& return_values,
-                    int64 offset, const Status& status) LOCKS_EXCLUDED(*mu_) {
-        result->UpdateStatus(status);
-        if (status.ok()) {
-          EnsureOutputAllocated(ctx, result, return_values);
-          for (size_t i = 0; i < return_values->size(); ++i) {
-            const Tensor& tensor = return_values->at(i);
-            Tensor* batch = &(result->output)[i];
-            if (tensor.NumElements() !=
-                (batch->NumElements() / batch->dim_size(0))) {
-              TensorShape batch_shape = batch->shape();
-              batch_shape.RemoveDim(0);
-              result->UpdateStatus(errors::InvalidArgument(
-                  "Cannot add tensor to the batch: number of elements does not "
-                  "match. Shapes are: [tensor]: ",
-                  tensor.shape().DebugString(),
-                  ", [batch]: ", batch_shape.DebugString()));
-              break;
-            }
-            // TODO(mrry): Add a version of DoParallelConcat that allows us to
-            // move `tensor` where possible, to speed up string tensor batching.
-            Status copy_status = ::tensorflow::functor::DoParallelConcat(
-                *dataset()->device_, tensor, offset, batch);
-            if (!copy_status.ok()) {
-              result->UpdateStatus(copy_status);
-              break;
-            }
-          }
-          {
-            mutex_lock l(result->mu);
-            result->num_elements++;
-          }
-        }
-        CallCompleted(result);
-      }
-
       void CallCompleted(const std::shared_ptr<BatchResult>& result)
           LOCKS_EXCLUDED(*mu_) {
         mutex_lock l(*mu_);
@@ -363,21 +376,48 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           return;
         }
 
-        // Call `captured_func_(input_element)`, using `Callback` to store the
-        // result in `result`.
-        (*ctx->runner())(std::bind(
-            [this, result, offset](std::shared_ptr<IteratorContext> ctx,
-                                   std::vector<Tensor> input_element) {
-              std::shared_ptr<std::vector<Tensor>> return_values(
-                  new std::vector<Tensor>());
-              dataset()->captured_func_->RunAsync(
-                  ctx.get(), std::move(input_element), return_values.get(),
-                  [this, ctx, result, return_values, offset](Status status) {
-                    Callback(ctx, result, return_values, offset, status);
-                  },
-                  prefix());
-            },
-            ctx, std::move(input_element)));
+        std::shared_ptr<std::vector<Tensor>> return_values =
+            std::make_shared<std::vector<Tensor>>();
+        auto done = [this, ctx, result, return_values, offset](Status status) {
+          result->UpdateStatus(status);
+          if (status.ok()) {
+            EnsureOutputAllocated(ctx, result, return_values);
+            for (size_t i = 0; i < return_values->size(); ++i) {
+              const Tensor& tensor = return_values->at(i);
+              Tensor* batch = &(result->output)[i];
+              if (tensor.NumElements() !=
+                  (batch->NumElements() / batch->dim_size(0))) {
+                TensorShape batch_shape = batch->shape();
+                batch_shape.RemoveDim(0);
+                result->UpdateStatus(errors::InvalidArgument(
+                    "Cannot add tensor to the batch: number of elements does "
+                    "not match. Shapes are: [tensor]: ",
+                    tensor.shape().DebugString(),
+                    ", [batch]: ", batch_shape.DebugString()));
+                break;
+              }
+              // TODO(mrry): Add a version of DoParallelConcat that allows us to
+              // move `tensor` where possible, to speed up string tensor
+              // batching.
+              Status copy_status = ::tensorflow::functor::DoParallelConcat(
+                  *dataset()->device_, tensor, offset, batch);
+              if (!copy_status.ok()) {
+                result->UpdateStatus(copy_status);
+                break;
+              }
+            }
+            {
+              mutex_lock l(result->mu);
+              result->num_elements++;
+            }
+          }
+          CallCompleted(result);
+        };
+
+        // Apply the map function on `input_element`, storing the result in
+        // `return_values`, and invoking `done` when finished.
+        map_func_(ctx.get(), prefix(), std::move(input_element),
+                  std::move(return_values), std::move(done));
       }
 
       Status CopyPartialBatch(Tensor* output, const Tensor& value,
@@ -404,7 +444,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       void EnsureRunnerThreadStarted(IteratorContext* ctx)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!runner_thread_) {
-          std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
+          auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
           runner_thread_.reset(ctx->env()->StartThread(
               {}, "runner_thread",
               std::bind(&Iterator::RunnerThread, this, ctx_copy)));
@@ -509,8 +549,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
             while (!busy()) {
               if (call_counter_ % dataset()->batch_size_ == 0) {
-                batch_results_.emplace_back(
-                    new BatchResult(dataset()->batch_size_));
+                batch_results_.push_back(
+                    std::make_shared<BatchResult>(dataset()->batch_size_));
               }
               int64 offset = call_counter_++ % dataset()->batch_size_;
               new_calls.emplace_back(batch_results_.back(), offset);
@@ -527,7 +567,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
       Status ReadBatchResult(IteratorContext* ctx, IteratorStateReader* reader,
                              size_t index) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        batch_results_.emplace_back(new BatchResult(dataset()->batch_size_));
+        batch_results_.push_back(
+            std::make_shared<BatchResult>(dataset()->batch_size_));
         std::shared_ptr<BatchResult> result = batch_results_.back();
         string prefix = strings::StrCat("batch_results_", index);
         mutex_lock l(result->mu);
@@ -653,6 +694,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       const std::shared_ptr<condition_variable> cond_var_;
       // Identifies the maximum number of parallel calls.
       const std::shared_ptr<model::SharedState> num_parallel_calls_;
+      const MapAndBatchIteratorFunction map_func_;
+
       // Counts the number of outstanding calls for this batch.
       int64 num_calls_ GUARDED_BY(*mu_) = 0;
       // Counts the total number of calls.
@@ -671,9 +714,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     const bool drop_remainder_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
-    const NameAttrList map_fn_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const Eigen::ThreadPoolDevice* device_;  // not owned
+    const MapAndBatchIteratorFunction map_func_;
   };
 
   const int op_version_;
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index f112e1dc43..6b6ffabf4f 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -17,7 +17,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -28,6 +30,9 @@ namespace {
 
 class MapDatasetOp : public UnaryDatasetOpKernel {
  public:
+  using MapIteratorFunction = std::function<Status(
+      IteratorContext*, std::vector<Tensor>, std::vector<Tensor>*)>;
+
   explicit MapDatasetOp(OpKernelConstruction* ctx) : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
@@ -43,8 +48,42 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
                                                  use_inter_op_parallelism_,
                                                  &captured_func));
 
+    std::vector<int> indices;
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
+
+    MapIteratorFunction map_func;
+    CapturedFunction* raw_captured_func = captured_func.get();
+    if (indices.empty()) {
+      map_func = [raw_captured_func](IteratorContext* ctx,
+                                     std::vector<Tensor> args,
+                                     std::vector<Tensor>* out_tensors) {
+        return raw_captured_func->Run(ctx, std::move(args), out_tensors);
+      };
+    } else {
+      std::vector<bool> can_move = ComputeMoveVector(indices);
+      map_func = [raw_captured_func, indices, can_move](
+                     IteratorContext* ctx, std::vector<Tensor> args,
+                     std::vector<Tensor>* out_tensors) {
+        const std::vector<Tensor>& captured_inputs =
+            raw_captured_func->captured_inputs();
+        size_t num_args = args.size();
+        for (size_t i = 0; i < indices.size(); ++i) {
+          if (indices[i] < num_args) {
+            if (can_move[i]) {
+              out_tensors->push_back(std::move(args[indices[i]]));
+            } else {
+              out_tensors->push_back(args[indices[i]]);
+            }
+          } else {
+            out_tensors->push_back(captured_inputs[indices[i] - num_args]);
+          }
+        }
+        return Status::OK();
+      };
+    }
+
     *output = new Dataset(ctx, input, func_, std::move(captured_func),
-                          output_types_, output_shapes_);
+                          output_types_, output_shapes_, std::move(map_func));
   }
 
  private:
@@ -54,13 +93,15 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
             const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
+            const std::vector<PartialTensorShape>& output_shapes,
+            MapIteratorFunction map_func)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
           captured_func_(std::move(captured_func)),
           output_types_(output_types),
-          output_shapes_(output_shapes) {
+          output_shapes_(output_shapes),
+          map_func_(std::move(map_func)) {
       input_->Ref();
     }
 
@@ -68,8 +109,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Map")}));
+      return MakeUnique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Map")}, map_func_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -116,8 +157,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
+      explicit Iterator(const Params& params, MapIteratorFunction map_func)
+          : DatasetIterator<Dataset>(params), map_func_(std::move(map_func)) {}
 
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
@@ -139,10 +180,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
           return Status::OK();
         }
 
-        // TODO(mrry): Avoid blocking a threadpool thread. We will need to
-        // stack-rip the iterators and use async kernels.
-        Status s =
-            dataset()->captured_func_->Run(ctx, std::move(args), out_tensors);
+        Status s = map_func_(ctx, args, out_tensors);
         if (errors::IsOutOfRange(s)) {
           // `f` may deliberately raise `errors::OutOfRange` to indicate
           // that we should terminate the iteration early.
@@ -167,6 +205,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       std::unique_ptr<IteratorBase> input_impl_;
+      const MapIteratorFunction map_func_;
     };
 
     const DatasetBase* const input_;
@@ -174,6 +213,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
     const std::unique_ptr<CapturedFunction> captured_func_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
+    const MapIteratorFunction map_func_;
   };
 
   DataTypeVector output_types_;
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 6abe6c8338..3a14924fba 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/parallel_map_iterator.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -56,9 +57,55 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
                                                  use_inter_op_parallelism_,
                                                  &captured_func));
 
+    std::vector<int> indices;
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
+
+    ParallelMapIteratorFunction map_func;
+    CapturedFunction* raw_captured_func = captured_func.get();
+    if (indices.empty()) {
+      map_func = [raw_captured_func](IteratorContext* ctx, const string& prefix,
+                                     std::vector<Tensor> args,
+                                     std::vector<Tensor>* out_tensors,
+                                     StatusCallback done) {
+        raw_captured_func->RunAsync(ctx, std::move(args), out_tensors,
+                                    std::move(done), prefix);
+      };
+      if (!use_inter_op_parallelism_) {
+        map_func = [map_func](IteratorContext* ctx, const string& prefix,
+                              std::vector<Tensor> args,
+                              std::vector<Tensor>* out_tensors,
+                              StatusCallback done) {
+          (*ctx->runner())(std::bind(map_func, ctx, prefix, std::move(args),
+                                     out_tensors, std::move(done)));
+        };
+      }
+    } else {
+      std::vector<bool> can_move = ComputeMoveVector(indices);
+      map_func = [raw_captured_func, indices, can_move](
+                     IteratorContext* ctx, const string& prefix,
+                     std::vector<Tensor> args, std::vector<Tensor>* out_tensors,
+                     StatusCallback done) {
+        const std::vector<Tensor>& captured_inputs =
+            raw_captured_func->captured_inputs();
+        size_t num_args = args.size();
+        for (size_t i = 0; i < indices.size(); ++i) {
+          if (indices[i] < num_args) {
+            if (can_move[i]) {
+              out_tensors->push_back(std::move(args[indices[i]]));
+            } else {
+              out_tensors->push_back(args[indices[i]]);
+            }
+          } else {
+            out_tensors->push_back(captured_inputs[indices[i] - num_args]);
+          }
+        }
+        done(Status::OK());
+      };
+    }
+
     *output = new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
                           output_shapes_, use_inter_op_parallelism_,
-                          std::move(captured_func));
+                          std::move(captured_func), std::move(map_func));
   }
 
  private:
@@ -69,7 +116,8 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
             bool use_inter_op_parallelism,
-            std::unique_ptr<CapturedFunction> captured_func)
+            std::unique_ptr<CapturedFunction> captured_func,
+            ParallelMapIteratorFunction map_func)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
@@ -77,7 +125,8 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
           output_types_(output_types),
           output_shapes_(output_shapes),
           use_inter_op_parallelism_(use_inter_op_parallelism),
-          captured_func_(std::move(captured_func)) {
+          captured_func_(std::move(captured_func)),
+          map_func_(std::move(map_func)) {
       input_->Ref();
     }
 
@@ -89,26 +138,9 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
         return captured_func_->Instantiate(ctx);
       };
 
-      const string& new_prefix = strings::StrCat(prefix, "::ParallelMap");
-      ParallelMapIteratorFunction map_func =
-          [this, new_prefix](IteratorContext* ctx,
-                             std::vector<Tensor> input_element,
-                             std::vector<Tensor>* result, StatusCallback done) {
-            captured_func_->RunAsync(ctx, std::move(input_element), result,
-                                     std::move(done), new_prefix);
-          };
-      if (!use_inter_op_parallelism_) {
-        map_func = [map_func](
-                       IteratorContext* ctx, std::vector<Tensor> input_element,
-                       std::vector<Tensor>* result, StatusCallback done) {
-          (*ctx->runner())(std::bind(map_func, ctx, std::move(input_element),
-                                     result, std::move(done)));
-        };
-      }
-
-      return NewParallelMapIterator({this, new_prefix}, input_,
-                                    std::move(init_func), std::move(map_func),
-                                    num_parallel_calls_);
+      return NewParallelMapIterator(
+          {this, strings::StrCat(prefix, "::ParallelMap")}, input_,
+          std::move(init_func), map_func_, num_parallel_calls_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -176,6 +208,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     const std::vector<PartialTensorShape> output_shapes_;
     const bool use_inter_op_parallelism_;
     const std::unique_ptr<CapturedFunction> captured_func_;
+    const ParallelMapIteratorFunction map_func_;
   };
 
   DataTypeVector output_types_;
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index 13bd4b6036..ebf41925c9 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -179,7 +180,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   void EnsureRunnerThreadStarted(IteratorContext* ctx)
       EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     if (!runner_thread_) {
-      std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
+      auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
       runner_thread_.reset(ctx->env()->StartThread(
           {}, "runner_thread",
           std::bind(&ParallelMapIterator::RunnerThread, this, ctx_copy)));
@@ -208,15 +209,15 @@ class ParallelMapIterator : public DatasetBaseIterator {
       return;
     }
 
-    // Call `func_(input_element)`, store the result in `result->return_values`,
-    // and notify `result->notification` to unblock a consumer.
     auto done = [this, result](Status status) {
       result->status.Update(status);
       CallCompleted(result);
     };
 
-    map_func_(ctx.get(), std::move(input_element), &result->return_values,
-              std::move(done));
+    // Apply the map function on `input_element`, storing the result in
+    // `result->return_values`, and invoking `done` when finished.
+    map_func_(ctx.get(), prefix(), std::move(input_element),
+              &result->return_values, std::move(done));
   }
 
   Status ProcessResult(const std::shared_ptr<InvocationResult>& result,
@@ -349,9 +350,9 @@ std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBase* input_dataset,
     std::function<Status(IteratorContext*)> init_func,
     ParallelMapIteratorFunction map_func, int32 num_parallel_calls) {
-  return std::unique_ptr<IteratorBase>(
-      new ParallelMapIterator(params, input_dataset, std::move(init_func),
-                              std::move(map_func), num_parallel_calls));
+  return MakeUnique<ParallelMapIterator>(
+      params, input_dataset, std::move(init_func), std::move(map_func),
+      num_parallel_calls);
 }
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.h b/tensorflow/core/kernels/data/parallel_map_iterator.h
index dc26c5cf25..813f13c9e4 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.h
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.h
@@ -30,7 +30,7 @@ namespace data {
 // 3. A `std::vector<Tensor>*` to which the function will write the result.
 // 4. A `StatusCallback` that should be invoked when the function is complete.
 using ParallelMapIteratorFunction =
-    std::function<void(IteratorContext*, std::vector<Tensor>,
+    std::function<void(IteratorContext*, const string&, std::vector<Tensor>,
                        std::vector<Tensor>*, StatusCallback)>;
 
 // Returns a new iterator that applies `map_func` to the elements of
diff --git a/tensorflow/core/kernels/data/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/parse_example_dataset_op.cc
index 1d1a717062..7de5ea8860 100644
--- a/tensorflow/core/kernels/data/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parse_example_dataset_op.cc
@@ -182,7 +182,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      auto map_fn = [this](IteratorContext* ctx,
+      auto map_fn = [this](IteratorContext* ctx, const string& prefix,
                            std::vector<Tensor> input_element,
                            std::vector<Tensor>* result, StatusCallback done) {
         (*ctx->runner())([this, ctx, input_element, result, done]() {
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index afd0fc3abf..d444c4082e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -332,6 +332,37 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       for _ in range(10):
         self.assertAllEqual([element for _ in range(10)], sess.run(get_next))
 
+  @parameterized.named_parameters(
+      ("Identity", None, lambda x: x, None),
+      ("Replicate", None, lambda x: (x, x), None),
+      ("Swap", (None, None), lambda x, y: (y, x), None),
+      ("Project", (None, None), lambda x, y: x, None),
+  )
+  def testShortCircuit(self, structure, map_fn, num_parallel_calls):
+    dataset = self.structuredDataset(structure).repeat().apply(
+        batching.map_and_batch(map_fn, batch_size=10))
+    get_next = dataset.make_one_shot_iterator().get_next()
+
+    with self.cached_session() as sess:
+      if isinstance(structure, tuple):
+        expected = map_fn(
+            *sess.run(self.structuredElement(structure, shape=[10])))
+      else:
+        expected = map_fn(
+            sess.run(self.structuredElement(structure, shape=[10])))
+      self.assertAllEqual(expected, sess.run(get_next))
+
+  def testShortCircuitCapturedInput(self):
+    captured_t = array_ops.placeholder(dtypes.int64, shape=[])
+    dataset = self.structuredDataset(None).repeat().apply(
+        batching.map_and_batch(lambda x: captured_t, batch_size=10))
+    iterator = dataset.make_initializable_iterator()
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(iterator.initializer, feed_dict={captured_t: 42})
+      self.assertAllEqual([42] * 10, sess.run(get_next))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
index 6b7afafa5d..a0c6b37a6d 100644
--- a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
@@ -156,7 +156,7 @@ class FilterDatasetTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testReturnComponent(self):
+  def testShortCircuit(self):
     iterator = (
         dataset_ops.Dataset.zip(
             (dataset_ops.Dataset.range(10),
diff --git a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
index 0c372ebb10..4683b1db91 100644
--- a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
@@ -622,7 +622,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       sess.run(init_op)
       for i in range(10):
         actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
+        self.assertIsInstance(actual, sparse_tensor.SparseTensorValue)
         self.assertSparseValuesEqual(actual, _sparse(i))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -649,7 +649,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       sess.run(init_op)
       for i in range(10):
         actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
+        self.assertIsInstance(actual, sparse_tensor.SparseTensorValue)
         self.assertSparseValuesEqual(actual, _check(_sparse(i)).eval())
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -783,19 +783,72 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertTrue(all(tids[0] == tid for tid in tids))
 # pylint: enable=g-long-lambda
 
+  @parameterized.named_parameters(
+      ("SequentialIdentity", None, lambda x: x, None),
+      ("SequentialReplicate", None, lambda x: (x, x), None),
+      ("SequentialSwap", (None, None), lambda x, y: (y, x), None),
+      ("SequentialProject", (None, None), lambda x, y: x, None),
+      ("ParallelIdentity", None, lambda x: x, 10),
+      ("ParallelReplicate", None, lambda x: (x, x), 10),
+      ("ParallelSwap", (None, None), lambda x, y: (y, x), 10),
+      ("ParallelProject", (None, None), lambda x, y: x, 10),
+  )
+  def testShortCircuit(self, structure, map_fn, num_parallel_calls):
+    dataset = self.structuredDataset(structure).repeat().map(
+        map_fn, num_parallel_calls=num_parallel_calls)
+    get_next = dataset.make_one_shot_iterator().get_next()
+
+    with self.cached_session() as sess:
+      if isinstance(structure, tuple):
+        expected = map_fn(*sess.run(self.structuredElement(structure)))
+      else:
+        expected = map_fn(sess.run(self.structuredElement(structure)))
+      self.assertEqual(expected, sess.run(get_next))
+
+  @parameterized.named_parameters(
+      ("Sequential", None),
+      ("Parallel", 10),
+  )
+  def testShortCircuitCapturedInput(self, num_parallel_calls):
+    captured_t = array_ops.placeholder(dtypes.int64, shape=[])
+    dataset = self.structuredDataset(None).repeat().map(
+        lambda x: captured_t, num_parallel_calls=num_parallel_calls)
+    iterator = dataset.make_initializable_iterator()
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(iterator.initializer, feed_dict={captured_t: 42})
+      self.assertEqual(42, sess.run(get_next))
+
 
 class MapDatasetBenchmark(test.Benchmark):
 
   def benchmarkChainOfMaps(self):
     chain_lengths = [0, 1, 2, 5, 10, 20, 50]
     for chain_length in chain_lengths:
-      for use_inter_op_parallelism in [False, True]:
+      for mode in ["general", "single-threaded", "short-circuit"]:
+        if mode == "general":
+          map_fn = lambda x: x + 1
+          use_inter_op_parallelism = True
+          print_label = ""
+          benchmark_label = ""
+        if mode == "single-threaded":
+          map_fn = lambda x: x + 1
+          use_inter_op_parallelism = False
+          print_label = " (single threaded mode)"
+          benchmark_label = "_single_threaded"
+        if mode == "short-circuit":
+          map_fn = lambda x: x
+          use_inter_op_parallelism = True  # should not have any significance
+          print_label = " (short circuit mode)"
+          benchmark_label = "_short_circuit"
+
         with ops.Graph().as_default():
           dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
           for _ in range(chain_length):
             dataset = dataset_ops.MapDataset(
                 dataset,
-                lambda x: x,
+                map_fn,
                 use_inter_op_parallelism=use_inter_op_parallelism)
           iterator = dataset.make_one_shot_iterator()
           next_element = iterator.get_next()
@@ -813,25 +866,39 @@ class MapDatasetBenchmark(test.Benchmark):
 
             median_wall_time = np.median(deltas) / 100
             print("Map dataset chain length%s: %d Median wall time: %f" %
-                  (" (single threaded mode)" if not use_inter_op_parallelism
-                   else "", chain_length, median_wall_time))
+                  (print_label, chain_length, median_wall_time))
             self.report_benchmark(
                 iters=1000,
                 wall_time=median_wall_time,
                 name="benchmark_map_dataset_chain_latency_%d%s" %
-                (chain_length, "_single_threaded"
-                 if not use_inter_op_parallelism else ""))
+                (chain_length, benchmark_label))
 
   def benchmarkMapFanOut(self):
     fan_outs = [1, 2, 5, 10, 20, 50, 100]
     for fan_out in fan_outs:
-      for use_inter_op_parallelism in [False, True]:
+      for mode in ["general", "single-threaded", "short-circuit"]:
+        if mode == "general":
+          map_fn = lambda *xs: [x + 1 for x in xs]
+          use_inter_op_parallelism = True
+          print_label = ""
+          benchmark_label = ""
+        if mode == "single-threaded":
+          map_fn = lambda *xs: [x + 1 for x in xs]
+          use_inter_op_parallelism = False
+          print_label = " (single threaded mode)"
+          benchmark_label = "_single_threaded"
+        if mode == "short-circuit":
+          map_fn = lambda *xs: xs
+          use_inter_op_parallelism = True  # should not have any significance
+          print_label = " (short circuit mode)"
+          benchmark_label = "_short_circuit"
+
         with ops.Graph().as_default():
           dataset = dataset_ops.Dataset.from_tensors(
               tuple(0 for _ in range(fan_out))).repeat(None)
           dataset = dataset_ops.MapDataset(
               dataset,
-              lambda *xs: xs,
+              map_fn,
               use_inter_op_parallelism=use_inter_op_parallelism)
           iterator = dataset.make_one_shot_iterator()
           next_element = iterator.get_next()
@@ -849,14 +916,12 @@ class MapDatasetBenchmark(test.Benchmark):
 
             median_wall_time = np.median(deltas) / 100
             print("Map dataset fan out%s: %d Median wall time: %f" %
-                  (" (single threaded mode)" if not use_inter_op_parallelism
-                   else "", fan_out, median_wall_time))
+                  (print_label, fan_out, median_wall_time))
             self.report_benchmark(
                 iters=1000,
                 wall_time=median_wall_time,
-                name="benchmark_map_dataset_fan_out_%d%s" %
-                (fan_out, "_single_threaded"
-                 if not use_inter_op_parallelism else ""))
+                name="benchmark_map_dataset_fan_out_%d%s" % (fan_out,
+                                                             benchmark_label))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index b730e10949..b73a94e683 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -19,10 +19,13 @@ from __future__ import print_function
 
 import re
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -107,3 +110,29 @@ class DatasetTestBase(test.TestCase):
       with self.assertRaisesRegexp(exception_class,
                                    re.escape(expected_message)):
         self.evaluate(next2())
+
+  def structuredDataset(self, structure, shape=None, dtype=dtypes.int64):
+    """Returns a singleton dataset with the given structure."""
+    if shape is None:
+      shape = []
+    if structure is None:
+      return dataset_ops.Dataset.from_tensors(
+          array_ops.zeros(shape, dtype=dtype))
+    else:
+      return dataset_ops.Dataset.zip(
+          tuple([
+              self.structuredDataset(substructure, shape, dtype)
+              for substructure in structure
+          ]))
+
+  def structuredElement(self, structure, shape=None, dtype=dtypes.int64):
+    """Returns an element with the given structure."""
+    if shape is None:
+      shape = []
+    if structure is None:
+      return array_ops.zeros(shape, dtype=dtype)
+    else:
+      return tuple([
+          self.structuredElement(substructure, shape, dtype)
+          for substructure in structure
+      ])
-- 
GitLab


From a04cd08ee7a8c5245d76a59849e1f7e8ba8a3f52 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Mon, 8 Oct 2018 10:20:52 -0700
Subject: [PATCH 526/570] Allow TensorSpec objects as arguments to defun's
 get_concrete_function

Will be helpful for specifying serving signatures when exporting SavedModels

PiperOrigin-RevId: 216207284
---
 tensorflow/python/eager/function.py      | 24 +++++----------
 tensorflow/python/eager/function_test.py | 37 ++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index bafe07de2b..93168826b1 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -855,20 +855,12 @@ class Function(object):
     return ret
 
 
-def _get_defun_inputs_from_signature(signature):
-  """Maps a signature to graph-construction inputs."""
-  function_inputs = [
-      graph_placeholder(spec.dtype, spec.shape)
-      for spec in nest.flatten(signature)
-  ]
-  return nest.pack_sequence_as(signature, function_inputs)
-
-
 def _get_defun_inputs_from_args(args):
   """Maps python function args to graph-construction inputs."""
   function_inputs = [
       graph_placeholder(arg.dtype, arg.shape)
-      if isinstance(arg, ops.Tensor) else arg for arg in nest.flatten(args)
+      if isinstance(arg, (ops.Tensor, tensor_spec.TensorSpec))
+      else arg for arg in nest.flatten(args)
   ]
   return nest.pack_sequence_as(args, function_inputs)
 
@@ -912,12 +904,12 @@ def func_graph_from_py_func(name,
   with func_graph.as_default(), AutomaticControlDependencies() as a:
     variable_scope.get_variable_scope().set_use_resource(True)
 
-    if signature is None:
-      func_args = _get_defun_inputs_from_args(args)
-      func_kwargs = _get_defun_inputs_from_args(kwargs)
-    else:
-      func_args = _get_defun_inputs_from_signature(signature)
-      func_kwargs = {}
+    if signature is not None:
+      args = signature
+      kwargs = {}
+
+    func_args = _get_defun_inputs_from_args(args)
+    func_kwargs = _get_defun_inputs_from_args(kwargs)
 
     # Note: `nest.flatten` sorts by keys, as does `_deterministic_dict_values`.
     # Variables to help check whether mutation happens in calling the function
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index a2cfb4b476..57e545be69 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -172,6 +172,43 @@ class FunctionTest(test.TestCase):
     out = sq_op(t)
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
+  def testInputSpecGraphFunction(self):
+    matmul = function.defun(math_ops.matmul)
+
+    @function.defun
+    def sq(a):
+      return matmul(a, a)
+
+    sq_op = sq.get_concrete_function(
+        tensor_spec.TensorSpec((None, None), dtypes.float32))
+    self.assertEqual([None, None], sq_op.output_shapes.as_list())
+
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    out1 = sq_op(t1)
+    self.assertAllEqual(out1, math_ops.matmul(t1, t1).numpy())
+
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    out2 = sq_op(t2)
+    self.assertAllEqual(out2, math_ops.matmul(t2, t2).numpy())
+
+  def testNestedInputSpecGraphFunction(self):
+    matmul = function.defun(math_ops.matmul)
+
+    @function.defun
+    def sq(mats):
+      ((a, b),) = mats
+      return matmul(a, b)
+
+    sq_op = sq.get_concrete_function(
+        [(tensor_spec.TensorSpec((None, None), dtypes.float32),
+          tensor_spec.TensorSpec((None, None), dtypes.float32))])
+    self.assertEqual([None, None], sq_op.output_shapes.as_list())
+
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.4, 2.4], [3.4, 4.4]])
+    out = sq_op(t1, t2)  # Flattened structure for inputs to the graph function
+    self.assertAllEqual(out, math_ops.matmul(t1, t2).numpy())
+
   def testExecutingStatelessDefunConcurrently(self):
 
     @function.defun
-- 
GitLab


From 049d98c84ca7474459175914ca49c1fa3c11581d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 10:28:59 -0700
Subject: [PATCH 527/570] Wait for shared resources to initialize before
 initializing local resources. shared resources are very similar to global
 variables functionally and they are initialized at the same time but since
 workers are only waiting for global variables being initialized, there is a
 race condition that sometimes the shared resource is not ready.

PiperOrigin-RevId: 216208679
---
 tensorflow/python/training/monitored_session.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 82f0e3be52..a479f38165 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -195,8 +195,12 @@ class Scaffold(object):
           default_ready_op)
     if self._ready_for_local_init_op is None:
       def default_ready_for_local_init_op():
-        return variables.report_uninitialized_variables(
-            variables.global_variables())
+        return array_ops.concat([
+            variables.report_uninitialized_variables(
+                variables.global_variables()),
+            resources.report_uninitialized_resources(
+                resources.shared_resources())
+        ], 0)
       self._ready_for_local_init_op = Scaffold.get_or_default(
           'ready_for_local_init_op', ops.GraphKeys.READY_FOR_LOCAL_INIT_OP,
           default_ready_for_local_init_op)
-- 
GitLab


From 153decedefc8da1fbd0717f4223b4b053e7aa517 Mon Sep 17 00:00:00 2001
From: Karmel Allison <karmel@google.com>
Date: Mon, 8 Oct 2018 10:36:38 -0700
Subject: [PATCH 528/570] Add support for SequenceExamples to
 sequence_feature_columns

PiperOrigin-RevId: 216210141
---
 .../contrib/estimator/python/estimator/rnn.py |  54 +-
 tensorflow/contrib/feature_column/BUILD       |  21 +
 .../feature_column/sequence_feature_column.py |  72 +-
 ...equence_feature_column_integration_test.py | 280 ++++++
 .../sequence_feature_column_test.py           | 912 ++++++++++++------
 .../python/feature_column/feature_column.py   |  53 +-
 tensorflow/python/ops/parsing_ops.py          |  13 +-
 7 files changed, 1018 insertions(+), 387 deletions(-)
 create mode 100644 tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py

diff --git a/tensorflow/contrib/estimator/python/estimator/rnn.py b/tensorflow/contrib/estimator/python/estimator/rnn.py
index 98660bb731..c595f47395 100644
--- a/tensorflow/contrib/estimator/python/estimator/rnn.py
+++ b/tensorflow/contrib/estimator/python/estimator/rnn.py
@@ -30,7 +30,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import partitioned_variables
@@ -92,55 +91,6 @@ def _make_rnn_cell_fn(num_units, cell_type='basic_rnn'):
   return rnn_cell_fn
 
 
-def _concatenate_context_input(sequence_input, context_input):
-  """Replicates `context_input` across all timesteps of `sequence_input`.
-
-  Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
-  This value is appended to `sequence_input` on dimension 2 and the result is
-  returned.
-
-  Args:
-    sequence_input: A `Tensor` of dtype `float32` and shape `[batch_size,
-      padded_length, d0]`.
-    context_input: A `Tensor` of dtype `float32` and shape `[batch_size, d1]`.
-
-  Returns:
-    A `Tensor` of dtype `float32` and shape `[batch_size, padded_length,
-    d0 + d1]`.
-
-  Raises:
-    ValueError: If `sequence_input` does not have rank 3 or `context_input` does
-      not have rank 2.
-  """
-  seq_rank_check = check_ops.assert_rank(
-      sequence_input,
-      3,
-      message='sequence_input must have rank 3',
-      data=[array_ops.shape(sequence_input)])
-  seq_type_check = check_ops.assert_type(
-      sequence_input,
-      dtypes.float32,
-      message='sequence_input must have dtype float32; got {}.'.format(
-          sequence_input.dtype))
-  ctx_rank_check = check_ops.assert_rank(
-      context_input,
-      2,
-      message='context_input must have rank 2',
-      data=[array_ops.shape(context_input)])
-  ctx_type_check = check_ops.assert_type(
-      context_input,
-      dtypes.float32,
-      message='context_input must have dtype float32; got {}.'.format(
-          context_input.dtype))
-  with ops.control_dependencies(
-      [seq_rank_check, seq_type_check, ctx_rank_check, ctx_type_check]):
-    padded_length = array_ops.shape(sequence_input)[1]
-    tiled_context_input = array_ops.tile(
-        array_ops.expand_dims(context_input, 1),
-        array_ops.concat([[1], [padded_length], [1]], 0))
-  return array_ops.concat([sequence_input, tiled_context_input], 2)
-
-
 def _select_last_activations(activations, sequence_lengths):
   """Selects the nth set of activations for each n in `sequence_length`.
 
@@ -222,8 +172,8 @@ def _rnn_logit_fn_builder(output_units, rnn_cell_fn, sequence_feature_columns,
         context_input = feature_column_lib.input_layer(
             features=features,
             feature_columns=context_feature_columns)
-        sequence_input = _concatenate_context_input(sequence_input,
-                                                    context_input)
+        sequence_input = seq_fc.concatenate_context_input(
+            context_input, sequence_input)
 
     cell = rnn_cell_fn(mode)
     # Ignore output state.
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index aab7d0c9e8..a926ffd598 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -27,6 +27,7 @@ py_library(
         "//tensorflow/python:check_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:tensor_shape",
@@ -46,9 +47,29 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//tensorflow/python/feature_column",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "sequence_feature_column_integration_test",
+    srcs = ["python/feature_column/sequence_feature_column_integration_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":sequence_feature_column",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/keras:layers",
     ],
 )
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 05bcdac2ca..dd6da35ed0 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -33,7 +33,6 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
 
 # pylint: disable=protected-access
-# TODO(b/73827486): Support SequenceExample.
 
 
 def sequence_input_layer(
@@ -110,6 +109,7 @@ def sequence_input_layer(
     output_tensors = []
     sequence_lengths = []
     ordered_columns = []
+
     for column in sorted(feature_columns, key=lambda x: x.name):
       ordered_columns.append(column)
       with variable_scope.variable_scope(
@@ -121,17 +121,67 @@ def sequence_input_layer(
         # Flattens the final dimension to produce a 3D Tensor.
         num_elements = column._variable_shape.num_elements()
         shape = array_ops.shape(dense_tensor)
+        target_shape = [shape[0], shape[1], num_elements]
         output_tensors.append(
-            array_ops.reshape(
-                dense_tensor,
-                shape=array_ops.concat([shape[:2], [num_elements]], axis=0)))
+            array_ops.reshape(dense_tensor, shape=target_shape))
         sequence_lengths.append(sequence_length)
+
     fc._verify_static_batch_size_equality(output_tensors, ordered_columns)
     fc._verify_static_batch_size_equality(sequence_lengths, ordered_columns)
     sequence_length = _assert_all_equal_and_return(sequence_lengths)
+
     return array_ops.concat(output_tensors, -1), sequence_length
 
 
+def concatenate_context_input(context_input, sequence_input):
+  """Replicates `context_input` across all timesteps of `sequence_input`.
+
+  Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
+  This value is appended to `sequence_input` on dimension 2 and the result is
+  returned.
+
+  Args:
+    context_input: A `Tensor` of dtype `float32` and shape `[batch_size, d1]`.
+    sequence_input: A `Tensor` of dtype `float32` and shape `[batch_size,
+      padded_length, d0]`.
+
+  Returns:
+    A `Tensor` of dtype `float32` and shape `[batch_size, padded_length,
+    d0 + d1]`.
+
+  Raises:
+    ValueError: If `sequence_input` does not have rank 3 or `context_input` does
+      not have rank 2.
+  """
+  seq_rank_check = check_ops.assert_rank(
+      sequence_input,
+      3,
+      message='sequence_input must have rank 3',
+      data=[array_ops.shape(sequence_input)])
+  seq_type_check = check_ops.assert_type(
+      sequence_input,
+      dtypes.float32,
+      message='sequence_input must have dtype float32; got {}.'.format(
+          sequence_input.dtype))
+  ctx_rank_check = check_ops.assert_rank(
+      context_input,
+      2,
+      message='context_input must have rank 2',
+      data=[array_ops.shape(context_input)])
+  ctx_type_check = check_ops.assert_type(
+      context_input,
+      dtypes.float32,
+      message='context_input must have dtype float32; got {}.'.format(
+          context_input.dtype))
+  with ops.control_dependencies(
+      [seq_rank_check, seq_type_check, ctx_rank_check, ctx_type_check]):
+    padded_length = array_ops.shape(sequence_input)[1]
+    tiled_context_input = array_ops.tile(
+        array_ops.expand_dims(context_input, 1),
+        array_ops.concat([[1], [padded_length], [1]], 0))
+  return array_ops.concat([sequence_input, tiled_context_input], 2)
+
+
 def sequence_categorical_column_with_identity(
     key, num_buckets, default_value=None):
   """Returns a feature column that represents sequences of integers.
@@ -453,9 +503,17 @@ class _SequenceNumericColumn(
         [array_ops.shape(dense_tensor)[:1], [-1], self._variable_shape],
         axis=0)
     dense_tensor = array_ops.reshape(dense_tensor, shape=dense_shape)
-    sequence_length = fc._sequence_length_from_sparse_tensor(
-        sp_tensor, num_elements=self._variable_shape.num_elements())
+
+    # Get the number of timesteps per example
+    # For the 2D case, the raw values are grouped according to num_elements;
+    # for the 3D case, the grouping happens in the third dimension, and
+    # sequence length is not affected.
+    num_elements = (self._variable_shape.num_elements()
+                    if sp_tensor.shape.ndims == 2 else 1)
+    seq_length = fc._sequence_length_from_sparse_tensor(
+        sp_tensor, num_elements=num_elements)
+
     return fc._SequenceDenseColumn.TensorSequenceLengthPair(
-        dense_tensor=dense_tensor, sequence_length=sequence_length)
+        dense_tensor=dense_tensor, sequence_length=seq_length)
 
 # pylint: enable=protected-access
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
new file mode 100644
index 0000000000..d8ca363627
--- /dev/null
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
@@ -0,0 +1,280 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration test for sequence feature columns with SequenceExamples."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import string
+import tempfile
+
+from google.protobuf import text_format
+
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.keras.layers import recurrent
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class SequenceFeatureColumnIntegrationTest(test.TestCase):
+
+  def _make_sequence_example(self):
+    example = example_pb2.SequenceExample()
+    example.context.feature['int_ctx'].int64_list.value.extend([5])
+    example.context.feature['float_ctx'].float_list.value.extend([123.6])
+    for val in range(0, 10, 2):
+      feat = feature_pb2.Feature()
+      feat.int64_list.value.extend([val] * val)
+      example.feature_lists.feature_list['int_list'].feature.extend([feat])
+    for val in range(1, 11, 2):
+      feat = feature_pb2.Feature()
+      feat.bytes_list.value.extend([compat.as_bytes(str(val))] * val)
+      example.feature_lists.feature_list['str_list'].feature.extend([feat])
+
+    return example
+
+  def _build_feature_columns(self):
+    col = fc.categorical_column_with_identity(
+        'int_ctx', num_buckets=100)
+    ctx_cols = [
+        fc.embedding_column(col, dimension=10),
+        fc.numeric_column('float_ctx')]
+
+    identity_col = sfc.sequence_categorical_column_with_identity(
+        'int_list', num_buckets=10)
+    bucket_col = sfc.sequence_categorical_column_with_hash_bucket(
+        'bytes_list', hash_bucket_size=100)
+    seq_cols = [
+        fc.embedding_column(identity_col, dimension=10),
+        fc.embedding_column(bucket_col, dimension=20)]
+
+    return ctx_cols, seq_cols
+
+  def test_sequence_example_into_input_layer(self):
+    examples = [_make_sequence_example().SerializeToString()] * 100
+    ctx_cols, seq_cols = self._build_feature_columns()
+
+    def _parse_example(example):
+      ctx, seq = parsing_ops.parse_single_sequence_example(
+          example,
+          context_features=fc.make_parse_example_spec(ctx_cols),
+          sequence_features=fc.make_parse_example_spec(seq_cols))
+      ctx.update(seq)
+      return ctx
+
+    ds = dataset_ops.Dataset.from_tensor_slices(examples)
+    ds = ds.map(_parse_example)
+    ds = ds.batch(20)
+
+    # Test on a single batch
+    features = ds.make_one_shot_iterator().get_next()
+
+    # Tile the context features across the sequence features
+    seq_layer, _ = sfc.sequence_input_layer(features, seq_cols)
+    ctx_layer = fc.input_layer(features, ctx_cols)
+    input_layer = sfc.concatenate_context_input(ctx_layer, seq_layer)
+
+    rnn_layer = recurrent.RNN(recurrent.SimpleRNNCell(10))
+    output = rnn_layer(input_layer)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      features_r = sess.run(features)
+      self.assertAllEqual(features_r['int_list'].dense_shape, [20, 3, 6])
+
+      output_r = sess.run(output)
+      self.assertAllEqual(output_r.shape, [20, 10])
+
+
+class SequenceExampleParsingTest(test.TestCase):
+
+  def test_seq_ex_in_sequence_categorical_column_with_identity(self):
+    self._test_parsed_sequence_example(
+        'int_list', sfc.sequence_categorical_column_with_identity,
+        10, [3, 6], [2, 4, 6])
+
+  def test_seq_ex_in_sequence_categorical_column_with_hash_bucket(self):
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_hash_bucket,
+        10, [3, 4], [compat.as_bytes(x) for x in 'acg'])
+
+  def test_seq_ex_in_sequence_categorical_column_with_vocabulary_list(self):
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_vocabulary_list,
+        list(string.ascii_lowercase), [3, 4],
+        [compat.as_bytes(x) for x in 'acg'])
+
+  def test_seq_ex_in_sequence_categorical_column_with_vocabulary_file(self):
+    _, fname = tempfile.mkstemp()
+    with open(fname, 'w') as f:
+      f.write(string.ascii_lowercase)
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_vocabulary_file,
+        fname, [3, 4], [compat.as_bytes(x) for x in 'acg'])
+
+  def _test_parsed_sequence_example(
+      self, col_name, col_fn, col_arg, shape, values):
+    """Helper function to check that each FeatureColumn parses correctly.
+
+    Args:
+      col_name: string, name to give to the feature column. Should match
+        the name that the column will parse out of the features dict.
+      col_fn: function used to create the feature column. For example,
+        sequence_numeric_column.
+      col_arg: second arg that the target feature column is expecting.
+      shape: the expected dense_shape of the feature after parsing into
+        a SparseTensor.
+      values: the expected values at index [0, 2, 6] of the feature
+        after parsing into a SparseTensor.
+    """
+    example = _make_sequence_example()
+    columns = [
+        fc.categorical_column_with_identity('int_ctx', num_buckets=100),
+        fc.numeric_column('float_ctx'),
+        col_fn(col_name, col_arg)
+    ]
+    context, seq_features = parsing_ops.parse_single_sequence_example(
+        example.SerializeToString(),
+        context_features=fc.make_parse_example_spec(columns[:2]),
+        sequence_features=fc.make_parse_example_spec(columns[2:]))
+
+    with self.cached_session() as sess:
+      ctx_result, seq_result = sess.run([context, seq_features])
+      self.assertEqual(list(seq_result[col_name].dense_shape), shape)
+      self.assertEqual(
+          list(seq_result[col_name].values[[0, 2, 6]]), values)
+      self.assertEqual(list(ctx_result['int_ctx'].dense_shape), [1])
+      self.assertEqual(ctx_result['int_ctx'].values[0], 5)
+      self.assertEqual(list(ctx_result['float_ctx'].shape), [1])
+      self.assertAlmostEqual(ctx_result['float_ctx'][0], 123.6, places=1)
+
+
+_SEQ_EX_PROTO = """
+context {
+  feature {
+    key: "float_ctx"
+    value {
+      float_list {
+        value: 123.6
+      }
+    }
+  }
+  feature {
+    key: "int_ctx"
+    value {
+      int64_list {
+        value: 5
+      }
+    }
+  }
+}
+feature_lists {
+  feature_list {
+    key: "bytes_list"
+    value {
+      feature {
+        bytes_list {
+          value: "a"
+        }
+      }
+      feature {
+        bytes_list {
+          value: "b"
+          value: "c"
+        }
+      }
+      feature {
+        bytes_list {
+          value: "d"
+          value: "e"
+          value: "f"
+          value: "g"
+        }
+      }
+    }
+  }
+  feature_list {
+    key: "float_list"
+    value {
+      feature {
+        float_list {
+          value: 1.0
+        }
+      }
+      feature {
+        float_list {
+          value: 3.0
+          value: 3.0
+          value: 3.0
+        }
+      }
+      feature {
+        float_list {
+          value: 5.0
+          value: 5.0
+          value: 5.0
+          value: 5.0
+          value: 5.0
+        }
+      }
+    }
+  }
+  feature_list {
+    key: "int_list"
+    value {
+      feature {
+        int64_list {
+          value: 2
+          value: 2
+        }
+      }
+      feature {
+        int64_list {
+          value: 4
+          value: 4
+          value: 4
+          value: 4
+        }
+      }
+      feature {
+        int64_list {
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+        }
+      }
+    }
+  }
+}
+"""
+
+
+def _make_sequence_example():
+  example = example_pb2.SequenceExample()
+  return text_format.Parse(_SEQ_EX_PROTO, example)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index 45d7b74046..929e83523a 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc
@@ -28,28 +29,61 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 
 
-class SequenceInputLayerTest(test.TestCase):
+class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_a': sparse_tensor.SparseTensorValue(
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           indices=((0, 0), (1, 0), (1, 1)),
+           values=(2, 0, 1),
+           dense_shape=(2, 2)),
+       'sparse_input_b': sparse_tensor.SparseTensorValue(
+           # example 0, ids [1]
+           # example 1, ids [2, 0]
+           indices=((0, 0), (1, 0), (1, 1)),
+           values=(1, 2, 0),
+           dense_shape=(2, 2)),
+       'expected_input_layer': [
+           # example 0, ids_a [2], ids_b [1]
+           [[5., 6., 14., 15., 16.], [0., 0., 0., 0., 0.]],
+           # example 1, ids_a [0, 1], ids_b [2, 0]
+           [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]],],
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'sparse_input_a': sparse_tensor.SparseTensorValue(
+           # feature 0, ids [[2], [0, 1]]
+           # feature 1, ids [[0, 0], [1]]
+           indices=(
+               (0, 0, 0), (0, 1, 0), (0, 1, 1),
+               (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           values=(2, 0, 1, 0, 0, 1),
+           dense_shape=(2, 2, 2)),
+       'sparse_input_b': sparse_tensor.SparseTensorValue(
+           # feature 0, ids [[1, 1], [1]]
+           # feature 1, ids [[2], [0]]
+           indices=((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           values=(1, 1, 1, 2, 0),
+           dense_shape=(2, 2, 2)),
+       'expected_input_layer': [
+           # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
+           [[5., 6., 14., 15., 16.], [2., 3., 14., 15., 16.]],
+           # feature 1, [a: 0, 0, b: 2, -], [a: 1, -, b: 0, -]
+           [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  def test_embedding_column(
+      self, sparse_input_a, sparse_input_b, expected_input_layer,
+      expected_sequence_length):
 
-  def test_embedding_column(self):
     vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [1]
-        # example 1, ids [2, 0]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 2, 0),
-        dense_shape=(2, 2))
-
     embedding_dimension_a = 2
     embedding_values_a = (
         (1., 2.),  # id 0
@@ -70,14 +104,6 @@ class SequenceInputLayerTest(test.TestCase):
         return embedding_values
       return _initializer
 
-    expected_input_layer = [
-        # example 0, ids_a [2], ids_b [1]
-        [[5., 6., 14., 15., 16.], [0., 0., 0., 0., 0.]],
-        # example 1, ids_a [0, 1], ids_b [2, 0]
-        [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]],
-    ]
-    expected_sequence_length = [1, 2]
-
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column_a = fc.embedding_column(
@@ -233,29 +259,53 @@ class SequenceInputLayerTest(test.TestCase):
           },
           feature_columns=shared_embedding_columns)
 
-  def test_indicator_column(self):
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_a': sparse_tensor.SparseTensorValue(
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           indices=((0, 0), (1, 0), (1, 1)),
+           values=(2, 0, 1),
+           dense_shape=(2, 2)),
+       'sparse_input_b': sparse_tensor.SparseTensorValue(
+           # example 0, ids [1]
+           # example 1, ids [1, 0]
+           indices=((0, 0), (1, 0), (1, 1)),
+           values=(1, 1, 0),
+           dense_shape=(2, 2)),
+       'expected_input_layer': [
+           # example 0, ids_a [2], ids_b [1]
+           [[0., 0., 1., 0., 1.], [0., 0., 0., 0., 0.]],
+           # example 1, ids_a [0, 1], ids_b [1, 0]
+           [[1., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'sparse_input_a': sparse_tensor.SparseTensorValue(
+           # feature 0, ids [[2], [0, 1]]
+           # feature 1, ids [[0, 0], [1]]
+           indices=(
+               (0, 0, 0), (0, 1, 0), (0, 1, 1),
+               (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           values=(2, 0, 1, 0, 0, 1),
+           dense_shape=(2, 2, 2)),
+       'sparse_input_b': sparse_tensor.SparseTensorValue(
+           # feature 0, ids [[1, 1], [1]]
+           # feature 1, ids [[1], [0]]
+           indices=((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           values=(1, 1, 1, 1, 0),
+           dense_shape=(2, 2, 2)),
+       'expected_input_layer': [
+           # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
+           [[0., 0., 1., 0., 2.], [1., 1., 0., 0., 1.]],
+           # feature 1, [a: 0, 0, b: 1, -], [a: 1, -, b: 0, -]
+           [[2., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  def test_indicator_column(
+      self, sparse_input_a, sparse_input_b, expected_input_layer,
+      expected_sequence_length):
     vocabulary_size_a = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
     vocabulary_size_b = 2
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [1]
-        # example 1, ids [1, 0]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 1, 0),
-        dense_shape=(2, 2))
-
-    expected_input_layer = [
-        # example 0, ids_a [2], ids_b [1]
-        [[0., 0., 1., 0., 1.], [0., 0., 0., 0., 0.]],
-        # example 1, ids_a [0, 1], ids_b [1, 0]
-        [[1., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]],
-    ]
-    expected_sequence_length = [1, 2]
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size_a)
@@ -298,18 +348,32 @@ class SequenceInputLayerTest(test.TestCase):
           features={'aaa': sparse_input},
           feature_columns=[indicator_column_a])
 
-  def test_numeric_column(self):
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0.], [1]]
-        # example 1, [[10.]]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0., 1., 10.),
-        dense_shape=(2, 2))
-    expected_input_layer = [
-        [[0.], [1.]],
-        [[10.], [0.]],
-    ]
-    expected_sequence_length = [2, 1]
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input': sparse_tensor.SparseTensorValue(
+           # example 0, values [0., 1]
+           # example 1, [10.]
+           indices=((0, 0), (0, 1), (1, 0)),
+           values=(0., 1., 10.),
+           dense_shape=(2, 2)),
+       'expected_input_layer': [
+           [[0.], [1.]],
+           [[10.], [0.]]],
+       'expected_sequence_length': [2, 1]},
+      {'testcase_name': '3D',
+       'sparse_input': sparse_tensor.SparseTensorValue(
+           # feature 0, ids [[20, 3], [5]]
+           # feature 1, ids [[3], [8]]
+           indices=((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           values=(20, 3, 5., 3., 8.),
+           dense_shape=(2, 2, 2)),
+       'expected_input_layer': [
+           [[20.], [3.], [5.], [0.]],
+           [[3.], [0.], [8.], [0.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  def test_numeric_column(
+      self, sparse_input, expected_input_layer, expected_sequence_length):
     numeric_column = sfc.sequence_numeric_column('aaa')
 
     input_layer, sequence_length = sfc.sequence_input_layer(
@@ -321,21 +385,38 @@ class SequenceInputLayerTest(test.TestCase):
       self.assertAllEqual(
           expected_sequence_length, sequence_length.eval(session=sess))
 
-  def test_numeric_column_multi_dim(self):
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input': sparse_tensor.SparseTensorValue(
+           # example 0, values [0., 1.,  2., 3., 4., 5., 6., 7.]
+           # example 1, [10., 11., 12., 13.]
+           indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
+                    (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
+           values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           dense_shape=(2, 8)),
+       'expected_input_layer': [
+           # The output of numeric_column._get_dense_tensor should be flattened.
+           [[0., 1., 2., 3.], [4., 5., 6., 7.]],
+           [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
+       'expected_sequence_length': [2, 1]},
+      {'testcase_name': '3D',
+       'sparse_input': sparse_tensor.SparseTensorValue(
+           # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
+           # example 1, [[10., 11., 12., 13.], []]
+           indices=((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
+                    (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 3),
+                    (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
+           values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           dense_shape=(2, 2, 4)),
+       'expected_input_layer': [
+           # The output of numeric_column._get_dense_tensor should be flattened.
+           [[0., 1., 2., 3.], [4., 5., 6., 7.]],
+           [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
+       'expected_sequence_length': [2, 1]},
+      )
+  def test_numeric_column_multi_dim(
+      self, sparse_input, expected_input_layer, expected_sequence_length):
     """Tests sequence_input_layer for multi-dimensional numeric_column."""
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
-        # example 1, [[[10., 11.],  [12., 13.]]]
-        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
-                 (1, 0), (1, 1), (1, 2), (1, 3)),
-        values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
-        dense_shape=(2, 8))
-    # The output of numeric_column._get_dense_tensor should be flattened.
-    expected_input_layer = [
-        [[0., 1., 2., 3.], [4., 5., 6., 7.]],
-        [[10., 11., 12., 13.], [0., 0., 0., 0.]],
-    ]
-    expected_sequence_length = [2, 1]
     numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
 
     input_layer, sequence_length = sfc.sequence_input_layer(
@@ -377,6 +458,134 @@ class SequenceInputLayerTest(test.TestCase):
           r'\[y \(sequence_input_layer/bbb/sequence_length:0\) = \] \[1 1\]'):
         sess.run(sequence_length)
 
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input': sparse_tensor.SparseTensorValue(
+           # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
+           # example 1, [[[10., 11.],  [12., 13.]]]
+           indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
+                    (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
+           values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           dense_shape=(2, 8)),
+       'expected_shape': [2, 2, 4]},
+      {'testcase_name': '3D',
+       'sparse_input': sparse_tensor.SparseTensorValue(
+           # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
+           # example 1, [[10., 11., 12., 13.], []]
+           indices=((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
+                    (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 2),
+                    (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
+           values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           dense_shape=(2, 2, 4)),
+       'expected_shape': [2, 2, 4]},
+      )
+  def test_static_shape_from_tensors_numeric(
+      self, sparse_input, expected_shape):
+    """Tests that we return a known static shape when we have one."""
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
+
+    input_layer, _ = sfc.sequence_input_layer(
+        features={'aaa': sparse_input},
+        feature_columns=[numeric_column])
+    shape = input_layer.get_shape()
+    self.assertEqual(shape, expected_shape)
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input': sparse_tensor.SparseTensorValue(
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           # example 2, ids []
+           # example 3, ids [1]
+           indices=((0, 0), (1, 0), (1, 1), (3, 0)),
+           values=(2, 0, 1, 1),
+           dense_shape=(4, 2)),
+       'expected_shape': [4, 2, 3]},
+      {'testcase_name': '3D',
+       'sparse_input': sparse_tensor.SparseTensorValue(
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           # example 2, ids []
+           # example 3, ids [[1], [0, 2]]
+           indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
+                    (3, 0, 0), (3, 1, 0), (3, 1, 1)),
+           values=(2, 0, 1, 2, 1, 0, 2),
+           dense_shape=(4, 2, 2)),
+       'expected_shape': [4, 2, 3]}
+      )
+  def test_static_shape_from_tensors_indicator(
+      self, sparse_input, expected_shape):
+    """Tests that we return a known static shape when we have one."""
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    indicator_column = fc.indicator_column(categorical_column)
+
+    input_layer, _ = sfc.sequence_input_layer(
+        features={'aaa': sparse_input}, feature_columns=[indicator_column])
+    shape = input_layer.get_shape()
+    self.assertEqual(shape, expected_shape)
+
+
+class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
+  """Tests the utility fn concatenate_context_input."""
+
+  def test_concatenate_context_input(self):
+    seq_input = ops.convert_to_tensor(np.arange(12).reshape(2, 3, 2))
+    context_input = ops.convert_to_tensor(np.arange(10).reshape(2, 5))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    input_layer = sfc.concatenate_context_input(context_input, seq_input)
+
+    expected = np.array([
+        [[0, 1, 0, 1, 2, 3, 4], [2, 3, 0, 1, 2, 3, 4], [4, 5, 0, 1, 2, 3, 4]],
+        [[6, 7, 5, 6, 7, 8, 9], [8, 9, 5, 6, 7, 8, 9], [10, 11, 5, 6, 7, 8, 9]]
+    ], dtype=np.float32)
+    with monitored_session.MonitoredSession() as sess:
+      output = sess.run(input_layer)
+      self.assertAllEqual(expected, output)
+
+  @parameterized.named_parameters(
+      {'testcase_name': 'rank_lt_3',
+       'seq_input': ops.convert_to_tensor(np.arange(100).reshape(10, 10))},
+      {'testcase_name': 'rank_gt_3',
+       'seq_input': ops.convert_to_tensor(np.arange(100).reshape(5, 5, 2, 2))}
+      )
+  def test_sequence_input_throws_error(self, seq_input):
+    context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(ValueError, 'sequence_input must have rank 3'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
+  @parameterized.named_parameters(
+      {'testcase_name': 'rank_lt_2',
+       'context_input': ops.convert_to_tensor(np.arange(100))},
+      {'testcase_name': 'rank_gt_2',
+       'context_input': ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))}
+      )
+  def test_context_input_throws_error(self, context_input):
+    seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(ValueError, 'context_input must have rank 2'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
+  def test_integer_seq_input_throws_error(self):
+    seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
+    context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(
+        TypeError, 'sequence_input must have dtype float32'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
+  def test_integer_context_input_throws_error(self):
+    seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
+    context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(
+        TypeError, 'context_input must have dtype float32'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
 
 class InputLayerTest(test.TestCase):
   """Tests input_layer with sequence feature columns."""
@@ -443,75 +652,79 @@ def _assert_sparse_tensor_indices_shape(test_case, expected, actual):
   test_case.assertAllEqual(expected.dense_shape, actual.dense_shape)
 
 
-class SequenceCategoricalColumnWithIdentityTest(test.TestCase):
-
-  def test_get_sparse_tensors(self):
-    column = sfc.sequence_categorical_column_with_identity(
-        'aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 2, 0),
-        dense_shape=(2, 2))
-    expected_sparse_ids = sparse_tensor.SparseTensorValue(
-        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
-        values=np.array((1, 2, 0), dtype=np.int64),
-        dense_shape=(2, 2, 1))
+class SequenceCategoricalColumnWithIdentityTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           indices=((0, 0), (1, 0), (1, 1)),
+           values=(1, 2, 0),
+           dense_shape=(2, 2)),
+       'expected': sparse_tensor.SparseTensorValue(
+           indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           values=np.array((1, 2, 0), dtype=np.int64),
+           dense_shape=(2, 2, 1))},
+      {'testcase_name': '3D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           indices=((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           values=(6, 7, 8),
+           dense_shape=(2, 2, 2)),
+       'expected': sparse_tensor.SparseTensorValue(
+           indices=((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           values=(6, 7, 8),
+           dense_shape=(2, 2, 2))}
+      )
+  def test_get_sparse_tensors(self, inputs, expected):
+    column = sfc.sequence_categorical_column_with_identity('aaa', num_buckets=9)
 
     id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
     self.assertIsNone(id_weight_pair.weight_tensor)
     with monitored_session.MonitoredSession() as sess:
       _assert_sparse_tensor_value(
-          self,
-          expected_sparse_ids,
-          id_weight_pair.id_tensor.eval(session=sess))
-
-  def test_get_sparse_tensors_inputs3d(self):
-    """Tests _get_sparse_tensors when the input is already 3D Tensor."""
-    column = sfc.sequence_categorical_column_with_identity(
-        'aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
-        values=(1, 2, 0),
-        dense_shape=(2, 2, 1))
-
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        r'Column aaa expected ID tensor of rank 2\.\s*'
-        r'id_tensor shape:\s*\[2 2 1\]'):
-      id_weight_pair = column._get_sparse_tensors(
-          _LazyBuilder({'aaa': inputs}))
-      with monitored_session.MonitoredSession() as sess:
-        id_weight_pair.id_tensor.eval(session=sess)
-
-
-class SequenceCategoricalColumnWithHashBucketTest(test.TestCase):
-
-  def test_get_sparse_tensors(self):
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+
+
+class SequenceCategoricalColumnWithHashBucketTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           indices=((0, 0), (1, 0), (1, 1)),
+           values=('omar', 'stringer', 'marlo'),
+           dense_shape=(2, 2)),
+       'expected': sparse_tensor.SparseTensorValue(
+           indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           # Ignored to avoid hash dependence in test.
+           values=np.array((0, 0, 0), dtype=np.int64),
+           dense_shape=(2, 2, 1))},
+      {'testcase_name': '3D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           indices=((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           values=('omar', 'stringer', 'marlo'),
+           dense_shape=(2, 2, 2)),
+       'expected': sparse_tensor.SparseTensorValue(
+           indices=((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           # Ignored to avoid hash dependence in test.
+           values=np.array((0, 0, 0), dtype=np.int64),
+           dense_shape=(2, 2, 2))}
+      )
+  def test_get_sparse_tensors(self, inputs, expected):
     column = sfc.sequence_categorical_column_with_hash_bucket(
         'aaa', hash_bucket_size=10)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('omar', 'stringer', 'marlo'),
-        dense_shape=(2, 2))
-
-    expected_sparse_ids = sparse_tensor.SparseTensorValue(
-        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
-        # Ignored to avoid hash dependence in test.
-        values=np.array((0, 0, 0), dtype=np.int64),
-        dense_shape=(2, 2, 1))
 
     id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
     self.assertIsNone(id_weight_pair.weight_tensor)
     with monitored_session.MonitoredSession() as sess:
       _assert_sparse_tensor_indices_shape(
-          self,
-          expected_sparse_ids,
-          id_weight_pair.id_tensor.eval(session=sess))
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
 
 
-class SequenceCategoricalColumnWithVocabularyFileTest(test.TestCase):
+class SequenceCategoricalColumnWithVocabularyFileTest(
+    test.TestCase, parameterized.TestCase):
 
   def _write_vocab(self, vocab_strings, file_name):
     vocab_file = os.path.join(self.get_temp_dir(), file_name)
@@ -527,68 +740,120 @@ class SequenceCategoricalColumnWithVocabularyFileTest(test.TestCase):
                                                         'wire_vocabulary.txt')
     self._wire_vocabulary_size = 3
 
-  def test_get_sparse_tensors(self):
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           indices=((0, 0), (1, 0), (1, 1)),
+           values=('marlo', 'skywalker', 'omar'),
+           dense_shape=(2, 2)),
+       'expected': sparse_tensor.SparseTensorValue(
+           indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           values=np.array((2, -1, 0), dtype=np.int64),
+           dense_shape=(2, 2, 1))},
+      {'testcase_name': '3D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           indices=((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           values=('omar', 'skywalker', 'marlo'),
+           dense_shape=(2, 2, 2)),
+       'expected': sparse_tensor.SparseTensorValue(
+           indices=((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           values=np.array((0, -1, 2), dtype=np.int64),
+           dense_shape=(2, 2, 2))}
+      )
+  def test_get_sparse_tensors(self, inputs, expected):
     column = sfc.sequence_categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    expected_sparse_ids = sparse_tensor.SparseTensorValue(
-        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
-        values=np.array((2, -1, 0), dtype=np.int64),
-        dense_shape=(2, 2, 1))
 
     id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
     self.assertIsNone(id_weight_pair.weight_tensor)
     with monitored_session.MonitoredSession() as sess:
       _assert_sparse_tensor_value(
-          self,
-          expected_sparse_ids,
-          id_weight_pair.id_tensor.eval(session=sess))
-
-
-class SequenceCategoricalColumnWithVocabularyListTest(test.TestCase):
-
-  def test_get_sparse_tensors(self):
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+
+
+class SequenceCategoricalColumnWithVocabularyListTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           indices=((0, 0), (1, 0), (1, 1)),
+           values=('marlo', 'skywalker', 'omar'),
+           dense_shape=(2, 2)),
+       'expected': sparse_tensor.SparseTensorValue(
+           indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           values=np.array((2, -1, 0), dtype=np.int64),
+           dense_shape=(2, 2, 1))},
+      {'testcase_name': '3D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           indices=((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           values=('omar', 'skywalker', 'marlo'),
+           dense_shape=(2, 2, 2)),
+       'expected': sparse_tensor.SparseTensorValue(
+           indices=((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           values=np.array((0, -1, 2), dtype=np.int64),
+           dense_shape=(2, 2, 2))}
+      )
+  def test_get_sparse_tensors(self, inputs, expected):
     column = sfc.sequence_categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'))
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    expected_sparse_ids = sparse_tensor.SparseTensorValue(
-        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
-        values=np.array((2, -1, 0), dtype=np.int64),
-        dense_shape=(2, 2, 1))
 
     id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
     self.assertIsNone(id_weight_pair.weight_tensor)
     with monitored_session.MonitoredSession() as sess:
       _assert_sparse_tensor_value(
-          self,
-          expected_sparse_ids,
-          id_weight_pair.id_tensor.eval(session=sess))
-
-
-class SequenceEmbeddingColumnTest(test.TestCase):
-
-  def test_get_sequence_dense_tensor(self):
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+
+
+class SequenceEmbeddingColumnTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           # example 2, ids []
+           # example 3, ids [1]
+           indices=((0, 0), (1, 0), (1, 1), (3, 0)),
+           values=(2, 0, 1, 1),
+           dense_shape=(4, 2)),
+       'expected': [
+           # example 0, ids [2]
+           [[7., 11.], [0., 0.]],
+           # example 1, ids [0, 1]
+           [[1., 2.], [3., 5.]],
+           # example 2, ids []
+           [[0., 0.], [0., 0.]],
+           # example 3, ids [1]
+           [[3., 5.], [0., 0.]]]},
+      {'testcase_name': '3D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           # example 2, ids []
+           # example 3, ids [[1], [0, 2]]
+           indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
+                    (3, 0, 0), (3, 1, 0), (3, 1, 1)),
+           values=(2, 0, 1, 2, 1, 0, 2),
+           dense_shape=(4, 2, 2)),
+       'expected': [
+           # example 0, ids [[2]]
+           [[7., 11.], [0., 0.]],
+           # example 1, ids [[0, 1], [2]]
+           [[2, 3.5], [7., 11.]],
+           # example 2, ids []
+           [[0., 0.], [0., 0.]],
+           # example 3, ids [[1], [0, 2]]
+           [[3., 5.], [4., 6.5]]]}
+      )
+  def test_get_sequence_dense_tensor(self, inputs, expected):
     vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 2))
-
     embedding_dimension = 2
     embedding_values = (
         (1., 2.),  # id 0
@@ -601,17 +866,6 @@ class SequenceEmbeddingColumnTest(test.TestCase):
       self.assertIsNone(partition_info)
       return embedding_values
 
-    expected_lookups = [
-        # example 0, ids [2]
-        [[7., 11.], [0., 0.]],
-        # example 1, ids [0, 1]
-        [[1., 2.], [3., 5.]],
-        # example 2, ids []
-        [[0., 0.], [0., 0.]],
-        # example 3, ids [1]
-        [[3., 5.], [0., 0.]],
-    ]
-
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
@@ -619,24 +873,35 @@ class SequenceEmbeddingColumnTest(test.TestCase):
         initializer=_initializer)
 
     embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+        _LazyBuilder({'aaa': inputs}))
 
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(
         ('embedding_weights:0',), tuple([v.name for v in global_vars]))
     with monitored_session.MonitoredSession() as sess:
       self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval(session=sess))
-
-  def test_sequence_length(self):
+      self.assertAllEqual(expected, embedding_lookup.eval(session=sess))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           indices=((0, 0), (1, 0), (1, 1)),
+           values=(2, 0, 1),
+           dense_shape=(2, 2)),
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           values=(2, 0, 1, 2),
+           dense_shape=(2, 2, 2)),
+       'expected_sequence_length': [1, 2]}
+      )
+  def test_sequence_length(self, inputs, expected_sequence_length):
     vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    expected_sequence_length = [1, 2]
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
@@ -644,7 +909,7 @@ class SequenceEmbeddingColumnTest(test.TestCase):
         categorical_column, dimension=2)
 
     _, sequence_length = embedding_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+        _LazyBuilder({'aaa': inputs}))
 
     with monitored_session.MonitoredSession() as sess:
       sequence_length = sess.run(sequence_length)
@@ -855,56 +1120,87 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
           expected_sequence_length_b, sequence_length_b.eval(session=sess))
 
 
-class SequenceIndicatorColumnTest(test.TestCase):
-
-  def test_get_sequence_dense_tensor(self):
+class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           # example 2, ids []
+           # example 3, ids [1]
+           indices=((0, 0), (1, 0), (1, 1), (3, 0)),
+           values=(2, 0, 1, 1),
+           dense_shape=(4, 2)),
+       'expected': [
+           # example 0, ids [2]
+           [[0., 0., 1.], [0., 0., 0.]],
+           # example 1, ids [0, 1]
+           [[1., 0., 0.], [0., 1., 0.]],
+           # example 2, ids []
+           [[0., 0., 0.], [0., 0., 0.]],
+           # example 3, ids [1]
+           [[0., 1., 0.], [0., 0., 0.]]]},
+      {'testcase_name': '3D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           # example 2, ids []
+           # example 3, ids [[1], [2, 2]]
+           indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
+                    (3, 0, 0), (3, 1, 0), (3, 1, 1)),
+           values=(2, 0, 1, 2, 1, 2, 2),
+           dense_shape=(4, 2, 2)),
+       'expected': [
+           # example 0, ids [[2]]
+           [[0., 0., 1.], [0., 0., 0.]],
+           # example 1, ids [[0, 1], [2]]
+           [[1., 1., 0.], [0., 0., 1.]],
+           # example 2, ids []
+           [[0., 0., 0.], [0., 0., 0.]],
+           # example 3, ids [[1], [2, 2]]
+           [[0., 1., 0.], [0., 0., 2.]]]}
+      )
+  def test_get_sequence_dense_tensor(self, inputs, expected):
     vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 2))
-
-    expected_lookups = [
-        # example 0, ids [2]
-        [[0., 0., 1.], [0., 0., 0.]],
-        # example 1, ids [0, 1]
-        [[1., 0., 0.], [0., 1., 0.]],
-        # example 2, ids []
-        [[0., 0., 0.], [0., 0., 0.]],
-        # example 3, ids [1]
-        [[0., 1., 0.], [0., 0., 0.]],
-    ]
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     indicator_column = fc.indicator_column(categorical_column)
 
     indicator_tensor, _ = indicator_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+        _LazyBuilder({'aaa': inputs}))
 
     with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected_lookups, indicator_tensor.eval(session=sess))
-
-  def test_sequence_length(self):
+      self.assertAllEqual(expected, indicator_tensor.eval(session=sess))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           indices=((0, 0), (1, 0), (1, 1)),
+           values=(2, 0, 1),
+           dense_shape=(2, 2)),
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           values=(2, 0, 1, 2),
+           dense_shape=(2, 2, 2)),
+       'expected_sequence_length': [1, 2]}
+      )
+  def test_sequence_length(self, inputs, expected_sequence_length):
     vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    expected_sequence_length = [1, 2]
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     indicator_column = fc.indicator_column(categorical_column)
 
     _, sequence_length = indicator_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+        _LazyBuilder({'aaa': inputs}))
 
     with monitored_session.MonitoredSession() as sess:
       sequence_length = sess.run(sequence_length)
@@ -938,7 +1234,7 @@ class SequenceIndicatorColumnTest(test.TestCase):
           expected_sequence_length, sequence_length.eval(session=sess))
 
 
-class SequenceNumericColumnTest(test.TestCase):
+class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
 
   def test_defaults(self):
     a = sfc.sequence_numeric_column('aaa')
@@ -971,25 +1267,36 @@ class SequenceNumericColumnTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError, 'must be a callable'):
       sfc.sequence_numeric_column('aaa', normalizer_fn='NotACallable')
 
-  def test_get_sequence_dense_tensor(self):
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0.], [1]]
-        # example 1, [[10.]]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0., 1., 10.),
-        dense_shape=(2, 2))
-    expected_dense_tensor = [
-        [[0.], [1.]],
-        [[10.], [0.]],
-    ]
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           # example 0, values [0., 1]
+           # example 1, [10.]
+           indices=((0, 0), (0, 1), (1, 0)),
+           values=(0., 1., 10.),
+           dense_shape=(2, 2)),
+       'expected': [
+           [[0.], [1.]],
+           [[10.], [0.]]]},
+      {'testcase_name': '3D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           # feature 0, ids [[20, 3], [5]]
+           # feature 1, ids [[3], [8]]
+           indices=((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           values=(20, 3, 5., 3., 8.),
+           dense_shape=(2, 2, 2)),
+       'expected': [
+           [[20.], [3.], [5.], [0.]],
+           [[3.], [0.], [8.], [0.]]]},
+      )
+  def test_get_sequence_dense_tensor(self, inputs, expected):
     numeric_column = sfc.sequence_numeric_column('aaa')
 
     dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+        _LazyBuilder({'aaa': inputs}))
 
     with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
+      self.assertAllEqual(expected, dense_tensor.eval(session=sess))
 
   def test_get_sequence_dense_tensor_with_normalizer_fn(self):
 
@@ -1026,41 +1333,34 @@ class SequenceNumericColumnTest(test.TestCase):
       self.assertAllEqual(
           expected_dense_tensor, dense_tensor.eval(session=sess))
 
-  def test_get_sequence_dense_tensor_with_shape(self):
-    """Tests get_sequence_dense_tensor with shape !=(1,)."""
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0., 1., 2.], [3., 4., 5.]]
-        # example 1, [[10., 11., 12.]]
-        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5),
-                 (1, 0), (1, 1), (1, 2)),
-        values=(0., 1., 2., 3., 4., 5., 10., 11., 12.),
-        dense_shape=(2, 6))
-    expected_dense_tensor = [
-        [[0., 1., 2.], [3., 4., 5.]],
-        [[10., 11., 12.], [0., 0., 0.]],
-    ]
-    numeric_column = sfc.sequence_numeric_column('aaa', shape=(3,))
-
-    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
-
-  def test_get_dense_tensor_multi_dim(self):
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input': sparse_tensor.SparseTensorValue(
+           # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
+           # example 1, [[[10., 11.],  [12., 13.]]]
+           indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
+                    (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
+           values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           dense_shape=(2, 8)),
+       'expected_dense_tensor': [
+           [[[0., 1.], [2., 3.]], [[4., 5.], [6., 7.]]],
+           [[[10., 11.], [12., 13.]], [[0., 0.], [0., 0.]]]]},
+      {'testcase_name': '3D',
+       'sparse_input': sparse_tensor.SparseTensorValue(
+           indices=((0, 0, 0), (0, 0, 2), (0, 0, 4), (0, 0, 6),
+                    (0, 1, 0), (0, 1, 2), (0, 1, 4), (0, 1, 6),
+                    (1, 0, 0), (1, 0, 2), (1, 0, 4), (1, 0, 6)),
+           values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           dense_shape=(2, 2, 8)),
+       'expected_dense_tensor': [
+           [[[0., 0.], [1., 0.]], [[2., 0.], [3., 0.]],
+            [[4., 0.], [5., 0.]], [[6., 0.], [7., 0.]]],
+           [[[10., 0.], [11., 0.]], [[12., 0.], [13., 0.]],
+            [[0., 0.], [0., 0.]], [[0., 0.], [0., 0.]]]]},
+      )
+  def test_get_dense_tensor_multi_dim(
+      self, sparse_input, expected_dense_tensor):
     """Tests get_sequence_dense_tensor for multi-dim numeric_column."""
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
-        # example 1, [[[10., 11.],  [12., 13.]]]
-        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
-                 (1, 0), (1, 1), (1, 2), (1, 3)),
-        values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
-        dense_shape=(2, 8))
-    expected_dense_tensor = [
-        [[[0., 1.], [2., 3.]], [[4., 5.], [6., 7.]]],
-        [[[10., 11.], [12., 13.]], [[0., 0.], [0., 0.]]],
-    ]
     numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
 
     dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
@@ -1070,43 +1370,55 @@ class SequenceNumericColumnTest(test.TestCase):
       self.assertAllEqual(
           expected_dense_tensor, dense_tensor.eval(session=sess))
 
-  def test_sequence_length(self):
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0., 1., 2.], [3., 4., 5.]]
-        # example 1, [[10., 11., 12.]]
-        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5),
-                 (1, 0), (1, 1), (1, 2)),
-        values=(0., 1., 2., 3., 4., 5., 10., 11., 12.),
-        dense_shape=(2, 6))
-    expected_sequence_length = [2, 1]
-    numeric_column = sfc.sequence_numeric_column('aaa', shape=(3,))
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           indices=((0, 0), (1, 0), (1, 1)),
+           values=(2., 0., 1.),
+           dense_shape=(2, 2)),
+       'expected_sequence_length': [1, 2],
+       'shape': (1,)},
+      {'testcase_name': '3D',
+       'inputs': sparse_tensor.SparseTensorValue(
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           values=(2., 0., 1., 2.),
+           dense_shape=(2, 2, 2)),
+       'expected_sequence_length': [1, 2],
+       'shape': (1,)},
+      {'testcase_name': '2D_with_shape',
+       'inputs': sparse_tensor.SparseTensorValue(
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           indices=((0, 0), (1, 0), (1, 1)),
+           values=(2., 0., 1.),
+           dense_shape=(2, 2)),
+       'expected_sequence_length': [1, 1],
+       'shape': (2,)},
+      {'testcase_name': '3D_with_shape',
+       'inputs': sparse_tensor.SparseTensorValue(
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           values=(2., 0., 1., 2.),
+           dense_shape=(2, 2, 2)),
+       'expected_sequence_length': [1, 2],
+       'shape': (2,)},
+      )
+  def test_sequence_length(self, inputs, expected_sequence_length, shape):
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=shape)
 
     _, sequence_length = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+        _LazyBuilder({'aaa': inputs}))
 
     with monitored_session.MonitoredSession() as sess:
       sequence_length = sess.run(sequence_length)
       self.assertAllEqual(expected_sequence_length, sequence_length)
       self.assertEqual(np.int64, sequence_length.dtype)
 
-  def test_sequence_length_with_shape(self):
-    """Tests _sequence_length with shape !=(1,)."""
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0.], [1]]
-        # example 1, [[10.]]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0., 1., 10.),
-        dense_shape=(2, 2))
-    expected_sequence_length = [2, 1]
-    numeric_column = sfc.sequence_numeric_column('aaa')
-
-    _, sequence_length = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
   def test_sequence_length_with_empty_rows(self):
     """Tests _sequence_length when some examples do not have ids."""
     sparse_input = sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 5352796174..28a8286544 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -2660,6 +2660,7 @@ class _EmbeddingColumn(
         inputs=inputs,
         weight_collections=weight_collections,
         trainable=trainable)
+
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
     sequence_length = _sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
@@ -3383,6 +3384,16 @@ class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn,
 
 
 def _verify_static_batch_size_equality(tensors, columns):
+  """Validates that the first dim (batch size) of all tensors are equal or None.
+
+  Args:
+    tensors: list of tensors to check.
+    columns: list of feature columns matching tensors. Will be used for error
+      messaging.
+
+  Raises:
+    ValueError: if one of the tensors has a variant batch size
+  """
   # bath_size is a tf.Dimension object.
   expected_batch_size = None
   for i in range(0, len(tensors)):
@@ -3403,9 +3414,18 @@ def _sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
   with ops.name_scope(None, 'sequence_length') as name_scope:
     row_ids = sp_tensor.indices[:, 0]
     column_ids = sp_tensor.indices[:, 1]
+    # Add one to convert column indices to element length
     column_ids += array_ops.ones_like(column_ids)
-    seq_length = math_ops.to_int64(
-        math_ops.segment_max(column_ids, segment_ids=row_ids) / num_elements)
+    # Get the number of elements we will have per example/row
+    seq_length = math_ops.segment_max(column_ids, segment_ids=row_ids)
+
+    # The raw values are grouped according to num_elements;
+    # how many entities will we have after grouping?
+    # Example: orig tensor [[1, 2], [3]], col_ids = (0, 1, 1),
+    # row_ids = (0, 0, 1), seq_length = [2, 1]. If num_elements = 2,
+    # these will get grouped, and the final seq_length is [1, 1]
+    seq_length = math_ops.to_int64(math_ops.ceil(seq_length / num_elements))
+
     # If the last n rows do not have ids, seq_length will have shape
     # [batch_size - n]. Pad the remaining values with zeros.
     n_pad = array_ops.shape(sp_tensor)[:1] - array_ops.shape(seq_length)[:1]
@@ -3439,25 +3459,14 @@ class _SequenceCategoricalColumn(
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
     id_tensor = sparse_tensors.id_tensor
     weight_tensor = sparse_tensors.weight_tensor
-    # Expands final dimension, so that embeddings are not combined during
-    # embedding lookup.
-    check_id_rank = check_ops.assert_equal(
-        array_ops.rank(id_tensor), 2,
-        data=[
-            'Column {} expected ID tensor of rank 2. '.format(self.name),
-            'id_tensor shape: ', array_ops.shape(id_tensor)])
-    with ops.control_dependencies([check_id_rank]):
-      id_tensor = sparse_ops.sparse_reshape(
-          id_tensor,
-          shape=array_ops.concat([id_tensor.dense_shape, [1]], axis=0))
+
+    # Expands third dimension, if necessary so that embeddings are not
+    # combined during embedding lookup. If the tensor is already 3D, leave
+    # as-is.
+    shape = array_ops.shape(id_tensor)
+    target_shape = [shape[0], shape[1], -1]
+    id_tensor = sparse_ops.sparse_reshape(id_tensor, target_shape)
     if weight_tensor is not None:
-      check_weight_rank = check_ops.assert_equal(
-          array_ops.rank(weight_tensor), 2,
-          data=[
-              'Column {} expected weight tensor of rank 2.'.format(self.name),
-              'weight_tensor shape:', array_ops.shape(weight_tensor)])
-      with ops.control_dependencies([check_weight_rank]):
-        weight_tensor = sparse_ops.sparse_reshape(
-            weight_tensor,
-            shape=array_ops.concat([weight_tensor.dense_shape, [1]], axis=0))
+      weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape)
+
     return _CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index ff50fe0d09..a2da6412ed 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -217,21 +217,21 @@ def _features_to_raw_params(features, types):
       feature = features[key]
       if isinstance(feature, VarLenFeature):
         if VarLenFeature not in types:
-          raise ValueError("Unsupported VarLenFeature %s." % feature)
+          raise ValueError("Unsupported VarLenFeature %s." % (feature,))
         if not feature.dtype:
           raise ValueError("Missing type for feature %s." % key)
         sparse_keys.append(key)
         sparse_types.append(feature.dtype)
       elif isinstance(feature, SparseFeature):
         if SparseFeature not in types:
-          raise ValueError("Unsupported SparseFeature %s." % feature)
+          raise ValueError("Unsupported SparseFeature %s." % (feature,))
 
         if not feature.index_key:
           raise ValueError(
-              "Missing index_key for SparseFeature %s." % feature)
+              "Missing index_key for SparseFeature %s." % (feature,))
         if not feature.value_key:
           raise ValueError(
-              "Missing value_key for SparseFeature %s." % feature)
+              "Missing value_key for SparseFeature %s." % (feature,))
         if not feature.dtype:
           raise ValueError("Missing type for feature %s." % key)
         index_keys = feature.index_key
@@ -260,7 +260,7 @@ def _features_to_raw_params(features, types):
           sparse_types.append(feature.dtype)
       elif isinstance(feature, FixedLenFeature):
         if FixedLenFeature not in types:
-          raise ValueError("Unsupported FixedLenFeature %s." % feature)
+          raise ValueError("Unsupported FixedLenFeature %s." % (feature,))
         if not feature.dtype:
           raise ValueError("Missing type for feature %s." % key)
         if feature.shape is None:
@@ -281,7 +281,8 @@ def _features_to_raw_params(features, types):
           dense_defaults[key] = feature.default_value
       elif isinstance(feature, FixedLenSequenceFeature):
         if FixedLenSequenceFeature not in types:
-          raise ValueError("Unsupported FixedLenSequenceFeature %s." % feature)
+          raise ValueError("Unsupported FixedLenSequenceFeature %s." % (
+              feature,))
         if not feature.dtype:
           raise ValueError("Missing type for feature %s." % key)
         if feature.shape is None:
-- 
GitLab


From 8ef3e7c8c053cb6dad530e13c478bbd406ea2c95 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Mon, 8 Oct 2018 10:43:01 -0700
Subject: [PATCH 529/570] Part 1/3 of the feature sync to the Keras 2.2.4 API.

PiperOrigin-RevId: 216211279
---
 tensorflow/python/keras/activations.py        |   5 +
 tensorflow/python/keras/activations_test.py   |  10 +
 tensorflow/python/keras/backend.py            |  81 ++++++--
 tensorflow/python/keras/backend_test.py       |  44 ++++-
 tensorflow/python/keras/callbacks.py          |   4 +
 tensorflow/python/keras/engine/network.py     |   9 +-
 .../python/keras/layers/convolutional.py      | 177 ++++++++++++-----
 .../python/keras/layers/convolutional_test.py |  31 +++
 tensorflow/python/keras/layers/pooling.py     | 185 +++++++++++++-----
 .../python/keras/layers/pooling_test.py       |  30 +++
 tensorflow/python/keras/layers/wrappers.py    |   3 +
 tensorflow/python/keras/testing_utils.py      |   5 +-
 tensorflow/python/keras/utils/conv_utils.py   |  45 +++--
 .../python/keras/utils/multi_gpu_utils.py     |  17 +-
 .../keras/utils/multi_gpu_utils_test.py       |  26 +++
 tensorflow/python/keras/utils/np_utils.py     |   5 +-
 .../v1/tensorflow.keras.activations.pbtxt     |   4 +
 .../golden/v1/tensorflow.keras.backend.pbtxt  |   4 +-
 ...low.keras.layers.-average-pooling1-d.pbtxt |   2 +-
 ...tensorflow.keras.layers.-avg-pool1-d.pbtxt |   2 +-
 ...flow.keras.layers.-conv2-d-transpose.pbtxt |   2 +-
 ...flow.keras.layers.-conv3-d-transpose.pbtxt |   2 +-
 ...ras.layers.-convolution2-d-transpose.pbtxt |   2 +-
 ...ras.layers.-convolution3-d-transpose.pbtxt |   2 +-
 ...as.layers.-global-average-pooling1-d.pbtxt |   4 +-
 ...low.keras.layers.-global-avg-pool1-d.pbtxt |   4 +-
 ...low.keras.layers.-global-max-pool1-d.pbtxt |   2 +-
 ....keras.layers.-global-max-pooling1-d.pbtxt |   2 +-
 ...tensorflow.keras.layers.-max-pool1-d.pbtxt |   2 +-
 ...sorflow.keras.layers.-max-pooling1-d.pbtxt |   2 +-
 .../golden/v1/tensorflow.keras.utils.pbtxt    |   2 +-
 .../v2/tensorflow.keras.activations.pbtxt     |   4 +
 .../golden/v2/tensorflow.keras.backend.pbtxt  |   4 +-
 ...low.keras.layers.-average-pooling1-d.pbtxt |   2 +-
 ...tensorflow.keras.layers.-avg-pool1-d.pbtxt |   2 +-
 ...flow.keras.layers.-conv2-d-transpose.pbtxt |   2 +-
 ...flow.keras.layers.-conv3-d-transpose.pbtxt |   2 +-
 ...ras.layers.-convolution2-d-transpose.pbtxt |   2 +-
 ...ras.layers.-convolution3-d-transpose.pbtxt |   2 +-
 ...as.layers.-global-average-pooling1-d.pbtxt |   4 +-
 ...low.keras.layers.-global-avg-pool1-d.pbtxt |   4 +-
 ...low.keras.layers.-global-max-pool1-d.pbtxt |   2 +-
 ....keras.layers.-global-max-pooling1-d.pbtxt |   2 +-
 ...tensorflow.keras.layers.-max-pool1-d.pbtxt |   2 +-
 ...sorflow.keras.layers.-max-pooling1-d.pbtxt |   2 +-
 .../golden/v2/tensorflow.keras.utils.pbtxt    |   2 +-
 46 files changed, 581 insertions(+), 172 deletions(-)

diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index 99645de736..d69791ce8d 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -160,6 +160,11 @@ def sigmoid(x):
   return nn.sigmoid(x)
 
 
+@tf_export('keras.activations.exponential')
+def exponential(x):
+  return math_ops.exp(x)
+
+
 @tf_export('keras.activations.hard_sigmoid')
 def hard_sigmoid(x):
   """Hard sigmoid activation function.
diff --git a/tensorflow/python/keras/activations_test.py b/tensorflow/python/keras/activations_test.py
index dd0bbcff39..ad238cb0a9 100644
--- a/tensorflow/python/keras/activations_test.py
+++ b/tensorflow/python/keras/activations_test.py
@@ -169,6 +169,16 @@ class KerasActivationsTest(test.TestCase):
     expected = np.tanh(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
+  def test_exponential(self):
+    with self.cached_session():
+      test_values = np.random.random((2, 5))
+      x = keras.backend.placeholder(ndim=2)
+      exp = keras.activations.exponential(x)
+      f = keras.backend.function([x], [exp])
+      result = f([test_values])[0]
+    expected = np.exp(test_values)
+    self.assertAllClose(result, expected, rtol=1e-05)
+
   def test_linear(self):
     x = np.random.random((10, 5))
     self.assertAllClose(x, keras.activations.linear(x))
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 63e776a06b..13f52fbae7 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -2223,7 +2223,7 @@ def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
 
 
 @tf_export('keras.backend.batch_normalization')
-def batch_normalization(x, mean, var, beta, gamma, epsilon=1e-3):
+def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
   """Applies batch normalization on x given mean, var, beta and gamma.
 
   I.e. returns:
@@ -2235,11 +2235,49 @@ def batch_normalization(x, mean, var, beta, gamma, epsilon=1e-3):
       var: Variance of batch.
       beta: Tensor with which to center the input.
       gamma: Tensor by which to scale the input.
+      axis: Integer, the axis that should be normalized.
+          (typically the features axis).
       epsilon: Fuzz factor.
 
   Returns:
       A tensor.
   """
+  if ndim(x) == 4:
+    # The CPU implementation of `fused_batch_norm` only supports NHWC
+    if axis == 1 or axis == -3:
+      tf_data_format = 'NCHW'
+    elif axis == 3 or axis == -1:
+      tf_data_format = 'NHWC'
+    else:
+      tf_data_format = None
+
+    if (tf_data_format == 'NHWC' or
+        tf_data_format == 'NCHW' and _has_nchw_support()):
+      # The mean / var / beta / gamma tensors may be broadcasted
+      # so they may have extra axes of size 1, which should be squeezed.
+      if ndim(mean) > 1:
+        mean = array_ops.reshape(mean, [-1])
+      if ndim(var) > 1:
+        var = array_ops.reshape(var, [-1])
+      if beta is None:
+        beta = zeros_like(mean)
+      elif ndim(beta) > 1:
+        beta = array_ops.reshape(beta, [-1])
+      if gamma is None:
+        gamma = ones_like(mean)
+      elif ndim(gamma) > 1:
+        gamma = array_ops.reshape(gamma, [-1])
+    y, _, _ = nn.fused_batch_norm(
+        x,
+        gamma,
+        beta,
+        epsilon=epsilon,
+        mean=mean,
+        variance=var,
+        data_format=tf_data_format,
+        is_training=False
+    )
+    return y
   return nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
 
 
@@ -2880,7 +2918,7 @@ class Function(object):
 
     if session_kwargs:
       raise ValueError('Some keys in session_kwargs are not supported at this '
-                       'time: %s', session_kwargs.keys())
+                       'time: %s', (session_kwargs.keys(),))
 
     self._callable_fn = None
     self._feed_arrays = None
@@ -3798,19 +3836,23 @@ def _preprocess_conv1d_input(x, data_format):
   return x, tf_data_format
 
 
-def _preprocess_conv2d_input(x, data_format):
+def _preprocess_conv2d_input(x, data_format, force_transpose=False):
   """Transpose and cast the input before the conv2d.
 
   Arguments:
       x: input tensor.
       data_format: string, `"channels_last"` or `"channels_first"`.
+      force_transpose: Boolean. If True, the input will always be transposed
+          from NCHW to NHWC if `data_format` is `"channels_first"`.
+          If False, the transposition only occurs on CPU (GPU ops are
+          assumed to support NCHW).
 
   Returns:
       A tensor.
   """
   tf_data_format = 'NHWC'
   if data_format == 'channels_first':
-    if not _has_nchw_support():
+    if not _has_nchw_support() or force_transpose:
       x = array_ops.transpose(x, (0, 2, 3, 1))  # NCHW -> NHWC
     else:
       tf_data_format = 'NCHW'
@@ -3958,7 +4000,8 @@ def conv2d_transpose(x,
                      output_shape,
                      strides=(1, 1),
                      padding='valid',
-                     data_format=None):
+                     data_format=None,
+                     dilation_rate=(1, 1)):
   """2D deconvolution (i.e.
 
   transposed convolution).
@@ -3972,6 +4015,7 @@ def conv2d_transpose(x,
       data_format: string, `"channels_last"` or `"channels_first"`.
           Whether to use Theano or TensorFlow/CNTK data format
           for inputs/kernels/outputs.
+      dilation_rate: Tuple of 2 integers.
 
   Returns:
       A tensor, result of transposed 2D convolution.
@@ -3987,7 +4031,13 @@ def conv2d_transpose(x,
   if isinstance(output_shape, (tuple, list)):
     output_shape = array_ops.stack(output_shape)
 
-  x, tf_data_format = _preprocess_conv2d_input(x, data_format)
+  # `atrous_conv2d_transpose` only supports NHWC format, even on GPU.
+  if data_format == 'channels_first' and dilation_rate != (1, 1):
+    force_transpose = True
+  else:
+    force_transpose = False
+
+  x, tf_data_format = _preprocess_conv2d_input(x, data_format, force_transpose)
 
   if data_format == 'channels_first' and tf_data_format == 'NHWC':
     output_shape = (output_shape[0], output_shape[2], output_shape[3],
@@ -4002,13 +4052,18 @@ def conv2d_transpose(x,
   else:
     strides = (1, 1) + strides
 
-  x = nn.conv2d_transpose(
-      x,
-      kernel,
-      output_shape,
-      strides,
-      padding=padding,
-      data_format=tf_data_format)
+  if dilation_rate == (1, 1):
+    x = nn.conv2d_transpose(x, kernel, output_shape, strides,
+                            padding=padding,
+                            data_format=tf_data_format)
+  else:
+    assert dilation_rate[0] == dilation_rate[1]
+    x = nn.atrous_conv2d_transpose(
+        x,
+        kernel,
+        output_shape,
+        rate=dilation_rate[0],
+        padding=padding)
   if data_format == 'channels_first' and tf_data_format == 'NHWC':
     x = array_ops.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
   return x
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index ab71589940..0834448699 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -26,6 +26,7 @@ from tensorflow.python import keras
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import nn
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
@@ -1381,6 +1382,36 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(mean.get_shape().as_list(), [3,])
     self.assertEqual(var.get_shape().as_list(), [3,])
 
+  def test_batch_normalization(self):
+    g_val = np.random.random((3,))
+    b_val = np.random.random((3,))
+    gamma = keras.backend.variable(g_val)
+    beta = keras.backend.variable(b_val)
+
+    # 3D NHC case
+    val = np.random.random((10, 5, 3))
+    x = keras.backend.variable(val)
+    mean, var = nn.moments(x, (0, 1), None, None, False)
+    normed = keras.backend.batch_normalization(
+        x, mean, var, beta, gamma, axis=-1, epsilon=1e-3)
+    self.assertEqual(normed.shape.as_list(), [10, 5, 3])
+
+    # 4D NHWC case
+    val = np.random.random((10, 5, 5, 3))
+    x = keras.backend.variable(val)
+    mean, var = nn.moments(x, (0, 1, 2), None, None, False)
+    normed = keras.backend.batch_normalization(
+        x, mean, var, beta, gamma, axis=-1, epsilon=1e-3)
+    self.assertEqual(normed.shape.as_list(), [10, 5, 5, 3])
+
+    # 4D NCHW case
+    val = np.random.random((10, 3, 5, 5))
+    x = keras.backend.variable(val)
+    mean, var = nn.moments(x, (0, 2, 3), None, None, False)
+    normed = keras.backend.batch_normalization(
+        x, mean, var, beta, gamma, axis=1, epsilon=1e-3)
+    self.assertEqual(normed.shape.as_list(), [10, 3, 5, 5])
+
 
 class TestCTC(test.TestCase):
 
@@ -1506,12 +1537,13 @@ class TestRandomOps(test.TestCase):
       self.assertAllClose(np.min(y), -2., atol=0.1)
 
   def test_string_input(self):
-    seq = keras.Sequential([
-        keras.layers.InputLayer(input_shape=(1,), dtype=dtypes.string),
-        keras.layers.Lambda(lambda x: x[0])
-    ])
-    preds = seq.predict([['tensorflow eager']])
-    self.assertEqual(preds.shape, (1,))
+    with self.cached_session():
+      seq = keras.Sequential([
+          keras.layers.InputLayer(input_shape=(1,), dtype=dtypes.string),
+          keras.layers.Lambda(lambda x: x[0])
+      ])
+      preds = seq.predict([['tensorflow eager']])
+      self.assertEqual(preds.shape, (1,))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 6dfbbf3694..3d6000f223 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -781,6 +781,10 @@ class LearningRateScheduler(Callback):
       print('\nEpoch %05d: LearningRateScheduler reducing learning '
             'rate to %s.' % (epoch + 1, lr))
 
+  def on_epoch_end(self, epoch, logs=None):
+    logs = logs or {}
+    logs['lr'] = K.get_value(self.model.optimizer.lr)
+
 
 @tf_export('keras.callbacks.TensorBoard')
 class TensorBoard(Callback):
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 918488bd7a..5969fea2b2 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -1641,10 +1641,11 @@ class Network(base_layer.Layer):
         ValueError: if `summary()` is called before the model is built.
     """
     if not self.built:
-      raise ValueError('This model has never been called, thus its weights '
-                       'have not yet been created, so no summary can be '
-                       'displayed. Build the model first '
-                       '(e.g. by calling it on some data).')
+      raise ValueError('This model has not yet been built. '
+                       'Build the model first by calling `build()` or calling '
+                       '`fit()` with some data, or specify '
+                       'an `input_shape` argument in the first layer(s) for '
+                       'automatic build.')
     layer_utils.print_summary(self,
                               line_length=line_length,
                               positions=positions,
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index d00def07bb..8f5872385c 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -645,6 +645,14 @@ class Conv2DTranspose(Conv2D):
           Specifying any stride value != 1 is incompatible with specifying
           any `dilation_rate` value != 1.
       padding: one of `"valid"` or `"same"` (case-insensitive).
+      output_padding: An integer or tuple/list of 2 integers,
+          specifying the amount of padding along the height and width
+          of the output tensor.
+          Can be a single integer to specify the same value for all
+          spatial dimensions.
+          The amount of output padding along a given dimension must be
+          lower than the stride along that same dimension.
+          If set to `None` (default), the output shape is inferred.
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
@@ -700,7 +708,9 @@ class Conv2DTranspose(Conv2D):
                kernel_size,
                strides=(1, 1),
                padding='valid',
+               output_padding=None,
                data_format=None,
+               dilation_rate=(1, 1),
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -717,6 +727,7 @@ class Conv2DTranspose(Conv2D):
         strides=strides,
         padding=padding,
         data_format=data_format,
+        dilation_rate=dilation_rate,
         activation=activations.get(activation),
         use_bias=use_bias,
         kernel_initializer=initializers.get(kernel_initializer),
@@ -728,6 +739,16 @@ class Conv2DTranspose(Conv2D):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
+    self.output_padding = output_padding
+    if self.output_padding is not None:
+      self.output_padding = conv_utils.normalize_tuple(
+          self.output_padding, 2, 'output_padding')
+      for stride, out_pad in zip(self.strides, self.output_padding):
+        if out_pad >= stride:
+          raise ValueError('Stride ' + str(self.strides) + ' must be '
+                           'greater than output padding ' +
+                           str(self.output_padding))
+
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if len(input_shape) != 4:
@@ -769,51 +790,50 @@ class Conv2DTranspose(Conv2D):
     inputs_shape = array_ops.shape(inputs)
     batch_size = inputs_shape[0]
     if self.data_format == 'channels_first':
-      c_axis, h_axis, w_axis = 1, 2, 3
+      h_axis, w_axis = 2, 3
     else:
-      c_axis, h_axis, w_axis = 3, 1, 2
+      h_axis, w_axis = 1, 2
 
     height, width = inputs_shape[h_axis], inputs_shape[w_axis]
     kernel_h, kernel_w = self.kernel_size
     stride_h, stride_w = self.strides
 
+    if self.output_padding is None:
+      out_pad_h = out_pad_w = None
+    else:
+      out_pad_h, out_pad_w = self.output_padding
+
     # Infer the dynamic output shape:
     out_height = conv_utils.deconv_output_length(height,
                                                  kernel_h,
-                                                 self.padding,
-                                                 stride_h)
+                                                 padding=self.padding,
+                                                 output_padding=out_pad_h,
+                                                 stride=stride_h,
+                                                 dilation=self.dilation_rate[0])
     out_width = conv_utils.deconv_output_length(width,
                                                 kernel_w,
-                                                self.padding,
-                                                stride_w)
+                                                padding=self.padding,
+                                                output_padding=out_pad_w,
+                                                stride=stride_w,
+                                                dilation=self.dilation_rate[1])
     if self.data_format == 'channels_first':
       output_shape = (batch_size, self.filters, out_height, out_width)
-      strides = (1, 1, stride_h, stride_w)
     else:
       output_shape = (batch_size, out_height, out_width, self.filters)
-      strides = (1, stride_h, stride_w, 1)
 
     output_shape_tensor = array_ops.stack(output_shape)
-    outputs = nn.conv2d_transpose(
+    outputs = backend.conv2d_transpose(
         inputs,
         self.kernel,
         output_shape_tensor,
-        strides,
-        padding=self.padding.upper(),
-        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+        strides=self.strides,
+        padding=self.padding,
+        data_format=self.data_format,
+        dilation_rate=self.dilation_rate)
 
     if not context.executing_eagerly():
       # Infer the static output shape:
-      out_shape = inputs.get_shape().as_list()
-      out_shape[c_axis] = self.filters
-      out_shape[h_axis] = conv_utils.deconv_output_length(out_shape[h_axis],
-                                                          kernel_h,
-                                                          self.padding,
-                                                          stride_h)
-      out_shape[w_axis] = conv_utils.deconv_output_length(out_shape[w_axis],
-                                                          kernel_w,
-                                                          self.padding,
-                                                          stride_w)
+      out_shape = self.compute_output_shape(inputs.shape)
       outputs.set_shape(out_shape)
 
     if self.use_bias:
@@ -837,13 +857,33 @@ class Conv2DTranspose(Conv2D):
     kernel_h, kernel_w = self.kernel_size
     stride_h, stride_w = self.strides
 
+    if self.output_padding is None:
+      out_pad_h = out_pad_w = None
+    else:
+      out_pad_h, out_pad_w = self.output_padding
+
     output_shape[c_axis] = self.filters
     output_shape[h_axis] = conv_utils.deconv_output_length(
-        output_shape[h_axis], kernel_h, self.padding, stride_h)
+        output_shape[h_axis],
+        kernel_h,
+        padding=self.padding,
+        output_padding=out_pad_h,
+        stride=stride_h,
+        dilation=self.dilation_rate[0])
     output_shape[w_axis] = conv_utils.deconv_output_length(
-        output_shape[w_axis], kernel_w, self.padding, stride_w)
+        output_shape[w_axis],
+        kernel_w,
+        padding=self.padding,
+        output_padding=out_pad_w,
+        stride=stride_w,
+        dilation=self.dilation_rate[1])
     return tensor_shape.TensorShape(output_shape)
 
+  def get_config(self):
+    config = super(Conv2DTranspose, self).get_config()
+    config['output_padding'] = self.output_padding
+    return config
+
 
 @tf_export('keras.layers.Conv3DTranspose',
            'keras.layers.Convolution3DTranspose')
@@ -878,6 +918,14 @@ class Conv3DTranspose(Conv3D):
           Specifying any stride value != 1 is incompatible with specifying
           any `dilation_rate` value != 1.
       padding: one of `"valid"` or `"same"` (case-insensitive).
+      output_padding: An integer or tuple/list of 3 integers,
+          specifying the amount of padding along the depth, height, and
+          width.
+          Can be a single integer to specify the same value for all
+          spatial dimensions.
+          The amount of output padding along a given dimension must be
+          lower than the stride along that same dimension.
+          If set to `None` (default), the output shape is inferred.
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
@@ -943,6 +991,7 @@ class Conv3DTranspose(Conv3D):
                kernel_size,
                strides=(1, 1, 1),
                padding='valid',
+               output_padding=None,
                data_format=None,
                activation=None,
                use_bias=True,
@@ -971,6 +1020,16 @@ class Conv3DTranspose(Conv3D):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
+    self.output_padding = output_padding
+    if self.output_padding is not None:
+      self.output_padding = conv_utils.normalize_tuple(
+          self.output_padding, 3, 'output_padding')
+      for stride, out_pad in zip(self.strides, self.output_padding):
+        if out_pad >= stride:
+          raise ValueError('Stride ' + str(self.strides) + ' must be '
+                           'greater than output padding ' +
+                           str(self.output_padding))
+
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if len(input_shape) != 5:
@@ -1012,11 +1071,9 @@ class Conv3DTranspose(Conv3D):
     inputs_shape = array_ops.shape(inputs)
     batch_size = inputs_shape[0]
     if self.data_format == 'channels_first':
-      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
+      d_axis, h_axis, w_axis = 2, 3, 4
     else:
-      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
-
-    self.input_spec = InputSpec(ndim=5, axes={c_axis: inputs_shape[c_axis]})
+      d_axis, h_axis, w_axis = 1, 2, 3
 
     depth = inputs_shape[d_axis]
     height = inputs_shape[h_axis]
@@ -1025,19 +1082,27 @@ class Conv3DTranspose(Conv3D):
     kernel_d, kernel_h, kernel_w = self.kernel_size
     stride_d, stride_h, stride_w = self.strides
 
+    if self.output_padding is None:
+      out_pad_d = out_pad_h = out_pad_w = None
+    else:
+      out_pad_d, out_pad_h, out_pad_w = self.output_padding
+
     # Infer the dynamic output shape:
     out_depth = conv_utils.deconv_output_length(depth,
                                                 kernel_d,
-                                                self.padding,
-                                                stride_d)
+                                                padding=self.padding,
+                                                output_padding=out_pad_d,
+                                                stride=stride_d)
     out_height = conv_utils.deconv_output_length(height,
                                                  kernel_h,
-                                                 self.padding,
-                                                 stride_h)
+                                                 padding=self.padding,
+                                                 output_padding=out_pad_h,
+                                                 stride=stride_h)
     out_width = conv_utils.deconv_output_length(width,
                                                 kernel_w,
-                                                self.padding,
-                                                stride_w)
+                                                padding=self.padding,
+                                                output_padding=out_pad_w,
+                                                stride=stride_w)
     if self.data_format == 'channels_first':
       output_shape = (batch_size, self.filters, out_depth, out_height,
                       out_width)
@@ -1058,20 +1123,7 @@ class Conv3DTranspose(Conv3D):
 
     if not context.executing_eagerly():
       # Infer the static output shape:
-      out_shape = inputs.get_shape().as_list()
-      out_shape[c_axis] = self.filters
-      out_shape[d_axis] = conv_utils.deconv_output_length(out_shape[d_axis],
-                                                          kernel_d,
-                                                          self.padding,
-                                                          stride_d)
-      out_shape[h_axis] = conv_utils.deconv_output_length(out_shape[h_axis],
-                                                          kernel_h,
-                                                          self.padding,
-                                                          stride_h)
-      out_shape[w_axis] = conv_utils.deconv_output_length(out_shape[w_axis],
-                                                          kernel_w,
-                                                          self.padding,
-                                                          stride_w)
+      out_shape = self.compute_output_shape(inputs.shape)
       outputs.set_shape(out_shape)
 
     if self.use_bias:
@@ -1109,15 +1161,38 @@ class Conv3DTranspose(Conv3D):
     kernel_d, kernel_h, kernel_w = self.kernel_size
     stride_d, stride_h, stride_w = self.strides
 
+    if self.output_padding is None:
+      out_pad_d = out_pad_h = out_pad_w = None
+    else:
+      out_pad_d, out_pad_h, out_pad_w = self.output_padding
+
     output_shape[c_axis] = self.filters
     output_shape[d_axis] = conv_utils.deconv_output_length(
-        output_shape[d_axis], kernel_d, self.padding, stride_d)
+        output_shape[d_axis],
+        kernel_d,
+        padding=self.padding,
+        output_padding=out_pad_d,
+        stride=stride_d)
     output_shape[h_axis] = conv_utils.deconv_output_length(
-        output_shape[h_axis], kernel_h, self.padding, stride_h)
+        output_shape[h_axis],
+        kernel_h,
+        padding=self.padding,
+        output_padding=out_pad_h,
+        stride=stride_h)
     output_shape[w_axis] = conv_utils.deconv_output_length(
-        output_shape[w_axis], kernel_w, self.padding, stride_w)
+        output_shape[w_axis],
+        kernel_w,
+        padding=self.padding,
+        output_padding=out_pad_w,
+        stride=stride_w)
     return tensor_shape.TensorShape(output_shape)
 
+  def get_config(self):
+    config = super(Conv3DTranspose, self).get_config()
+    config.pop('dilation_rate')
+    config['output_padding'] = self.output_padding
+    return config
+
 
 class SeparableConv(Conv):
   """Abstract base layer for separable nD convolution.
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index cad5e4c8bd..f88d632ab5 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -204,6 +204,9 @@ class Conv2DTransposeTest(test.TestCase):
     if test.is_gpu_available(cuda_only=True):
       self._run_test(kwargs, 'data_format', ['channels_first'])
 
+    kwargs['strides'] = (2, 2)
+    self._run_test(kwargs, 'output_padding', [(1, 1)])
+
   def test_conv2dtranspose_regularizers(self):
     kwargs = {
         'filters': 3,
@@ -239,6 +242,31 @@ class Conv2DTransposeTest(test.TestCase):
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_conv2d_transpose_dilation(self):
+    testing_utils.layer_test(keras.layers.Conv2DTranspose,
+                             kwargs={'filters': 2,
+                                     'kernel_size': 3,
+                                     'padding': 'same',
+                                     'data_format': 'channels_last',
+                                     'dilation_rate': (2, 2)},
+                             input_shape=(2, 5, 6, 3))
+
+    input_data = np.arange(48).reshape((1, 4, 4, 3)).astype(np.float32)
+    expected_output = np.float32([[192, 228, 192, 228],
+                                  [336, 372, 336, 372],
+                                  [192, 228, 192, 228],
+                                  [336, 372, 336, 372]]).reshape((1, 4, 4, 1))
+    testing_utils.layer_test(keras.layers.Conv2DTranspose,
+                             input_data=input_data,
+                             kwargs={'filters': 1,
+                                     'kernel_size': 3,
+                                     'padding': 'same',
+                                     'data_format': 'channels_last',
+                                     'dilation_rate': (2, 2),
+                                     'kernel_initializer': 'ones'},
+                             expected_output=expected_output)
+
 
 class Conv3DTransposeTest(test.TestCase):
 
@@ -270,6 +298,9 @@ class Conv3DTransposeTest(test.TestCase):
     if test.is_gpu_available(cuda_only=True):
       self._run_test(kwargs, 'data_format', ['channels_first'])
 
+    kwargs['strides'] = (2, 2, 2)
+    self._run_test(kwargs, 'output_padding', [(1, 1, 1)])
+
   def test_conv3dtranspose_regularizers(self):
     kwargs = {
         'filters': 3,
diff --git a/tensorflow/python/keras/layers/pooling.py b/tensorflow/python/keras/layers/pooling.py
index 912e8bd619..72a9c1d629 100644
--- a/tensorflow/python/keras/layers/pooling.py
+++ b/tensorflow/python/keras/layers/pooling.py
@@ -18,12 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
@@ -41,16 +44,18 @@ class Pooling1D(Layer):
       strides of the pooling operation.
     padding: A string. The padding method, either 'valid' or 'same'.
       Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
+      `(batch, steps, features)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, features, steps)`.
     name: A string, the name of the layer.
   """
 
   def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format=None,
+               padding='valid', data_format='channels_last',
                name=None, **kwargs):
     super(Pooling1D, self).__init__(name=name, **kwargs)
     if data_format is None:
@@ -65,45 +70,39 @@ class Pooling1D(Layer):
     self.input_spec = InputSpec(ndim=3)
 
   def call(self, inputs):
-    # There is no TF op for 1D pooling, hence we make the inputs 4D.
-    if self.data_format == 'channels_last':
-      # input is NWC, make it NHWC
-      inputs = array_ops.expand_dims(inputs, 1)
-      # pool on the W dim
-      pool_shape = (1, 1) + self.pool_size + (1,)
-      strides = (1, 1) + self.strides + (1,)
-      data_format = 'NHWC'
-    else:
-      # input is NCW, make it NCHW
-      inputs = array_ops.expand_dims(inputs, 2)
-      # pool on the W dim
-      pool_shape = (1, 1, 1) + self.pool_size
-      strides = (1, 1, 1) + self.strides
-      data_format = 'NCHW'
-
+    pad_axis = 2 if self.data_format == 'channels_last' else 3
+    inputs = array_ops.expand_dims(inputs, pad_axis)
     outputs = self.pool_function(
         inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper(),
-        data_format=data_format)
-
-    if self.data_format == 'channels_last':
-      return array_ops.squeeze(outputs, 1)
-    else:
-      return array_ops.squeeze(outputs, 2)
+        self.pool_size + (1,),
+        strides=self.strides + (1,),
+        padding=self.padding,
+        data_format=self.data_format)
+    return array_ops.squeeze(outputs, pad_axis)
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    length = conv_utils.conv_output_length(input_shape[1], self.pool_size[0],
-                                           self.padding, self.strides[0])
-    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
+    if self.data_format == 'channels_first':
+      steps = input_shape[2]
+      features = input_shape[1]
+    else:
+      steps = input_shape[1]
+      features = input_shape[2]
+    length = conv_utils.conv_output_length(steps,
+                                           self.pool_size[0],
+                                           self.padding,
+                                           self.strides[0])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape([input_shape[0], features, length])
+    else:
+      return tensor_shape.TensorShape([input_shape[0], length, features])
 
   def get_config(self):
     config = {
         'strides': self.strides,
         'pool_size': self.pool_size,
-        'padding': self.padding
+        'padding': self.padding,
+        'data_format': self.data_format,
     }
     base_config = super(Pooling1D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -119,19 +118,36 @@ class MaxPooling1D(Pooling1D):
           E.g. 2 will halve the input.
           If None, it will default to `pool_size`.
       padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, steps, features)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, features, steps)`.
 
   Input shape:
-      3D tensor with shape: `(batch_size, steps, features)`.
+      - If `data_format='channels_last'`:
+          3D tensor with shape:
+          `(batch_size, steps, features)`
+      - If `data_format='channels_first'`:
+          3D tensor with shape:
+          `(batch_size, features, steps)`
 
   Output shape:
-      3D tensor with shape: `(batch_size, downsampled_steps, features)`.
+      - If `data_format='channels_last'`:
+          3D tensor with shape:
+          `(batch_size, downsampled_steps, features)`
+      - If `data_format='channels_first'`:
+          3D tensor with shape:
+          `(batch_size, features, downsampled_steps)`
   """
 
   def __init__(self, pool_size=2, strides=None,
-               padding='valid', data_format=None, **kwargs):
+               padding='valid', data_format='channels_last', **kwargs):
 
     super(MaxPooling1D, self).__init__(
-        nn.max_pool,
+        functools.partial(backend.pool2d, pool_mode='max'),
         pool_size=pool_size,
         strides=strides,
         padding=padding,
@@ -149,18 +165,35 @@ class AveragePooling1D(Pooling1D):
           E.g. 2 will halve the input.
           If None, it will default to `pool_size`.
       padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, steps, features)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, features, steps)`.
 
   Input shape:
-      3D tensor with shape: `(batch_size, steps, features)`.
+      - If `data_format='channels_last'`:
+          3D tensor with shape:
+          `(batch_size, steps, features)`
+      - If `data_format='channels_first'`:
+          3D tensor with shape:
+          `(batch_size, features, steps)`
 
   Output shape:
-      3D tensor with shape: `(batch_size, downsampled_steps, features)`.
+      - If `data_format='channels_last'`:
+          3D tensor with shape:
+          `(batch_size, downsampled_steps, features)`
+      - If `data_format='channels_first'`:
+          3D tensor with shape:
+          `(batch_size, features, downsampled_steps)`
   """
 
   def __init__(self, pool_size=2, strides=None,
-               padding='valid', data_format=None, **kwargs):
+               padding='valid', data_format='channels_last', **kwargs):
     super(AveragePooling1D, self).__init__(
-        nn.avg_pool,
+        functools.partial(backend.pool2d, pool_mode='avg'),
         pool_size=pool_size,
         strides=strides,
         padding=padding,
@@ -561,41 +594,96 @@ class GlobalPooling1D(Layer):
   """Abstract class for different global pooling 1D layers.
   """
 
-  def __init__(self, **kwargs):
+  def __init__(self, data_format='channels_last', **kwargs):
     super(GlobalPooling1D, self).__init__(**kwargs)
     self.input_spec = InputSpec(ndim=3)
+    self.data_format = conv_utils.normalize_data_format(data_format)
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    return tensor_shape.TensorShape([input_shape[0], input_shape[2]])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape([input_shape[0], input_shape[1]])
+    else:
+      return tensor_shape.TensorShape([input_shape[0], input_shape[2]])
 
   def call(self, inputs):
     raise NotImplementedError
 
+  def get_config(self):
+    config = {'data_format': self.data_format}
+    base_config = super(GlobalPooling1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @tf_export('keras.layers.GlobalAveragePooling1D',
            'keras.layers.GlobalAvgPool1D')
 class GlobalAveragePooling1D(GlobalPooling1D):
   """Global average pooling operation for temporal data.
 
+  Arguments:
+    data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, steps, features)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, features, steps)`.
+
   Input shape:
-      3D tensor with shape: `(batch_size, steps, features)`.
+      - If `data_format='channels_last'`:
+          3D tensor with shape:
+          `(batch_size, steps, features)`
+      - If `data_format='channels_first'`:
+          3D tensor with shape:
+          `(batch_size, features, steps)`
 
   Output shape:
       2D tensor with shape:
       `(batch_size, features)`
   """
 
-  def call(self, inputs):
-    return backend.mean(inputs, axis=1)
+  def __init__(self, data_format='channels_last', **kwargs):
+    super(GlobalAveragePooling1D, self).__init__(data_format=data_format,
+                                                 **kwargs)
+    self.supports_masking = True
+
+  def call(self, inputs, mask=None):
+    steps_axis = 1 if self.data_format == 'channels_last' else 2
+    if mask is not None:
+      mask = math_ops.cast(mask, backend.floatx())
+      input_shape = inputs.shape.as_list()
+      broadcast_shape = [-1, input_shape[steps_axis], 1]
+      mask = array_ops.reshape(mask, broadcast_shape)
+      inputs *= mask
+      return backend.sum(inputs, axis=steps_axis) / math_ops.reduce_sum(
+          mask, axis=steps_axis)
+    else:
+      return backend.mean(inputs, axis=steps_axis)
+
+  def compute_mask(self, inputs, mask=None):
+    return None
 
 
 @tf_export('keras.layers.GlobalMaxPool1D', 'keras.layers.GlobalMaxPooling1D')
 class GlobalMaxPooling1D(GlobalPooling1D):
   """Global max pooling operation for temporal data.
 
+  Arguments:
+    data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, steps, features)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, features, steps)`.
+
   Input shape:
-      3D tensor with shape: `(batch_size, steps, features)`.
+      - If `data_format='channels_last'`:
+          3D tensor with shape:
+          `(batch_size, steps, features)`
+      - If `data_format='channels_first'`:
+          3D tensor with shape:
+          `(batch_size, features, steps)`
 
   Output shape:
       2D tensor with shape:
@@ -603,7 +691,8 @@ class GlobalMaxPooling1D(GlobalPooling1D):
   """
 
   def call(self, inputs):
-    return backend.max(inputs, axis=1)
+    steps_axis = 1 if self.data_format == 'channels_last' else 2
+    return backend.max(inputs, axis=steps_axis)
 
 
 class GlobalPooling2D(Layer):
diff --git a/tensorflow/python/keras/layers/pooling_test.py b/tensorflow/python/keras/layers/pooling_test.py
index 2cd9939e66..936e73ecf9 100644
--- a/tensorflow/python/keras/layers/pooling_test.py
+++ b/tensorflow/python/keras/layers/pooling_test.py
@@ -18,11 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.training import rmsprop
 
 
 class GlobalPoolingTest(test.TestCase):
@@ -31,8 +34,26 @@ class GlobalPoolingTest(test.TestCase):
   def test_globalpooling_1d(self):
     testing_utils.layer_test(keras.layers.pooling.GlobalMaxPooling1D,
                              input_shape=(3, 4, 5))
+    testing_utils.layer_test(keras.layers.pooling.GlobalMaxPooling1D,
+                             kwargs={'data_format': 'channels_first'},
+                             input_shape=(3, 4, 5))
     testing_utils.layer_test(
         keras.layers.pooling.GlobalAveragePooling1D, input_shape=(3, 4, 5))
+    testing_utils.layer_test(keras.layers.pooling.GlobalAveragePooling1D,
+                             kwargs={'data_format': 'channels_first'},
+                             input_shape=(3, 4, 5))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_globalpooling_1d_masking_support(self):
+    model = keras.Sequential()
+    model.add(keras.layers.Masking(mask_value=0., input_shape=(3, 4)))
+    model.add(keras.layers.GlobalAveragePooling1D())
+    model.compile(loss='mae', optimizer=rmsprop.RMSPropOptimizer(0.001))
+
+    model_input = np.random.random((2, 3, 4))
+    model_input[0, 1:, :] = 0
+    output = model.predict(model_input)
+    self.assertAllClose(output[0], model_input[0, 0, :])
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_globalpooling_2d(self):
@@ -172,6 +193,10 @@ class Pooling1DTest(test.TestCase):
             kwargs={'strides': stride,
                     'padding': padding},
             input_shape=(3, 5, 4))
+    testing_utils.layer_test(
+        keras.layers.MaxPooling1D,
+        kwargs={'data_format': 'channels_first'},
+        input_shape=(3, 2, 6))
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_averagepooling_1d(self):
@@ -183,6 +208,11 @@ class Pooling1DTest(test.TestCase):
                     'padding': padding},
             input_shape=(3, 5, 4))
 
+    testing_utils.layer_test(
+        keras.layers.AveragePooling1D,
+        kwargs={'data_format': 'channels_first'},
+        input_shape=(3, 2, 6))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index a1933c11b0..d19d0b5f8c 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -587,6 +587,9 @@ class Bidirectional(Wrapper):
       output = y * y_rev
     elif self.merge_mode is None:
       output = [y, y_rev]
+    else:
+      raise ValueError(
+          'Unrecognized value for `merge_mode`: %s' % (self.merge_mode))
 
     # Properly set learning phase
     if (getattr(y, '_uses_learning_phase', False) or
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 501b50ba5f..2fae094a1e 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -166,8 +166,9 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     if expected_dim is not None:
       if expected_dim != actual_dim:
         raise AssertionError(
-            'When testing layer %s, for input %s, found output_shape='
-            '%s but expected to find %s.\nFull kwargs: %s' %
+            'When testing layer %s **after deserialization**, '
+            'for input %s, found output_shape='
+            '%s but expected to find inferred shape %s.\nFull kwargs: %s' %
             (layer_cls.__name__,
              x,
              actual_output_shape,
diff --git a/tensorflow/python/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
index 8ebca1418d..f486e631e5 100644
--- a/tensorflow/python/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -137,26 +137,49 @@ def conv_input_length(output_length, filter_size, padding, stride):
   return (output_length - 1) * stride - 2 * pad + filter_size
 
 
-def deconv_output_length(input_length, filter_size, padding, stride):
+def deconv_output_length(input_length, filter_size, padding,
+                         output_padding=None, stride=0, dilation=1):
   """Determines output length of a transposed convolution given input length.
 
   Arguments:
-      input_length: integer.
-      filter_size: integer.
-      padding: one of "same", "valid", "full".
-      stride: integer.
+      input_length: Integer.
+      filter_size: Integer.
+      padding: one of `"same"`, `"valid"`, `"full"`.
+      output_padding: Integer, amount of padding along the output dimension.
+          Can be set to `None` in which case the output length is inferred.
+      stride: Integer.
+      dilation: Integer.
 
   Returns:
       The output length (integer).
   """
+  assert padding in {'same', 'valid', 'full'}
   if input_length is None:
     return None
-  input_length *= stride
-  if padding == 'valid':
-    input_length += max(filter_size - stride, 0)
-  elif padding == 'full':
-    input_length -= (stride + filter_size - 2)
-  return input_length
+
+  # Get the dilated kernel size
+  filter_size = filter_size + (filter_size - 1) * (dilation - 1)
+
+  # Infer length if output padding is None, else compute the exact length
+  if output_padding is None:
+    if padding == 'valid':
+      length = input_length * stride + max(filter_size - stride, 0)
+    elif padding == 'full':
+      length = input_length * stride - (stride + filter_size - 2)
+    elif padding == 'same':
+      length = input_length * stride
+
+  else:
+    if padding == 'same':
+      pad = filter_size // 2
+    elif padding == 'valid':
+      pad = 0
+    elif padding == 'full':
+      pad = filter_size - 1
+
+    length = ((input_length - 1) * stride + filter_size - 2 * pad +
+              output_padding)
+  return length
 
 
 def normalize_data_format(value):
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils.py b/tensorflow/python/keras/utils/multi_gpu_utils.py
index e1c49bc852..04b2ea8fe3 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils.py
@@ -244,9 +244,24 @@ def multi_gpu_model(model, gpus, cpu_merge=True, cpu_relocation=False):
         for o in range(len(outputs)):
           all_outputs[o].append(outputs[o])
 
+  # Deduplicate output names to handle Siamese networks.
+  occurrences = {}
+  for n in model.output_names:
+    if n not in occurrences:
+      occurrences[n] = 1
+    else:
+      occurrences[n] += 1
+  conflict_counter = {n: 0 for n, count in occurrences.items() if count > 1}
+  output_names = []
+  for n in model.output_names:
+    if n in conflict_counter:
+      conflict_counter[n] += 1
+      n += '_%d' % conflict_counter[n]
+    output_names.append(n)
+
   # Merge outputs under expected scope.
   with ops.device('/cpu:0' if cpu_merge else '/gpu:%d' % target_gpu_ids[0]):
     merged = []
-    for name, outputs in zip(model.output_names, all_outputs):
+    for name, outputs in zip(output_names, all_outputs):
       merged.append(concatenate(outputs, axis=0, name=name))
     return Model(model.inputs, merged)
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
index 3d0351a11f..1780ab6587 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils_test.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
@@ -198,5 +198,31 @@ class TestMultiGPUModel(test.TestCase):
       parallel_model.compile(loss='mean_squared_error', optimizer='adam')
       parallel_model.train_on_batch(x, y)
 
+  def test_multi_gpu_with_siamese_network(self):
+    gpus = 2
+
+    if not check_if_compatible_devices(gpus=gpus):
+      return
+
+    with self.cached_session():
+      input_shape = (3,)
+      nested_model = keras.models.Sequential([
+          keras.layers.Dense(32, input_shape=input_shape),
+          keras.layers.Dense(1)
+      ], name='nested')
+
+      input1 = keras.Input(input_shape)
+      input2 = keras.Input(input_shape)
+      score1 = nested_model(input1)
+      score2 = nested_model(input2)
+      score_sum = keras.layers.Add(name='add')([score1, score2])
+
+      siamese = keras.models.Model(inputs=[input1, input2],
+                                   outputs=[score_sum, score1, score2],
+                                   name='siamese')
+      parallel_siamese = keras.utils.multi_gpu_model(siamese, gpus)
+      self.assertEqual(parallel_siamese.output_names,
+                       ['add', 'nested_1', 'nested_2'])
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/utils/np_utils.py b/tensorflow/python/keras/utils/np_utils.py
index c24e87308b..3763999bff 100644
--- a/tensorflow/python/keras/utils/np_utils.py
+++ b/tensorflow/python/keras/utils/np_utils.py
@@ -22,7 +22,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.utils.to_categorical')
-def to_categorical(y, num_classes=None):
+def to_categorical(y, num_classes=None, dtype='float32'):
   """Converts a class vector (integers) to binary class matrix.
 
   E.g. for use with categorical_crossentropy.
@@ -31,6 +31,7 @@ def to_categorical(y, num_classes=None):
       y: class vector to be converted into a matrix
           (integers from 0 to num_classes).
       num_classes: total number of classes.
+      dtype: The data type expected by the input. Default: `'float32'`.
 
   Returns:
       A binary matrix representation of the input. The classes axis is placed
@@ -44,7 +45,7 @@ def to_categorical(y, num_classes=None):
   if not num_classes:
     num_classes = np.max(y) + 1
   n = y.shape[0]
-  categorical = np.zeros((n, num_classes), dtype=np.float32)
+  categorical = np.zeros((n, num_classes), dtype=dtype)
   categorical[np.arange(n), y] = 1
   output_shape = input_shape + (num_classes,)
   categorical = np.reshape(categorical, output_shape)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.activations.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.activations.pbtxt
index 2e9de9ebb2..eb315e356d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.activations.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.activations.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "elu"
     argspec: "args=[\'x\', \'alpha\'], varargs=None, keywords=None, defaults=[\'1.0\'], "
   }
+  member_method {
+    name: "exponential"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
index a71a59e269..9feb7c09b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
@@ -46,7 +46,7 @@ tf_module {
   }
   member_method {
     name: "batch_normalization"
-    argspec: "args=[\'x\', \'mean\', \'var\', \'beta\', \'gamma\', \'epsilon\'], varargs=None, keywords=None, defaults=[\'0.001\'], "
+    argspec: "args=[\'x\', \'mean\', \'var\', \'beta\', \'gamma\', \'axis\', \'epsilon\'], varargs=None, keywords=None, defaults=[\'-1\', \'0.001\'], "
   }
   member_method {
     name: "batch_set_value"
@@ -98,7 +98,7 @@ tf_module {
   }
   member_method {
     name: "conv2d_transpose"
-    argspec: "args=[\'x\', \'kernel\', \'output_shape\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'None\'], "
+    argspec: "args=[\'x\', \'kernel\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\'], "
   }
   member_method {
     name: "conv3d"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index c3dd2ad046..014f5828fa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index c440604aae..a6e4856de9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 065bb4d35b..381839d6de 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -84,7 +84,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index c7ba6056f9..2933f9f4b3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -84,7 +84,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 8f4f7918ab..9c9c7461c8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -84,7 +84,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 93c442bd55..44ca598724 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -84,7 +84,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 5ea61d118d..a8094c0bde 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
@@ -111,7 +111,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 11dca17c6d..3ebe162f57 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
@@ -111,7 +111,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 278429af6f..c0a53b847b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 935a69ab2f..ff6c6f3ec4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 238d96cca6..d26da270e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 4a45bf7997..524c5fd69e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
index 81b91d2780..138d97b11f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
@@ -70,6 +70,6 @@ tf_module {
   }
   member_method {
     name: "to_categorical"
-    argspec: "args=[\'y\', \'num_classes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'y\', \'num_classes\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'float32\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt
index 2e9de9ebb2..eb315e356d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "elu"
     argspec: "args=[\'x\', \'alpha\'], varargs=None, keywords=None, defaults=[\'1.0\'], "
   }
+  member_method {
+    name: "exponential"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
index a71a59e269..9feb7c09b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -46,7 +46,7 @@ tf_module {
   }
   member_method {
     name: "batch_normalization"
-    argspec: "args=[\'x\', \'mean\', \'var\', \'beta\', \'gamma\', \'epsilon\'], varargs=None, keywords=None, defaults=[\'0.001\'], "
+    argspec: "args=[\'x\', \'mean\', \'var\', \'beta\', \'gamma\', \'axis\', \'epsilon\'], varargs=None, keywords=None, defaults=[\'-1\', \'0.001\'], "
   }
   member_method {
     name: "batch_set_value"
@@ -98,7 +98,7 @@ tf_module {
   }
   member_method {
     name: "conv2d_transpose"
-    argspec: "args=[\'x\', \'kernel\', \'output_shape\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'None\'], "
+    argspec: "args=[\'x\', \'kernel\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\'], "
   }
   member_method {
     name: "conv3d"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index c3dd2ad046..014f5828fa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index c440604aae..a6e4856de9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 065bb4d35b..381839d6de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -84,7 +84,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index c7ba6056f9..2933f9f4b3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -84,7 +84,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 8f4f7918ab..9c9c7461c8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -84,7 +84,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 93c442bd55..44ca598724 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -84,7 +84,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 5ea61d118d..a8094c0bde 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
@@ -111,7 +111,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 11dca17c6d..3ebe162f57 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
@@ -111,7 +111,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 278429af6f..c0a53b847b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 935a69ab2f..ff6c6f3ec4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 238d96cca6..d26da270e7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 4a45bf7997..524c5fd69e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -83,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
index 81b91d2780..138d97b11f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
@@ -70,6 +70,6 @@ tf_module {
   }
   member_method {
     name: "to_categorical"
-    argspec: "args=[\'y\', \'num_classes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'y\', \'num_classes\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'float32\'], "
   }
 }
-- 
GitLab


From d1588d72a820423cab36977ca97221aba01be713 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Mon, 8 Oct 2018 10:43:03 -0700
Subject: [PATCH 530/570] Add a utility that allows finding a name for an
 entity, relative to an existing namespace.

PiperOrigin-RevId: 216211286
---
 .../python/autograph/pyct/inspect_utils.py    | 34 +++++++++++++++++++
 .../autograph/pyct/inspect_utils_test.py      | 19 +++++++++++
 2 files changed, 53 insertions(+)

diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py
index 1416988ea3..29c406c248 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils.py
@@ -67,6 +67,40 @@ def getnamespace(f):
   return namespace
 
 
+def getqualifiedname(namespace, object_, max_depth=2):
+  """Returns the name by which a value can be referred to in a given namespace.
+
+  This function will recurse inside modules, but it will not search objects for
+  attributes. The recursion depth is controlled by max_depth.
+
+  Args:
+    namespace: Dict[str, Any], the namespace to search into.
+    object_: Any, the value to search.
+    max_depth: Optional[int], a limit to the recursion depth when searching
+        inside modules.
+  Returns: Union[str, None], the fully-qualified name that resolves to the value
+      o, or None if it couldn't be found.
+  """
+  for name, value in namespace.items():
+    # The value may be referenced by more than one symbol, case in which
+    # any symbol will be fine. If the program contains symbol aliases that
+    # change over time, this may capture a symbol that will later point to
+    # something else.
+    # TODO(mdan): Prefer the symbol that matches the value type name.
+    if object_ is value:
+      return name
+
+  # TODO(mdan): Use breadth-first search and avoid visiting modules twice.
+  if max_depth:
+    for name, value in namespace.items():
+      if tf_inspect.ismodule(value):
+        name_in_module = getqualifiedname(value.__dict__, object_,
+                                          max_depth - 1)
+        if name_in_module is not None:
+          return '{}.{}'.format(name, name_in_module)
+  return None
+
+
 def _get_unbound_function(m):
   # TODO(mdan): Figure out why six.get_unbound_function fails in some cases.
   # The failure case is for tf.keras.Model.
diff --git a/tensorflow/python/autograph/pyct/inspect_utils_test.py b/tensorflow/python/autograph/pyct/inspect_utils_test.py
index f3eb027822..11074debfc 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from functools import wraps
+import imp
 
 import six
 
@@ -127,6 +128,24 @@ class InspectUtilsTest(test.TestCase):
     self.assertEqual(ns['closed_over_primitive'], closed_over_primitive)
     self.assertTrue('local_var' not in ns)
 
+  def test_getqualifiedname(self):
+    foo = object()
+    qux = imp.new_module('quxmodule')
+    bar = imp.new_module('barmodule')
+    baz = object()
+    bar.baz = baz
+
+    ns = {
+        'foo': foo,
+        'bar': bar,
+        'qux': qux,
+    }
+
+    self.assertIsNone(inspect_utils.getqualifiedname(ns, inspect_utils))
+    self.assertEqual(inspect_utils.getqualifiedname(ns, foo), 'foo')
+    self.assertEqual(inspect_utils.getqualifiedname(ns, bar), 'bar')
+    self.assertEqual(inspect_utils.getqualifiedname(ns, baz), 'bar.baz')
+
   def test_getmethodclass(self):
 
     self.assertEqual(
-- 
GitLab


From 0691d49fb6e15740b8ddf8019fea4edb91bca914 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 10:43:54 -0700
Subject: [PATCH 531/570] Convert TensorFlow's nasm dependency to new third
 party import method.

PiperOrigin-RevId: 216211467
---
 tensorflow/workspace.bzl                        | 15 ++-------------
 third_party/nasm/BUILD                          |  1 +
 third_party/{nasm.BUILD => nasm/BUILD.bazel}    | 12 ++++++------
 .../nasm.BUILD => nasm/BUILD.system}            |  0
 third_party/nasm/workspace.bzl                  | 17 +++++++++++++++++
 5 files changed, 26 insertions(+), 19 deletions(-)
 create mode 100644 third_party/nasm/BUILD
 rename third_party/{nasm.BUILD => nasm/BUILD.bazel} (100%)
 rename third_party/{systemlibs/nasm.BUILD => nasm/BUILD.system} (100%)
 create mode 100644 third_party/nasm/workspace.bzl

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 6f5aa85b01..adeac62e43 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -23,11 +23,13 @@ load(
 load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
 load("//third_party/icu:workspace.bzl", icu = "repo")
 load("//third_party/jpeg:workspace.bzl", jpeg = "repo")
+load("//third_party/nasm:workspace.bzl", nasm = "repo")
 
 def initialize_third_party():
     flatbuffers()
     icu()
     jpeg()
+    nasm()
 
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
@@ -235,19 +237,6 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "nasm",
-        build_file = clean_dep("//third_party:nasm.BUILD"),
-        sha256 = "63ec86477ad3f0f6292325fd89e1d93aea2e2fd490070863f17d48f7cd387011",
-        strip_prefix = "nasm-2.13.03",
-        system_build_file = clean_dep("//third_party/systemlibs:nasm.BUILD"),
-        urls = [
-            "https://mirror.bazel.build/www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2",
-            "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.13.03.tar.bz2/sha512/d7a6b4cee8dfd603d8d4c976e5287b5cc542fa0b466ff989b743276a6e28114e64289bf02a7819eca63142a5278aa6eed57773007e5f589e15768e6456a8919d/nasm-2.13.03.tar.bz2",
-            "http://www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2",
-        ],
-    )
-
     tf_http_archive(
         name = "png_archive",
         build_file = clean_dep("//third_party:png.BUILD"),
diff --git a/third_party/nasm/BUILD b/third_party/nasm/BUILD
new file mode 100644
index 0000000000..e3aec1fce9
--- /dev/null
+++ b/third_party/nasm/BUILD
@@ -0,0 +1 @@
+# Needed to make this a package.
diff --git a/third_party/nasm.BUILD b/third_party/nasm/BUILD.bazel
similarity index 100%
rename from third_party/nasm.BUILD
rename to third_party/nasm/BUILD.bazel
index d746a65e7e..c68d713946 100644
--- a/third_party/nasm.BUILD
+++ b/third_party/nasm/BUILD.bazel
@@ -137,12 +137,6 @@ cc_binary(
         ":windows": ["config/msvc.h"],
         "//conditions:default": [],
     }),
-    includes = [
-        "asm",
-        "include",
-        "output",
-        "x86",
-    ],
     copts = select({
         ":windows": [],
         "//conditions:default": [
@@ -157,6 +151,12 @@ cc_binary(
             "HAVE_SYS_TYPES_H",
         ],
     }),
+    includes = [
+        "asm",
+        "include",
+        "output",
+        "x86",
+    ],
     visibility = ["@jpeg//:__pkg__"],
 )
 
diff --git a/third_party/systemlibs/nasm.BUILD b/third_party/nasm/BUILD.system
similarity index 100%
rename from third_party/systemlibs/nasm.BUILD
rename to third_party/nasm/BUILD.system
diff --git a/third_party/nasm/workspace.bzl b/third_party/nasm/workspace.bzl
new file mode 100644
index 0000000000..6d50f6fcad
--- /dev/null
+++ b/third_party/nasm/workspace.bzl
@@ -0,0 +1,17 @@
+"""loads the nasm library, used by TF."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "nasm",
+        urls = [
+            "https://mirror.bazel.build/www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2",
+            "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.13.03.tar.bz2/sha512/d7a6b4cee8dfd603d8d4c976e5287b5cc542fa0b466ff989b743276a6e28114e64289bf02a7819eca63142a5278aa6eed57773007e5f589e15768e6456a8919d/nasm-2.13.03.tar.bz2",
+            "http://www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2",
+        ],
+        sha256 = "63ec86477ad3f0f6292325fd89e1d93aea2e2fd490070863f17d48f7cd387011",
+        strip_prefix = "nasm-2.13.03",
+        build_file = "//third_party/nasm:BUILD.bazel",
+        system_build_file = "//third_party/nasm:BUILD.system",
+    )
-- 
GitLab


From 3f0155133d668cf6cee1f1fb362d2a75c04836e3 Mon Sep 17 00:00:00 2001
From: Sourabh Bajaj <sourabhbajaj@google.com>
Date: Mon, 8 Oct 2018 10:52:15 -0700
Subject: [PATCH 532/570] Fix support for a single tensor to be passed to
 target_tensors

PiperOrigin-RevId: 216212953
---
 tensorflow/python/keras/engine/training.py             | 6 ++++--
 tensorflow/python/keras/engine/training_distributed.py | 4 ----
 tensorflow/python/keras/engine/training_test.py        | 4 ++++
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 2ebb4cf99f..ff2ae54ad4 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -563,9 +563,11 @@ class Model(Network):
         for name in self.output_names:
           tmp_target_tensors.append(target_tensors.get(name, None))
         target_tensors = tmp_target_tensors
+      elif tensor_util.is_tensor(target_tensors):
+        target_tensors = [target_tensors]
       else:
-        raise TypeError('Expected `target_tensors` to be '
-                        'a list or dict, but got:', target_tensors)
+        raise TypeError('Expected `target_tensors` to be a list or tuple or '
+                        'dict or a single tensor, but got:', target_tensors)
 
     for i in range(len(self.outputs)):
       if i in skip_target_indices:
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index 04e8d079c0..ac759ef3aa 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -820,10 +820,6 @@ def _clone_and_build_model(model, inputs=None, targets=None):
     optimizer_config = model.optimizer.get_config()
     optimizer = model.optimizer.__class__.from_config(optimizer_config)
 
-  # TODO(priyag): Is there a cleaner way to do this? The API doc suggests a
-  # single tensor should be OK but it throws an error in that case.
-  if targets is not None and not isinstance(targets, (list, dict, tuple)):
-    targets = [targets]
   if isinstance(targets, tuple):
     targets = nest.flatten(targets)
   cloned_model.compile(
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 54ad74c08b..868fd1dc69 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -1865,6 +1865,10 @@ class TestTrainingWithDataTensors(test.TestCase):
       model.compile(optimizer='rmsprop', loss='mse', target_tensors=[target])
       model.train_on_batch(input_val, None)
 
+      # single-output, as single tensor
+      model.compile(optimizer='rmsprop', loss='mse', target_tensors=target)
+      model.train_on_batch(input_val, None)
+
       # single-output, as dict
       model.compile(optimizer='rmsprop', loss='mse',
                     target_tensors={'dense': target})
-- 
GitLab


From 7d92890cb215f2f563fac96f1e3bde712a8749f8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 11:18:12 -0700
Subject: [PATCH 533/570] Update ops-related pbtxt files.

PiperOrigin-RevId: 216217887
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 0753316724..9df0ece69b 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -28980,6 +28980,74 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "LeakyRelu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LeakyReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "LearnedUnigramCandidateSampler"
   input_arg {
-- 
GitLab


From 1221a8e38a402513560ee71e6982b7cd8b6d901b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 11:54:12 -0700
Subject: [PATCH 534/570] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 216224026

---
 tensorflow/go/op/wrappers.go | 228 +++++++++++++++++------------------
 1 file changed, 114 insertions(+), 114 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 5d17605e37..fe99915a6c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -7221,6 +7221,45 @@ func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.
 	return components
 }
 
+// Deprecated. Use TensorArrayGradV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
+func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayWriteV2",
+		Input: []tf.Input{
+			handle, index, value, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Writes the given dataset to the given file using the TFRecord format.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to write.
+//	filename: A scalar string tensor representing the filename to use.
+//	compression_type: A scalar string tensor containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//
+// Returns the created operation.
+func DatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DatasetToTFRecord",
+		Input: []tf.Input{
+			input_dataset, filename, compression_type,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Computes rectified linear 6: `min(max(features, 0), 6)`.
 func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
@@ -8251,44 +8290,6 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 	return op.Output(0)
 }
 
-// Bucketizes 'input' based on 'boundaries'.
-//
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
-//
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
-//
-// Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
-//
-// Returns Same shape with 'input', each value of input replaced with bucket index.
-//
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
-	opspec := tf.OpSpec{
-		Type: "Bucketize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
 type FusedBatchNormV2Attr func(optionalAttr)
 
@@ -10980,6 +10981,44 @@ func Tan(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Bucketizes 'input' based on 'boundaries'.
+//
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
+//
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
+//
+// Arguments:
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
+//
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
+	opspec := tf.OpSpec{
+		Type: "Bucketize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // EncodeJpegAttr is an optional argument to EncodeJpeg.
 type EncodeJpegAttr func(optionalAttr)
 
@@ -21413,43 +21452,6 @@ func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the minimum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMin",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
 type SdcaOptimizerAttr func(optionalAttr)
 
@@ -21924,6 +21926,43 @@ func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Outp
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Computes the minimum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMin",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the sum along segments of a tensor.
 //
 // Read
@@ -29878,28 +29917,6 @@ func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
 	return op.Output(0)
 }
 
-// Writes the given dataset to the given file using the TFRecord format.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to write.
-//	filename: A scalar string tensor representing the filename to use.
-//	compression_type: A scalar string tensor containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//
-// Returns the created operation.
-func DatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DatasetToTFRecord",
-		Input: []tf.Input{
-			input_dataset, filename, compression_type,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // AvgPool3DAttr is an optional argument to AvgPool3D.
 type AvgPool3DAttr func(optionalAttr)
 
@@ -31692,23 +31709,6 @@ func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
-func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV2",
-		Input: []tf.Input{
-			handle, index, value, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SparseReduceMaxAttr is an optional argument to SparseReduceMax.
 type SparseReduceMaxAttr func(optionalAttr)
 
-- 
GitLab


From 723fd1245ed650ad07e5049faec021f4f0f6d408 Mon Sep 17 00:00:00 2001
From: Sourabh Bajaj <sourabhbajaj@google.com>
Date: Mon, 8 Oct 2018 12:03:09 -0700
Subject: [PATCH 535/570] Fix the steps_per_epoch when training on mnist

PiperOrigin-RevId: 216225505
---
 tensorflow/contrib/distribute/python/examples/keras_mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/distribute/python/examples/keras_mnist.py b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
index a84ef04196..da7f8c548f 100644
--- a/tensorflow/contrib/distribute/python/examples/keras_mnist.py
+++ b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
@@ -113,7 +113,7 @@ def main(_):
                 distribute=strategy)
 
   # Train the model with the train dataset.
-  model.fit(x=train_ds, epochs=20, steps_per_epoch=310)
+  model.fit(x=train_ds, epochs=20, steps_per_epoch=468)
 
   # Evaluate the model with the eval dataset.
   score = model.evaluate(eval_ds, steps=10, verbose=0)
-- 
GitLab


From dcd3b4307a3095e3f18aef53f5034787e3cc3af6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 12:09:54 -0700
Subject: [PATCH 536/570] Remove the restrictions that constant resolution of
 reduce_sum operators must be on axis 0, and can only be on 1 or 2-d inputs.

PiperOrigin-RevId: 216226776
---
 .../resolve_constant_unary.cc                 |  93 +++++++++---
 .../toco/graph_transformations/tests/BUILD    |  13 ++
 .../tests/resolve_constant_unary_test.cc      | 140 ++++++++++++++++++
 3 files changed, 229 insertions(+), 17 deletions(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_unary_test.cc

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
index c698a9567a..5364eebbc9 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -27,6 +27,73 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
+namespace {
+
+// Using the function reducer, reduce input along all axes in axes.
+// Put the reduced data in output, which should aleady be appropriately sized.
+// check_output_shape is set to what this code computes the final shape
+// to be, so it can be cross checked with the shape computation logic.
+void ReduceGeneric(bool keep_dims, const std::vector<int>& axes,
+                   const Shape& input_shape, const std::vector<float>& input,
+                   Shape* check_output_shape, std::vector<float>* output,
+                   const std::function<float(float, float)>& reducer) {
+  if (!IsNonEmpty(input_shape)) {
+    // Zero-dimensions will break the NextIndices() logic, so just early out if
+    // we have an empty shape.
+    return;
+  }
+
+  // Set up output_shape to be the same length as input_shape, with
+  // appropriate dimensions squashed to 1.  If keep_dims is false, we'll strip
+  // out the one dimensions at the end, but it's convenient to leave them for
+  // now.  We recompute the shape because we need the output shape to have
+  // 1-dims in all the squashed dimensions; the shape from shape computation may
+  // remove those squashed dimensions, depending on the options used.
+  Shape output_shape = input_shape;
+
+  // Reduction mask will be elementwise multiplied against the input
+  // indices to figure out the output index for the element.
+  std::vector<int> reduction_mask(input_shape.dimensions_count(), 1);
+  for (int axis : axes) {
+    CHECK_GE(axis, 0);
+    CHECK_LT(axis, input_shape.dimensions_count());
+    reduction_mask[axis] = 0;
+    output_shape.mutable_dims()->at(axis) = 1;
+  }
+
+  std::vector<int> output_indices(input_shape.dimensions_count());
+  for (int input_offset = 0; input_offset < input.size(); ++input_offset) {
+    std::vector<int> input_indices = ReverseOffset(input_shape, input_offset);
+    // Calculate the output location by squashing input indices to 0
+    // in reduced axes.
+    for (int i = 0; i < input_shape.dimensions_count(); ++i) {
+      output_indices[i] = input_indices[i] * reduction_mask[i];
+    }
+    int output_offset = Offset(output_shape, output_indices);
+    if (input_indices == output_indices) {
+      // Base element for the reduced axes
+      output->at(output_offset) = input.at(input_offset);
+    } else {
+      // Reduce with existing element.
+      output->at(output_offset) =
+          reducer(output->at(output_offset), input.at(input_offset));
+    }
+  }
+
+  if (!keep_dims) {
+    // Strip out the dims from output_shape.
+    std::vector<int> new_dims;
+    for (int i = 0; i < output_shape.dimensions_count(); ++i) {
+      if (reduction_mask[i]) {
+        new_dims.push_back(output_shape.dims(i));
+      }
+    }
+    output_shape.mutable_dims()->swap(new_dims);
+  }
+  *check_output_shape = output_shape;
+}
+
+}  // namespace
 
 bool CopyMinMaxFromFirstInput(const Operator& op, Model* model) {
   auto& output_array = model->GetArray(op.outputs[0]);
@@ -176,27 +243,19 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
     }
     auto& axis_array = model->GetArray(unary_op->inputs[1]);
     CHECK(axis_array.data_type == ArrayDataType::kInt32);
-    int axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
-    CHECK_LT(axis, input_shape.dimensions_count()) << "Axis out of bounds";
 
-    // We currently only handle reduction on axis 0.
-    CHECK_EQ(axis, 0) << "Only reduction along axis 0 is supported";
-    // We currently only handle 1-D and 2-D input tensors.
-    CHECK_LE(input_shape.dimensions_count(), 2) << "Rank >2 not yet supported";
     // We only support keep_dims=true; shape prop will need to change otherwise.
     auto sum_op = static_cast<const TensorFlowSumOperator*>(unary_op);
-    CHECK(sum_op->keep_dims) << "Only keep_dims=true is supported";
+    Shape check_output_shape;
 
-    std::vector<int> indices(input_shape.dimensions_count());
-    for (int i = 0; i < input_shape.dims(1); ++i) {
-      indices[1] = i;
-      float sum = 0.f;
-      for (int j = 0; j < input_shape.dims(0); ++j) {
-        indices[0] = j;
-        sum += (*input_float_data)[Offset(input_shape, indices)];
-      }
-      output_float_data[i] = sum;
-    }
+    ReduceGeneric(
+        sum_op->keep_dims, axis_array.GetBuffer<ArrayDataType::kInt32>().data,
+        input_shape, *input_float_data, &check_output_shape, &output_float_data,
+        [](float existing, float current) -> float {
+          return existing + current;
+        });
+    CHECK(check_output_shape == output_shape)
+        << "Shape propagation output shape doesn't match output shape from op";
   } else if (unary_op->type == OperatorType::kReduceMin) {
     // At the moment only full reduction across all dimensions is supported.
     // TODO(starka): Output should not be padded.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
index acf1e3ede5..6f1be298ca 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
@@ -30,3 +30,16 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+tf_cc_test(
+    name = "resolve_constant_unary_test",
+    srcs = ["resolve_constant_unary_test.cc"],
+    tags = ["no_oss"],
+    deps = [
+        "//tensorflow/contrib/lite/toco:graph_transformations",
+        "//tensorflow/contrib/lite/toco:model",
+        "//tensorflow/contrib/lite/toco:tooling_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_unary_test.cc b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_unary_test.cc
new file mode 100644
index 0000000000..a53abc9941
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_unary_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+namespace {
+
+void RunResolveSum(const std::vector<float>& input,
+                   const std::vector<int>& input_shape,
+                   const std::vector<int>& axis,
+                   const std::vector<int>& output_shape,
+                   const std::vector<float>& expected_output) {
+  Model model;
+  Array& input0 = model.GetOrCreateArray("input0");
+  Array& input1 = model.GetOrCreateArray("input1");
+  Array& output = model.GetOrCreateArray("output");
+
+  *input0.mutable_shape()->mutable_dims() = input_shape;
+  input0.data_type = ArrayDataType::kFloat;
+  input0.GetMutableBuffer<ArrayDataType::kFloat>().data = input;
+
+  *input1.mutable_shape()->mutable_dims() = {static_cast<int>(axis.size())};
+  input1.GetMutableBuffer<ArrayDataType::kInt32>().data = axis;
+  input1.data_type = ArrayDataType::kInt32;
+
+  *output.mutable_shape()->mutable_dims() = output_shape;
+
+  auto sum_op = absl::make_unique<TensorFlowSumOperator>();
+  sum_op->keep_dims = true;
+  sum_op->inputs = {"input0", "input1"};
+  sum_op->outputs = {"output"};
+  model.operators.push_back(std::move(sum_op));
+  ResolveConstantUnaryOperator().Run(&model, 0);
+  EXPECT_EQ(model.GetArray("output").GetBuffer<ArrayDataType::kFloat>().data,
+            expected_output);
+  EXPECT_EQ(model.GetArray("output").shape().dims(), output_shape);
+}
+
+// Reduce a 2d array across axis 0
+TEST(ResolveConstantUnary, ResolveSumAxis0_2D) {
+  // clang-format off
+  RunResolveSum(
+      // Input data
+      {3, 1, 4, 1,
+       5, 9, 2, 6,
+       5, 3, 5, 8},
+
+      // Input shape
+      {3, 4},
+
+      // Axes
+      {0},
+
+      // Expected output shape,
+      {1, 4},
+
+      // Expected output
+      {13, 13, 11, 15});
+  // clang-format on
+}
+
+// Reduce a 2d array across axis 1
+TEST(ResolveConstantUnary, ResolveSumAxis1_2D) {
+  // clang-format off
+  RunResolveSum(
+      // Input data
+      {3, 1, 4, 1,
+       5, 9, 2, 6,
+       5, 3, 5, 8},
+
+      // Input shape
+      {3, 4},
+
+      // Axes
+      {1},
+
+      // Expected output shape,
+      {3, 1},
+
+      // Expected output
+      {9, 22, 21});
+  // clang-format on
+}
+
+// Reduce a 3d tensor across axes 0 and 2.
+TEST(ResolveConstantUnary, ResolveSumAxis0_2_3D) {
+  // clang-format off
+  RunResolveSum(
+      // Input data
+      {  0,   1,   2,
+         3,  10,  11,
+        12,  13,  20,
+        21,  22,  23,
+
+       100, 101, 102,
+       103, 110, 111,
+       112, 113, 120,
+       121, 122, 123,
+
+       200, 201, 202,
+       203, 210, 211,
+       212, 213, 220,
+       221, 222, 223 },
+
+      // Input shape
+      {3, 4, 3},
+
+      // Axes
+      {0, 2},
+
+      // Expected output shape,
+      {1, 4, 1},
+
+      // Expected output, generated using octave.
+      { 909, 972, 1035, 1098});
+  // clang-format on
+}
+
+}  // namespace
+}  // namespace toco
-- 
GitLab


From d3595b1534a855f3d0da35d3f1dd8b5d464b1b70 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 12:34:00 -0700
Subject: [PATCH 537/570] Fix a couple of reference leaks

PiperOrigin-RevId: 216230391
---
 tensorflow/python/pywrap_tfe.i | 1 +
 tensorflow/python/util/util.cc | 8 +++-----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 61e0abbfcb..adbce95c6f 100755
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -209,6 +209,7 @@ limitations under the License.
     SWIG_fail;
   } else {
     int num_outputs = $1->size();
+    Py_CLEAR($result);
     $result = PyList_New(num_outputs);
     for (int i = 0; i < num_outputs; ++i) {
       PyObject *output;
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 7b3e618e84..11eb9ce947 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -825,18 +825,16 @@ PyObject* IsNamedtuple(PyObject* o, bool strict) {
 }
 
 PyObject* SameNamedtuples(PyObject* o1, PyObject* o2) {
-  PyObject* f1 = PyObject_GetAttrString(o1, "_fields");
-  PyObject* f2 = PyObject_GetAttrString(o2, "_fields");
+  Safe_PyObjectPtr f1 = make_safe(PyObject_GetAttrString(o1, "_fields"));
+  Safe_PyObjectPtr f2 = make_safe(PyObject_GetAttrString(o2, "_fields"));
   if (f1 == nullptr || f2 == nullptr) {
-    Py_XDECREF(f1);
-    Py_XDECREF(f2);
     PyErr_SetString(
         PyExc_RuntimeError,
         "Expected namedtuple-like objects (that have _fields attr)");
     return nullptr;
   }
 
-  if (PyObject_RichCompareBool(f1, f2, Py_NE)) {
+  if (PyObject_RichCompareBool(f1.get(), f2.get(), Py_NE)) {
     Py_RETURN_FALSE;
   }
 
-- 
GitLab


From 9b558126e31d25ec4e82cb4f50033d6eca44349a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 12:58:29 -0700
Subject: [PATCH 538/570] Add timeout mechanism to Grappler meta optimizer.
 This is only a best-effort mechanism, since the meta optimizer only checks if
 it has been cancelled before running each sub-optimizer. We can add
 cancellation to each sub-optimizer if necessary.

PiperOrigin-RevId: 216234262
---
 .../grappler/optimizers/graph_optimizer.h     | 21 ++++++
 .../grappler/optimizers/meta_optimizer.cc     | 68 ++++++++++++++++++-
 .../core/grappler/optimizers/meta_optimizer.h | 15 +++-
 .../optimizers/meta_optimizer_test.cc         | 62 +++++++++++++++++
 .../core/protobuf/rewriter_config.proto       |  4 ++
 5 files changed, 165 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer.h b/tensorflow/core/grappler/optimizers/graph_optimizer.h
index 765dd13263..bd6bf9f860 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -16,8 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
 
+#include <atomic>
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -29,6 +32,7 @@ struct GrapplerItem;
 // optimization of a GrapplerItem for running on a cluster.
 class GraphOptimizer {
  public:
+  GraphOptimizer() : is_cancelled_(false) {}
   virtual ~GraphOptimizer() {}
 
   virtual string name() const = 0;
@@ -45,8 +49,25 @@ class GraphOptimizer {
   // call to Optimize) performed.  Lower "result" scores are better.
   virtual void Feedback(Cluster* cluster, const GrapplerItem& item,
                         const GraphDef& optimized_graph, double result) = 0;
+
+  // Best effort cancellation. Sets is_cancelled to true and requests that the
+  // optimizer returns as soon as possible from active calls to Optimize() or
+  // FeedBack().
+  void Cancel() { is_cancelled_ = true; }
+
+  bool is_cancelled() const { return is_cancelled_; }
+
+ private:
+  std::atomic<bool> is_cancelled_;
 };
 
+#define GRAPPLER_RETURN_IF_CANCELLED()                                  \
+  do {                                                                  \
+    if (is_cancelled()) {                                               \
+      return errors::DeadlineExceeded(this->name(), " was cancelled."); \
+    }                                                                   \
+  } while (0)
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 3f33b16ba8..7488cedec5 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+
+#include <memory>
+
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
@@ -37,7 +40,11 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
@@ -115,6 +122,21 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
 
 #undef MK_OPT
 
+MetaOptimizer::MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg)
+    : cpu_device_(cpu_device), cfg_(cfg) {
+  // TODO(rmlarsen): Increase kNumThreads to, say, port::NumSchedulableCPUs()
+  // if we want to the threadpool for parallelizing Grappler
+  const int kNumThreads = 1;
+  thread_pool_ = absl::make_unique<thread::ThreadPool>(
+      Env::Default(), "MetaOptimizerThreadPool", kNumThreads);
+}
+
+MetaOptimizer::~MetaOptimizer() {
+  // The ThreadPool destructor waits for threads to finish, so we don't
+  // pull the rug out from under them.
+  thread_pool_.reset();
+}
+
 Status MetaOptimizer::InitializeOptimizers(
     std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
   if (cfg_.disable_meta_optimizer()) {
@@ -310,6 +332,7 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
 
     VLOG(4) << "Starting optimization iteration " << iteration;
     for (const auto& optimizer : optimizers) {
+      GRAPPLER_RETURN_IF_CANCELLED();
       // Some optimizers can run only once.
       if (iteration > 0 && IsRunOnceOptimizer(optimizer->name())) continue;
       // Some must run only on the last iteration.
@@ -368,6 +391,7 @@ Status MetaOptimizer::RunOptimizer(
   // resets optimized_graph to an empty graph.
   optimized_graph->Swap(&optimized_item->graph);
   *optimized_graph = GraphDef();
+  // TODO(rmlarsen): Add timeout for individual optimizers.
   Status status =
       optimizer->Optimize(cluster, *optimized_item, optimized_graph);
   uint64 end_us = Env::Default()->NowMicros();
@@ -389,14 +413,15 @@ Status MetaOptimizer::RunOptimizer(
   return status;
 }
 
-Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
-                               GraphDef* optimized_graph) {
+Status MetaOptimizer::OptimizeMainGraphAndFunctionLibrary(
+    Cluster* cluster, const GrapplerItem& item, GraphDef* optimized_graph) {
   VLOG(1) << "Starting optimization for grappler item: " << item.id;
   optimization_results_.clear();
 
   // 1. Optimize main graph
   TF_RETURN_IF_ERROR(OptimizeGraph(cluster, item, optimized_graph));
   VLOG(1) << "Optimized main graph.";
+  GRAPPLER_RETURN_IF_CANCELLED();
 
   // Skip optimizing functions if this is a TPU graph. Currently, Grappler
   // passes do not handle TPU functions correctly in a variety of ways (Note
@@ -432,6 +457,8 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     optimize_function_library = false;
 
     for (const FunctionDef& func : optimized_graph->library().function()) {
+      GRAPPLER_RETURN_IF_CANCELLED();
+
       const string& func_name = func.signature().name();
 
       // Skip already optimized functions.
@@ -506,6 +533,43 @@ void MetaOptimizer::PrintResult() {
   }
 }
 
+Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                               GraphDef* optimized_graph) {
+  const int64 kFiveMinutesInUsec = 5 * 60 * 1000 * 1000;
+  const int64 timeout_usec = (cfg_.meta_optimizer_timeout_ms() == 0
+                                  ? kFiveMinutesInUsec
+                                  : cfg_.meta_optimizer_timeout_ms() * 1000);
+  if (timeout_usec < 0) {
+    return OptimizeMainGraphAndFunctionLibrary(cluster, item, optimized_graph);
+  }
+
+  GraphDef optimized_with_timeout;
+  Status status;
+  Notification done;
+  thread_pool_->Schedule(
+      [this, cluster, &done, &optimized_with_timeout, &item, &status]() {
+        status = this->OptimizeMainGraphAndFunctionLibrary(
+            cluster, item, &optimized_with_timeout);
+        done.Notify();
+      });
+
+  const bool notified = WaitForNotificationWithTimeout(&done, timeout_usec);
+  if (notified && status.ok()) {
+    optimized_graph->Swap(&optimized_with_timeout);
+  } else {
+    *optimized_graph = item.graph;
+    if (!notified) {
+      this->Cancel();
+      done.WaitForNotification();
+      status = errors::DeadlineExceeded(
+          "Grappler MetaOptimizer timed out after ",
+          static_cast<float>(timeout_usec) / (1000 * 1000), " seconds");
+      LOG(WARNING) << status.error_message();
+    }
+  }
+  return status;
+}
+
 void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
                              const GraphDef& pruned_graph, double result) {
   // Nothing to do for MetaOptimizer.
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index 99a0a33ffa..35d6a4559b 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -28,9 +29,8 @@ namespace grappler {
 // Run the other grappler optimizers based on the specified rewriter config.
 class MetaOptimizer : public GraphOptimizer {
  public:
-  MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg)
-      : cpu_device_(cpu_device), cfg_(cfg) {}
-  ~MetaOptimizer() override = default;
+  MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg);
+  ~MetaOptimizer();
 
   string name() const override { return "meta_optimizer"; };
 
@@ -65,9 +65,18 @@ class MetaOptimizer : public GraphOptimizer {
   Status OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
                        GraphDef* optimized_graph);
 
+  // Run optimization passes over the main graph and for functions in the
+  // function library.
+  Status OptimizeMainGraphAndFunctionLibrary(Cluster* cluster,
+                                             const GrapplerItem& item,
+                                             GraphDef* optimized_graph);
+
   DeviceBase* const cpu_device_;  // may be NULL
   RewriterConfig cfg_;
 
+  // Thread pool used for launching optimizers asynchronously.
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+
   struct OptimizerResult {
     string optimizer_name;
     string result;
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 3f3f43382f..7f1dd91f09 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -461,6 +461,68 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
   EXPECT_FALSE(allowed_optimizations_my_mul_2->non_differentiable_rewrites);
 }
 
+class SleepingOptimizer : public CustomGraphOptimizer {
+ public:
+  SleepingOptimizer() {}
+  string name() const override { return "test_optimizer"; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override {
+    *optimized_graph = item.graph;
+    optimized_graph->add_node();
+    sleep(1);
+    return Status::OK();
+  }
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override {}
+};
+
+REGISTER_GRAPH_OPTIMIZER(SleepingOptimizer);
+
+TEST_F(MetaOptimizerTest, OptimizerTimesOut) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  RewriterConfig rewriter_config;
+  rewriter_config.add_optimizers("SleepingOptimizer");
+  rewriter_config.set_min_graph_nodes(-1);
+  rewriter_config.set_meta_optimizer_timeout_ms(1500);
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+
+  MetaOptimizer optimizer(nullptr, rewriter_config);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  EXPECT_EQ(status.error_message(),
+            "Grappler MetaOptimizer timed out after 1.5 seconds");
+  // Make sure the graph was reverted to the original regardless of when the
+  // optimizer timed out.
+  CompareGraphs(item.graph, output);
+}
+
+TEST_F(MetaOptimizerTest, OptimizerDoesNotTimeOut) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  RewriterConfig rewriter_config;
+  rewriter_config.add_optimizers("SleepingOptimizer");
+  rewriter_config.set_min_graph_nodes(-1);
+  rewriter_config.set_meta_optimizer_timeout_ms(1500);
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
+  MetaOptimizer optimizer(nullptr, rewriter_config);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(item.graph.node_size() + 1, output.node_size());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 8c31468ff5..7ccd54b818 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -83,6 +83,10 @@ message RewriterConfig {
   // Controls how many times we run the optimizers in meta optimizer (default
   // is once).
   NumIterationsType meta_optimizer_iterations = 12;
+  // Maximum number of milliseconds to spend optimizing a single graph before
+  // timing out. If equal to 0 the system picks a default (currently 5 minutes).
+  // If less than 0 the optimizer will never time out.
+  int64 meta_optimizer_timeout_ms = 20;
 
   // The minimum number of nodes in a graph to optimizer. For smaller graphs,
   // optimization is skipped.
-- 
GitLab


From 76ab96c8a5b2d77dfc191c94ff54fd5e52c561f2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 13:31:52 -0700
Subject: [PATCH 539/570] Changed Adam algorithm variant formula from
 sqrt(max(v, epsilon**2)) to sqrt(v + epsilon**2) and changed flag name
 accordingly.

PiperOrigin-RevId: 216240045
---
 tensorflow/contrib/tpu/proto/optimization_parameters.proto | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
index 8529b48c15..c2e3be03db 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
@@ -62,9 +62,9 @@ message FtrlParameters {
 // (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/AdamOptimizer). If
 // use_non_lazy_adam is enabled, use_gradient_accumulation is also required in
 // order to get correct results; a warning will be printed otherwise (which may
-// change to an error in the future). If use_max_with_epsilon is set, the Adam
+// change to an error in the future). If use_sum_inside_sqrt is set, the Adam
 // variable update formula will be changed from m / (sqrt(v) + epsilon) to
-// m / max(sqrt(v), abs(epsilon)); this option improves the performance of TPU
+// m / sqrt(v + epsilon**2); this option improves the performance of TPU
 // training and is not expected to harm model quality.
 message AdamParameters {
   float beta1 = 3;
@@ -73,7 +73,7 @@ message AdamParameters {
   float initial_m = 6;
   float initial_v = 7;
   bool use_non_lazy_adam = 8;
-  bool use_max_with_epsilon = 9;
+  bool use_sum_inside_sqrt = 10;
 }
 
 // https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer
-- 
GitLab


From b052c51374f558c25a29c70918d79205dfec808b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 13:46:12 -0700
Subject: [PATCH 540/570] Add tf.BenchmarkConfig that returns a session config
 appropriate for benchmarking. At the moment, it returns a default config with
 only Grappler dependency optimizer disabled. Many benchmarks wrap the
 subgraph they want to time in control_flow_ops.group() to avoid including the
 overhead of copying the output back to the Python client in the measurement.
 In the graph, this only adds a control dependency between the subgraph output
 and the fetch node, which in turn (often) causes the dependency optimizer to
 turn all nodes in the graph into no-ops.

PiperOrigin-RevId: 216242463
---
 .../python/kernel_tests/benchmark_test.py     |  2 +-
 .../python/kernel_tests/cholesky_op_test.py   |  7 ++-
 .../kernel_tests/determinant_op_test.py       |  9 +--
 .../kernel_tests/matrix_band_part_op_test.py  |  5 +-
 .../matrix_exponential_op_test.py             |  5 +-
 .../kernel_tests/matrix_inverse_op_test.py    |  5 +-
 .../kernel_tests/matrix_logarithm_op_test.py  |  3 +-
 .../kernel_tests/matrix_solve_ls_op_test.py   |  5 +-
 .../kernel_tests/matrix_solve_op_test.py      |  5 +-
 .../sparse_tensors_map_ops_test.py            |  3 +-
 .../python/kernel_tests/where_op_test.py      |  5 +-
 tensorflow/python/ops/image_ops_test.py       | 62 +++++++++----------
 tensorflow/python/platform/benchmark.py       | 14 +++++
 .../tools/api/golden/v1/tensorflow.test.pbtxt |  4 ++
 .../tools/api/golden/v2/tensorflow.test.pbtxt |  4 ++
 15 files changed, 84 insertions(+), 54 deletions(-)

diff --git a/tensorflow/python/kernel_tests/benchmark_test.py b/tensorflow/python/kernel_tests/benchmark_test.py
index 78b6e38d94..5777a5d097 100644
--- a/tensorflow/python/kernel_tests/benchmark_test.py
+++ b/tensorflow/python/kernel_tests/benchmark_test.py
@@ -64,7 +64,7 @@ class TestReportingBenchmark(test.Benchmark):
                 "other_key": "string"})
 
   def benchmark_times_an_op(self):
-    with session.Session() as sess:
+    with session.Session(config=benchmark.benchmark_config()) as sess:
       a = constant_op.constant(0.0)
       a_plus_a = a + a
       return self.run_op_benchmark(
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index 782e6b5068..2ebf74a4d7 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.linalg import linalg
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
@@ -327,7 +328,7 @@ class CholeskyBenchmark(test.Benchmark):
   def benchmarkCholeskyOp(self):
     for shape in self.shapes:
       with ops.Graph().as_default(), \
-          session.Session() as sess, \
+          session.Session(config=benchmark.benchmark_config()) as sess, \
           ops.device("/cpu:0"):
         matrix = variables.Variable(self._GenerateMatrix(shape))
         l = linalg_ops.cholesky(matrix)
@@ -341,7 +342,7 @@ class CholeskyBenchmark(test.Benchmark):
 
       if test.is_gpu_available(True):
         with ops.Graph().as_default(), \
-            session.Session() as sess, \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
             ops.device("/device:GPU:0"):
           matrix = variables.Variable(self._GenerateMatrix(shape))
           l = linalg_ops.cholesky(matrix)
@@ -359,7 +360,7 @@ class CholeskyBenchmark(test.Benchmark):
       for shape in self.shapes:
         matrix = self._GenerateMatrix(shape)
         with ops.Graph().as_default(), \
-            session.Session() as sess, \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
             ops.device(device):
           l = variables.Variable(np.linalg.cholesky(matrix))
           grad_matrix = variables.Variable(
diff --git a/tensorflow/python/kernel_tests/determinant_op_test.py b/tensorflow/python/kernel_tests/determinant_op_test.py
index a52b2c0dc3..fb114f9f24 100644
--- a/tensorflow/python/kernel_tests/determinant_op_test.py
+++ b/tensorflow/python/kernel_tests/determinant_op_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 
@@ -185,8 +186,8 @@ class MatrixDeterminantBenchmark(test.Benchmark):
 
   def benchmarkMatrixDeterminantOp(self):
     for shape in self.shapes:
-      with ops.Graph().as_default(), session.Session() as sess, ops.device(
-          "/cpu:0"):
+      with ops.Graph().as_default(), session.Session(
+          config=benchmark.benchmark_config()) as sess, ops.device("/cpu:0"):
         matrix = self._GenerateMatrix(shape)
         d = linalg_ops.matrix_determinant(matrix)
         variables.global_variables_initializer().run()
@@ -198,8 +199,8 @@ class MatrixDeterminantBenchmark(test.Benchmark):
             name="matrix_determinant_cpu_{shape}".format(shape=shape))
 
       if test.is_gpu_available(True):
-        with ops.Graph().as_default(), session.Session() as sess, ops.device(
-            "/gpu:0"):
+        with ops.Graph().as_default(), session.Session(
+            config=benchmark.benchmark_config()) as sess, ops.device("/gpu:0"):
           matrix = self._GenerateMatrix(shape)
           d = linalg_ops.matrix_determinant(matrix)
           variables.global_variables_initializer().run()
diff --git a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
index 68d626de2c..a0ef3a607e 100644
--- a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test as test_lib
 
 
@@ -109,7 +110,7 @@ class MatrixBandPartBenchmark(test_lib.Benchmark):
     for shape_ in self.shapes:
       for limits in (-1, -1), (-1, 0), (0, -1), (2, 2):
         with ops.Graph().as_default(), \
-            session.Session() as sess, \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
             ops.device("/cpu:0"):
           matrix = variables.Variable(array_ops.ones(shape_))
           band = array_ops.matrix_band_part(matrix, limits[0], limits[1])
@@ -123,7 +124,7 @@ class MatrixBandPartBenchmark(test_lib.Benchmark):
 
         if test_lib.is_gpu_available(True):
           with ops.Graph().as_default(), \
-              session.Session() as sess, \
+              session.Session(config=benchmark.benchmark_config()) as sess, \
               ops.device("/gpu:0"):
             matrix = variables.Variable(array_ops.ones(shape_))
             band = array_ops.matrix_band_part(matrix, limits[0], limits[1])
diff --git a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
index 0386e91276..9630c052b8 100644
--- a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.linalg import linalg_impl
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 
@@ -181,7 +182,7 @@ class MatrixExponentialBenchmark(test.Benchmark):
   def benchmarkMatrixExponentialOp(self):
     for shape in self.shapes:
       with ops.Graph().as_default(), \
-          session.Session() as sess, \
+          session.Session(config=benchmark.benchmark_config()) as sess, \
           ops.device("/cpu:0"):
         matrix = self._GenerateMatrix(shape)
         expm = linalg_impl.matrix_exponential(matrix)
@@ -195,7 +196,7 @@ class MatrixExponentialBenchmark(test.Benchmark):
 
       if test.is_gpu_available(True):
         with ops.Graph().as_default(), \
-            session.Session() as sess, \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
             ops.device("/gpu:0"):
           matrix = self._GenerateMatrix(shape)
           expm = linalg_impl.matrix_exponential(matrix)
diff --git a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
index 720ba806e9..8bda04b53d 100644
--- a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 
@@ -179,7 +180,7 @@ class MatrixInverseBenchmark(test.Benchmark):
     for adjoint in False, True:
       for shape in self.shapes:
         with ops.Graph().as_default(), \
-            session.Session() as sess, \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
             ops.device("/cpu:0"):
           matrix = self._GenerateMatrix(shape)
           inv = linalg_ops.matrix_inverse(matrix, adjoint=adjoint)
@@ -193,7 +194,7 @@ class MatrixInverseBenchmark(test.Benchmark):
 
         if test.is_gpu_available(True):
           with ops.Graph().as_default(), \
-              session.Session() as sess, \
+              session.Session(config=benchmark.benchmark_config()) as sess, \
               ops.device("/gpu:0"):
             matrix = self._GenerateMatrix(shape)
             inv = linalg_ops.matrix_inverse(matrix, adjoint=adjoint)
diff --git a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
index 723a15fbd1..3205e211d9 100644
--- a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.linalg import linalg_impl
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 
@@ -159,7 +160,7 @@ class MatrixLogarithmBenchmark(test.Benchmark):
   def benchmarkMatrixLogarithmOp(self):
     for shape in self.shapes:
       with ops.Graph().as_default(), \
-          session.Session() as sess, \
+          session.Session(config=benchmark.benchmark_config()) as sess, \
           ops.device("/cpu:0"):
         matrix = self._GenerateMatrix(shape)
         logm = gen_linalg_ops.matrix_logarithm(matrix)
diff --git a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
index de495968a7..225a10e117 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test as test_lib
 
 
@@ -313,7 +314,7 @@ class MatrixSolveLsBenchmark(test_lib.Benchmark):
       for num_rhs in 1, 2, matrix_shape[-1]:
 
         with ops.Graph().as_default(), \
-            session.Session() as sess, \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
             ops.device("/cpu:0"):
           matrix, rhs = _GenerateTestData(matrix_shape, num_rhs)
           x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer)
@@ -328,7 +329,7 @@ class MatrixSolveLsBenchmark(test_lib.Benchmark):
 
         if run_gpu_test and (len(matrix_shape) < 3 or matrix_shape[0] < 513):
           with ops.Graph().as_default(), \
-                session.Session() as sess, \
+                session.Session(config=benchmark.benchmark_config()) as sess, \
                 ops.device("/gpu:0"):
             matrix, rhs = _GenerateTestData(matrix_shape, num_rhs)
             x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer)
diff --git a/tensorflow/python/kernel_tests/matrix_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
index b8f2736b7b..264df2565c 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 
@@ -167,7 +168,7 @@ class MatrixSolveBenchmark(test.Benchmark):
         for num_rhs in 1, 2, matrix_shape[-1]:
 
           with ops.Graph().as_default(), \
-              session.Session() as sess, \
+              session.Session(config=benchmark.benchmark_config()) as sess, \
               ops.device("/cpu:0"):
             matrix, rhs = self._GenerateTestData(matrix_shape, num_rhs)
             x = linalg_ops.matrix_solve(matrix, rhs, adjoint=adjoint)
@@ -185,7 +186,7 @@ class MatrixSolveBenchmark(test.Benchmark):
 
           if run_gpu_test:
             with ops.Graph().as_default(), \
-                session.Session() as sess, \
+                session.Session(config=benchmark.benchmark_config()) as sess, \
                 ops.device("/gpu:0"):
               matrix, rhs = self._GenerateTestData(matrix_shape, num_rhs)
               x = linalg_ops.matrix_solve(matrix, rhs, adjoint=adjoint)
diff --git a/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py b/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
index 31e84341ae..fdfe1001b8 100644
--- a/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 # pylint: disable=protected-access
@@ -192,7 +193,7 @@ class BenchmarkSparseTensorsMapVsSerialization(test.Benchmark):
         sorted(zip(indices_batch, indices_value)), dtype=np.int64)
     values = ["feature_value_for_embedding_lookup"] * num_elements
     shape = np.asarray([batch_size, num_elements], dtype=np.int64)
-    with session.Session() as sess:
+    with session.Session(config=benchmark.benchmark_config()) as sess:
       with ops.device("/cpu:0"):
         indices = variables.Variable(indices)
         values = variables.Variable(values)
diff --git a/tensorflow/python/kernel_tests/where_op_test.py b/tensorflow/python/kernel_tests/where_op_test.py
index 29fb002ef4..04ac589432 100644
--- a/tensorflow/python/kernel_tests/where_op_test.py
+++ b/tensorflow/python/kernel_tests/where_op_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 
@@ -160,7 +161,7 @@ class WhereBenchmark(test.Benchmark):
           x = random_ops.random_uniform((m, n), dtype=dtypes.float32) <= p
           v = resource_variable_ops.ResourceVariable(x)
           op = array_ops.where(v)
-        with session.Session() as sess:
+        with session.Session(config=benchmark.benchmark_config()) as sess:
           v.initializer.run()
           r = self.run_op_benchmark(sess, op, min_iters=100, name=name)
           gb_processed_input = m * n / 1.0e9
@@ -186,7 +187,7 @@ class WhereBenchmark(test.Benchmark):
           y = resource_variable_ops.ResourceVariable(y_gen)
           c = resource_variable_ops.ResourceVariable(c_gen)
           op = array_ops.where(c, x, y)
-        with session.Session() as sess:
+        with session.Session(config=benchmark.benchmark_config()) as sess:
           x.initializer.run()
           y.initializer.run()
           c.initializer.run()
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 35fdee4fad..ff86df6346 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -602,20 +602,19 @@ class AdjustHueBenchmark(test.Benchmark):
     if cpu_count is not None:
       config.inter_op_parallelism_threads = 1
       config.intra_op_parallelism_threads = cpu_count
-    with session.Session("", graph=ops.Graph(), config=config) as sess:
-      with ops.device(device):
-        inputs = variables.Variable(
-            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
-            trainable=False,
-            dtype=dtypes.float32)
-        delta = constant_op.constant(0.1, dtype=dtypes.float32)
-        outputs = image_ops.adjust_hue(inputs, delta)
-        run_op = control_flow_ops.group(outputs)
-        sess.run(variables.global_variables_initializer())
-        for i in xrange(warmup_rounds + benchmark_rounds):
-          if i == warmup_rounds:
-            start = time.time()
-          sess.run(run_op)
+    with self.benchmark_session(config=config, device=device) as sess:
+      inputs = variables.Variable(
+          random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
+          trainable=False,
+          dtype=dtypes.float32)
+      delta = constant_op.constant(0.1, dtype=dtypes.float32)
+      outputs = image_ops.adjust_hue(inputs, delta)
+      run_op = control_flow_ops.group(outputs)
+      sess.run(variables.global_variables_initializer())
+      for i in xrange(warmup_rounds + benchmark_rounds):
+        if i == warmup_rounds:
+          start = time.time()
+        sess.run(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -646,21 +645,20 @@ class AdjustSaturationBenchmark(test.Benchmark):
     if cpu_count is not None:
       config.inter_op_parallelism_threads = 1
       config.intra_op_parallelism_threads = cpu_count
-    with session.Session("", graph=ops.Graph(), config=config) as sess:
-      with ops.device(device):
-        inputs = variables.Variable(
-            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
-            trainable=False,
-            dtype=dtypes.float32)
-        delta = constant_op.constant(0.1, dtype=dtypes.float32)
-        outputs = image_ops.adjust_saturation(inputs, delta)
-        run_op = control_flow_ops.group(outputs)
-        sess.run(variables.global_variables_initializer())
-        for _ in xrange(warmup_rounds):
-          sess.run(run_op)
-        start = time.time()
-        for _ in xrange(benchmark_rounds):
-          sess.run(run_op)
+    with self.benchmark_session(config=config, device=device) as sess:
+      inputs = variables.Variable(
+          random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
+          trainable=False,
+          dtype=dtypes.float32)
+      delta = constant_op.constant(0.1, dtype=dtypes.float32)
+      outputs = image_ops.adjust_saturation(inputs, delta)
+      run_op = control_flow_ops.group(outputs)
+      sess.run(variables.global_variables_initializer())
+      for _ in xrange(warmup_rounds):
+        sess.run(run_op)
+      start = time.time()
+      for _ in xrange(benchmark_rounds):
+        sess.run(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -699,7 +697,7 @@ class ResizeBilinearBenchmark(test.Benchmark):
         deps = [resize_op]
       benchmark_op = control_flow_ops.group(*deps)
 
-    with session.Session() as sess:
+    with self.benchmark_session() as sess:
       sess.run(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
@@ -747,7 +745,7 @@ class ResizeBicubicBenchmark(test.Benchmark):
         deps = [resize_op]
       benchmark_op = control_flow_ops.group(*deps)
 
-    with session.Session() as sess:
+    with self.benchmark_session() as sess:
       sess.run(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
@@ -804,7 +802,7 @@ class ResizeAreaBenchmark(test.Benchmark):
         deps = [resize_op]
       benchmark_op = control_flow_ops.group(*deps)
 
-    with session.Session() as sess:
+    with self.benchmark_session() as sess:
       sess.run(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index fa17b17d10..4f7abb311a 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -27,6 +27,7 @@ import time
 import six
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.util import test_log_pb2
 from tensorflow.python.client import timeline
 from tensorflow.python.platform import app
@@ -182,6 +183,19 @@ class Benchmark(six.with_metaclass(_BenchmarkRegistrar, object)):
         throughput=throughput, extras=extras)
 
 
+@tf_export("test.benchmark_config")
+def benchmark_config():
+  """Returns a tf.ConfigProto for disabling the dependency optimizer.
+
+    Returns:
+      A TensorFlow ConfigProto object.
+  """
+  config = config_pb2.ConfigProto()
+  config.graph_options.rewrite_options.dependency_optimization = (
+      rewriter_config_pb2.RewriterConfig.OFF)
+  return config
+
+
 @tf_export("test.Benchmark")
 class TensorFlowBenchmark(Benchmark):
   """Abstract class that provides helpers for TensorFlow benchmarks."""
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.test.pbtxt
index abe9b068ae..984c584c9e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.test.pbtxt
@@ -20,6 +20,10 @@ tf_module {
     name: "assert_equal_graph_def"
     argspec: "args=[\'actual\', \'expected\', \'checkpoint_v2\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "benchmark_config"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "compute_gradient"
     argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'init_targets\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
index abe9b068ae..984c584c9e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
@@ -20,6 +20,10 @@ tf_module {
     name: "assert_equal_graph_def"
     argspec: "args=[\'actual\', \'expected\', \'checkpoint_v2\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "benchmark_config"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "compute_gradient"
     argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'init_targets\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\', \'None\'], "
-- 
GitLab


From 494bbdfced3fd8596721d12e73676c4967f452e4 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Mon, 8 Oct 2018 13:48:19 -0700
Subject: [PATCH 541/570] Allow using more than one converter in the testing
 harness.

PiperOrigin-RevId: 216242862
---
 tensorflow/python/autograph/core/converter_testing.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index dc2d419d34..fcdbd0a82c 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -128,7 +128,13 @@ class TestCase(test.TestCase):
   @contextlib.contextmanager
   def converted(self, entity, converter_module, namespace, *tf_symbols):
     node, ctx = self.prepare(entity, namespace)
-    node = converter_module.transform(node, ctx)
+
+    if not isinstance(converter_module, (list, tuple)):
+      converter_module = (converter_module,)
+    for m in converter_module:
+      node = m.transform(node, ctx)
+      node = converter.standard_analysis(node, ctx, is_initial=True)
+
     with self.compiled(node, namespace, *tf_symbols) as result:
       yield result
 
-- 
GitLab


From eec9ca8f0baccd249a49046fe31b460903e44850 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Mon, 8 Oct 2018 13:50:12 -0700
Subject: [PATCH 542/570] Partial support tfe.defun in tf.gradients.

Doesn't attempt to deal with cases where we might have already generated
the functiondef for the parent function as in that case we cannot easily
modify the forward pass.

PiperOrigin-RevId: 216243224
---
 .../core/common_runtime/shape_refiner.cc      |  5 ++
 tensorflow/core/framework/shape_inference.cc  |  9 ++
 tensorflow/core/framework/shape_inference.h   |  9 +-
 tensorflow/core/graph/graph.cc                | 13 +++
 tensorflow/core/graph/graph.h                 |  5 ++
 tensorflow/core/graph/node_builder.cc         |  8 +-
 tensorflow/core/ops/resource_variable_ops.cc  |  3 +-
 tensorflow/python/eager/function.py           | 87 ++++++++++---------
 tensorflow/python/eager/function_test.py      | 18 +++-
 tensorflow/python/framework/op_def_library.py |  3 +-
 .../python/kernel_tests/cond_v2_test.py       |  1 +
 tensorflow/python/ops/custom_gradient.py      | 44 ++++++++++
 tensorflow/python/ops/gradients_impl.py       | 30 +++----
 tensorflow/python/ops/while_v2.py             |  3 +-
 14 files changed, 169 insertions(+), 69 deletions(-)

diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index fa4d1eda62..9488a44778 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -288,6 +288,11 @@ Status ShapeRefiner::SetShape(const Node* node, int output_port,
         "output_port '", output_port, "' is out of range, ", "node '",
         node->name(), "' has ", node->num_outputs(), " outputs");
   }
+  // Note: it's possible, if the node's been updated, that the shape inference
+  // context doesn't have the right number of outputs.
+  if (node->num_outputs() > c->num_outputs()) {
+    TF_RETURN_IF_ERROR(c->ExpandOutputs(node->num_outputs()));
+  }
 
   // Check compatibility, and merge the shapes.
   ShapeHandle existing_shape = c->output(output_port);
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 3e77028a5f..4dcc80680f 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -239,6 +239,15 @@ void InferenceContext::PreInputInit(
   output_handle_shapes_and_types_.resize(num_outputs);
 }
 
+Status InferenceContext::ExpandOutputs(int new_output_size) {
+  if (new_output_size < outputs_.size()) {
+    return errors::InvalidArgument("Trying to reduce number of outputs of op.");
+  }
+  outputs_.resize(new_output_size, nullptr);
+  output_handle_shapes_and_types_.resize(new_output_size);
+  return Status::OK();
+}
+
 void InferenceContext::PostInputInit(
     std::vector<std::unique_ptr<std::vector<ShapeAndType>>> input_handle_data) {
   int num_inputs_from_node_def = 0;
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index 81258b55b3..e3885b7d9e 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -323,13 +323,13 @@ class InferenceContext {
     return input_tensors_as_shapes_;
   }
 
-  ShapeHandle output(int64 idx) const { return outputs_[idx]; }
-  void set_output(int idx, ShapeHandle shape) { outputs_[idx] = shape; }
+  ShapeHandle output(int64 idx) const { return outputs_.at(idx); }
+  void set_output(int idx, ShapeHandle shape) { outputs_.at(idx) = shape; }
   Status set_output(StringPiece output_name,
                     const std::vector<ShapeHandle>& shapes);
 
   int num_outputs() const { return outputs_.size(); }
-  ShapeHandle output(int idx) const { return outputs_[idx]; }
+  ShapeHandle output(int idx) const { return outputs_.at(idx); }
   Status output(StringPiece output_name,
                 std::vector<ShapeHandle>* output) const;
 
@@ -645,6 +645,9 @@ class InferenceContext {
     return merged_dims_;
   }
 
+  // Adds new outputs; useful when mutating the graph.
+  Status ExpandOutputs(int new_output_size);
+
  private:
   // Creates and stores shapes for use in InferenceContext.
   class ShapeManager {
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 7a4a0096fa..6f068546d2 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -142,6 +142,19 @@ void Node::Clear() {
   assigned_device_name_index_ = 0;
 }
 
+void Node::UpdateProperties() {
+  DataTypeVector inputs;
+  DataTypeVector outputs;
+  Status status =
+      InOutTypesForNode(props_->node_def, *(props_->op_def), &inputs, &outputs);
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed at updating node: " << status;
+    return;
+  }
+  props_ = std::make_shared<NodeProperties>(props_->op_def, props_->node_def,
+                                            inputs, outputs);
+}
+
 const string& Node::name() const { return props_->node_def.name(); }
 const string& Node::type_string() const { return props_->node_def.op(); }
 const NodeDef& Node::def() const { return props_->node_def; }
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 2944951f82..228b1331d9 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -171,6 +171,7 @@ class Node {
   template <typename T>
   void AddAttr(const string& name, const T& val) {
     SetAttrValue(val, AddAttrHelper(name));
+    UpdateProperties();
   }
 
   void ClearAttr(const string& name);
@@ -211,6 +212,10 @@ class Node {
   // e.g. in AddAttr.
   void MaybeCopyOnWrite();
 
+  // Called after an attr has changed. Decides whether we need to update some
+  // property of the node (stored in props_).
+  void UpdateProperties();
+
   AttrValue* AddAttrHelper(const string& name);
 
   // A set of mutually exclusive classes for different kinds of nodes,
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index d92874909f..68a20fcc5f 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -140,10 +140,10 @@ void NodeBuilder::AddIndexError(const Node* node, int i) {
         strings::StrCat("Attempt to add nullptr Node to node with type ",
                         def_builder_.op_def().name()));
   } else {
-    errors_.emplace_back(
-        strings::StrCat("Attempt to add output ", i, " of ", node->name(),
-                        " not in range [0, ", node->num_outputs(),
-                        ") to node with type ", def_builder_.op_def().name()));
+    errors_.emplace_back(strings::StrCat(
+        "Attempt to add output ", i, " of ", node->name(), " not in range [0, ",
+        node->num_outputs(), ") to node with type ",
+        def_builder_.op_def().name(), ". Node: ", node->DebugString()));
   }
 }
 
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index adc9cd1486..65bdde375b 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -216,7 +216,8 @@ REGISTER_OP("VarIsInitializedOp")
 Status VariableShapeShapeFn(InferenceContext* c) {
   auto* handle_data = c->input_handle_shapes_and_types(0);
   if (handle_data == nullptr || handle_data->empty()) {
-    return errors::InvalidArgument("Handle doesn't have shape information.");
+    c->set_output(0, c->Vector(c->UnknownDim()));
+    return Status::OK();
   }
   ShapeHandle var_shape = (*handle_data)[0].shape;
   int64 rank = c->RankKnown(var_shape) ? c->Rank(var_shape)
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 93168826b1..99bf375ea7 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -46,6 +46,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import cond_v2_impl
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
@@ -81,49 +82,10 @@ def _create_substitute_placeholder(value, name=None, dtype=None):
   with ops.control_dependencies(None):
     placeholder = graph_placeholder(
         dtype=dtype or value.dtype, shape=value.shape, name=name)
-  _copy_handle_data(value, placeholder)
+  custom_gradient.copy_handle_data(value, placeholder)
   return placeholder
 
 
-def _copy_handle_data(source_t, target_t):
-  """Copies HandleData for variant and resource type tensors if available.
-
-  The CppShapeInferenceResult::HandleData proto contains information about the
-  shapes and types of the element tensors of resource/variant type tensors.
-  We need to copy this across function boundaries, i.e., when capturing a
-  placeholder or when returning a function tensor as output. If we don't do this
-  the element tensors will have unknown shapes, e.g., if a TensorList variant
-  tensor is captured as a placeholder, elements popped from that list would have
-  unknown shape.
-
-  Args:
-    source_t: The tensor to copy HandleData from.
-    target_t: The tensor to copy HandleData to.
-  """
-  if (target_t.dtype == dtypes_module.resource or
-      target_t.dtype == dtypes_module.variant):
-    if isinstance(source_t, ops.EagerTensor):
-      handle_data = source_t._handle_data  # pylint: disable=protected-access
-    else:
-      handle_data = resource_variable_ops.get_resource_handle_data(source_t)
-    if handle_data is not None and handle_data.is_set:
-      # pylint: disable=protected-access
-      pywrap_tensorflow.SetHandleShapeAndType(target_t.graph._c_graph,
-                                              target_t._as_tf_output(),
-                                              handle_data.SerializeToString())
-      # pylint: enable=protected-access
-      # Ensure that shapes and dtypes are propagated.
-      shapes, types = zip(*[(pair.shape, pair.dtype)
-                            for pair in handle_data.shape_and_type])
-      ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
-      shapes = [[d.size for d in s.dim]
-                if not s.unknown_rank else None for s in shapes]
-      pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
-          target_t._op._graph._c_graph,  # pylint: disable=protected-access
-          target_t._as_tf_output(),  # pylint: disable=protected-access
-          shapes, ranks, types)
-
-
 def _get_device_functions(ctx, graph):
   """Returns a tuple of device functions representing the device stack."""
   if ctx.executing_eagerly():
@@ -547,7 +509,7 @@ class _EagerDefinedFunction(object):
       for i, shape in enumerate(self._output_shapes):
         outputs[i].set_shape(shape)
       for i, func_graph_output in enumerate(self._func_graph_outputs):
-        _copy_handle_data(func_graph_output, outputs[i])
+        custom_gradient.copy_handle_data(func_graph_output, outputs[i])
       return outputs
 
 
@@ -658,7 +620,48 @@ class Function(object):
     if tape.should_record(tensor_inputs) or tape.should_record(captures):
       return self._backprop_call(args)
 
-    outputs = self._inference_function.call(ctx, args)
+    # Only need to override the gradient in graph mode and when we have outputs.
+    if context.executing_eagerly() or not self.outputs:
+      outputs = self._inference_function.call(ctx, args)
+    else:
+      name = "PartitionedCall-%s" % ops.uid()
+
+      @ops.RegisterGradient(name)
+      def grad_fn(op, *doutputs):  # pylint: disable=unused-variable
+        """Gradients of this function."""
+        if op.graph is not ops.get_default_graph():
+          # TODO(apassos) this will still emit SymbolicGradient ops when
+          # nested defuns are being differentiated. We need to somehow figure
+          # out a way to update the FunctionDef corresponding to the calling
+          # function when mutating a call to the forward pass.
+          return gradients_impl._SymGrad(op, list(doutputs))  # pylint: disable=protected-access
+        if self._backward_graph_function is None:
+          self._construct_backprop_function()
+        self._forward_function.add_to_graph(op.graph)
+        func = attr_value_pb2.AttrValue(
+            func=attr_value_pb2.NameAttrList(
+                name=self._forward_function.name))
+        # pylint: disable=protected-access
+        op._set_attr("f", func)
+        types = attr_value_pb2.AttrValue.ListValue(
+            type=self._forward_function._output_types)
+        op._set_attr("Tout", attr_value_pb2.AttrValue(list=types))
+        for i in range(
+            len(outputs), len(self._forward_function._output_types)):
+          t = ops.Tensor(op, i, self._forward_function._output_types[i])
+          t.set_shape(self._forward_function._output_shapes[i])
+          func_graph_output = self._forward_function._func_graph_outputs[i]
+          custom_gradient.copy_handle_data(func_graph_output, t)
+          op._outputs.append(t)
+        # pylint: enable=protected-access
+        side_outputs = op.outputs[len(outputs):]
+        return self._backward_graph_function(
+            *(list(doutputs) + list(side_outputs)))
+
+      with ops.get_default_graph().gradient_override_map(
+          {"PartitionedCall": name}):
+        outputs = self._inference_function.call(ctx, args)
+
     return self._build_call_outputs(outputs)
 
   @property
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 57e545be69..e46bde098b 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -286,7 +286,23 @@ class FunctionTest(test.TestCase):
       c = constant_op.constant([[2.]])
       f_c = f(c)
       g, = gradients_impl.gradients(f_c, c)
-      self.assertAllEqual(sess.run(g), [[1.0]])
+      self.assertAllEqual(sess.run(g).values, [[1.0]])
+
+  def testNoSymGradNestedDefun(self):
+
+    @function.defun
+    def outer():
+
+      @function.defun
+      def f(x):
+        return array_ops.gather_nd(x, [[0]])
+
+      c = constant_op.constant([[2.]])
+      f_c = f(c)
+      g, = gradients_impl.gradients(f_c, c)
+      self.assertTrue(isinstance(g, ops.IndexedSlices))
+
+    outer()
 
   def testNestedInputsGraphFunction(self):
     matmul = function.defun(math_ops.matmul)
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index e85bba11cd..9955a9a2cd 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -482,7 +482,8 @@ class OpDefLibrary(object):
               else:
                 raise TypeError("%s that don't all match." % prefix)
             else:
-              raise TypeError("%s that are invalid." % prefix)
+              raise TypeError(
+                  "%s that are invalid. Tensors: %s" % (prefix, values))
 
           types = [x.dtype for x in values]
           inputs.extend(values)
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index ec875aae59..a424a0f219 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -153,6 +153,7 @@ class CondV2Test(test.TestCase):
         self.assertIn("foo_cond_1_false", ops.get_default_graph()._functions)
 
   def testDefunInCond(self):
+    self.skipTest("b/117293122")
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
 
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index d7834ba350..bfe23834b7 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -18,9 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape as tape_lib
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
@@ -33,6 +35,45 @@ from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
+def copy_handle_data(source_t, target_t):
+  """Copies HandleData for variant and resource type tensors if available.
+
+  The CppShapeInferenceResult::HandleData proto contains information about the
+  shapes and types of the element tensors of resource/variant type tensors.
+  We need to copy this across function boundaries, i.e., when capturing a
+  placeholder or when returning a function tensor as output. If we don't do this
+  the element tensors will have unknown shapes, e.g., if a TensorList variant
+  tensor is captured as a placeholder, elements popped from that list would have
+  unknown shape.
+
+  Args:
+    source_t: The tensor to copy HandleData from.
+    target_t: The tensor to copy HandleData to.
+  """
+  if (target_t.dtype == dtypes.resource or
+      target_t.dtype == dtypes.variant):
+    if isinstance(source_t, ops.EagerTensor):
+      handle_data = source_t._handle_data  # pylint: disable=protected-access
+    else:
+      handle_data = resource_variable_ops.get_resource_handle_data(source_t)
+    if handle_data is not None and handle_data.is_set:
+      # pylint: disable=protected-access
+      pywrap_tensorflow.SetHandleShapeAndType(target_t.graph._c_graph,
+                                              target_t._as_tf_output(),
+                                              handle_data.SerializeToString())
+      # pylint: enable=protected-access
+      # Ensure that shapes and dtypes are propagated.
+      shapes, types = zip(*[(pair.shape, pair.dtype)
+                            for pair in handle_data.shape_and_type])
+      ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
+      shapes = [[d.size for d in s.dim]
+                if not s.unknown_rank else None for s in shapes]
+      pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
+          target_t._op._graph._c_graph,  # pylint: disable=protected-access
+          target_t._as_tf_output(),  # pylint: disable=protected-access
+          shapes, ranks, types)
+
+
 @tf_export("custom_gradient")
 def custom_gradient(f):
   """Decorator to define a function with a custom gradient.
@@ -180,8 +221,11 @@ def _graph_mode_decorator(f, *args, **kwargs):
     input_grads = nest.flatten(input_grads)
     return ([None] * len(flat_result)) + input_grads + variable_grads
 
+  original_tensors = all_tensors
   with ops.get_default_graph().gradient_override_map({"IdentityN": name}):
     all_tensors = array_ops.identity_n(all_tensors)
+  for ot, t in zip(original_tensors, all_tensors):
+    copy_handle_data(ot, t)
   return nest.pack_sequence_as(
       structure=result, flat_sequence=all_tensors[:len(flat_result)])
 
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index aac95037dc..6909fcaed5 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -800,23 +800,21 @@ def _GradientsHelper(ys,
         # pylint: enable=protected-access
         has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
         if has_out_grads and (op not in stop_ops):
-          if is_func_call:
-            if is_partitioned_call:
-              func_call = src_graph._get_function(  # pylint: disable=protected-access
-                  compat.as_bytes(op.get_attr("f").name))
+          try:
+            grad_fn = ops.get_gradient_function(op)
+          except LookupError:
+            if is_func_call:
+              if is_partitioned_call:
+                func_call = src_graph._get_function(  # pylint: disable=protected-access
+                    compat.as_bytes(op.get_attr("f").name))
+              else:
+                func_call = src_graph._get_function(op.type)  # pylint: disable=protected-access
+              # Note that __defun is not set if the graph is
+              # imported. If it's set, we prefer to access the original
+              # defun.
+              func_call = getattr(op, "__defun", func_call)
+              grad_fn = func_call.python_grad_func
             else:
-              func_call = src_graph._get_function(op.type)  # pylint: disable=protected-access
-            # Note that __defun is not set if the graph is
-            # imported. If it's set, we prefer to access the original
-            # defun.
-            func_call = getattr(op, "__defun", func_call)
-            grad_fn = func_call.python_grad_func
-          else:
-            # A grad_fn must be defined, either as a function or as None
-            # for ops that do not have gradients.
-            try:
-              grad_fn = ops.get_gradient_function(op)
-            except LookupError:
               raise LookupError(
                   "No gradient defined for operation '%s' (op type: %s)" %
                   (op.name, op.type))
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 8e88a84d60..0419656143 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -37,6 +37,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import cond_v2_impl as cond_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
@@ -580,7 +581,7 @@ def _check_shapes_compat(output_tensors, shape_invariants, input_tensors):
 
 def _copy_handle_data(src_tensors, tgt_tensors):
   for src_t, tgt_t in zip(src_tensors, tgt_tensors):
-    function._copy_handle_data(src_t, tgt_t)
+    custom_gradient.copy_handle_data(src_t, tgt_t)
 
 
 # TODO(srbs): Move to common utils for cond_v2 and while_v2.
-- 
GitLab


From 13b47e6c4f9d7b295948b1057139bf676e394b6f Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 8 Oct 2018 14:16:55 -0700
Subject: [PATCH 543/570] Automated rollback of commit
 295b3c80555cc82d8d70faf96a47681e1d904b9c

PiperOrigin-RevId: 216247929
---
 tensorflow/core/kernels/data/iterator_ops.cc  |  4 ---
 .../kernels/data/map_and_batch_dataset_op.cc  |  9 ++++---
 .../core/kernels/data/model_dataset_op.cc     | 10 ++++---
 .../data/parallel_interleave_dataset_op.cc    | 27 +++++++++++--------
 .../kernels/data/parallel_map_iterator.cc     |  9 ++++---
 .../core/kernels/data/prefetch_dataset_op.cc  | 10 ++++---
 tensorflow/core/kernels/data/writer_ops.cc    | 12 ++++-----
 7 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 7a833668ac..8acd6cc724 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -16,10 +16,8 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
-#include "tensorflow/core/common_runtime/threadpool_device.h"
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/resource_op_kernel.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
@@ -27,13 +25,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/optional_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index f45a239793..0fb721cd7c 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -445,9 +445,10 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!runner_thread_) {
           auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
-          runner_thread_.reset(ctx->env()->StartThread(
-              {}, "runner_thread",
-              std::bind(&Iterator::RunnerThread, this, ctx_copy)));
+          runner_thread_ =
+              MakeUnique<BackgroundWorker>(ctx->env(), "runner_thread");
+          runner_thread_->Schedule(
+              std::bind(&Iterator::RunnerThread, this, ctx_copy));
         }
       }
 
@@ -703,7 +704,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_;
       // Buffer for storing the (intermediate) batch results.
       std::deque<std::shared_ptr<BatchResult>> batch_results_ GUARDED_BY(*mu_);
-      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+      std::unique_ptr<BackgroundWorker> runner_thread_ GUARDED_BY(*mu_);
       bool cancelled_ GUARDED_BY(*mu_) = false;
     };
 
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index 9aa505f4f1..859df57962 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -126,9 +127,10 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (!optimize_thread_) {
           std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-          optimize_thread_.reset(ctx->env()->StartThread(
-              {}, "optimize_thread",
-              [this, new_ctx]() { OptimizeThread(new_ctx); }));
+          optimize_thread_ =
+              MakeUnique<BackgroundWorker>(ctx->env(), "optimize_thread");
+          optimize_thread_->Schedule(
+              [this, new_ctx]() { OptimizeThread(new_ctx); });
         }
         return Status::OK();
       }
@@ -167,7 +169,7 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       mutex mu_;
       condition_variable cond_var_;
       std::shared_ptr<model::Model> model_;
-      std::unique_ptr<Thread> optimize_thread_ GUARDED_BY(mu_);
+      std::unique_ptr<BackgroundWorker> optimize_thread_ GUARDED_BY(mu_);
       bool cancelled_ GUARDED_BY(mu_) = false;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
     };
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 6b6b3d6ab9..9c836b836e 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -481,9 +482,10 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
           worker_threads_.reserve(dataset()->num_threads());
           for (size_t i = 0; i < dataset()->num_threads(); ++i) {
             std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, "worker_thread",
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
+            worker_threads_.emplace_back(
+                MakeUnique<BackgroundWorker>(ctx->env(), "worker_thread"));
+            worker_threads_.back()->Schedule(
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); });
           }
         }
         return Status::OK();
@@ -580,9 +582,10 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
             }
             workers_[i].SetInputs(s, std::move(args));
             std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, "worker_thread",
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
+            worker_threads_.emplace_back(
+                MakeUnique<BackgroundWorker>(ctx->env(), "worker_thread"));
+            worker_threads_.back()->Schedule(
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); });
             if (i < dataset()->cycle_length_) {
               interleave_indices_.push_back(i);
             } else {
@@ -1047,7 +1050,8 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       // The worker threads. This must be last to ensure the
       // threads have exited before any other members are deallocated.
       // TODO(b/65178177): Avoid allocating additional threads.
-      std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
+      std::vector<std::unique_ptr<BackgroundWorker>> worker_threads_
+          GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
@@ -1389,9 +1393,10 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!runner_thread_) {
           std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-          runner_thread_.reset(ctx->env()->StartThread(
-              {}, "runner_thread",
-              [this, new_ctx]() { RunnerThread(new_ctx); }));
+          runner_thread_ =
+              MakeUnique<BackgroundWorker>(ctx->env(), "runner_thread");
+          runner_thread_->Schedule(
+              [this, new_ctx]() { RunnerThread(new_ctx); });
         }
       }
 
@@ -1645,7 +1650,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       int64 num_calls_ GUARDED_BY(*mu_) = 0;
 
       std::unique_ptr<thread::ThreadPool> thread_pool_;
-      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+      std::unique_ptr<BackgroundWorker> runner_thread_ GUARDED_BY(*mu_);
 
       // Identifies whether background activity should be cancelled.
       bool cancelled_ GUARDED_BY(*mu_) = false;
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index ebf41925c9..e69274e4f2 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -181,9 +181,10 @@ class ParallelMapIterator : public DatasetBaseIterator {
       EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     if (!runner_thread_) {
       auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
-      runner_thread_.reset(ctx->env()->StartThread(
-          {}, "runner_thread",
-          std::bind(&ParallelMapIterator::RunnerThread, this, ctx_copy)));
+      runner_thread_ =
+          MakeUnique<BackgroundWorker>(ctx->env(), "runner_thread");
+      runner_thread_->Schedule(
+          std::bind(&ParallelMapIterator::RunnerThread, this, ctx_copy));
     }
   }
 
@@ -331,7 +332,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   // Buffer for storing the invocation results.
   std::deque<std::shared_ptr<InvocationResult>> invocation_results_
       GUARDED_BY(*mu_);
-  std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+  std::unique_ptr<BackgroundWorker> runner_thread_ GUARDED_BY(*mu_);
   bool cancelled_ GUARDED_BY(*mu_) = false;
 };
 
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 754ed772db..e9c38eb8a0 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -256,10 +257,11 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     Status EnsurePrefetchThreadStarted(IteratorContext* ctx)
         EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!prefetch_thread_) {
+        prefetch_thread_ =
+            MakeUnique<BackgroundWorker>(ctx->env(), "prefetch_thread");
         std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-        prefetch_thread_.reset(ctx->env()->StartThread(
-            {}, "prefetch_thread",
-            [this, new_ctx]() { PrefetchThread(new_ctx); }));
+        prefetch_thread_->Schedule(
+            [this, new_ctx]() { PrefetchThread(new_ctx); });
       }
       return Status::OK();
     }
@@ -363,7 +365,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     string prefix_end_;
     PrefetchAutotuner auto_tuner_ GUARDED_BY(mu_);
     std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
-    std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
+    std::unique_ptr<BackgroundWorker> prefetch_thread_ GUARDED_BY(mu_);
     bool cancelled_ GUARDED_BY(mu_) = false;
     bool prefetch_thread_finished_ GUARDED_BY(mu_) = false;
   };
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc
index 3f76695bb1..7bb2077b62 100644
--- a/tensorflow/core/kernels/data/writer_ops.cc
+++ b/tensorflow/core/kernels/data/writer_ops.cc
@@ -29,10 +29,10 @@ class ToTFRecordOp : public AsyncOpKernel {
  public:
   explicit ToTFRecordOp(OpKernelConstruction* ctx)
       : AsyncOpKernel(ctx),
-        thread_pool_(new thread::ThreadPool(
-            ctx->env(), ThreadOptions(),
-            strings::StrCat("to_tf_record__op_", SanitizeThreadSuffix(name())),
-            1 /* num_threads */, false /* low_latency_hint */)) {}
+        background_worker_(
+            ctx->env(),
+            strings::StrCat("to_tf_record_op_", SanitizeThreadSuffix(name()))) {
+  }
 
   template <typename T>
   Status ParseScalarArgument(OpKernelContext* ctx,
@@ -50,7 +50,7 @@ class ToTFRecordOp : public AsyncOpKernel {
     // The call to `iterator->GetNext()` may block and depend on an
     // inter-op thread pool thread, so we issue the call from the
     // owned thread pool.
-    thread_pool_->Schedule([this, ctx, done]() {
+    background_worker_.Schedule([this, ctx, done]() {
       string filename;
       OP_REQUIRES_OK_ASYNC(
           ctx, ParseScalarArgument<string>(ctx, "filename", &filename), done);
@@ -97,7 +97,7 @@ class ToTFRecordOp : public AsyncOpKernel {
   }
 
  private:
-  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  BackgroundWorker background_worker_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("DatasetToTFRecord").Device(DEVICE_CPU),
-- 
GitLab


From 09b0fc199129e0f487a39741bdf674cf09035cbc Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 8 Oct 2018 14:17:24 -0700
Subject: [PATCH 544/570] [tf.data] Choose non-deterministic seed once per
 Python-level `Dataset` object.

This changes the behavior of randomness-introducing datasets (`tf.data.Dataset.shuffle()`, `tf.data.experimental.shuffle_and_repeat()`, and `tf.data.experimental.RandomDataset`). Previously, when you used the same `tf.data.Dataset` object multiple times in a pipeline (e.g. by zipping two datasets derived from the same randomness-introducing dataset) *and* you did not specify an explicit `seed`, the implementation would choose different non-deterministic seeds for each use of the `Dataset` object.

With this change, the seed will be chosen once per `Dataset` (technically, once per `Dataset`-`Graph` combination, due to the vagaries of capturing state in `Dataset.make_one_shot_iterator()`), which means that all uses of the same dataset object will observe the same sequence of values.

This change also revealed a small bug in how `Dataset.shuffle(..., reshuffle_each_iteration=False)` is serialized when an explicit seed is specified. The op-level seed was dropped, which could lead to non-deterministic behavior. This change fixes that issue by forwarding the op-level seed to the appropriate place.

PiperOrigin-RevId: 216248013
---
 .../core/kernels/data/shuffle_dataset_op.cc   |  2 +-
 .../data/experimental/kernel_tests/BUILD      | 13 ++++++
 .../kernel_tests/random_dataset_test.py       | 45 +++++++++++++++++++
 .../kernel_tests/shuffle_and_repeat_test.py   | 21 ++++++++-
 .../data/experimental/ops/random_ops.py       | 21 +++++++--
 .../data/experimental/ops/shuffle_ops.py      | 21 +++++++--
 tensorflow/python/data/kernel_tests/BUILD     |  1 +
 .../kernel_tests/shuffle_dataset_op_test.py   | 25 ++++++++++-
 tensorflow/python/data/ops/dataset_ops.py     | 22 +++++++--
 tensorflow/python/data/util/BUILD             |  1 +
 tensorflow/python/data/util/random_seed.py    |  5 ++-
 .../python/data/util/random_seed_test.py      | 13 +++++-
 12 files changed, 174 insertions(+), 16 deletions(-)
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py

diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 66466d6a36..9f54c381a9 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -485,7 +485,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
                      int64 buffer_size, int64 seed, int64 seed2, int64 count)
         : ShuffleDatasetBase(ctx, input, buffer_size, count),
           seed_(seed),
-          seed2_(seed) {}
+          seed2_(seed2) {}
 
     string DebugString() const override {
       return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 4eef9580ad..a67f6ff031 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -453,6 +453,18 @@ cuda_py_test(
     tags = ["no_windows_gpu"],
 )
 
+py_test(
+    name = "random_dataset_test",
+    srcs = ["random_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/data/experimental/ops:random_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_library(
     name = "reader_dataset_ops_test_base",
     testonly = 1,
@@ -562,6 +574,7 @@ py_test(
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py
new file mode 100644
index 0000000000..d403a575ec
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py
@@ -0,0 +1,45 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.RandomDataset()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import random_ops
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+
+
+class RandomDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ("NoSeed", None),
+      ("WithSeed", 42),
+  )
+  def testZipRandomDataset(self, seed):
+    dataset = random_ops.RandomDataset(seed=seed).take(30)
+    dataset = dataset_ops.Dataset.zip((dataset, dataset))
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for _ in range(30):
+        x, y = sess.run(next_element)
+        self.assertEqual(x, y)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
diff --git a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
index c208963a86..883169495f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import shuffle_ops
@@ -27,7 +28,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
 
-class ShuffleAndRepeatTest(test_base.DatasetTestBase):
+class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _build_ds(self, seed, count=5, num_elements=20):
     return dataset_ops.Dataset.range(num_elements).apply(
@@ -110,6 +111,24 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
       with self.session(graph=g) as sess:
         sess.run(get_next_op)
 
+  @parameterized.named_parameters(
+      ("NoSeed", None),
+      ("WithSeed", 42),
+  )
+  def testShuffleAndRepeatAndZipDataset(self, seed):
+    dataset = dataset_ops.Dataset.range(10).apply(
+        shuffle_ops.shuffle_and_repeat(10, count=3, seed=seed))
+    dataset = dataset_ops.Dataset.zip((dataset, dataset))
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for _ in range(30):
+        x, y = sess.run(next_element)
+        self.assertEqual(x, y)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/ops/random_ops.py b/tensorflow/python/data/experimental/ops/random_ops.py
index e3a2aeab31..25d7fbf691 100644
--- a/tensorflow/python/data/experimental/ops/random_ops.py
+++ b/tensorflow/python/data/experimental/ops/random_ops.py
@@ -33,13 +33,26 @@ class RandomDataset(dataset_ops.DatasetSource):
   def __init__(self, seed=None):
     """A `Dataset` of pseudorandom values."""
     super(RandomDataset, self).__init__()
-    self._seed, self._seed2 = random_seed.get_seed(seed)
+
+    # NOTE(mrry): We generate the seed-pair once per graph in which the dataset
+    # is iterated over, and cache it in `self._graph_seed_map`. This supports
+    # two features: iterating over the same `ShuffleDataset` twice in the same
+    # pipeline and observing the same order (by tying the seeds together with
+    # a randomly-generated seed), and using `Dataset.make_one_shot_iterator()`,
+    # which requires the stateful RNG op to be created inside the same graph as
+    # the dataset.
+    self._original_seed = seed
+    self._graph_seed_map = {}
 
   def _as_variant_tensor(self):
+    try:
+      seed, seed2 = self._graph_seed_map[ops.get_default_graph()]
+    except KeyError:
+      seed, seed2 = random_seed.get_seed(self._original_seed)
+      self._graph_seed_map[ops.get_default_graph()] = (seed, seed2)
+
     return gen_dataset_ops.random_dataset(
-        seed=self._seed,
-        seed2=self._seed2,
-        **dataset_ops.flat_structure(self))
+        seed=seed, seed2=seed2, **dataset_ops.flat_structure(self))
 
   @property
   def output_classes(self):
diff --git a/tensorflow/python/data/experimental/ops/shuffle_ops.py b/tensorflow/python/data/experimental/ops/shuffle_ops.py
index a4307212da..a82e4b7d09 100644
--- a/tensorflow/python/data/experimental/ops/shuffle_ops.py
+++ b/tensorflow/python/data/experimental/ops/shuffle_ops.py
@@ -39,17 +39,32 @@ class _ShuffleAndRepeatDataset(dataset_ops.UnaryDataset):
     else:
       self._count = ops.convert_to_tensor(
           count, dtype=dtypes.int64, name="count")
-    self._seed, self._seed2 = random_seed.get_seed(seed)
+
+    # NOTE(mrry): We generate the seed-pair once per graph in which the dataset
+    # is iterated over, and cache it in `self._graph_seed_map`. This supports
+    # two features: iterating over the same `ShuffleDataset` twice in the same
+    # pipeline and observing the same order (by tying the seeds together with
+    # a randomly-generated seed), and using `Dataset.make_one_shot_iterator()`,
+    # which requires the stateful RNG op to be created inside the same graph as
+    # the dataset.
+    self._original_seed = seed
+    self._graph_seed_map = {}
 
   def _as_variant_tensor(self):
+    try:
+      seed, seed2 = self._graph_seed_map[ops.get_default_graph()]
+    except KeyError:
+      seed, seed2 = random_seed.get_seed(self._original_seed)
+      self._graph_seed_map[ops.get_default_graph()] = (seed, seed2)
+
     # pylint: disable=protected-access
     input_resource = self._input_dataset._as_variant_tensor()
     return gen_dataset_ops.shuffle_and_repeat_dataset(
         input_resource,
         buffer_size=self._buffer_size,
         count=self._count,
-        seed=self._seed,
-        seed2=self._seed2,
+        seed=seed,
+        seed2=seed2,
         **dataset_ops.flat_structure(self))
     # pylint: enable=protected-access
 
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index c7295d6e69..ecb24103b3 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -443,6 +443,7 @@ tf_py_test(
     srcs = ["shuffle_dataset_op_test.py"],
     additional_deps = [
         ":test_base",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
index 347af18576..6001721726 100644
--- a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import collections
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.kernel_tests import test_base
@@ -31,7 +32,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class ShuffleDatasetTest(test_base.DatasetTestBase):
+class ShuffleDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testShuffleDataset(self):
     components = (
@@ -209,5 +210,27 @@ class ShuffleDatasetTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  @parameterized.named_parameters(
+      ("ReshuffleEachIterationNoSeed", None, True),
+      ("ReshuffleEachIterationWithSeed", 42, True),
+      ("NoReshuffleEachIterationNoSeed", None, False),
+      ("NoReshuffleEachIterationWithSeed", 42, False),
+  )
+  def testShuffleAndZipDataset(self, seed, reshuffle):
+    dataset = (dataset_ops.Dataset.range(10)
+               .shuffle(10, seed=seed, reshuffle_each_iteration=reshuffle)
+               .repeat(3))
+    dataset = dataset_ops.Dataset.zip((dataset, dataset))
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for _ in range(30):
+        x, y = sess.run(next_element)
+        self.assertEqual(x, y)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index b7e19055f2..2d036fd0d6 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -2254,18 +2254,34 @@ class ShuffleDataset(UnaryDataset):
     self._input_dataset = input_dataset
     self._buffer_size = ops.convert_to_tensor(
         buffer_size, dtype=dtypes.int64, name="buffer_size")
-    self._seed, self._seed2 = random_seed.get_seed(seed)
+
+    # NOTE(mrry): We generate the seed-pair once per graph in which the dataset
+    # is iterated over, and cache it in `self._graph_seed_map`. This supports
+    # two features: iterating over the same `ShuffleDataset` twice in the same
+    # pipeline and observing the same order (by tying the seeds together with
+    # a randomly-generated seed), and using `Dataset.make_one_shot_iterator()`,
+    # which requires the stateful RNG op to be created inside the same graph as
+    # the dataset.
+    self._original_seed = seed
+    self._graph_seed_map = {}
+
     if reshuffle_each_iteration is None:
       self._reshuffle_each_iteration = True
     else:
       self._reshuffle_each_iteration = reshuffle_each_iteration
 
   def _as_variant_tensor(self):
+    try:
+      seed, seed2 = self._graph_seed_map[ops.get_default_graph()]
+    except KeyError:
+      seed, seed2 = random_seed.get_seed(self._original_seed)
+      self._graph_seed_map[ops.get_default_graph()] = (seed, seed2)
+
     return gen_dataset_ops.shuffle_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         buffer_size=self._buffer_size,
-        seed=self._seed,
-        seed2=self._seed2,
+        seed=seed,
+        seed2=seed2,
         reshuffle_each_iteration=self._reshuffle_each_iteration,
         **flat_structure(self))
 
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 39082ce370..95bf3209d7 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -142,6 +142,7 @@ py_test(
         ":random_seed",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:util",
     ],
 )
diff --git a/tensorflow/python/data/util/random_seed.py b/tensorflow/python/data/util/random_seed.py
index d5169f7a53..d24df6d957 100644
--- a/tensorflow/python/data/util/random_seed.py
+++ b/tensorflow/python/data/util/random_seed.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 
 
 def get_seed(seed):
@@ -37,7 +38,7 @@ def get_seed(seed):
 
   Returns:
     A tuple of two `tf.int64` scalar tensors that should be used for the local
-    seed of the calling dataset.
+    seeds of the calling dataset.
   """
   seed, seed2 = random_seed.get_seed(seed)
   if seed is None:
@@ -45,7 +46,7 @@ def get_seed(seed):
   else:
     seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed")
   if seed2 is None:
-    seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
+    seed2 = random_ops.random_uniform([], 1, 2**63 - 1, dtype=dtypes.int64)
   else:
     with ops.name_scope("seed2") as scope:
       seed2 = ops.convert_to_tensor(seed2, dtype=dtypes.int64)
diff --git a/tensorflow/python/data/util/random_seed_test.py b/tensorflow/python/data/util/random_seed_test.py
index a809151e6e..5df2e38c62 100644
--- a/tensorflow/python/data/util/random_seed_test.py
+++ b/tensorflow/python/data/util/random_seed_test.py
@@ -41,7 +41,6 @@ class RandomSeedTest(test.TestCase):
         # (input_graph_seed, input_op_seed)
         # and output from get_seed:
         # (output_graph_seed, output_op_seed)
-        ((None, None), (0, 0)),
         ((None, 1), (random_seed.DEFAULT_GRAPH_SEED, 1)),
         ((1, 1), (1, 1)),
         ((0, 0), (0, 2**31 - 1)),  # Avoid nondeterministic (0, 0) output
@@ -78,6 +77,18 @@ class RandomSeedTest(test.TestCase):
       self.assertEqual((g_seed, op_seed), toutput, msg=msg)
       random_seed.set_random_seed(None)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testNondeterministicRandomSeed(self):
+    random_seed.set_random_seed(None)
+    op_seeds = []
+    for _ in range(50):
+      g_seed, op_seed = data_random_seed.get_seed(None)
+      g_seed = self.evaluate(g_seed)
+      op_seed = self.evaluate(op_seed)
+      self.assertEqual(0, g_seed)
+      self.assertNotEqual(0, op_seed)
+      op_seeds.append(op_seed)
+    self.assertGreater(len(set(op_seeds)), 1)
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From bc5635dc3ac78007caee88fabd81d23ad945b637 Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Mon, 8 Oct 2018 14:19:49 -0700
Subject: [PATCH 545/570] Update performance documentation.

PiperOrigin-RevId: 216248418
---
 .../performance/model_size_vs_accuracy.png    | Bin 0 -> 18946 bytes
 .../performance/model_size_vs_latency.png     | Bin 0 -> 21380 bytes
 tensorflow/contrib/lite/g3doc/performance.md  |  21 ++++++++++++------
 3 files changed, 14 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/contrib/lite/g3doc/images/performance/model_size_vs_accuracy.png
 create mode 100644 tensorflow/contrib/lite/g3doc/images/performance/model_size_vs_latency.png

diff --git a/tensorflow/contrib/lite/g3doc/images/performance/model_size_vs_accuracy.png b/tensorflow/contrib/lite/g3doc/images/performance/model_size_vs_accuracy.png
new file mode 100644
index 0000000000000000000000000000000000000000..44d0ccd3128dea1c947e57ccbc4e18b2d34cef88
GIT binary patch
literal 18946
zcmeAS@N?(olHy`uVBq!ia0y~yU`l6TV0gyC#=yWZ;h>T$0|Ns~v6E*A2L}g74M$1`
z0|NtRfk$L90|U1Z2s2)~Tla^7fx)uGHKHUqKdq!Zu_%=xH?gE3C%+^oGfAN=wWv5V
zKTp9(&q&W$M<Ju6q`*pFAE7`mzbIW_S;U2vfkA=6)5S5QV$Pepl`)}LfBpD)J~4=e
zaj%7fmR}3AOg~45vBmV@S;BqhzQOy`?#_NT;n@tSn^P}cG7&XpJ+o)gGE3EE&txnf
zwlpz62=#CjNV&D&{>R2{3l6>mg7dz=KI;8FuKNAgIqk|e>$ZNkl5!0cX%TSZP;7BD
z(0a)LW*(Wp#6lP(WS|f<hf}ddz)3<t^Bh=;r$ZziBqPA)(q#;i;b2xeI#a-D{fZSD
zCQ`hTl9H@)uiwA-pKX>qZ}t3*$;XZI@9lYR70#jPrL1|bMT7O#m6gpiXG#_p7Vi9J
zo_&p{*UdR3B;+K^xBqM7_RebKl@8jN<hm+!^)c!EJqH)N_cKInOky=oKc{o?$2>I^
z6_#GN!|C&D!!9l4P&~x}GEw!_wYA<34h_3%|NW_yv#*P35=^x&c_DClxxc!CLW6t1
zT;a=0s%(5R5tkB0Z|y35y&`6((d+B$^}p@oHqO#)(WqTwA-rgcgjr6+I;Nz%yGnWF
z?QE_aa&mI=$lJ})3SIT!!Gi;TettfA>J*cLvGHWj$!dw&+1`7rzW#W*eEz37)>&6H
z-n@CUVf*&<V?B~7X=!2-*H;EF&$zis_3SKD<^@GhPxVL`Hg$<=GkrK;|8KISaT>#g
z&FTI+y1G~Om6etM{Qb+gqvog4T<h{>MNdyzSXyqZ{G7J;FDonS!)MQ?W#6Bq>iy#C
zYH^0$Wp5W1K0X$*CPJ{fx>`&>ZchH2lP6E++}zZ<e*eE&+1J+<{`&GV=B<05%)-#s
zVK;8v2v{AaYkF<U6cM{WADm@mWHxNuw#;|7*^(to1cZc~&dxF|j$Q0G*Xqsv{qhYr
zH>Y1-A8+5rC)+i3s_4Ie|K`|KPMW40os^v1Ja?|Fu$qs8uCA}`|3^o=v*JMU)5OZX
zqx!p^s8)!Aon4&myYKhwmwQdsTC#lk^WRmi0&R|6#)@8?{{H?#!ooXCUJA{eIg^Er
z&8XzXg_z@ezTc}>QBf&)e2kY**6PTwudgF_m+3OB4qKb_{M_7@wzi^QUoyAl+?@3K
zPW`{n^JmPMAs{H|n317T{r%n3-S78l8>|!4joOfYUM}tItfLnfyH|dC;<@kmyBiyu
zmn>14G-=Y0hwbt!;`Um}T9>I<TFyK-*V?$^L&CeeyWJBK6doV%SGTdTId59^?aj%l
z+TlCO-p<;Ue|ecNkDQG}L*Qb!k8ii%|McRbvR=#%hju>MPrd1{udR)Uin_GFzJ6EP
z+f%zrUtifa+dTi%uh;88->ZIK`2Sz+p+ko*yewhxnPt*xoPO@Z?fm^8kIUCTc>TJ2
zV&KENJ8bOipx~9Wt2r@2(b>SvEbZhZ)gwob7QVS*$R}fQ!2Es<^MS?g{V%Sp6u!Hw
zR9Q`}t)qj(s_c!&*H>3Re)@DO^g!N?4ULY?Y>8P}Q7bH31XMi)o%Vc^k&&5F`}>>b
z=Vxbwm-)?I_2%B*=$BVkPM$Px-Z}=al$0ftCQe*9W7aG!as9ZE^U)iVS}*OZt#%C*
zIePncbmYF8owv57pO<3|xODmQ)ZOLpwbIhkPCY)}fAyR1Y%|TcxVWxG8mDe;&7OK~
zZS-^L`|DzNPm!~&(ztf*T32gp>!<bgf4xthI1wP?8tB#|v9RXP$KyNu_7^?v3R>zV
z`t-xY!(EFsyb=-?Tw3NkJLY(gq_Nvf+v;ui{<ZN)E;=*Q*ga6hb!XMrRY{qdD^=9h
zwYwH+Y|XkF6t*T}p^A#illd3^uTMVSck1{1{p<I|MMp=g>gxIiinvB@NND`@Y<B)L
zOLb%8V9oe_dxXTqrY(Q_;9zst-o3T!YAx?uy9Q2)+?=MFdwW}LmXG%|oxtCBw`5LU
zbou2MIdKVz6<e~ddIgHOg516;boH`17KM*iUKewcV3V{IUL+!EmLoCSEcej;`}XIj
zq{YO@ynOl6!op(3Zy6bxj=nxN1vRy{sI6H~vrZR&e&&1W^5u=2Hy=KBj7?fv`d;n#
z*mrk#hp*TXSO2%P?9GjX^7Vfl3knSW{rmT`Z<nfy%7xwK`OnVJUvI})_4%2v-yDmD
z-*S_a4<GB5-dX*9-MwrcIh%;R|MM;{^WF3F+3b)N0gMgu^?xSD?k;=y;DN*MZ*MKj
z-psg{eP>_o>}Ga;qm&Z@3{~IX`DSIUvbtOS`&;g->+9z~-?L+fMNn|?*KMN5;`g&N
zgs+cV>fSFGvOZ2$TU&dX|NM1!KPJZWSbl74Zx@!7Jh`v-_l9lTjvYSycxRH3lZZa3
zTy>R^ky#PFT@O_3<lo=7^Hxr7uA!;v(`RRAzuFhQJufjopTA-5T-ojU_s?COwm58U
z)D-RTz8yO(KxzDNJHMxo&yz<-yU&ZvpEyxa*1Bwob30$mjsivh>aZ}eW_JE#w{Ooj
zOg{GH)&APw-?HxW%iF1FYIZ(7J^l6D<;Bm>y$VToaA>ev%K6~kJHLVggYWO|K0a%H
zKjlCJW7YbNiHF%XZQAtW=4SP)tHYPa?X6m}Y}u;d*;})&KKk|bb>+FuTemK)`T42p
zwvLWYk?oR+GiRRs`}=#b?Ba_VMY;Xb=6!9v(nW7?nF<LDM{Y=9>{_(K@Xn4x=lnlU
zPft%j(7;&p|L^yl8ygxE54XLv_2Wpa0M`+bd#g<M|M|o%CMLG?o3ymFal!!xyMI5D
z<!q~toSkj{^L+ik#rLM0B^+Sb_v@ARyE{7%H?#9cY{{5dwQlZQ+1Onr4-fO(s~8$i
zT(d@JUHtxY3!U4!deki}W*j)+AZJt2ut)<`=sLIaJ$(6c=JOl<_Wx!)d6Kg4&!^Kd
zzpKBzICyn+_{|N8%r9TQT(W$5@vAGEda=8fZ1)ci4*vE1{rM*+C;xohZ~x-XPGg45
z%*>oSJ0`ZYu=x1;Zrrr#)c3dzrLV6UW?$1ebm-8P^>YHZ|M*eS#v^&?-{0Ri_f#57
zN=hC%eq2~gtZm{%L1p*8C-?vVGdDCeOi4>CdUL~Yo=xSU?Dcz>_3c}_e7SM`zdy_G
zHM8IRJ~c8jGBGpr<kQpBi$#~%R((-0GZVA7w+AU-?A~ut_^9QUUCobzKY#z;+*i9>
zB;9F`6F4MS#_zA2w_LPXCw7+zH#hgItE=5-o8?N_R2V3`_pu~=d2zAi?X9WHd}lv;
z^=el7lia_*zAjn1^x}@fWUbIuF8k~LcJ%h@8qD*Ws<koes+Nqb?8Z%-l3rX`=rhkI
za<1N=)eQ{{ee(8xOTDIEQ8zQsyW?@ROZ4XMa{b1}#w#lVn_pdBZJc^)%5pKYW_JFx
z|Ns7ed4FGCSXj99e;cp#is0pZVPRowA~rVlN}E5k{Ql$P<2!pQC;$Jw|Nr8uudiN(
zuaDcCabW=?BO~LLb+NN!cb9!UDjpxOvnaL1N;ducJl2GdkB;uxu>+KHpPrgJ(<qfI
zGBPqEI{Ncj^ZOspS-(%n&+l(-b)C1|{D0oZM@MsRZRzY1)i$d7@*?K+gpf566Kj8e
z+n9WuZ&mpEzQv1`+4<!{Zbxs=>zz1JaNW9fJD06&6=?Ha{G_L!Td_r>)T-pgfd>bh
zE1%CT_p%aEbW!vyRBI7fdSzwsa!^{>o_{~=`cz>j3A>rsM4UQ;GLCczmb|*cX>V_@
zd8Vb}=!p|E!OcI$i4&0<bR3E;j1wREa45D29BS|Yndrok=s-lf@W_$}&R+A@uGKAl
zbw%@~0gqErk&&2~n5t~s!HIibs8^nx|HRGyWOv;p@xLnizg%5i1;xd`-+8`Y=xF3z
zt5S>FUnQRHi2_beP9Hvf(t2sY)Az*H{-pOm74u)7abG5bOt^Ud?Af=i=cZ@+f*PPA
zCoWyubmLdlh6KlTZ^PC`iF)rBR`=7nc73KixXs#Oq@(Sf7@c8aJ6-Nz?mxRKJ=;Uu
zE@W)l<n?ax_WyBn{(ifi?|Sd~%gf6vKRj@p_&rzx(&kMK37dB4#{2UIfA>GPl%MMm
z6D0L~vG9y-Uu}~2+b`bkaC+vcH`mw8A8zMQKQTd3!|Pk&a=*EYLRW{G<lYkD=jWHR
zt#V0AQ=6>rf9&pE*|lM(x8JW@9k)DocUj5%d$Q{4>c{$Iy~D!9f|vWLs;aWi^*cU6
z(Ye9i{jG>=;-@DkEv&5{KYHX85D+kLhLl;(4DDqy)Ai$z9X{-Qx38<~(9xr;4!swz
zU;nO}{^O^}OWSnsX*v&gzu)J)_~L^{j}G11n%&vkn|fh^<B{XXFYl==Hp{uuFmq;P
zU>hjNIBd7g?|tauBGZ4_sQ9hF=lLfewpx_U>0Y?~oZb1#g$ozP?5UWzI(+@7kH_VU
zzq|;vv9Xb`C{S>8blkCf_v9HfI(F`~Tphmt*sWW$f|vV!d~#AaYI~mVbiG&xi^@+b
z4<A0Xu&}tWHrjk!-rZA+T)QK-<wzzbCRYBsyQ@??F)?wDZS}MrJ1p+py_<P`UF^CU
zSyxs#zP`5Bvg(UQ?yW6>=W}mtP|VECba8VF+MXBN)YJsBoZ<8|-NoVS<8Iu%xp&K*
zeH}{Dmfd}Q&sMMB_vp*Z%P~6&8qb{ZVPa+m<+lI7uJ7;Y=vWZ6)N4)L-d!u4KneUv
z@ESXznad4MPn<OGWO=*t*KL&*6#<*me0zF&F1##padj<xdP?-on>PjJ<?IJeo#GM}
z7XJ3`?&)7&UtfG#vL)kU(~ln&>tc5=iv!g+64KI-A3r`kQQ3W-)cc9b?jJsWJb2^A
zjI67xF7B(X_M2-pHQ%S+JmCOC=@#AP7Tukl58LJI9_;;oPdnsXm-9rw?6j1W36mx{
zxw*0R%iAA2cdjqK?q_S=-(QRdNk=%g<=zgPq6o@+O7a^yHP2s-TNh*b_5HD9$5K*K
zo>cR?21e{E(M(KCY-w#3l$2cAHz(6%R?P8#wZ9`p)Rr|evu~>WoVG4*Z<pKRgBLGO
zyl^2vNl6J*WR<?U^6=y1<0csw9_)NvA=$OYV)ORx>6eyxURf6_ZEbD6X6@R{dwVR+
z%*<xYNIAdk+M3A8FJ8P@#N`Nbi+^Hz?mwru^-rHXammS9V>RjPi;K!eMn*4gZPh+K
zU0>bSR<>bv`1(n6=J2HWXS=z%O`1B@H6UPu6>nx%)+)LG;kmcBscL9+oH^t3?*9Js
zpiawcyG>13*2l{yCnrBUJA1ibX4(6Db9a}&7ZMe94G9qebuN^Y7G1cm!{u6b{$?{f
zKL;P5nvs!^w6rv+$~$lWKjy{f@O3dKPfgWk+)@0TPf=0PRpjMrRZUILqJ{0qMThp<
zO3AAVb6?b_Oj`P6@5lQdix;LX75Q44o|YygB;*tkA@T9!$Cj3s2~(!Hl$4lEn>Ovw
zx7+zOUoN_Xx^L?K^8(hzSnm7rh+E1eqoK8x)z8mQqpPX6m-q9tv&u?JP1ob9dFAEh
zlai7yyetV>71B9jg21lQ*KW;hyagpCDaU#wv+hsT3SAVm)XOB}0z<Fc;SNFN9W_6T
z*3DorRja*kY;JBIwkE>S!J*;aw>kT~Cb|9m{g_ApB)|Qi1Ly7kuelezt?qACpRDyU
zkjUNL<%Wia4{vNtJ~PjDwr%w{o3BxMd3l>S-u_qT{`XS7@}mEzN%kjuLB*_Bc3jcC
zYc@5yeN(>Jzqd6uH(j=Nv95NT+3^z^|5qLpKk4n^!O<<QpZ5IR-1DXT6ss@pT6$uc
zY4-JXXFonZUis&TVb`L92L~9X%=3D-Z#R$Lmcy8EcUS3`cXxNgvX+EoW`yAu%kFzc
z2an#9iuN{+d9yb1%QWxC)|nbsu4Zd*E^_7m_4Rf8>ebq6YHB9g*Ln<-k6qYXUCzcU
zb)-kqIOEzH$;EEHTB`&=qYQxz%F4WZOTSf3KcnFl7Z<m*$mCP1#--A8&(6#|Jjb%Q
z=<Vw8^>XU|^K?$-wSw|?T;wNCMr^|pWY=P-qZ{-!W`dITxpgr+oBWms?_aZK&4<sQ
zm34G@{Qdn)tYq`@^5)o7ZUTk-l)`D#rX9U@y@{2(D0h3-)m0w9rA@O$d}o`vuD*I{
zeZ2joNt3#^*ekkx)ZFyw%*m6T8#WjmI&{dQ_!*C=R>%VX`F5bheP)(v_l6Ay&(6&~
z?Ji#%5~9eV=(Tc<PHXPMi(g`@ShoNC{bJ`Aj@wR)Po4_$(pk1&JbnM)Z_yLKZBIEV
zbZ(yQ>7B*TSH$i%J3UQT`_jJ_fuqYKmYMR)SUCLs^|i2|z+v&l2{UGJI5;^mnPy$l
z=o4rWSn7hmUghTGJb3zacirD#7dIq2xA90eElQCue|1GOYHOD3<z>DfKYVy_aj|<!
zTAESir<6N)?_SK>TJ!tu_E)91wr0D3e|NXAs3_^pjg2X(sfMMmLV|*VF1#$^ku+lY
zQGCTOP(;mtUeD&u#(H{sEUc`e62X0^r|Um{_;6vKyQ}NNM~|2qrcRw&^>@k?5fiDi
zr+qdHEMl1{;;!D<*!bhe4~7TNpSxdQ7dz7=Qz$Yr()HhdZ$+1voBrnTxGO!Xm@{o=
zcD+}XTKoE=8z0B;>@EFPd5)Wx_vxdf-Jmw-wYAa34-c_MZO=P<ppluAi|f$cyK|2m
zaXEALET~^`?b@|DR;69){_`&ED$U+i@{;Mm$;s-R{QSoc9&F6JyK7_V>o76hs3V(F
zPjhf{pT7M5+S+JNZtll#Zf-tu^ytHP@BDU`z5Q~m<jae|TU#<0&%3=L(fQJ)OC3Eu
zOY-jSVqs-9EO_9s_3zv5_tUPgi#5r+Gedvhk4H<NgmVOb>e^%B?d_fX``cTcm>mKQ
zPft(ZoN`j=Ue#+|v;2E!Rs=4-vA@24-sdp1a#r@0wj!*Y-G6Du^c~fnJ;kTaEH`cb
z9q!CoS;5<I=8TV%lM@Fw_u-2dCw6pjC@Cqeh}x<Z7#IlZV1=v*XngYINlfLf9fiu~
zd3O%nzc1hL<Vni5oSRK6S8DqC`Q6!7+8w<;@8X(BV<u+ija#>d{ysHvv#f-K$L6%N
z7BxQ%Cd3x3joi$}&E0Kd`1ao3**!fyG3!s9J=@#LEpAltAt7vi+*$ScHH*F(CmrEv
z<C8rV9$$NOj%9Jh->=u>E%q<{X#3g71JtY(=~dD{SN8s1@9NdsS678Hg9^BspHHVV
ze0hI=e)+vh_xF{_$9fXa%rJEL#;>lgzkU(l*QpbXMU{#s?|Bg}_1P%%pWUNLmp<ig
z-)i#s3xhsGPjBzl>-+2f*L{6&**b0aZtH8;u2mMLWM`lL^78V_TU)c&{Z&*{ym8~k
zisScp6gGePQnF{yo}#Z`ukUMaZqB^1Vd1T9RbR7WVq!LI+H^{I^5VtG^CGgYu5$hV
z?=L9%iE4+XytuHiGH&zAm74qZ?W_5AGyUhYr5n5+S3gUg0BR*`O1ajZkA(E00~Wi1
z2HIE-=<om2w0LoHV7o$#hU=R(XX97=T|Z~eoQ~e!r&m@6GuTvp>FDX<0gZ8edV1Q^
z-=F=#tE;P3)YX}9UY<2^;>48HR6!AuC4SDcX3bi4{D0`}ZMmFWTuR2qlYf7I|M~U$
z{m*vwuTl8=`ug!3H)d3Rezr1rR@Wi{LBWOpYJY!wYm#$g!-eZ2P7>Rcbk9wiJo)DK
zeEHhn-$K8yjoRA9Z~te)GT+%owZBSAUj^AnM=CLF%e{T<&d%a)lWF_8xVe+BuZw;4
z?c3Yiky|n@a>rjkeY*Sda)0IvD}&YLY$`q+x~_B9ZR^VJohvuqGb;{GzLy!j{lax4
zV`F6vjgIg4s{5Du&c3v>I6Wvh7}QhBxU<7>-MV!a*4Cfj?S8-Na__r$?|fz$G`jUl
z1#Zua{q^M~b4vfW?5$B>8yjS5e|>RWef83o%wVgM7Y*yz>t9_RUcc<RO~C_)egFSe
zv++nUq==^|yY(D6dX!aMTs$c``R9|#{_|eWa^g6=H1OZQiV2KG#l^w(l~c<iBe`y$
zKepkQhss7y%^Q)kjaT``R++ACoH)@<)@j?8Ejrsud%HI4Hyk<Q^3`)o{r}(hL)J!3
zT^YRmndQ3B)nPgJ_sK>_-%gT@`u_d<^Di$i>(`!S(%07)(~s+!JzM(qwY87W&9w#%
z#OXi1wY2b*;`7Y2hp#{K@cjSzo^f$-@-3@do02k$dwRI)iau5r2@44wdUUk==FZ~h
zo7?`L(&&9ZbGqZFMf0Dy?iXJh{Y03Fr$a|D;D;*Hy12bwJByxf*t(Tf!OU#hG~MV&
zj~*TBku<&{;q2t};^N}=s(US*!dGVRd46tg_wL=+=gyr2jra9lJbk)bGkDp9hYt^C
zuity@;o){qZ|}#G{p}PrG&tt^xd#LY{QLLMXNCcz!u6)p&z?P#u`20!@+8HrPo}fI
zo&D@AQ)LSaiH0dtMA&$xTo$|azPP-cUsF>vt7dQ2S0x372buYkjn1F&INY`HY~+_|
z+<M=x-H4j?Fe(4+qU|wUu`@UoTNJgYEj4j@D3aF6kZt??Wwp%x!-p4#=nEEnc@Y>B
z8+-L_>gj2Qd3P+@_~p-CkFSqiQR6?uz;P~@lv$31b@{s`>F4J$q@<)2e0t(p_VyO2
z__(pZ-oE(xxwEE4(}JHcX4RFwxzV_Oz5d#$tx4C`M3%g`&}f={?ZbzM&NenSpb_2s
zb-#5*wL%_TTIxN=x?J!2>lGm@h5YB)DB9W0tN-&@zO_m=b>hT{ZTGCI_j;#^sOszM
zOR5SuiC9Wn8Y`^scX_6HrfA8M^iOvmdhE@Wlasr$CQ?{gS-HeYHZU+SW@izrwYBw~
zyLTJkT-2bT*YfpPR;}URyA6!YS1xYbFP@%#eVwYBTAN$1)XL+3LUVF-LRW<ZZr6|5
zvB0jvDCfq8<;UheesHjPOU6YeadGjwpHId0;`XfA_VkYW%{`UHdpEu=etXMQ&41pR
zXJ=;{nwqA*zqeN>Y756)zxL_V#aFIe2`b&=>;Hz<KJr_Ab<x$V;@IU`S63Z5aztd6
zzhaBPQvXKf^vK9aP<L_042d&m&lZ-IEz7#P>ZoA!OySuRf;!4B+x$K~ReSlryEkrF
zoVW6<U#h|7JzekWZkGf1m-EM`9Q}Cch)KZ%hhN{{``_PJdojbLa^4+NGqbb{3mkv^
z`t|AC?fmCi#^&bgii!&lRQUS%oH%tVXnSyQ@KyWc{qpBuUS7U3bhX&#^z&i2|4G$5
z{yZ~r%9JAwjLZ>Rvqa0v${fnp)&KeEK4;DxE}Q@BgMxz%&Gr8Xw>xntdQGo!KL;A&
zn_-v?>UM$p_zDUQ`TKsh`OUR@`10k+TU)b>|6UT-_nXF{@kfZUSIRV~pShr{OwGVR
z;6$8*NqciM^TUS^Z7M&twDZdsJvhMl;Jnh=YuBbNS)#Hn=VnmC9#QSECl?pHSKW5&
zlR0@if4}eTZMj!#V|SOOzPz;b)!v+i3l}nM$-ds_JKK!$rgGi$b8~~w2a2@rE`J}i
z+;1+^V?ig5K&z+GbI#5%Y+mTxUi9}@>9;pG4}ZU3e}0;7v_<(lncLg*)fE*Rd8N%n
z($_4V2Pp_ZjjW7}6;VGxrRDnl|D?pl#TPAJeDyIXVj37hen>mez-W?vjfX*6TH38&
zu6Ofh<JjG0q08&%gT{P6d?)~wspk3jQch3PT@$(4tybf~mzS5%%(0yO<x9zyC|%!K
zCW+6^%w*`1Ht$=qL`BLp>xlaN8mG_C&R)%)Ygx?Z@9*#9^Q0_5u|;EbiY6~}&<s_k
zy1!ooCf=FwM@vue!@u9}FK<rww<>)#<$2+|M{)lir&j*|`~BtB)#9?&Wi^`DCm+A_
zS$*e4e{ewu8jCu*ZHk+V3yX6*-_y0*?>)M)G5P%6g(v3g2!6;}=0E@3<Kz9_Rz1^C
z-Msm8sc4!Lhhm)X&vSE^EL#?|ucorJ`q`P8KYsmU0!`}e_15kXSNCQSiQAfW^~KfI
z)9-ByFTBD78qidgZ4z+m5SsS>MCAE-wvqem_U_8f%JK>c5i!rdcjQLI|Jp!s59ZO)
zC((2M{`q`fMNRF}+Gz8-zhA?}^yALVv#tK|zW)F0*O?aJF`SQ|o>;HV%*i>^!YM3a
zmLu``+1ccupPrtXZ$JOn*Vp3dHDJ?Hz4jfw*(t34<Im^whQ`LpzrMVjF?;sskH_Vc
z^YhQ2nQ6Rn<3`PM7T{jgsg|1V&E@xNr^oIt1Euua`TNgaT<p%t!}H|ALT4TsiwU;X
z-$c^qKn!-;cXVcFCud+_po+SBa$4H5clr1C*}lEKU4PmGP$y3D)U2BB&AzkEKEB<4
zKPfG3+Uu3z@j8ycl25wlWMpN3KI+z=SG8%8hW?j_H^62pZLgVr>BF~gX)iA=y|b%S
z`})5vTTDRZid&Dw!Cj@VyKEdlUhmkXa)09H`St&1nr2^n@c6NFN{Wh^nc0$M%O=g7
z={a35_Q!)}{ww?w#lVK@yYD+1`Sa6LN&7k*(9EB$?c5zZEY6%go65-xHce?te&M(9
zWy_Y`xOr2wR0pgnHE!Q`$jo=gBX!XHHmC=vxxpfOSBWOSoK3|u?)aTWtaJUA*F{D~
zK79UsdEDm2!)+#+ms<Mme!aN47;Lr1Qm#GE-_-yAE&uM_yB)iB85KS8*j4s6YQ@&G
zbLPz9;O1slFg6yhuCD&|<>ldn&Fq|fe9zw8+zgsVss8?M&cX6|u(betf2;;|b6Yf2
zxnwM7+t=G&Ul*HvWkukR-@hNfcyZ#(%ga0K{#Grtwk&wS0BU_5JlOd3^mOCWS0SgS
zYO_0BT@^atz6$Kwpk)F6*;yu1tHRg&eSLKmG=^}nnf=S_>;32ET0egMI{Lv+XkVw`
z>#I;cdAl>m<?DTZetH_Yr^4{vyLXl6B6k+4YKO05d2nv7bxTW&&EeBpdnZkrq#|Ag
zZb(1zFIqewH15(}H)*=kB9Q~<&-)jamWJ*sdATU(-kwNLZ|`6c*FYm<<Ed3&Ux}9N
z`t;<a@KgE!KiZ4Fzl(JZ6j|yyS?yE{r|^_DYjmC-=@j;wYgIaB&6+jK;}%zbe#ZJO
zI84T-V#5+=0}t>B^3Tb4UZ__>^D?N>!X;z*`*FYhja{YMe0+R6ze!6<ZY+L&j;VY9
z-QDGnU%m3$nst?7MdW6++TY(?7hinv@L}V`iGqcNg&`}0o?4#v1_#rn#Rsa-foAZu
z!q=_IQ&&?fdUS-7LCQ2MB!Qigk@3|OwF9&B_nkc0%-;A$^W2s#CONse^DPS?dRH8H
zlyhfCVMJ8as@v11P3w_1Ul$<m@8_3xe_!pFx3|OZnioCs$h)(nar0*5W;R~XnuFG=
zU%{XuwnaJ`1yR9)Q(}%zo;XoZQc|*Y*N)a!){h@Q?))}$=FE?Ozu&h{o&ias&W+0H
zUte8)yn6jUC0pCMU%r&=`}4{B)z#JEG8@5*Hl|pzilu9Zt$DEhe%;}F_vXzq%?5d+
zw38d;PL7X7q84i-w`PShF*A27i-E*CjMg7`q+@D2_3G;I;*XD9gMx#v{>{F&M)T4E
zNS0p2wA{3vSNho9-Q_nor}MwPz5V&Qxz-_TA~sqifme)Z6m#jEQ&LizVV*B1CME_N
zI-6hj%QN;lcqYMVPYkPA`m$xq7;fCSQSk5(tB{b;j`H_$6WP^3<_C(2R-C^%!!Wtx
z!vn_8&(ALpUmvHUs%n^jZ_lF+UvT)Gsmv;zHg#(2)TyGN!L-=jWp{QIF5dA-2oxq7
zljm8j%}hyYS-(EN@Td~l$Y-Jz=OdT<&5hVoVYog2{<D7je*ybJGKxn|hd;1QcXV`Q
zsQLT#`YXF$H|1bR+2*?7z#|<aqfHi%z`0I9WRXSm{<^*Q{vK}UuYY+E<PfJRrL1D<
z%l+mC{hv2$)~QBj_JY#VsgJ<ET*M&~F-JFcm&x^ShuirjO)>%|vZ{cr44fwVW6QJq
zkr5FmYBN9%7I0m1VB+R^w$+O+W;DEU%{+Ygu%tnP10pyg=J?Jsd3nCs*QcPMU_sj1
zS*}u3KyGr03{5Bu+g+CX=FJ<8_<c4Enwpv}uC9U0{pK#}gNBsmhk2X-{r#Ps^5uEr
zv17-+yuZKSqZtxTtGRT}85tQF7#jy~KYQkkhqt%(r3D}tDxO-!)V*0Nbk&C1-({uN
z78VlG(YKlOYo4AGEwP&WD>XH>b^iSMxBeX!k6&;xqocdK`rHL@-QhK#se7}icG!l5
zgG^c1*ZID^we{zl&F6o7yPeOyXT#*l!inI)>cVGdB(*|TILx=JW!O>s+pPNgyQO7s
zZ|VQzIt3aLZI`cGk*6QGXU5IV>Cdg)mS27vS^w_G$H$;1=3GDb#csV<ivRrmYiMNj
z=+4gKR~yfrJLlr+YHP}8Z*MPWU-!ppz3co569lThyjZyXZLj&g2M-UoGeks1?fbG1
z<Vxq@gu<}>b+w?W!HNothYufuhIm1P;WMoeMJHoJ{omK|KYsmM<y|r*6x3gHTKsb7
z7mmEo-!B%w6x`P8HhJpOCsylXb{6H_-DP?|@R;{>J<tp@sBh!v7xukh!mw$ocDRt3
z*s)e_@gm*Epv`H%c7MNIPVd^jqHFW=n_IKR&CJY>-dZl;#1S}&OXnP@k#V@4f8&-d
zM=o8O#Lh2gkbJBs{gsM=fq=Zcyh3%<m3#N*E%Tji7k>or@Y&?m4#tLtpiv5)$W1Qq
zw=yv?ftm)fyGj%-EF$jKzrSbu=FOXiH|&;D=6P$jeSLAUxm#Sn=;^7cHrH?OF4xzM
z-ge~b)v5pf{+>K#%8~H+T2~(*o_qK1Ra93$w_NTw*UD$M8Sh-b_Sv(g_y7NA9lN`1
zY2DvnYa%u>F}McKu&FdUckbMd9XkX>M3&5(KVyam%4pom>sQ04)cyPUe8=wHi?3#Z
zD(~Q8-4+3-JziWfUdi9z-PMWM;Ba?c$V#EzWp9)I{rTDYN<mk*_wDWNn+qN~$=TPP
zX=3HRvAaCK@%h@Aos*_&hcgvCKPT(t<ivIL{_01GURJ)oK?Q|D|E^7HoT%0&TP>H6
zo5$slmZlc9c3R+Kw}n@;cGms10(Bn!{`~#>^Uvq=7c)$BqPO)(nPxG3cs4sfXn$R8
ziIwcTckfDGU*k14Hr9*Z=aZA8V_W?#<<XH&pV?-;PoAXo$=jd1xj7xw#gEvMA^7as
zGZq$>5BI9y8zvp$&<bC7=FiX18@Fvc_xZ1fudk}9si?iZ{l0&{vOzNzmzH{adV427
zIWf^B=f(u*cD}+74;(?|wvdpJgn6EfnwlDDWyj;={g2<j_urazb%s$Y*XL(v7uWv&
zwq)6|qAxE3*Q{N8^2`|(6_u9Y<$i@PE-2R2)&`2S_S^rHC@%i2^sjI4tVxq5dEMPV
zce*Q5UA!nTp>W#FnI|W!`)h=+lbNvh+|9}A{#<ttzPr1-u(Wh()Yhyy_Vx2lobV{U
z^5)!J>m9px6@7Z*xhj19yw`gSlaCpsosnR;bLS3dTDD~V`lfl_{RMVb|Ch(bzBzmI
zW1O@?^_A7t-{18}8n@Y2f4i`wFnQ9XNgqCa0!`0dSryvd$jm-z=FF4R<Lf%ZRxd4j
zda9zb(lGfL&$oAXy%Q1?K<P<QQPIWK71Z?HQ~5dL(h^QSKE8;EhzBoUo^0h7@967G
zyR{|Lp{(%krd01ed-i||zO~WYul;uL`1$E6XjJUmw{LfL6f%R>;Z!{4Wl-~*)8V&V
z*}`JR$H&JnZ%lS)`1kw%|D>#}F2Ci6FJGQ~u$i6Npzcq_xw+QUm-{CtC-2#_=f=)r
zb#c9z2l4-ZiBI@@SkcaI-<7@YacvVO2%Pk`Lz;!wlwuW2cW&cZn0&l%&);vi8P>$^
zp7!u?yZEl=h6aXGuggb|wpM?Cw=x$r=l130<<_cSo40iOdQ?99#&7qd;aIQq>hJFy
z%j)#w_pQ<6nlod@iGKTkCth4!-1w%N?Rc;B@fQ~tuMAqs_3`7!S3A|z)M|b_Y_Iut
zGyRpFYanP|pOKOA{GVAxToua7%3t4G-j<Y<^tb)~{=2;0+d3E5z!_$_Qt$WuUboBt
z-*dGVfkj(1KFr&!8NAFO`52F0?5-nAy{FHxELIC%=A(J#=w$d{T<t>#Z)xk-%l!|(
z%swy48@;#6bi&`r<+IK6pZ)py*~QgWP*l{lpkPB(#NRKM{WZM4ZF#vh`?`Ud*}3B4
zzxMxs`hR<W|NP(I-x)uA`SRq&#l<czE(|yCA6^~4o+)5slB=9;)t5W#cjn&S#=^=9
zDl`B7{{Hjp_4tn;J}^9Za&oc@Y|>C5_tut^->pF1*|W1uRa8|O1!QH<Hna23v(^JG
zNthrKF^93_^|juk-Qt@oJ|^kt==|9K|Mz~U*gJ;~fo5vgtX=zco4c#))RTe64S~6@
zlJc_FY(IPF%8c?`*RI|Mt<m|)_VaK%e{z2Q`g_T27uLmEA3A&(G--agjn~4`a^dBd
zprTXSJnzU2i}LwtX=y%lECOXOt_fPo_3z)mA3uI9xSExDbCc?H{dl%b%V+H_eZA~^
zW5XfP@Kn^=Y0G?PUs~wQUQ=6p^<_?ZdAXX;42Q*Ty(^-(uiN#e%Pw|TN#chG2e;(h
z6cQH~|Mm5C_#6L;T#79MuC9xp@T|3Q&B)MTWMs^_w`b<A($|-^=f~fxe!uro2mgtE
z&)<A`dHLwI_`(yFQJ+6-wFtUoeD&|MWyiLZZb?0Q;hBWw)48`M|EvWquJDZegwoB`
zkJ|pE`uxqStHU`tI205V7^0)2+4y8S7HP=XRD9TJEnIJ8bZJ9UMseAk?uWY%PduWw
zBq2TboOSN&OTp33`H7%LTE`{*`;XQn9&X!`dwbek>+)6I=bmjpH?6|X%}uK6^_!P#
zH|xI)05v5N6+Z2Kp7Z+J+8Z}-K79N3?2C(wL3M2F3`S9K(??(7{-ZZ%&F`xi7)$`o
z!MuIjJAc0X_jh-b@9ZdSjdBSCH?wSZGtIxYWy6Mq4-XE$xv{a?y8PXRHIbWRUMGQ@
zxE<4%2Y6>bSrM{QDEU~=!vl@XuA=?m2GY!LKG1%fMigX0hZDg?%%DylS!-U1nJx6`
zdUYs#T}<MG1C6h~^-7!1urAlTc=6(!8ylI8jg4PjUw{7A)@)F3e3bzx_!Q$bUCoM~
zJ$d315g`FuG+|lX#wDtCVY$D&Rr$L!t=!_EA)57jze#b6>*+w|O%>xTj~cD5d~ksA
z)%ErL&CSfQyGjloQK>#~?(*fy*5&U++&PZAExu?_@*<$rT3NYSD|FR`#qRyf=b2Z3
z%K<GYPOtaz;raXPE3<-%N=xSDWqa>5FhZJEF)tp=+t>Mge|I<Y#s)>u2+gZ2EA?&F
z6k7xqt<jqHNC(_hnQ!y^`kKhYuh;L7lTqBW0mn3Ak!*Tm;=xvKaSu;V#)SL(YH!@W
zZC}3MH0w&j>1n$9BGr!bwpuXHFv%2}t{<;vVlrj1d;g~D@9&hl{5TX_G@`rS%-bvj
zTH$topY8cCPnXZH>+0;}bZ+ChSbgu*RBccPm7)CI9YqzDB`ch3SIz#|?^!j;ar!B)
z5Vi`RFH7xj6g<}9`@dp=-1M0<JtwRAe)#%zY3=WCI+2@N#B`$y-rccua&q$U@i}t+
z`t%z&B2G=!KHe^0=kfQ~*UqjkB{j9St=ZQnO_*?Cfn)QFi;LOI-``8UwIvf&{BPJ`
zU}R*(!O5wps@l4K-><F}D>RIZj3f+`SaNS|dH8&O{j=@&>y%ARr_PumacfKF;Ts#1
z87)d*iTwNbZ^`oI!HeB^m-);*v@UjcM_=Eu!-tu(v$Ln^L@wGlZL+%mrtIr_Y3Jwh
zIz&WBI669lrs{g#7T5W&3R?^6e;AmX>o0rn7U${dsrpaO&}Gu}Kc0Ee1ylZ?f?TF3
zZOgrFwzT;9xt-tE#_kUL_~lE;q{)-HOM9Q4oh|N~(faNF{r4KZY45^hmc2Ri`D($j
zywvCyvR@;(xw)Udxw-jNbpGC@OO`Cr@SSZI$|q;DqU8O(*gz52nFfhXM%CYPTmwa(
z9%y9ty1dL+6uSC0YHe8MpC22G{{O38ns#=US4qhhAxX)VCYhI90!3UcD?TiE^z!9O
zjjlyLv(0>OZb)oiq|x>1!9nJu7cT~SdU*w1TNk_f%G&7g${!yVPMS1n(Uo<v(K}0D
zPXim6dwZK#P|&1B8eN+*E-Iy-n`7CvNTaK{xw+{7zrQx?t!-^%SB9>hR(0;i#^kGZ
z#Wocm99BkeUsv?^*VpNj-#jn*@Zg}(+xr1n%1Du)Cn6mj9M)-`onQY?GCn?@Pu9w%
zl}mJ9(XxjTwJMX2XU?p;9dya~=eq)xy`}Yju~}QA7A;!D!NKvM*Zkgz(mxZG-G6*M
zF3<Sl!b0YcA3v(7sWE|8{hm2{*3jJi`0d-XC#(B|mLh-KUH(39TmJoZ_m)qaHS5&c
z=<N(IZfsP(cI_GiXo42B!u-QS=R=1NXWrdqI>C0)%}uGNm+#9H78c%B`ubR(to4iQ
z>*ekLei7ccZ(mlmn;V;*ot@7di-}cVU)g?*0+q5Fa}ZN)_v?PI-Iaf+g%dQWd~U8a
z^G$x%H4z&ZX&kOd1T8YUb8mIHzMAhW7Y`4Pj~_p_zH(oD(WdnGdJ)%$$K~sfoII&{
z{hzS9U%<K;%hKC7Zp`TE;Sq6NSp57P3k!>g)-)N=P{z$osUhoPB+bpuwZhh{*w!m)
z%;xUyZujFsbNa4jpmO<B_!54Wh=>RQdHMByj~HBBTnc`F%PpPB>#To<;mw-)b^25H
zeVeH4Zj^mZCnhGQ=xzS(ZMxFtc`Tde@7uC<>(NuExEdZjNC*lJ?(FYZS5aXpFaKWi
z`^~Mb+TPRkTmv`BdvNgb)@Jq0n>SC!wrWa82ZzG$f_+tAwHRRI<>#~S@3RH9HbJ`$
zE-Y|-^-t0`ZNh{J4fE&U5B?3Rn~rRr=+B|ig4Sp_yLo@w*;zhwEDX>8ySX{N_}LlB
zEm69`%l%UC?kdfi9k%-D?c37+{{Cs_=6Lo>nSScce0FB$lI6=6U(L$Aw1m?*{anqf
z>=jsB5AFwcmA-D<zP)_j=bql)mlqea?=F8IHl2BA@pHcUb-y$pK76=i*RDmEU;cQv
z`~AFI`#}N8<-Yg{&t3@wb8~aQ`F62Mf74Snd;b38X<vVI;-r7i_okyX3G7*a8nw2x
zJb3yvRHt^Ymx}%`tm&SaacSekiEX0Jvo}AE;khr{r(&kaYMottFEe`jyjxO7!$8w%
z0*k&(ezaynPmjr$Yd4=)X@Li>9;HsMJ_oM1aP1I6sa&VLdgOyz5v}N6`7Hh0o13rF
z!~TUrYoSfo%QRnDoj-j3eE;!&dGTLAj`z!7-dkNRWtQVHS<QEbRcY3~<9WBY_3kWw
zzOToCTUAG=Cw6z)%G<ZM=civ><f>m2tFNzL`gexkT&s@{4mOML1Es-4jkuGKaI6U}
zTE9FSRyUpe_vUo%?{A6u|NnR=f>v<oMsM2_@P@VW+9RHXOG`Xs_SMY1a3P>i&i2&1
zySu;2J%9cB^pB5^<K1Soe)|4sP5Amae_grx$2TM%{_$*fe%ve5RspAuNncKhrdzK5
z>i_?J<vDkCf$39@{d~c)EZ6q@jrZrDXkNHi@tC*n*GqK<CMKp2A3p}({`ccC|Gk>e
zzFVSn=gpfp$GY4v*Sh+f4ydPo@#4hL)nSYRqN1&vHy6*FF=_H-Ny{P?IXOAd0;>lP
z5*Ysc{rmFvcKy4%N<j-S`t5#s*rva~w-?mx3=a<nO^XEu3B}ib6*V+81oiQMYAgi}
ztN;7^d-ZKg<#JK2kbr(uW8=eDuTHftfA^&G<EyKyr)Y=sMMp<7C@Cw0n!kE_dS_-B
zGJAWUj=6r;F!2xzXg?FEZS(HkI}2-T<FqppUyE(`?%4zC$jZsd?fG^qTgI+t#+EH6
zc0Zp8H#Id~cv-^2%KGsA`|~ZF!VDK)mb{w2bEjoHpX@2~eoakH9x0O!w_d4>%l+lI
z-e`N%^gh!5Oe}|Di$Tizn8Iy;v8}!FS@*Y@jaSLQV8WFvA?M~=yLXG}UfCA8ISsUs
z<lJ2A<4vsG5_UB^Zk4^eV+opH0IjOKv$MFf^JMMa{lC6k_J4U{A+x&wyeAJ1Hv7!8
znfc^NibC}l4GoQs&(F^bi-;U~dwY9lcX#r?KR@5x-#`C;{eM|=bMt?n=l?e-eB_dK
zWyQgR&Fq#H9}?EtS(Uyz(kZNd<&cVohQ@XFiOe=NKPJdpmtEMHeB8!R&bCS<JUqPg
z_uudL)9>skoMT@vXKlT^CjNH3?+k;+;N^ZR7f-Y<e;0Dw$jAt^xVEmY?#xVMc4K2>
z(26b4LLzSN<Oc^DcdncI`QZ1@&(0n`d$u=hZPe4*$Df{_9=y<r71Y_AV_9rb{H&*H
zuGds8&~jZNAtA1>QlRYW{QZ<@I?4?5iJ0G8qI3fT13`lt$E5S0WLXzI;W*sJ%e=t3
zo$tu;<C8%Py58U03#!YmT@%aC&-a^WGc(_(`un@oi;G-elvwTAv&X~N_vy8@(F`ST
zZU|;)XP=s`@BjE%@6O84YRCKK)opC%Tw5Ex`0~q$?Rm21`S)sGy*4v6To}DQPsX~e
zr>BPplm)83zgxR4JtgJA<Hv`ObPBHwUmrK`b8>R>%AloPyK29@2rLybRA*<ktoWdy
z8@1)Y(W9-PCA8h*`m1tle}7|LQ2F_pi>s^Xt18fB$)_hLbMEbt3=Y0r^ZCc?_4`+S
z=gZm}wIwQd-D}YLiXT6IIDORwZFJlf{_gSYB;lZWJNJgKj{|MznlNF4(|@7s87BUB
z|7p7hf@=1{qM}n<{~zfTo}wM@ch}6!Oifvt8Qd-ZZ0vuWSIVU0=H~Rv%Y3Ep+_|$N
zV&kG)W@%?6KvP~1n)$!X%g)Q2mtNv|KikOAP*Gdk`>vm_Z|Hv+`#PKD{_~fGt&KV}
z&o(-j`O5W~pQX4b!Ka+3S`|%?m*bu^Yt}67^G+O!M^?X?Ds8!WvswK!>^0{5t(yqd
zn4#|va-r9kzvpd5>aH*OzUSN9+r?jBg@&z-nwsrZe5^-OMMdSo^XJp^=YyuLe=}#?
z-DTR?*vOE@#~>pk!@<x0{N!YHP%>v`=VPcB%iOkYn~JKc=&OSr%lA&3sO%1!cfY(X
zH@Y;wt}^HPI@#G~xl9iZwQ}!##~UI7o}k`a^%Xoptr*9;KBmxYm2d1B-`=m=X3m!S
z?00)t<+tl=A}4nWt3SJGU;QoT+#Jj1u+^!ruB==Wz1=VOa@N+0>gvZYU!FWO(|Bdz
zVz+&UXVIsY{cCwcSA`tBe_vi+UVclIu5R?UC!e04cKF5-u|4nZCmR8Vn~Q^Vqqn7e
ze|Ptl@9*+=cOG8$w^y~XkqHkEFR_xnU;90FUh(a%+3w(}aL_`rn!jJKgN8n3W!K7u
zuMX2?<B@3CyxEwUjb}mR<}}cJhe6esj5%}XzPz>db#V7pnS1x{wY0ad=4W}bqw=#_
z^tPPDv$ITJIWsQzpMTE0|M9V2Wlhaaw_d4_C)MX)*i%{jH7fnwoP(Dy3nwT4eGlsV
zJ_=nNSX37+4Qc|E#;%^T+>}4QTUqt*`Rem$e0wurXPoX@c-GM9R6W<HP;nulrs>nK
z7tj6u4>38u{OqMmla?%5BHpv^$KL<{et&s&l{-24@$+-unctM&Pru}c7(EpK`!8y{
zO@DaalQSo<FSYTiP5icKng9HMXR=pFTBWC_@AY2}8qiq3D*Rm|!wJ0$;K70Ff2^se
z7lBsEuMXGWo4fwe3dsX6E-v=AVo_`nc(k<YsT-sb;bjcoyTPHzb3*&0Ljz(r$sy1-
zE?qV4P2kNS$dirYfA!+`{c+mPy;tb_oTEpNPA%bJd+a)&L31sZ)=pkto_@(bpKOo|
zk;er-zPY(MeR_$&qnm$9TWdb5gZ8PQ^rIHt?|$-hYK4LW2e{`u0kQ`Lya}3dckOSp
zd-v{vn(k`8vpzhX9?x`B{zhJY{`$I%AM0Xwds$q16u#U~HYO$pRLcb$*8lm)KG(YZ
z*u{$z&&{>|{NP}-P~WDC*queLwf-+ktRQQDwI{!yK2g~{XlqvJoVjza*8bdUFXEc`
z=tw7M75Un*)9?5Hj|)l^x2&z4X;W#$D{bbo`s$_S{_=8mH5?8hAtKM7J$v;#?ahsi
zs-k5RFHfJo{O|6_9R-TKQYHsZp6s-({x-$s#8IdCI5s|+70;IV&9!<_VznS=Z~Uj<
zcmD*{AJaILaP-I#jr4PK0!3T{g@lDg-QC?mYg$jOiQGKpy#4<@EsM1y`S|!wfnw(E
zY@5nWn+hH}xdw`)K0MTV3bdwJXrW|FPY(}hgxM=3BxGjIei^G056j|bJ(rev3YW&$
zz3&Ij5x%^(cJ`#HQ<utEm-&4B@IhnZjPB-rd*0mK?C$CBA1>k=X!q-dvZ?;Wi4%AB
zX`lV_RCRy7IcQhwmZ-IpX3bg!+P}9leEqyh6DJ1Fc_=1bTlsS}X!+c!IhMsDuA0&B
zr~j!l-+yMlp;1xV0nj#&(o)t1Nk_X(vaWQzc=1B0?^FRRJ3DBdC<AD@!M5DnUKWcc
z6~hNpoITrvlt8Oro}QXo{Py(wzk)I{XFfbUd@;kMqN?iChlkEd$;sWe&mJw%*N2X~
zob>kh_kVY1XRy%1sT#GFC;bZw44T<^7sc!>y0SKUIvcN4!KWuDmHIZM*y!qaE!t4|
zIjyFqrpt2Gr0MtHKb!v3*sW`GxJyWgh)3$>vkVNJ?x1r47A{-}nhtA7Jw0vV)htQN
zqLi7l_c33{Fe!O?srA{}*_(@=dNKU_zW;w>dOEwq*H>46-v9qEe3Ig{6$cI+h_N?0
z{p&aDr}}yqclY4cVY**;%recMcJn{W+Q`jqpv7-@`+9noL~c&ol7D~R;dcJ%2`Q@1
z_xDRz?oY9<`C)K%Rp{Z17X$Bxi{00+|9kh3QT;KG!f#w2US5yZZohZv{CR$DZS7;d
z(#OBPzRt<befZqDK7RW@8<sfhl)3lGJp6vYKK=W<yP&f2&#%|(ciu9U;+5)M*0M-D
z5;C4M_ut>&%qM=k_HX+0@5Fh;$PVKsclNxyyFkOD|9_sZ2W=GW?CeyPN&Psn-rQeY
zKw!da`-aqye?NV{f42VXBKzbe7RCiHU%uS)^;$HKq*2Sh+TTn8YoknidV39xjFP^*
zxVW^;<8S7hw{IC&L~Yfo{q^PHlarG}R)_h@YW4N@F3q~SN>wtc^XA@a^L6Xig{%(a
zo$%ee@6SE?|LgZwb9b)Fe$4PsJot3hnIN7mliQY@>xoJe%v`DXNK;#h^`tYq=ZXby
zmMQqQ8F6wcy;9Or^}ej5s_h%7qx|7OMwvtK=1m8-_~;s!UaMBjn|1y0_ivT6zWm=^
zbnRC5&ofD9@10*`Tb=Li?Y*b|f84Eihuirt{h6h=E_QcY_4jv*s;aJ53nU~Z6Aw1A
zGW>dfJtaB0dDg61CwYpUc)#y_Si@LVT|IH;%uZSBGLO{Dx%+s^bL3Xn@=P|CRckr0
zE_Qc9dV0HaJD=y%{qtAvD}LU0ZK}!TgWuoZzj*PY;@`Q`r?c;j={s=10W{*yFlo}H
z4V9mto#ZLL^yjzq+miYE%x$yf8yg!v{QTMkn9k3$ZSL&sEXhi;s`&9iQAk*L;@r8s
zpcdMn&(F`lc>S80;mY;vi??kPyLRo`WYK3W((j6eB_ukoTnQ;DFJHWHA>-DqTMJ4{
z=dLkOcde_l3tJPxsI9HNGRNxd^oM<ClXvw?PWP6~@Uf};6Tu+B)#_wnWyR(0?mlty
zWMwC3=hhfKajsS;78aI{etCPpvK7Ls)~)l?nCi7KM5}f0-n{`UL!3AiqoSf(mM>o(
zv9D(5hfkk`)~;PUS?1-D*Kgj4e13jjJU2Ht=w`_r{euS&COtgVnsls3^3bJAK{5Mk
zDsTK;crnAJq-4v6ZQI0(cE(JZHqEc-%j5JFg&Q|+6buUsyJV-PrRBw;=;G$qwqg5r
z@xH#kLpN_mrlhC)L)1yWEM6S6vSsVmtsAy(75)DH{`?~+j$aRWtkfcqn3(wJ(b4Xx
z+kVS`8r|7gsq@{i#)&~tP;esO!%SZ#B_$=<Lr;1-IyyYs4J?(Fl$0(h`kd?N=;$cn
zm~0G}J~nsWJUJ(4=cU*8?%iu(Zr;9s|9|<fn@e5>ZP~J=r)oCi@qT&ty1IYzM-8Uz
zE`P6NXD7!XATG{sYiqlw^0V72`|@`(U%q?^2n$m?x*|a1!ILK}txk<=*61+&^)D|d
zFL%$+f4?GGY0-Z1$;L`j<(t#byOoxj?%K7BfuUc{_EP8WRmYDXPrkg&ckw%QLqoxf
z7cXAQZgpzBawX(tX;qb#jEu~dTOwj&PHV%Im6e&<`D7RvB&4Jcoj%Q-nVGq`ue0M{
zoX@$A45uHzekrM|vmbb{T%zXZCss#C#{jLV5xdKDSFKvbz_51h+7I8pwKX?0gVyKY
z&p%!9_gCqTUAqpQJPBHTcIocP_Dk2Vw}1FhaN=o^h6vXj+iEce0Xez8o*telQ>Pw0
zdD1iLq*O&qU~H^xO-;>)tgBi(cJ6%m_4W1UU+Y$`1g-u#bLPyFO$!z%WSXv7w~kLu
zO)cu{{_Onx_LVDFhTWbrWy(bM$sNl~)Y4r{6q}lxC(fGHHQ&B|(W+Hk+1J)ge3g`(
z{P4}2o>i-~=GasUS(m?KVVGl4$n^U9`s3Z=`WsSDi|wiY&d2cK)m7~!OP4NOvV<jT
ztJJSwzm(NXvbIY3`ueU|x9-^I=jR0l1s8_CfAolHYZUL+D9~oIBS(%fFnoJ=xBK(+
z^NaWFkx4%{=VFwsbU;uLXn{f2)RQSjpcS=OuU_@=^}Slf_K<;}pZ~<uqMREW7`3K)
zX-z-<R6|$yZ~S3Tm*5*cC;1vWma#7l>fD}x|IxFvvp;<OdUaLxrzf0w_xJHKeE9Ok
zB|BUD*6rJYF)=-7&iF7i%%4Ah%b}2ffQC<>iq^#Km14MX^{VNk=67q?t__Tj@BjPz
z`{DEF`S0Dk=bSfv+O)Q=F0Lm}pDx_IS=fKR-O+DvZ!i7z?%g}bT)Wy|EXBp29iK9v
z%*e~r<Kp51tu0&o>(C*m88c@tT(^!dE-voIjzZ;UXJ#_j*49c$NiABun7R1*IoWTv
zf)}qGebUpR=97@1kd%}Z5E9aI?b@~fyX6&G4jnpF@aoFS|6BI1T&bz0tvzw}>~8Pr
zdJnIyjowl8wCkGL)vH%Ky1ShV3k|<~{krgChC%*4n-y1I3I3jEnk}}k_P5!JJNNI)
zyZ6a7c6W297~S0PHa90HAu+LWx_-PD|Alw&-W@wL)A;}8YUM>M+n1YmW*l;Hb5k-i
z6XR-iT7Bp2SzjyN$dC{gV`JkJDMmYX?|yu&SDK-LonP+3>+9<o81(h^Q&LhE?A$5o
z@IU{Wl(cmF-~GC}x)(A`Zfwhy4$%_T($d=U>h`T$0zyJfPoAV?iu$NcPQ1A()x+PP
z{la{Ku&^+`SGUid^Lu^j4CAq5$1G}pne@q8n?0IgVr~v9KE0>uFkbj?ZEb99EFmG$
zbEB_gncliNf)`shZrU_yozI`xm>3x)W#xq-T8TM1Jxi9TF#O%`%FW5y*xbx~ZEbY>
z(Qa{OhBt5DE(}`v;O$%9UN>fDX6Be31&&??|JJTt$tfr(cp<}t!64(PuC_LJZ*T97
z&FTFi%fePqoHVJa_V+i(r9lgqFK0h^@L)kn$(5|v@80ztZs&LAP>l63P_VG5xUuoW
zhYt<=_SsccRZW;Mfq|Qwo8j+r>%V{gwB@?H1ka!M<L@VRC8b}<?EG>r4h{!q+?kgA
z`r29#508emYu`$^xVh9T&%36gB(-$0d;g>3{ql?7J-@y_o`Z`kDap;`Y~fQ#30VN8
i4RDu~$OY+t1_pnZqV0LozZe)87(8A5T-G@yGywoWIUg1P

literal 0
HcmV?d00001

diff --git a/tensorflow/contrib/lite/g3doc/images/performance/model_size_vs_latency.png b/tensorflow/contrib/lite/g3doc/images/performance/model_size_vs_latency.png
new file mode 100644
index 0000000000000000000000000000000000000000..94a6310612828db2370d19a094795341478e90f8
GIT binary patch
literal 21380
zcmeAS@N?(olHy`uVBq!ia0y~yV9H})V0gyC#=yXkbA_{?fq{Xg*vT`5gM)*kh9jke
zfq{Xuz$3Dlfq`2Hgc&d0t^32kz+hS88c`CQpH@<ySd_|;n^;nilV6gPnWRvbT2!2w
zpQm7?XQXGWqmWTjQedU8k5HhOUzDz|EaJk-z@Wh3>EaktG3U+Q%9zlrzkYl?|8PND
zXP2eo1Yr)12Z3y+>L&z@56dJKED8wBPn#AvD|1%bjkU9v#r1Al5_B-T?&v8--xFKr
z9OYzx)1)~;v#aS;#kKeR_3yM9o75g$eEzrUW7pnma`X0vKf4@Ny?$x1mu8EA6NloF
z+Vg96F@snw0;0;Dv%##+2}gW66k7xixp+KM11U>XQYjR6;!tcc5S+AwQ?W(BX$FVq
zofd(qN~WfwbN$>C6BBp-){EZOlXrL5-K(X?`eaY9iQN3|RfJ-TfhkBw#>GXfrrFn?
z9BSo0ddj-wMT6gRWmD6wRZPGCPt%L-nmk!}+O%mVnU`Aj)&8C|cP{UNNvhs6%yOf)
z9t)qf+t*E4NNB^>t!rC4S_Jr+Ko&D>$+<acs&;tR?RI|oXScRyUzNDXYybC)aQ@z}
zV%p*BRIIIMCm-(<l#qzHo5-o?vK8bb#seQ79{%}i_4+MuSFF%*>yZe|nSXk^zH!nK
z4k^<tm(S16&a|yIb6YIfuxOD|T>altDYG1p)nRLYJZzWulCds&;<0A!+Kn4GK770V
z{<0M-R@~X~?afW+>3Xp*E-Ykb;FU5_P*Gv=_V#9ot9ZzIukLp)!@SyWk*}_-T>N*&
z%$XZ=ZW=N4$=mm>U$4(AWpd!+#fhP-!=_g4oiRhg#>S>=YVDto$AA3(&Ag%RZ`G}B
zx!p^bs+yUZ)%<wa{`B7A_}ZUOr(0NCCue4Qf}Hf_vOnXFs;^r9_J2)YzI=JhcFFSP
z$tF^r9UTiwUthENb|aaEg{9!*Bi9G@&X)oM18x5Ocx;k$W5V}))&9G~0s{q$pPgA)
z`ubW#RMew)@6OFMPOo^mbovaF%%F`s{`~#>^T*@<_dn`d1VTMH73HeV&NdI9ZJr;N
zduvOer>Ezt4I2V(-ng;i%9Rk-6;q~&ES(Z`xSc=z-@kvaPEFO;QdM=ew6Iul-u{1#
z=F%&Vj&@I7ym;}izu)hNrll?W^zB<%gO-LyfS2aiJH_W$ZrETjb@JrJ=jK{3x3RUo
zI!V=gN{|=Gv#YYNuUmR-TkdVMuP2oISG>HueD&Mg+gIPYb0_JmqM~Bft1By2`=a;P
z?cKC(TiE{p|8}1`c`|U@wr#Jv_4fs^va)9V|M&OR#^Z9q?)`GQg?Fy54iBAcReI&*
zWc95^{QUe|i=KKdof70b+bs0<w%lMZ&C-vLT({=m-xu_F-(B0<|9`)~y0S9(REEi_
zmKK)O#KeVj)$Q!!Zr->da=ZTCp2}6h%l%%RnQ5$LZXVv++PZSV0)?ehf;2TXxdIlu
z^#)yE7aQE!*|{q1>?|!!O;63GSJ>ri7R1EFT>1F;_|%ITA)TF_uMV|xXI)<xd+OG$
zC{a;S&81VU>i_N8v|&R){hyESD-F37-|>U;&Xyy&*VaU4eR^^->u?+ItCh>=1+lWS
zMm-9Ph**(zWksOQ&B#qDov&_gULNG7DK9U-wc=ya+Purle4`$HJ1$?pVwPz(C@eF-
zYKN@}xOeZK*UH=KmR@%~4*mN2dikrXtC!2wehK{gYW4b6hYmR{4SCslyHi*_WU*WC
zmGk!h*Gvi0va*UQeSK~14ZFm|#H?dIl1t<AUtCz2_3X?{FU_TX^K4dvlK8r~y{o1K
zojP~UPd9RtOYFXyop0Eq8mo7O{{Q#)>YmEaQ-YSN=lS{hWu2a;d#kLfYL}6@c{tb~
zWzWt?uDy5pa<I3_?Jb#$udR!XepNbm&YX~#m^ndSD_Ob4RxF=ix6A1xm*T2^o~c}l
zCWqVk(~owE>O^l_bHw0fE4R3YhQ@-*&(ESFKR-L`>Fv#|U~4ODZM{3G)O@a8?X0FI
zrZsEVu8iMr_x1Jl^fNOI|NMMDU$pD>%jNT*?S8-S@uQ>NM~)ralylR_?*E_9LErb*
z{51Oe`+NGgH#ct`XI$<-KkfRu*jrm)UtfRz)6>&GANSjP*&d&u=)C3o->@|i2aC_!
zy1%`(mGQ^7+xe0E>uk@?Hea58e%_y7uh)z2YLCvnv7zzYT<gmV9GPETUw{7B*Vo=P
z5=E;oFZVZ2IKZG=mY$yerONjCxw)6WW!~Dd@>})WTU$?_JjwXr%uM4gd3SeR`P|mh
z^5LxceFhL=Q1j!1^*(V$m#dSG_;M`Fytl_vH+oyj-(O#OWGpsB#2#+rT^YETZMpya
zYp3e||1HnS$zj;AZQHXmGmVeF%G<Pg^Ww0zQ87CT8rQ7R*|cfXj-sbr?)`Fm7ymeO
z=FE$$tHu58eyX_l$!G@Vn@C9-rEu8U*|oH`itesEKhJh+c6w^+#L1HnKRVj&y%$uD
zJwHER-OzC2RPAu1loJAL&t1Me`NoZie}6vv&#QiC$;8C;?uFc^PoE@AG6dG||7Z35
z++1aC?e2ZGzZXqma#((Os>QK$=lXc1%|yEG&CSEBzim%WPd}c&|8H2<-3^I{J9>Mc
zJ~-GcV_nwsd|vgtzIU!1fnPxtk;vwfmqFX|?w(rQZx^*DX8m<x0f7ToRtBFuckbDj
zmzV#%j{l!@V?!bX3k%DKlj`#ezP*Xmi{0ho;?h#KX6shdz182-E-mql*;&N8_~MED
z{eO@7+y4#8v)Q<Dql>$H@yko9Q-a>y+dF%H-LKAbbFHV}D1UWjWz62HsXsqI7uSA&
zeZ9Q8|GX!!*Y97pU_nDu6VtahHxF-2KJK+Qtyjvl;@{8boc#RHA06%f@#`1Ug!%J9
ziDqZbPb1T;D+}`O?wT=k>fiZ+OGS3>+<CO_)9248PoG{KxY%vR?Ae<W53@ab^k_@+
zER#&9v^2Hp`tjFJf$GvLD}%Q#`T6JP=Of3CP5b?Z!L3*7=$6dO9)5mqnnxEpx1Tt3
zhULJuYhwQX{%!p7=N33NZ`iP5#Rbb&fvfI9=87(cYooX8eGGl-%Ppq!;G(<y($Ljm
zC9kjbDmu3bNJ()WcyV#@$IqV^*Z%%i^5(`ym;Srq@wKjXb#}S8w;eros_S4g`{aoe
z1M_Uc!oq64-AuQzun5?m7u&`w?N(7?;oQ!ryYc<0soFPp7ORJah209Dd3w73@k^H`
z?bu<lXz}9AYilHZXPZ4eYkprvUw{3X*L;#jExOU$KK%K7J~=OM-PHQOU$1Y;xY)F-
z^!28clS0$<<Kv!L*8O}cuA`&l^{3|3Np%($mW0&Qqg|rfTlB*i=ggVI!N<2ObamL3
z_3`t+d?|^md@B0q@86eqcbiX}Hm&5{otd$_%clN*!w^^f)|82f>BZgM=B2N%r9L?^
zF=l^V>}q~a&W9f!9yapPYU#YNa`xZPP8^DIR=2n1Cg0eQILEGb*OkO5#Up=RtM0T2
zJW>n|2srTP=VwXNtSKtF(00!w#gaES3}0Pe@9*ZuCamta=3POHfT)V|QcxR7K+OrH
z(}_dTr$r?XDk5YKYBg~PE5e#sE4CcbH8&U6*Vn(bJ>P$K+1rZebITbj9=3{S+}xzf
z*38&^&><;FsUW%SBd4%hLVkY#?Ag-4zrVkJPE=I1wYiy@kB`sA)ipHi1lWe}rk_8R
z@3h{Ubv5brwY67P2DdjgG3_dSeeANoJ?{bg|3CfN_~l}9{_{$kO_(#sr>MwiciG#d
zJ39*B+}hgREv|1={LJUuySv?f%MahZD_dV*zh=!EE$tGJqdR^FE&aOVWr@|RD=V27
ztmUoP%_*0hoGfXW#9}qK@7!GL$y2Agy1B8<Hp@M9{rYq^Ua5rC)YkRu^`*`8)~wmk
z3Tjfmi=O(cFFG(#kgd7#Sg-Ws*RQ>o`^{~T=62k@bLYobtJh~`vo2e@G%-DWxomcJ
zws%xi)Xb+~kMYm;lGb9q@cOHYn%bnPQ;#+<GFucq;dsCQzn$-FGgW*0`5QJEJUch{
z^p}^H|9{)6q^#W8)5D^mskt&@le*UQiy24G`0Oh-6ulI;%>CAxFCMdJ&U=-!`OFy~
zy_g*V3v0Da3wN)q`}^z1j~^QfAG?8CeGiUV7wGT%p(J1TLy^JE%xsQjv77a?H#axm
zxN##PIoa9C$!TLze*Sx*>m1zN$~rnc4E+53-`?Ggt~6Ka^7Qb~*y?-aujj4PAiLx;
zjV}k!&3*RpVc^4BZQZRl_5W-lBO@7N_SellJKOy6lP4~2ZfpgQwJ&YWzW(UZBc_J^
z`|aavzlwTkmc6^<xwq=;hHcx9oj>2dV~2%a?5-nAy{C6{cOO1~e*LCHVxaKZkslh>
zAMNJGrl_bWA}X3_BE`wUu^@hboz2fDljqpi%PkdQYi?Bap2l!tdw%@2_3{0C_u6jF
zzW(gA{(hDPj?HX7v(0$@{rwr1`Oo)jX5-b;&Vj^v*izNJyu3Uai-H9%Kd-Hg_P!gx
z{PM<(i;IF@J2^XZ^78Io{t?m>kg6;WLhdbeWR(`*yS~aG{oI_S+x|8mU8csAA8O(J
z{=`ld+=)2i{CIumZT<a!f?i&~v%6e>x?b!q_2;1KOtD4ad+Y4rA|5##33YY#Ha^*|
zAg%`6>Te&82>V~yS6jVj&z=+K&MjN9qT|Dd0vlV~!s6n`m;LRpp1Zu*oqt~CGf5d)
zSxM_M9k<03Pfk`Z{`;$xhmF~}osae3_1nv*DB0M^h>MHI#Kv}Zc50e0n+<A>PoF-0
z+uT{Rx>m2&ZV-FSE?-mNKQD0VnaALMPoN+>pUi_77Z;mkU+Y;HyL(gealSo!_Hgj<
zsHmy2`TP5?i`m)K+RExZU2kdJ-l{h@H@ENEW3#L5t=Ic|dl_%+tv2`c^fbx8H^;8_
z*M_vSQc6ln5jzSLJ3Bj7*X_=|Z6+ivY+3O^L0m89z~#%6<!q~_NCt1uySpWM?wmO%
z&YxFTRc)Q78!aR(?3|FGz`!l8cjW$k`Rwf1)t_pX90zyNzVqtq>*wU=8X6lPPB54-
zYu2ewVf78$wq0AswmN#d-m_=VjvPJu@oIQ{qJhMVD=UR(&YY>DuAcoo^Xe+ms=f0b
zJV@A<f4}bbYdOoJmIn_KzEs)z&N5k;etzDYo12%Pnek6cLBZkm(%mz6EM30*^ZfsR
z+Na0Y{hVhj59)+DaTMNP5xRQXx7~5;mv7jx;Y{t+Af1Q}4M&c+2nh>&dU{^0x>I5m
zyRr8FuG;m<Ts%A}S67A3v8fcA>(}1g%nS<JD=UL<+`fHzkt_GQ*xhY=_x_DN&IoGr
zwg^13d;a2uhnrj5HRfq^=lUilD%RE2>BQ}s(bUAm;5X0a<ki*Tjnd9bH8nLQdfk$e
zlC~^7D`lFM@%#R(XVtl`uC8<J>*wX(-X>a|zt(1L^me!P*NwBU>EzD%GBbVN!+!gJ
z3Z|x0uUrYqy1wr0%HZYyu5F%Beh1XoEsS3sxmnH7&~VS6Pp4z{*U55ob8pMJd1yu8
zVh>;6rOC(pG?r#uSs|ExeH|+();>NyUi|8cW=&0vgh2vBL2_=fxw*Mj*_#ueo}S*C
zUGo0k+&6D>Zf(!+_gntBncr?f=H+EF_H}y>6|%9h3CYR%d3l}MTg$Nh!pi-1QXd&6
zxISAwO=nq7)RMhE{aa?foM^T>j+2uU)C;Wn|Mz=&iBGAXQT)X{u5Di6g4L%5)FN&X
zc+~jl?c2WxzEybjqxa!Mq8?3~J-d7Qbn&;hw=a*~UAE`*IctU`OO{+%8La;1?ORSB
z9u<3g`G!4vY?zptH*Vj)Jb1a^QMK*W|Nnm9xPANaix($?dP%T~W~-O?ovf>?j$T_E
z4NAM6ot-&%c1)aWUH<9W?EFu!R<AF5bwzW{nl%x-N;Da~r|W@Q5W&m+K!rR=1=EBn
zQ(Q_)Ox*kBQXe1d{qtD<|AG}ORzQlgBhKeCLe|($IJTL4+hfCO<MVnq&fQ|pl<2)`
zFX+BhHP33UU-XWxkXBcTm2C9(ywnd54l=Z~v}Al<d*s+LAt52A0}l>1|NMAdKF~{0
zTG~4xKwxXu)kUeNr`@=Hdvo>occ-|bL4_5+;o?BUZ?kLne64S{oKrTF+w!N3-|jP0
zOxq=YpWT<`e^2YH?Dj>Alz7;fd!@~fUB5p4+S=&g{dKix&gR#g$pTkVQVf}ynKy3U
zWK77*^V?hX^~9MoI${5=CxcRv;=84#?3ZuWy<)olsk{GOtiHyh=TEq6$~E@RQrf*R
zXgTvm<QnZ=#<z#oqLPxHDJd!p>F4LEDk?6#v`)3_^vRPGpXWNk(}J<B`RzA{=3Dj@
z>%5-2#_8OA8waPbqN&HqR%^{Xz7|rAeR#co|D>r?T@w=(pFMjv$F^EbT3VVRCnv|E
z_!-Z(ZQDe|#EO1?N|mdAW4QLuvgrPup%D=hiHV5^dL=vLul$UD=WgV@BXi<6Bil=9
z;41S@&gSds`OlpE>Sq2nT@yTcFXO#X@f-0Ww*p@N{QUgnsZ&f3etmtNb8nAidU|?I
zO-;n!DpQ8Qz`!r>@5?jP*Vivvw1@#D2I^W^&E*4$EM9!_^l4>P)z&9Zo>)hH2?fQ-
zqD5<7NB&v+>r_~#ZJwm0WJ^m+KuAc-#S9e%g@(Gnzb<aejc#gcs`&kO`^yUpnU&r9
zSQ4I`n8+h<CsSQrEoGA7P*P&T@b=bLWd((XZMnCT^776ta_zqG`s-BDZJ;)YT$1wc
zttqEw9gfl0u#CwHbUwf0o$hMIRsUCRIdb;w*<NOLz9|~{S679ujBmC3bVB*hbNm0A
zUt@xo`=!3Rvhur)8@OVO{<%H>*ssHqf9BLIjQErm=<HwK-e&Xq+8U|Flea?hq)A$u
z)ij>s=ewO>*8D5xlAOLiIre;0;N9p?Yod+MoGG~~H6Pk8+-_ie`QrTq=X&Rp{fet+
zuL;!xr5#+OZ3J4Bpdl_|2JJw`xGYu8OFuW~;PK<_+j4JTS{G~m?EHNG13i+)8#Zn{
zm|&1#AQ7>@Zg1I{-|zQ>S`2-CeL1(bcxvC@QK;P4*LUOgZDB#dz(_@IP?+4gen$K4
zk!R0pzFc%Kd2?gpn>RUk?%w_Q^(*UxtKsoazg~~m-&P+KBy{oOMH$<wDPO*n%&U5(
z={L`Y^MJSh-XqV>&fd6n>(R@Xh3o6<Z*9wsj$8;acDj~s;pT1Ij@`N?#o+DjJ!kG*
zPj7GS;{P{pL^L%utq5H$rW?KO$h~{>?(QyUPRP&qUmdoVfrE=n$;>Pa(%!ZyS@lGA
z`;{vpd@>de%l+mufNI~oyu6mSwxXApR2eQ^y0l^Q=F5E>w{BI{)m?k02;`lPcHgC6
zcif#adGg@}j?FX7^W{LDhdVn8K~1H(e#f_DUY=o+DHLDx@#y`1ekmy`6DLk&m}6T#
zt)-=9f307jp{1qer#-$MTc*F9Xl4w`MaM&KtqBs|-qggz#w&H?!a`?Bivk4(-&rOX
zU%B_o`R=d#%dlbN#)oHS8b{r|xjB8~#*K^`oBxC*CMLGDwlY3=_^@&N_V2d)<pX!M
zw6Ij(Q*%KyE|ax#!!n(hdTF}#$#ibsY`lK&w^?EHCQe-V_h0@0@AW%&>|l8C{=I)m
ziAiN;<&q^!Cd`@BGkv=Fym|9f)YT_Xn&ebhXLs@9#UH<ahlkw&H7EtXul0Iecza_q
zdwl(0({=0CN${{SJUcV<@TsZVo<2S<%P(JC<jQUL_e(Itt*zPa4h{^UUgDiQb6mT{
z7G8hdDE<D(&!0a7Lqc3KGBn=a+PWIv19+;I=eBraGdusJdGq>OTUqt=^i1;Z?6@Ob
z{`wm4(vZ$i`)Yqrt2}dMW$;X^(yW!o@4T~sM2F+d&fB0~`kkG{H#enn^YQVSWL@dF
zb0@}co{i_*TU#ZK(|8mW6=%#m`G_0rh0uj(j=X)l{eIe=9fdW&UM_!8vMZ^y{@+h^
z?XWcuUa#M;rlPXsQnLtH?RL*QVe4WdpE>qQo2NZG(m7RV0;sTP(TLsoc+UTy=j%_L
zJI9uAZA~PPjKze%zrTl9C4luU+EFCDJ#uqeVrJ&aZ*OmdY8*++B9+I-dRKSd0OexE
zBf6)2@8tQN{CJq(UO-IjnDKcV=eoMOrR?CQrDB)1dEw@b8xzmWFg$YP2&i@b>C-0z
zsoqdQNCm#bb!X@8RiUdteEyt#d71B(_3`t&#q~d(um87MD+xRpFR)#soax@Jt=W$s
zKR!IuIDJdMkd#?Y#LGr-yS(GkDc?KGd}klqlzKYj?yjw7^Jn`lPkw!E?NqrrU?2XN
zUnIOebamLr+xhzgH>aIl)!_~*hn=P@yi<NBCMM>@sZ*=EA|PJLopSrJS?VbftCAND
z-QC<<v#wq`b$xAg_(~+ZoSd90KA$x&EG?bN`V`zaSryp2PiwA0qEqM*C9pYpi|^=d
zd3SfWaq20Ny1!q;zrDY|{_m9MXJ#7z`0?Yyw%q7jTeFY<`T3b~N5w~_ef#!VSXexG
z{rdD$@97KyUWSQ>Sl+ySTUcDIUDmr|g@%lbOhG|`f|k~+y!YT5<j|6-w;z{1IKaqI
z_U_Kgsi7%W0>#C}cE4UITa~@>C@V92_wL<~U%x;DeN|sy9er|ga%V?}f|63x&f@2r
z5)ZRYnmpOl&kr=%*CTD-7rVQx@Z%%bv~zP#-rHM!@ny-5-Mb&Zf8RfK>ec&uN<k@;
z<8n~mBQbgT`f&Aa=*5(>nc1|K7M2Iceiz@nckc{n6zN8UTaQGe-}1%9&(Ec#rcRtV
z(b36?Y2UtmCK(qP7~<pO&GPPe6cl`jJ*o-{ti)BPWVHT1o1LGxTkRz%`G0$RTf1Z;
zqA;^4e>Z2^1=HMHQ;MITW1KK;THD;YbIW?xuGP)V%uJmIE<<f{PQ58Sedf#(=XO4j
z``^9uTkO_*;_TU{S5^l9`EuDGG$+u}(NUy70j0Rp4O@Qs<(8`w|JI#2;i06g%xF>l
zZOygS>RR!BetkCi!b)#btCpMvw@?kInrRysRzIB@E@5A118R~T?~|3WD&gSf=AJWm
z?$2MZ*H`}ec>J_&rK2NbGaGNxiwg^RWGn<4)O=@6m^iU<>sC`qNlBBuI}!}i($eSV
zST2^W{`Mv^Cnsma#*Lca6wc4H?cTn<yfeGCwN+3|jIE))-F@Znj#CpSOi(y~yTzk=
zCa9CGoSRnVv-E7<@4ne_>yKZ*E*>9$-`Z|tP}f9d_eXEuoXK9lciGh6zkdshih?7)
zt7}pH|9??G54Cc0a&aYHUl)62bvS=^cJ`k?e=1(D-M(eL|KmrGD*pfd{@y|k9NEF)
z?~ZepXCLpAt@-=)`i@<@k_;pmW*DdQO;-0;Q&4DF>OK9^p335|wNb1N_xII;MuNY6
z`*vkr>}>u0e}o#?c%=+-Z<*NG+A_|ttKIc&VG}459yz?^lF;_NySo-n(c1d%xO}}%
z;IhjvBerA&uKeBJ)+Qt+RptL!*oouggO$O{`7A9hLyOj07C-xNEjmB*w!h6smi>P|
zc`uF4e{y2to4dQkmx|=x+LCy4Q|gtK!OJhLQ-A0G)bBuaNJxliAt-CBrJkE(d32k$
zaVGPRACLP%&G&Xb*+(Cb%cnm))H=1wwy<8yys)^SK*7dl&XOfdASJ?&+j;vk>?%Ga
z%$YMs!m4D28P}Hd^K$bl9&v(tHPfey-@JKKL{wBzPOfkHa`n8tJW!E)tFr3Li-X4J
zZJHM^R!%<NcQnJqsPa?Fy7>Kl2NM!rTv*5u^?rI#m#X))2ag^#tzNBdXlVHKSjpv&
zkB@)+^y$*J;NalG(o)t7X=wpog@1p2tv1tcHqd>_H&xE2V#13T8E(B&trs&?OiZRM
zSg=4SxEkcW?~|F?`3k<g2%Ke_?RI@#?9GkI>~V2%ENpB-!otnVm#c@bi#d4cP}5TH
z=|St`Y(bNEadC37yUUiAzrV)-8u4ASL}iwFzTbSi+KA0*yn=#)Ix#yY9654C=z8qt
zG~VTYb00l8*c`K?fYIOIzv|12z%-w`yGk$jl@=8#+1t;bGDRe6TaKrX4^LxbW7OYn
z{e35XeSMvIdz-F&?Uz7Rd&fe1t?kFpUb-~t;$rvDr}g(6l)sBfJ2$6u>sC`mMa4N*
zrK_}(K(*4l8PlhWZ`-zQ#>|<C1`-~=zE6*I3P(goXUp%ox;9$8xVZSyqenkptzK^+
z!Bg_;O6Qp~K9eR--uih<)>W>(asCAb1v@97&%eKKuKN5Mp}leblhu4*+}x~Q{q2pW
z@v{9~92^&_UjCc412oIFWcl*PZ{DnVt1PD{T6H}4?ygcE34?}CVRfU-ODg|<-~a#V
z&F1r$w&&kp)e*>{*s>#eYu43Eb=O~i6%iAgwsU<@7b~~egc&nDl9H4(H8rF9Z^yT*
z{68m<>Y9Y!-*?_vwKvW`AVA=L-S4-t#}$3TTkjQi_x7sV*vw&O=lgIgd;P^Fp2BSW
zaz69zYF8<Nn%j@&6_k{yC@VX^W;}iF+_DWD1nlkY#q{HR932@M)cxl@d3AO5f!>9e
zrc4p(aZ@%lG`w{IGzP!rZ~oQ2)#Z;KKi1UJdi3Y#=bYQy*50VjewulAmuc1Bd6Apb
zGWX_sY35Gx);zka^mRc|(XQX@iaudG-|vvMDp_&ne6Rh_CzGq?Kuv+b$gFE?K#{7y
z|Buo2_3_6K9ct2z-e!<>MI$IU_~ng_%3hjMrddaBZOvx<@a0R&pC2FRSe0_g%geLz
zNHE-q*m(Tx+1@@`>rFL3i{45xv+*QcT@|_||GwN@zvJxkH4c}T`C8Wfsc3!7!^5M}
z_h!lz*jfM^n>kaah&*uHBqk(uXsUL2fY-%+wbf$!aW(w&dkjRaLa(J{|Bv_Z;Q0Lf
z{PMcLzZhb6m2`SeR{MBVJU+y;q5SaS!_9lEzlW7e@bd;os=3@z02kV;{GFVbKx3Qt
z>;M0il~>PAi(0k$>^qfw&GHprE-ZA8y2J(=Y;R;{e`jX}F8K<>Px;>Iaa+92PvgMR
zZt?uzP0IzGI6f};@a@~a$M?(>wSsrP-{CvUq*E_;*M~o!&o7>m^!3$MpSf0B5e*gd
zLTSS!mUr*o9XWoS*<4L5=k~T-cuQ#Y1*Ok^etr>=k(tg*_W%E9-EaS|VksM<ofN$x
zfzdeq+!D{pYA?Q4O$mBYV#OzG6(R~Af&4DJIB?^;1S{|D%O@26w7T|HN8;vM)qihi
zC#I=ocXXW!es`Sn^Yio1k8}!yx*yK%e1$(gJmgZ%UAS<eN!}d^adGjrImdb=LBqQS
z5(W}HYvT9Et*%BH{m6VOZJu}J;zhyg>S`q=rFUQcwFo$MRG-QS>G_)`pa1uMBH!}Z
zXHI^0?U(h8Q_m(ItJ}=K_~W+CIh-H2Pno^r=FOXd>*MxX+kNEL-*eztueA4+{l8wV
zzBNh7eEJzLFE1r4t67WP`!^LncKi3Szdj%e9LP=D_iA!)N51x?M)soLR_rc5b0
zz7br`o!Zs>GRMBOG5e?M(NpWiMLrgPV%=vwCC+zZXS?H0=cQ|-wjR2Ddv^M~%4IjC
zve)nZb}P64^~J^RE4!ahQ!_K0wqSum*40(6r>E&|+`c{i`@6dgph37*q1p^hO-&uW
zy;J*@TLhd!W-q-R?4^=x4jR1pJU@8Glc=MPo$YQrZS(r=|5@zYx9`dG=ac8oUHkXW
z+E#AyN6((QWn`={vyX|2k}4}JyJfd++qNZ3mrjk;0~t2G=k)deVS5GY-FMpNz2Ebh
zkKy{d*u`aUZ`Hp}dVH)m`PP=q7bRB9d}o*WPe0ey)1zW!B-F4u{rsc}6AJv#X`h-t
zd9pIBy5Ic!^bNDh7=4Ys$9WGdYEx2I_rG=K$c$6-Hx~(SkFWnbb)vHSq{)*Hzu*7=
z+`(q{pI<Kf|NOrHf9-Mc>`O~Jzx{r3aq-LR>*ZmC`>U<1zv-BnnK7hJ`wt%sG2Nfi
zyJMAd_NSR*nfjNNbIa;aT1_}M#kAeq_nuDHi%y9rUUS!^WEu-6Yt^4SC95^pqEKnJ
zS?;5a$K{w8)c*dKb9>v{b$Zj_J+WKj%m4lR=i=@@dHQsBFE6f>Cr?&XRz7_B(zB?@
zNK;eu$@AyJva-GV_y3QL1&vVc5dU-AeDQ8)XPvAU-BY8lT)2F9q2l>nrmL5oIC6Nw
zog=&&-{wg+#I4s)KL4MSk8hdxbiFwig-p@W(bM!|Pf6$RX<WTpJ1;Nq$noQ!KOUE7
zzOXLVnu&=i<MuY)+FxH1FD`O@@wMvA*|U*53P2;P7S`6n0s;=Z%ie0$6kE;hOFZ0`
zdAqf>)g<eR$Lp8cr+Xxge|$L1?>%Mt^5w=A9}*t8rN!N=e0OY5<>wt`Z=+hj?=O2R
zb>8MPkDs64mh9_&Cnu{<-}(O5*6hmfcgx?q&Jdk6V@5>Isp+M!uic%j8M|li-kZD2
z^%?wazlLnwTGA@8Xw%G@FC&c3%<XVox8v58#ffug?YMPCIcb8nciYa>HhGJJrdo$&
zrKca)-~Xq{Z~5W{3mSfYe*XB)o0xZ<;p<`|7v(%TF){U>ed((!LDwDL-rm06Ece9R
zsxKGaEsLITK#T3Ov(2YlUWpPsJIgfr>#M8fwKcA1W}EjX9&WpP@plVohT6`~?#c7#
zmsf}Do8{a%aM|BJHfcY<-H!&4vriY~Ya1F)+*|!U>+SRR@9WQ3`?T}RpWFZcZ+@hG
zI7i^pGiOS^nBTZ?zu<fQa*jgtQ*R5SH>GfPi|H!q>helUOA84Lzkl_2+pA04^W*D&
zJY;|1W+W*iv*v2-o{EXj=T-B8#*NG0#pEPCY-Z=bwA5Q%Mn>kxo6YALJbZn1%@%&}
z4PPIpYGNXyudi=a^1>k}M~C6`G~MI}2O7V;yv)wd&VH}rG4I<MJ9b#?DtYO2yid0B
z-_PeKPo8`@+22ml)>c+tUcRQLrYko2@v&Z;|9?K;`gT@s|E;ar*}t!?iJaUgYhCp1
z%}pW0c{Y`kqPAvT+>q$Z5EK;j;?h#?b?esc`TOlQs8xBRGGEHR&gS$qUFHXO%kP7R
zZN$aJ4;?xb@amY#WMOr`Cuhy?A31%x+bs9ihtKEjC(oaM{>#hDot>Qr6AT1|g%58^
zJq_w8G?;O2KA3Re(4nUJb-z}gd0z4NS7}pIQ$bl-+JOefnEn6u>u>w>_pkTg`7DYp
z0*@R+@*b5vI>I?kKmOd#;^#j;pSS<{Y<9jy*_(*aBW3<9tgM2vvc7qFdhL9&i-MQ?
zMcuW2zsI?qPu8O3g+TGMGl@q!1f%XQbZ)<~t5kdczhBz#_x;v;`7+gi#gBW{?>|17
z?7yk{d)}kRk3n<y;8sQ4@)<KE_~h+;e0+G)&&_#wsFnN7Jlom2(c3oE|F5&LvH5X*
z|G%RxoWdE`)<}N*`0>c`<KnxI1^e3`T^GB1%jaW9jtGc|gyhU$vqop%?|0T!UtfhP
z9h0>#yRb3YJ@@z2@VLY~I|^m&Y9z9=Uvt@>{`mO#=JfM&YsEkRe!u_vD=u#Cn|rIx
zKR-L0{OrukD{CSrm*20Qe&B$EkgzZ(506gSy4jNr&d)OC=I7_1SMjLRXg+8e%gWf@
zX2<(vL0aD2-tKSRKXs~TWo2bbT3Xl(IZ&rEI4tkcw@;__!&khid;08|TUM5qcKA9S
z!za6Hf0woK%kv$W>~H6pmZlcIE{1VI{r`Uv5fK+QBs$mB)NJ8w^cEHucXx4NVQXg0
z&(AL^F8=xV`+de2S67RFeRcKmM0dG`xwp5?v8$am*SdVtlqoLP*Tou2@VL0T7XJG3
z^7g&<ojWaSe|>ql`~AMf8#YXkulv#X@9*#IudS`EEbQ#YiHBHn4?T)bO-Z@1B2YQE
z`gosg)ceP0(*x%H_)%e1`l@C1YVGLlc~4(lTpV@xKqE6_L3#Q54@XX)JbALEy?t}m
zRV~nJsAt7@w&l+Lwr<hv+0uD=c}LG(-ck7Yw%Rq&TF2%^XUgw@#vMSb3(n2AKfnLq
zSN*krPEFMo5uIvVfA;T*`}gNxkFTHG($d29{mhG!UAOb@dublse!s4J@#5r-pJI2H
ziALu-t-l@|5z!IE1?pqUJ>ur#N_u#x71WF8=jYea)!n*l!v+Hx8JQUtg-I)qM@2*w
z{QUH^=>5q%cVre{T#@_y*|V(we~#P#J9uwz^~TMclaKXCM#(O?`}}ORj=!HDA0s1U
z&9|HBQT_3e=QrBy+_^I%BI3?PL(r&`rq<@;y7u<-=gg7e;^yvNJ0oCw-d(SH$NzEM
zyu456mft&gb?Xl)mAkvk(|>+?dZ4#_x0G3q$K73}jnbzJo<4gvY358%Cnu(P^X8?_
zyMO+CKP$Ib#`_vsi-HH+K7!Uw$Xb^byt<<K;Fxuhy8k?u1D7ssIx|(X>f4)>|Nj2|
z`DXL^F5b!e{(ie1v#(}m<>zNpD}B?`)8AGx#O^LrwX%|OsAgk0JKJ1cQ?ql=9-B>@
zHvRZ`T)y$YY5}jbnTopla;N`%cizQ0IXhQ=JSzV3-rn6|Vb_Y^TwKhq9kzzy!HJ2=
zNAJ0YczrxBUw>hdYj@FOLC{cj>`dD{+v;yi($CM^^WhM;N&dY(TFJYf=j7(jG)Q#n
z{3L0dCLkr%HG8)7*6izQT3TMKi;Ig7A3xsCEv^UZanCZ%4tpl>05ocAQK*!cmnYHd
z7PT!W^3C7%`?IdB5Ij9iSJ}d1#-2Sk3~A@*`OY@W<>2OC>_6Y`*6zIRmtKGU^7?vz
z>gj2j8|T?pPy2qanqOUA{jKb_kB^TVXI)v5bo|?ki_M;s)fP^1T7KEE{$EWSkK~~{
zJBzz{ciy^rQ&2zvG+w3~y$v)1!vGq@ySUi>@uNphmzVkG&pvVQ$>Yb1d29V<8o6rQ
z*ZwkD9lrkAlarHgosyhi|F3f9`NBIp3iteY)NNsH{rJtBp5425m#wS){q5q;;`B3T
zPycm@;!tccsI)5GT)mj*`T6<JoB8b?ytugdXxp)Ed3U2;9=xl|&(EJbPyeI+`aPex
z&fERg0oAyjyR1rIO_)5nIQ;qAb2pd8L)OoD|5b3FaxNr|UHm_2G-b(IAOF&AH|IXl
zeOfoipyWlsoH=tmJUt)IEx)(WwOdT|Rj%Oo`P=VRz3x-{Zpx~sr#EQ}r(#P*#+2DR
zQq$6mik^66U0LCHexB{mN8S4GUf(kd+@|I`Yl<b~ySv}Eg>JvN*gd?Q!-+%DWvb@)
zqYI==vltR`bI<bI|8dxyc6LX_$0T_7K-;`<_s^q`+~sSJoYvoeW~z30-tVuAK*J|3
zD`cLk<(W^nUlqQ7-nXfXRr0RpUtZ=Lv%9R9k(n)|TmzJ8RwzB)_c&};wE7d#P<><L
z$>MPp3lDjRf(A(+9V`g%jJ7oH=<xb>D|@}5goMZQb8}Z)i-7t)Aq#hV-O1bj=dv$S
zN8t@*UDu-r4;t352Q6J-%2R0)R`WS<@1C5wxjAS+Q&t$hK<B91iQ0(VJ3A)k-ri<d
z{q4;wK37l!K6K5QBcNe`U8S$5{n)r=%bE*K_w@ICV3Ib^(+Lw36m(o4x7WkZZ&~>I
zI2RWeh7ItsNi`#*NrjJ(F`5hQooSrDDf_x!+53BE&&)L5n({tz%G|kUZ){8kwRTqq
zFZU~L2Q}h5w8GLJO`ALS>{RV=uRkg2>ERiIALVSTE-Y|tPD<UlabrtM3xj}=(4kvf
zv$uW{RCZf%`Q?i{JB=eFBU4gRHe_B_tEsKcys&^VEG+EDuV2$V*yPnf)5FJ)x8K`a
z9S#j)wsO;us_mdPyn6%gZ^;ZkH`kio0oo0ygLMP`8T@)WJw9S@l__XAq_kbBMW8-9
zv`^~HOylJ@z6tx=1g^X;ZCRwE?A~|e?AhLf&Fs@H5B`KUA+=4XMXRc*ZL0fQm0Nvx
zSLv-2*Y7`k`*yAF^)FX~{cW!(Cx<S)7541f%M#J5@O6uz%YgE9_RO0wq2T<v*t?%T
zf9B-oPChfkP{yic#jeAeit6gm8>{Z|TV?8B{;aSqGB7Z3>fOt0A~$!bW*0v@v+_zd
zzl_BMMrO8v<$khax=}~6*Y7<xLDAXk&G&D&^FQCKejj?X_{NSxWp%$f3(DT!Vz{z4
zdiuG!*8ktWPESqUn1A1{sHo_}hYz5!G*An(xnuRTlYcCOr)sFGx^B(B?sq%y^0Hnp
zfB$eV&8hR|t-G})bMcfQEgc=7wGkT^9jODY)=f-IjQSb>UAuet?%#WNwf;V|CG+x?
z;`6r4d)$&s+k;m9`0?WvpW_`yh!<`y?zdYtWmjRoZS}X1W;WiSzP`S!6B86~)xR!#
ze{b!sl9!9_+_?j4!S5=1+NGtV6S98)zg?d`d<a;#Ze7;NNvgLhca^_i_v_o+)gZ?l
z>ycdC)zx)ncK*JVXU?1n`Tgzf)d`BuU9q<_Ojccwua5<JV&{$>D_*>KQIx#v<&BNW
zr!HR(zU}Af8K|eHcdM+XW{;6nZ`PX|8?}^_79IIJ-=ff|vbs9^_W!@{>qDQPn>)3;
zyL*;ZY1YQ)pU)Vd&-lIf%O&qur}g)TcxjpzKl6F|@ZrK+TeFwD^~<e2dSi3C|JP&E
z`5ONI{!bq~Sg<Yk_Oef(K24cBcW%~=4GXjG@7vqe*B2IF_j760%caw&1bJ=EzaRIz
z^2tQ^5GSW5(Bg3YeLog``tl`&t=VyH?CvnlrB}9QUk^!0SWr|{6m;J9`<zdoJ_S{O
zdlLv=41P4J{{8Gpul^L5saJYW<(vBJ_wVfZHaAz-i~H;A*Zx_(exH-((Q|XHSB9^b
zyLRo`kDosU#l?>=^`5??<fTwyVd0OTKSNVbTzz+8p|eiprWV(3vAdV$ycAn@#LHL|
zG*o_mHgU?7BlZ7($Ctgi;aFE^*Vx#2X1@LWW_JEfsi(zsbaX%qD89bBntXnqEofoU
zdAr{|Q>KV8fO32jE4PQ2*Q4$C>yppTGIeorNk~j|OiEHZJzZbF?5=6{wFxt3te9(W
zV>73xr^hIH*VM_Ag@6A1d24HOQW6Ib&y`ot&&~CQ3_Mxr@BdTOS$%I~GP|sG*_Cae
z%HUY9^zny>+qG@npa1yy_~zzxe)m3^hrhnQ1}#B}icL&Z++F@Y?cScsGjl8_&oa&a
z^ltZiAqfeOq@<*c5f6_UWF;jjZB9FT=<wlY^?4Od-|toTtIw~Qba!{Tu)Mr~MTJH9
z`nc4OkB)){$CHou8G_axzP-JC?(tKny7ug`F)}vRF5!Q4E;Q<qn2e0ibiG)M!bdFI
zwr$gi-8JRpWcAIJpVMSyWNvIq<({nW&$b}{{yxyiXLWV8Tff{{P=Aa;$|U2!mdwi^
zzI-`y`SRqKmzPif@!+9jSVY8$6BCtZ8mIG}on^ZET|tY$qM(Y(%EavK?wK<s*_s=#
zTnX`;V*#3zm^W|Uf!;)0!egxlx3}l-e_Oge|Gr;AfkD;Yd1}71E^JQs|Muo)aGFoO
zsDOZjj}K3PQDODxv*w=u{>N|LjJ*9k3^eDFb;aX0DBuqsJjlQ!Z@0(kysn+yJiobC
z4`01Hb!~05W!0AzVde!792oB0z3bh(*94`h>lhOwBPb{+!Nay^@7|MV&p!Qrzdrr;
zw%l6*D`q3LeYwusDZcYty7%$6qNk^VPS^kWcw8veYHpuvx7gHO^%)r&3l}c@ZekV;
zYP;y@>PlJ`rIdoo=BdlV)<)fWU$yt&He+#(r4btv96K*gZ)s@}S}atbK4p=TAuAi(
zr$3+18y7rqD6x{|<KsK}>T8v)qoZTZhlA{(R?E>>IyyQ#YJL{&{5Wg&>|_(Et#9w#
ziFrH!|BvJLH+Glnmz9;BnPKR>dbjDnPxGGpT3T9yR&3u)pPy-4=cT#t@3&|fS=rOJ
z|H9yn;6;yCm8=K}xdWOH`~80Z`Skg<YR*fXI@<ddFIJY6l#JrPadzd3KY#v2?5Qwh
zWo2cUV^cY4b@+NF^ER*GpdbUOUeH|Y+GulztSc)PzO7&V?(S}3VPWUMK*8td=YvA8
zqOvm7;@Gvd(TlIYo@tsr?b_{SQyyJe8GP&g>uYNdpFFwp$i{Q$&fThmwRsoZn2-nB
zb>QWtq_5w<W{u9;&yUk4UE7f8TvSvPvA3#p=f|k1s7Z6@#%9H4WN3gEmfqc6e*F~4
zjT;ggw?$oQWM*%WzFfODVk1+7vU{J%?+ic9qw@8C9J|GIPxZRrk@dXu&e==z(4j*!
zj8eJQMsHsSPPnJ8u8EvHV}?Xr-Op6eM9lj2`a5^-?2$6<0<GxSnCu=C6C+`q)^ly~
zlb0_~{`mN~u&iuZ>gj1b5(W$n8#Wl+xpPOtsw87$Hb}tS-2B$|{PUCj?M_0+q~B|t
zJ8-~Z-_K{#OiWA-(*OVRadIk}n23ajhZ`9g&9JMj>U<2E;%Q`N-?Aj*-=Cj4QCm2|
z!>=b@j=KqO_cAgvmb|#Y*zoJ?YvZaf88I<2A3lG++%_fu`0?ZIOO~kcusLshc>D5l
z|IJljv-bUX#LdIT%+}nvxB9zL?ky88F0O*&;>EGM%Y0^=@ir7bJ~nak<mSzrjV~|v
zKYsUato8MMwZB1)m1obcnq?ksem}2Xs!-nG>v}Ky|9^@>wFeg$*DC8NT$KX*E2<}W
zR?n2n-2Cb9*Vm_Cg-tY@ZIIY>YfokI%2M<FTBfGLYCZwc+j3UEGkDik6RP#jtE$Q>
zc$rV)%S%f?)$2HMd}OG8y>>fjtgE!W(0}KS9T!%IuYbk&xKVB9!CDv1qo9>6GYpfx
zYL+Uu2rSCEckf<Bb#-{TL^){C`p56z)1$(#WpRJs4QcoC+j!o2r+mNex9;(N`Rj23
zqM(NAj;Shn-#bBzu@eumgsqQ@ePaw+fIa((?DpSBmUvE<Fv$p5s$ZuA>dIeMwaxq9
zcy_kAu)O^Fhlkt2t=5VK61`oe?3W{V*|zqazP@o?`oR-<x*zlJ^B5b?R1TgY=XqyU
zq@N#OVq&6B<R%tx@6)gN+&L8GB-|D!y)plN1F~L?VaJ5Y=ckyqN7h?3s~*0iS}CmN
z-sZLX=9bLMFYfKNuKo38<+}jTn!`=+5~?;XdAsa2tLvXRHI482PFBr2^u?n;+Vs}W
zVs+4ZAIqX8yPO4`I@r%$daJ2AY1`~|;vye^p4a?hej}tX#OaUirN?2upe-V%St9cF
ze+>I=zeSX`E3azsTb_JriRZ0v?ecXedL)gz-X<GJaBy*Hg>5T-?gttTIGAwY=FLd!
zw40k!XUv=_8m)fe=+V|ISFYSXwz2B#E3cVfj1?6d>;C?l8g+epe*C6Qn;2FEFF$9y
zf1AM4lRIku=3nJ5w4Zcj?W{fj|NY*Qb=50+Mug3etgEYz&N9v3QSi`d<=5kVvWr)&
z=$K#sFEZLhT0upn<-`dO(30JvCmq|=lp5Bq)qTJB`#ssCfsviJ->!+@KkwJq*WoWs
zTQ{sr|G52C>kfI(M`65uGM0xHxpx2fzW;x(pSu1rJDoFU&vJ5eKYrYAueY`C>8Yta
z@^*9X?X4D;mR>#e|Ig?1Bch|37hDHT9Ig$!zCPZ*PtF#!s@H3dzMfv6UhFQ=2-3Xz
zf0ac=ML(WQ_CImv%#q{A+2!Tu3mpgTqtFaqmS7<9;lqc7|Ns8}`1$kVuF~vjI+09w
zA|5KkS3q~`hSfejB{~mz&2*r3ekSK@&|U%P((1Y8_g0=r7Fl%2tt`9KmA7_fshH`P
zo>%uabtwxydcS(wk+rVfVmo&2%Br_K(94-oTU%RLP;lV-_37Tz^%%|nO`2hl$n@>o
zw;eloZmj?R?-j?-d74^UNmo_`o|$JW9UXo96`y;^#_4fYolj3sFaG)IsZgn*p&@7}
zPe$g<sj1pqKZVEFin_bIv++nASU$hbD=%+dSbAF8G3oq02j^H8cg=l`b5O40&Bo&$
z{r&28cKfbK$k+cV6cQGGd2g>ZsGGGqe0|#Ub91ATwx*qx0>x*=2ZiWuIf~%s5)YOJ
zR;pe1jvW@0CQk+p$#r&izPY`<Klk>w&_9V>fqehJoqh4{!Uh-QwScpfgde>NjqZ=G
zsj&g6zq?z!p`(MN>gy}lq9UWcRbQ2ql$aV2dod<XoY*66zHXP(K6Npjhy%*~HcGE|
z+Aeu@ZSCm}L1j=RcTIhNclY!*Ug=Li9`}RBr}<<oK$CHx#ROK>-`_1Qe}8WcsNW!d
z!nutn@%g#AOO`CjNVP71r=qFZxh{5hl7U1=SJ$I2FE5LBeSCJSbKSaiC9ke<dV70=
z_P(x-+Ir&DsifcE-m<W>FTVad^U@Md+v;yA=jYinR8&@8TotPQrONi;!Gk&X_w7CN
z`pD6vg5u)t?R>H?ZfsOO-Y>6iX}QzMkSkCSvh-0sX@d4Tv#%Ku%hlGIeeIBVq7~f8
zF}d`9O{kQ#w6~KJQ^Di8a{2lB&(6*D&dJeHcI$Ciei<~Vv1!w$6DLkwXzTOw^z!1`
zoPPe<_4xXuXMEgN%7+Q7`z=X6-Un(}MBO|!Rok-ekA-g37KS_?%evohw?{SI-&L9o
zYP!C-xH$953c;B(XWrWQ?AbG)Stgx3i=StudTG|w)^>lr6ME&W>h&{c&UESSPfJU4
zOH6#2+yD3Vh5v8kSOdHaQ%{K;>ya!zZpalFn3iVsOFHMt>%fl(R!n>2xcu_P-R1dr
z?%oA$F?n!jXK|F#HMHLDMo4crwDv;%<e;t%8w@~W@-i|qtE^=Ld!<Y=92^`#i=tT%
zWSDHqx~c^lU(U+XTDfxNkz>cQJg@WH|7i%~YIyP_<x5p<r}*FaiifPA#pgdiKW7Bh
zbk=2W9$Z-&%mCVrHd)=DDPVV5?xxL~-&_5|){tRDYxQh5I{*0j^W)<2HI9CMd<Bos
z{{oFNT`%NRJaTwJRMnkr`S<&JdwD@C<v@$CEG<P>Uw>Ir{^wF%O3IRHEc0xu!(Q**
z9iMY!L*wP;{*T|itGj>nTlv!`PizXlhe&pHb!~ZmWoL1^-One&LBYX4pG@}Ouw~1W
zCnqOI6{VU;dHVP~xt+g1_2MGe9$D*YQCqVXUVmNrxYzv0&*$?&k(rs9dCPKs-LFhB
zy_gf{=2~ao+M)@H^!WXCA|fIT0#Z_5*Vn~<{Q7li`uTaFEzYWTQ%;;cn|i!Y_R8w;
z^+$f@UyF>4OiWE}-MiP;YHr_z2?Blc_I^o8O1ZbTC|X#|`0}NMhmAQbEKEg3Ma263
z+2;Y9(|q3^W?|sw=Dt_;S~n+WN&1uZPtMN(FWg*qDb1?%bN$oRCwnA~R|GHTn_v4)
zl9Q7&B075V#EFgDx0|c`%}F>j!?2>dT3AMgr(yBp#oOW|ZES3|ls|vRUjOGKJ81sx
z=+Q(2iB;Bfs`V5V8T0eszv6QanXm=EHgvhw?Kgg(rfJD*?XCK{%FL(u>#NX5j~*2i
z7N&pND9XSfbP{wfz`^_X=dW0y!LVr2q8In~+ZQ}`eYt+$FRrB`*5&V(#O<w8`(AhZ
z$H&K#rdd~*XKn!vZ7q5g@-yJ&&f@2wT?q`Wt*tzgMlE)=zaG4NxpMFK%X=UzmF~<a
zto{4z>y6vDgLjwZ?)&rU^eVoe`H`E`c-#NSSAUlixmsyg@Mm{XV1S~bfR;j_a<6C^
ztIH9;3kuT&J~(xl7!(#cdf!SjdCMX>`-(&F_K4`2^F2#?e0|usO=ir`;O$yumLwfm
zxU=P!1d~=mvP<JJ*6FREe>clVhv_jomQA;rKl9JV=Wq5sPn`ee$C>JLzwMSTU9x=n
zY{&>)#7@Qeg|lrcjgB5YDk3ia`RDWbpKoQaXZ&!0nIE*V@@TiXWz`pr$H#h^Z_O{S
z{{43Q$`vaXY}s;U$NSl)J1Re`X@{*zxV|p7N6Pfoy5oT>Lv&(yUHNe<!QjEGSGSU@
z7xJxKxiaVGrlqGp-ntb9sv^JKTb7}4>C&ZNTdnu|Emzjk>RK7R{L$mbhhJS?&2Fm(
zT3vnX<UEn@7gmL??&#}NQ&CBAc-$jrJ8RP>qgk_V{eC5DS(M_yU$A@k?#XlK+Af%}
z&%2)6ykW@_6}^}p2Y!BjZdv+DWarME-!(uhD-KzCJ}m18O`=bqe*9RkG<)iv6DK_Q
z<ZL*$y#IN1V+8Z9-F`0a?u##GaPaY&Wm?6>#GH9^bMwb<-=5tqzt6g0)hf^ogx>WX
zrLV;}IXPKaSs%WCpP#dPOZE4>q@<*dj*bJjZ_j@6B;|4Xr%#_i`}=oo30<{z?b~f@
zFD-O#@96C{-S=+B$KsL_lYQ6k-J93Yz_4odYW81SLFwmkmq^}s&@l~$h7;f3-v0Q>
z6BCW(V@HopoH^68xY$@#RrSm))6*4ts;aE@_4Qs}UN<%*GM~5mz2;+#rzhv}KH10e
z|9`N9_V91pw(WQ}sC88N`B`8@#EFlOk4J9Jn!54k&al<0+S=WhE(Jk$B6n?Eduf_(
z^sX(TMdju1x2?VO=jUh3+Fx4=@AR4Z&%Jv6dhvq;jE4>#V*9hVR(|VN%{SKZhY#hl
zvaGW{bSRhgjdi?p^z2CS3EYY;5-ehuEfXy&|9-vB$;YRrrImHz*p&>EkkC-js>_s=
zB`0?_H8FjAcbEIXp32WFR<B;Xaid_v(Qfg_4<0l$H#09>xKKn)?AwlAruzE*e}8|M
zp5L9Fm-p<`)6)zst*sB=yx{?jE`SDlzJC2WW%~5ZNk_Sk^+-0ytp{xy-}_H?+qP{k
zE-nQR4lqvBi9B?$nf>OT%FQ45986HKvzs?nJKU)9)05plmhHQL>y}8}yt=+7{q>74
zZ)Ym_8*RJ)=Z-1<Uh6j{zvMA?*Ju$)Qdm+TosxcTj-svY+)I}(Im%aGT;!^vs#^Hy
z2<PN!?t5S6b1&Lk{r%Gk<^CBm_O-vZc)nJamX_vWYu>mv=y+Upt*E&8@#y@$M|TuH
z=HTH;d2?gqvEt6o&Wh^l&#UAAt~&p?^Z%{*nCze}r5Rb7PoErpEu*pW=uMxEoQf?H
z7CS?&I=)odo}Q-rxWE37^15~Fn(gYGoR~B<HCJ5Cn*3N%QSr^q&FmTVX=jceJzCLi
zrmo)p>{;5YSFb>WHpc1a-W2{hE?++<_x3i)+uC#O>-Sx+u)3vHx<%pjov_R|H#XXC
z3!P<MuIK9Ny65-1-D=xC>wn+Af9b*nhp4Ei_18she+N6JPG7XvIoMl!w_jLUwI7FK
zOX-9|xeZa#(I-!zW`1xMJl}je<>7&ZPhY;IJUulvwr}~eWkv7qSei)jt`1*smY2fw
zc6)@gvoj|jU)tGOraN};{%-MamT7iIW~Srfiw_<@K0M1byQ=s2)vHtQ?k@j)OgewT
z#f%xY)n?*)F$}lT&nYP>?b7jU_$Ij<wA^j>?Cd*TOLpy=_4W1j;J7$H2M30_y1F;F
zwyyTS-~D6#YF=MvEBzLMLz`yqeR_4T!4%!~US3`m&t|5dICaYCoQm%ZgG7&YxtdG1
zc0Y~Sv9{sTtTwsxt8M30y5>%(iIn`=8!3Ns=GTeS!tV=*uZuZ&_;B;n)6+MXy^Sid
zl3ly@?)_DpN;XMd{H0r6BE(l_c6VO!hRunUCzdGod2lEmNtkpfw_(-l)tNatOSW#E
z+SJ7K?EL)u&s&+NElD<gwEBOvSEeuX_5D|aqb?uFx6^z2{N?i{6`O^+m+cmQ^!C!y
zisB8M6_4C*ZJjkQe=4_P%Z8dwtG3L)bxY)%`1McAo4nSUnFN+U;h63({^*Ixw4a}!
zZ!UiB=hiQm`_AC^TK(06;=k|KZf7$zaq9`#nEaB5Z_1TTuTy^iU;KEkY=oRqXRB1X
zThEKi(2W949H-S=qwPz!yI;FLeczF?IqVNlX72guwdLx+!#Ad1?FiZt*)FIYqVw(Z
zgc?r8BL*rU#pc>i#kR{eOmdT6J#Xrys~w<D^V7-ccYY;Wyq{=#?`ERI^Hu>Tj>7km
zz7NY@Tu_v&d?L8aXXc@0zO&B=9WFrV+1zz#?bfYRL0ykKcVbSRK7I1Ui3NN1)SP%M
zac*<aCMnQT#|<@MfjdOS#Eeo;iKzL{>)E^4wy?19$&)8Ml`@Ab!&W7mK3cZ?$_m>#
zg_6nrw+_$r^<Q~&V@capy<MQ0sW#!&5px29gOe}yf;I_WUhdCdDbu}d_h-+Sp?-0D
z{DM!{&o7nMgIksp7P#XpXgC?Pi=IJTFGj(@;KRJw)vni_{%+r%nVEa|TpG)^WVfIv
z8SWSU-`VUPwIzSwPqy&z@ZxsGUml-QYgN*(xfi9?o|(CM)v8r?`>$(pne+4WuZ!84
zR1+H;+uPp$a!o)|l2T`9r?c#=gpBILU5%4#AW3abq<r?tnV~k1&5ED-=*8{vNKI9(
z+B@&e86Pp-sFZgbK*J4+ZBs(e?6`UB)~9RH`5!@@kXyIBoSc~K?ChTLd4p!Pk~|_^
zAF>I6cd;$md061?gy<{RToEo?qnfVe8YoiC4w_u!P~0~=)aEf{iQkd};lCf&*Z<|V
zx3_;>@8{>YWa-k4Sy#1|`OWQeTm0~(`urvP_T9U)u<QTz;B!`g9`e^aEDb9B`>WL0
zP1EYNE@;>;`PG$`KYsj}FlSEAGY{AHc}2^vW^wZI9Xo%%|G)tUE^h9#va4RCyuP;f
z$B!QmUcdIPsj)eG_H0O4m{ImM9Z(v%>~G&2x8A&bMy#gO9oFW93wG?>sTmp?dg|ff
z_NmjRiMa-fs9IV^diwd96_+fso?E<3qpRrWr&LuNo0yNEJ_UhB%8cFBcU`tjKi(%B
zeCW`jDbJpzxdw_neZ77^RLi%?^Z&Sjn#Cfnfm52<`L&9lpEJ!bIJ9EduBe$7g^Lzl
zehHGfaU<gC-|zRslaiCC_Vn<8%s46@A9AnqxomOUP2HJ=!sVcS@9pw+Guq{<KwF`6
za&KSX$->Sa92=Ya?1f9Y)pMQ)*Z2Q3oo$})wscZi+O$>0=eFhFKlkeDYDvo?6$XC0
z9}8wK7QXv(j`374*W_en23}s?9lN8;%jZ5kmSAw<%9WOb2@cCI8`k})=##ZRHc8c+
zq2k@n=Qp<J&!4&YjTe`BVYrd8ac6)3`=UR4zu$Y@sXouZ+2zshl5?QZB{@0J8s58S
z0z2=OoVzI=Uvu!}Wc8V**<!`d&pkae(>SfquVi`gIV(BUnFfhWo}QjLcXnJnbFq_8
zPEJlmRrTT1r$-^<m1)O%B=3~puRYeu;qULCm6dfyCNn%}M`zvT*I(b<-#`DxjflyQ
zUteE;{Mxl?>GLX&`AqDbS8{Hq-}2<cZM<t@cVD}c*co@{r?s&J&z&8G&7dii`~Uxa
z|MO+Jz3W6@Mx9f0EpP9uwFa#!IW<)~=kBhlsi&vSv~;@o@Y&hfPo6vph>Y~i&DGtW
zclXhSh0f3TR{MjJ>a^Ljy&W7F^78VICGReO|7?BzU+wB|Zze|Pi>)X=r=_K}V$GVA
zUteDO%(0mG>QxpK6BB6gC-<F!t9tprb2hfNot>SIOM?VOM2>U_DyIZ?w(c}em#_K2
zXlQ82aOBvrQ;%4<i@v-FRP&t$TJus?R(9mb5zxAfSFc|iO7J**e`~7SD3Z76zLAlU
zPu^rXtCEhhv&}cx|F4sgl|3sl^M&X0IV-Pbt&83578)uV8X8(rRTUHxa)iI`1GBNa
zrq%Ax<zlk3XFonZ-uUhIvZ}pu;^N}Pt?Ii9)6d<xd-vz<`~TPuY)n4>;oCQ}@Qg=%
zadB~=^(Ze&tVCQD?d|7hUS1|BCFNCCW_BU2LO*WLjWZL=RHZ+3)$J^OEe2Bj^=i0<
zwKcOr>M4=D`}@u^^V=|N(GR$>F}dCHu*lq!>6_1~`OP_Tb94GqjtvTTpZcu7K6%1~
zhOpH~@7<fH8@<gS|DKJpyOveS^vz98OrYUy9yaF1ZoQA@mfuS})+3qL7gFNk>8Yr#
z%^e*b{pauBn_IKRO-)UY96eh2_!#e>zkfab{mu0_Hi-m5&MJ%G+zH)Ssk)><7*uI&
z5Q0l-f;W9Du4xg09Muu##AUA7aul_*aH@%w`<LmtBb~zgiznaOw`cj`*X#G6ySUi>
zx2SXZx3{;`-^5IuHLJ^O>7&#7`%UKgdwV-C4J!QhCNjAvYpayKz5SIeQ%g%rhu>m~
z#m~+d=1FL^Nc;o2C1TFS#qPmjVQ#^}!VK<xG8;Q&&s<*@d-~7M&+Inmo}Hcj^QnG)
zX3g5L)hExKG1>R;#p3=CA3qxYJoEAK@rzkoQ_|Bn7d`c20PW)D=2q6$&OTtssko*Y
z<d<#{Ny(E3o7o%JSzo%pqp<nKiwqMf-h&4ZPMI==As{^5y`sXx&dyFJe&3xX)6=V}
ztQ;L3fBgLU@lm(_rCYadJ>D7}9SvGQ`0?vk*R^4rOJ9dEfCe2mZQg8I{Y|H?uI`xR
zhCZ`=QBhGB7Z-!%V?0hyPM~qq=^OX|c*O12CzHA9VPQ$hmO`<$ixx4Zr$3i7cgf4i
zdGhb~d-I=f=Gj(P{WoeAINSzttGKv(TAJF~vuCGFnPQNAP3Q5>DN{uL{Qaw`shN1V
zjo0S)o6QV=-j(lve9>M0>F4wI&rhn)e*$Wc|M@)szlyeYckpt*g;%pSZru2C>y3+9
zTd!QX5)c;VmYJ!kzvn~K$)5)fIBbd1jaoacql078q)7}OK0Yb8x8=^UF7MNg-ZtaR
zs#1&6S0apzjB3Z{%$WlkxJ*q=O-W1p^yRXD`Olb)42^yJ_JOu>mA<|v+5h|3ucBvX
zX1@Pr)hckfW$jL80jG|4875MzR;@a5;)H^c(WNIbj*g7g-`}w&+}l%m<?7XoH*TC*
zKELkNart^4hmepdg=t#$_VbIMp94*PGR&GatK!2!_Ub*~o}8Tg^VRD05gU_O0|NtF
z+S|*2?_0A*r>m>$#Oc$IZ|CoS`o8|Z^~Q}GkA2>rfB)U5nXALsKYI1**3%02g;hU3
zB!2q*x$^(t@4n~c?dw2?U1(`(ad2}#-gsONwDbYA#63U%eEq-A^2hpQZ$G}T(jp-T
zPKRq;yTw2g(w)NUD^{=G{kl-Z_2JIv^FT{QYpeeM{eIuSu+XrbUq0=4pX{Hf`t=X@
ze!pj2=^q&>xpL)7(1xEIH*Q?GcI{a0_q*+3tKXL9u(PvENlERJiQkaGxH@dDl7WGM
zy83cC`~M#f^FMy}%&oH0($Ue;p}Nr`>&gndzqw8vuhq7DwF-!7D=IclnIdu}YwO96
z6D9~eUVBbh-4C=$X!2y?;%8?bp0EFB%&>Fk&O3W5g+cRJn>KIWS@u@y!oQw7OTDL8
z?Roe0_4Ul`?8hsY&r7<yt5if(bmpA(>({H>+0Em(|8v2j`q`5wpt+PEFBbPVd`sH6
zW5<ruH|&31?Dq+qP*zY<0$RWNWb5^~*I%tc=1!eD)#l3uXNTX#wR(DbH*Vhi`pZ)B
z$Oi@EJSUFsr!QY-#yH-L%*xW@<mAjSk&=;>{rRAozu{Z@{YQ@<fBg9I;c@$ahH-Im
z3_i2XdP`ql6O@zli;k8qD=Pyns6F=Rxu2=2>7EaVxVLQG%2+aU<>`|rA6{DO{pU@(
zy|LWotgWD#|M)+T#MeY`@0&JF?9rn~pxt99)1RN2d02b>o~A__$6R@3A3b_h^+(Ui
z*?Hr}jSH79ZS|dP_VC#=wk`Ladb_$_eJpnBc<8#cB(6-gMc~oA>Yq=iNA4(ixFAPY
zNLcvrGT+%h9`e^)a7^S>)R}unOVp`D@!PA_>la;q$!60Rw_e@KDyqv&ai4q2G4P2a
xhrCYoKz683op1;$;_(pNkW0i<z8?9{*qp7ev-Q5CBm)BjgQu&X%Q~loCIDfAeCYrH

literal 0
HcmV?d00001

diff --git a/tensorflow/contrib/lite/g3doc/performance.md b/tensorflow/contrib/lite/g3doc/performance.md
index 6b7943caf8..ed11452716 100644
--- a/tensorflow/contrib/lite/g3doc/performance.md
+++ b/tensorflow/contrib/lite/g3doc/performance.md
@@ -3,8 +3,15 @@
 
 Mobile and embedded devices have limited computational resources and it is important to keep your application resource efficient. We have compiled a list of best practices and strategies you can use to optimize your model and application when using Tensorflow Lite.
 
-## Choose the most efficient model for the problem
-Some models may be too large to run on embedded devices. Instead of large models it is better to use a slightly less precise but smaller model for embedded devices. Smaller models not only use less disk space and memory but are generally faster and more energy efficient. One example of models optimized for mobile devices are [MobileNets](https://arxiv.org/abs/1704.04861), which are optimized for mobile vision applications. Tensorflow Lite [models page](models.md) lists several other models that have been optimized specifically for mobile and embedded devices.
+## Choose the best model for the task
+Depending on the task you will need to make a tradeoff between model complexity and size. If your task requires high accuracy then you may need a large and complex model. Some tasks may work with a less precise model, for these tasks it is better to use a smaller but less precise model. Smaller models not only use less disk space and memory but are generally faster and more energy efficient. For example, graphs below show accuracy and latency tradeoff for some common image classification models.
+
+![accuracy vs model size](images/performance/model_size_vs_accuracy.png "Accuracy vs Model size")
+
+
+![latency vs model size](images/performance/model_size_vs_latency.png "Latency vs Model size")
+
+One example of models optimized for mobile devices are [MobileNets](https://arxiv.org/abs/1704.04861), which are optimized for mobile vision applications. Tensorflow Lite [models page](models.md) lists several other models that have been optimized specifically for mobile and embedded devices.
 
 You can retrain the listed models on your own dataset by using transfer learning. Check out our transfer learning tutorial for
 [image classification](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0) and
@@ -12,25 +19,25 @@ You can retrain the listed models on your own dataset by using transfer learning
 
 
 ## Profile your model
-Before starting any optimization, it is a good practice to profile and benchmark your model. Tensorflow Lite [benchmarking tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark) has a built-in profiler that shows per operator profiling statistics. This can help in understanding performance bottlenecks and which operators dominate the computation time.
+Once you have selected a candidate model that is right for your task, it is a good practice to profile and benchmark your model. Tensorflow Lite [benchmarking tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark) has a built-in profiler that shows per operator profiling statistics. This can help in understanding performance bottlenecks and which operators dominate the computation time.
 
 ## Profile and optimize operators in the graph
 If a particular operator appears frequently in the model and based on profiling you find the operator consuming the most amount of time, you can look into optimizing the operator.
  This scenario should be rare as Tensorflow Lite has optimized versions for most ops. However you may be able to write a faster version of a custom op, if you know the constraints in which the operator is executed. Check out our [custom operator documentation](custom_operators.md).
 
 ## Quantize your model
-If your model uses floating point weights or activations then it may be possible to reduce the size of model up to ~4x by using quantization and other model optimizations. Check out our [model optimization toolkit](https://www.tensorflow.org/performance/model_optimization) for details about optimizing your model. Fully quantized models can be remarkably power efficient as well.
+If your model uses floating point weights or activations then it may be possible to reduce the size of model up to ~4x by using quantization and other model optimizations. Check out our [model optimization toolkit](https://www.tensorflow.org/performance/model_optimization) for details about optimizing your model. 
 
 ## Tweak the number of threads
-Tensorflow Lite supports multi-threaded kernels for many operators. You can increase the number of threads and speed up execution of operators. Increasing the number of threads will however make your model use more resources and power. For some applications latency may be more important than energy efficiency. You can increase the number of threads by setting the number of [interpreter](https://github.com/tensorflow/tensorflow/blob/1084594657a5d139102ac794f84d1427a710e39a/tensorflow/contrib/lite/interpreter.h#L337) threads.
+Tensorflow Lite supports multi-threaded kernels for many operators. You can increase the number of threads and speed up execution of operators. Increasing the number of threads will however make your model use more resources and power. For some applications latency may be more important than energy efficiency. You can increase the number of threads by setting the number of [interpreter](https://github.com/tensorflow/tensorflow/blob/1084594657a5d139102ac794f84d1427a710e39a/tensorflow/contrib/lite/interpreter.h#L337) threads. Multi-threaded execution however comes at the cost of increased performance variability depending on what else is been executed concurrently. This is particularly the case for mobile apps. For example, isolated tests may show 2x speed up vs single-threaded but if another app is executing at the same time may result in worst performance than single-threaded.
 
 ## Eliminate redundant copies
-Tensorflow Lite is optimized to reduce redundant copies. The APIs allow user to [mmap a model file](https://github.com/tensorflow/tensorflow/blob/9982fd6c8831cbd2f58954f79ea71f26660393bc/tensorflow/contrib/lite/model.h#L152) and avoid copies. If your application is not careful, there can be redundant copies when feeding the input to the model and reading output from the model. Make sure to eliminate redundant copies. If you are using higher level APIs like Java API, make sure to carefully check the documentation for performance caveats. For example, the Java API is a lot faster if ByteBuffers are used as [inputs](https://github.com/tensorflow/tensorflow/blob/6305a6d83552ba6a472cd72398b60d9241467f1f/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java#L151).
+If your application is not careful, there can be redundant copies when feeding the input to the model and reading output from the model. Make sure to eliminate redundant copies. If you are using higher level APIs like Java API, make sure to carefully check the documentation for performance caveats. For example, the Java API is a lot faster if ByteBuffers are used as [inputs](https://github.com/tensorflow/tensorflow/blob/6305a6d83552ba6a472cd72398b60d9241467f1f/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java#L151).
 
 ## Profile your application with platform specific tools
 Platform specific tools like [Android profiler](https://developer.android.com/studio/profile/android-profiler) and [Instruments](https://help.apple.com/instruments/mac/current/) provide a wealth of profiling information that can be used to debug your app. Sometimes the performance bug may be not in the model but in parts of application code that interact with the model. Make sure to familiarize yourself with platform specific profiling tools and best practices for your platform.
 
-## Use hardware accelerators available on the device
+## Evaluate whether your model benefits from using hardware accelerators available on the device
 Tensorflow Lite is working on adding support for accelerators like GPU and provides acceleration through [Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/) on Android.
 You can utilize these hardware accelerator backends to improve the speed and efficiency of your model. To enable Neural Networks API call [UseNNAPI](https://github.com/tensorflow/tensorflow/blob/6305a6d83552ba6a472cd72398b60d9241467f1f/tensorflow/contrib/lite/interpreter.h#L334) on the interpreter instance.
 
-- 
GitLab


From 396a8a4105edd409d0821c4d5d0b920b315ffb72 Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Mon, 8 Oct 2018 14:26:43 -0700
Subject: [PATCH 546/570] Add custom call with layout constraints. Add a
 variant of CustomCall which specifies arbitrary layout constraints on the
 operands and result. The existing non-layout-constrained CustomCall is
 changed to have no layout preference and can now be assigned arbitrary
 layouts by layout assignment.

PiperOrigin-RevId: 216249615
---
 .../compiler/tf2xla/kernels/index_ops_cpu.cc  |  22 +-
 tensorflow/compiler/xla/client/xla_builder.cc |  43 +++-
 tensorflow/compiler/xla/client/xla_builder.h  |  22 +-
 tensorflow/compiler/xla/layout_util.cc        |   6 +
 tensorflow/compiler/xla/layout_util.h         |   4 +
 .../xla/service/gpu/gpu_layout_assignment.cc  |  10 -
 .../xla/service/gpu/gpu_layout_assignment.h   |   2 -
 tensorflow/compiler/xla/service/hlo.proto     |   9 +-
 .../compiler/xla/service/hlo_instruction.cc   |  28 ++-
 .../compiler/xla/service/hlo_instruction.h    |  10 +
 .../compiler/xla/service/hlo_instructions.cc  |  33 ++-
 .../compiler/xla/service/hlo_instructions.h   |  32 ++-
 tensorflow/compiler/xla/service/hlo_parser.cc | 101 ++++++++--
 .../compiler/xla/service/hlo_parser_test.cc   |  67 ++++++
 .../compiler/xla/service/hlo_verifier.cc      |  22 +-
 .../compiler/xla/service/layout_assignment.cc | 108 +++++-----
 .../compiler/xla/service/layout_assignment.h  |  13 --
 .../xla/service/layout_assignment_test.cc     | 190 ++++++++++++++++++
 tensorflow/compiler/xla/shape_util.cc         |   2 +-
 .../compiler/xla/tests/custom_call_test.cc    |  50 ++++-
 20 files changed, 650 insertions(+), 124 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
index 3d81ae9eb8..f210bfbd88 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
@@ -88,20 +88,30 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
           xla::ConstantLiteral(&b, xla::LiteralUtil::CreateR0<int32>(dim)));
     }
 
-    xla::Shape xla_shape =
-        xla::ShapeUtil::MakeShape(xla::S64, output_shape.dim_sizes());
+    // The argmax function expects row-major layout.
+    xla::Shape xla_shape = xla::ShapeUtil::MakeShapeWithDescendingLayout(
+        xla::S64, output_shape.dim_sizes());
+    std::vector<xla::Shape> arg_shapes;
+    for (const xla::XlaOp& arg : args) {
+      auto shape_status = b.GetShape(arg);
+      OP_REQUIRES_OK(ctx, shape_status.status());
+      xla::Shape arg_shape = shape_status.ConsumeValueOrDie();
+      *arg_shape.mutable_layout() = xla::LayoutUtil::MakeDescendingLayout(
+          xla::ShapeUtil::Rank(arg_shape));
+      arg_shapes.push_back(std::move(arg_shape));
+    }
 
     // Tell XLA to call the custom code, defined in
     // index_ops_kernel_argmax_float_1d.cc.
     xla::XlaOp output;
     switch (input_shape.dims()) {
       case 1:
-        output =
-            xla::CustomCall(&b, "argmax_float_1d_xla_impl", args, xla_shape);
+        output = xla::CustomCallWithLayout(&b, "argmax_float_1d_xla_impl", args,
+                                           xla_shape, arg_shapes);
         break;
       case 2:
-        output =
-            xla::CustomCall(&b, "argmax_float_2d_xla_impl", args, xla_shape);
+        output = xla::CustomCallWithLayout(&b, "argmax_float_2d_xla_impl", args,
+                                           xla_shape, arg_shapes);
         break;
       default:
         OP_REQUIRES(ctx, false,
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 6b31831010..e7cf9ae363 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -1279,9 +1279,10 @@ XlaOp XlaBuilder::AfterAll(absl::Span<const XlaOp> tokens) {
   });
 }
 
-XlaOp XlaBuilder::CustomCall(const string& call_target_name,
-                             absl::Span<const XlaOp> operands,
-                             const Shape& shape, const string& opaque) {
+XlaOp XlaBuilder::CustomCall(
+    const string& call_target_name, absl::Span<const XlaOp> operands,
+    const Shape& shape, const string& opaque,
+    absl::optional<absl::Span<const Shape>> operand_shapes_with_layout) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     if (absl::StartsWith(call_target_name, "$")) {
@@ -1293,6 +1294,31 @@ XlaOp XlaBuilder::CustomCall(const string& call_target_name,
     *instr.mutable_shape() = shape;
     instr.set_custom_call_target(call_target_name);
     instr.set_custom_call_opaque(opaque);
+    if (operand_shapes_with_layout.has_value()) {
+      if (!LayoutUtil::HasLayout(shape)) {
+        return InvalidArgument(
+            "Result shape must have layout for custom call with constrained "
+            "layout.");
+      }
+      if (operands.size() != operand_shapes_with_layout->size()) {
+        return InvalidArgument(
+            "Must specify a shape with layout for each operand for custom call "
+            "with constrained layout; given %d shapes, expected %d",
+            operand_shapes_with_layout->size(), operands.size());
+      }
+      instr.set_constrain_layout(true);
+      int64 operand_num = 0;
+      for (const Shape& operand_shape : *operand_shapes_with_layout) {
+        if (!LayoutUtil::HasLayout(operand_shape)) {
+          return InvalidArgument(
+              "No layout specified for operand %d for custom call with "
+              "constrained layout.",
+              operand_num);
+        }
+        *instr.add_operand_shapes_with_layout() = operand_shape;
+        ++operand_num;
+      }
+    }
     return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
   });
 }
@@ -2690,7 +2716,16 @@ XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
 XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
                  absl::Span<const XlaOp> operands, const Shape& shape,
                  const string& opaque) {
-  return builder->CustomCall(call_target_name, operands, shape, opaque);
+  return builder->CustomCall(call_target_name, operands, shape, opaque,
+                             /*operand_shapes_with_layout=*/absl::nullopt);
+}
+
+XlaOp CustomCallWithLayout(XlaBuilder* builder, const string& call_target_name,
+                           absl::Span<const XlaOp> operands, const Shape& shape,
+                           absl::Span<const Shape> operand_shapes_with_layout,
+                           const string& opaque) {
+  return builder->CustomCall(call_target_name, operands, shape, opaque,
+                             operand_shapes_with_layout);
 }
 
 XlaOp Complex(const XlaOp& real, const XlaOp& imag,
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 2e14e47a35..9ceede7a79 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -577,9 +577,10 @@ class XlaBuilder {
              absl::Span<const XlaOp> operands);
 
   // Enqueues a custom call instruction onto the computation.
-  XlaOp CustomCall(const string& call_target_name,
-                   absl::Span<const XlaOp> operands, const Shape& shape,
-                   const string& opaque);
+  XlaOp CustomCall(
+      const string& call_target_name, absl::Span<const XlaOp> operands,
+      const Shape& shape_with_layout, const string& opaque,
+      absl::optional<absl::Span<const Shape>> operand_shapes_with_layout);
 
   // The following methods enqueue element-wise binary arithmetic operations
   // onto the computation. The shapes of the operands have to match unless one
@@ -1197,6 +1198,10 @@ class XlaBuilder {
   friend XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
                           absl::Span<const XlaOp> operands, const Shape& shape,
                           const string& opaque);
+  friend XlaOp CustomCallWithLayout(
+      XlaBuilder* builder, const string& call_target_name,
+      absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
+      absl::Span<const Shape> operand_shapes_with_layout, const string& opaque);
   friend XlaOp Complex(const XlaOp& real, const XlaOp& imag,
                        absl::Span<const int64> broadcast_dimensions);
   friend XlaOp Conj(const XlaOp& operand);
@@ -1732,6 +1737,17 @@ XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
                  absl::Span<const XlaOp> operands, const Shape& shape,
                  const string& opaque = "");
 
+// Overload which constructs a custom call with fixed layouts. The operands will
+// have the layouts specified by |operand_shapes_with_layout| when provided to
+// external code, and the external code is expected to produce a result with the
+// layout specified by |shape_with_layout|. All shapes in |shape_with_layout|
+// and |operand_shapes_with_layout| must have layouts.
+XlaOp CustomCallWithLayout(XlaBuilder* builder, const string& call_target_name,
+                           absl::Span<const XlaOp> operands,
+                           const Shape& shape_with_layout,
+                           absl::Span<const Shape> operand_shapes_with_layout,
+                           const string& opaque = "");
+
 // The following methods enqueue element-wise binary arithmetic operations
 // onto the computation. The shapes of the operands have to match unless one
 // of the operands is a scalar, or an explicit broadcast dimension is given
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index d310335618..3c8db9aa45 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -65,6 +65,12 @@ void SetDefaultLayoutToContainer(
   return layout;
 }
 
+/* static */ Layout LayoutUtil::MakeDescendingLayout(int64 rank) {
+  std::vector<int64> layout(rank);
+  std::iota(layout.rbegin(), layout.rend(), static_cast<int64>(0));
+  return MakeLayout(layout);
+}
+
 /* static */ Layout LayoutUtil::MakeLayoutFromMajorToMinor(
     absl::Span<const int64> major_to_minor) {
   Layout layout;
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index b78883c2d8..af032b1cae 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -40,6 +40,10 @@ class LayoutUtil {
   static Layout MakeLayoutFromMajorToMinor(
       absl::Span<const int64> major_to_minor);
 
+  // Returns a layout with descending ((i.e. {n, n-1, ..., 0}) minor-to-major
+  // dimensions.
+  static Layout MakeDescendingLayout(int64 rank);
+
   // Creates a sparse layout with the given maximum number of elements. (This is
   // a convenience function for protobuf construction.)
   static Layout MakeSparseLayout(int64 max_sparse_elements);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 1ffe855750..8c9a8adc61 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -213,16 +213,6 @@ Status GpuLayoutAssignment::AddBackendConstraints(
   return Status::OK();
 }
 
-bool GpuLayoutAssignment::CustomCallRequiresMajorFirstLayout(
-    const HloInstruction* instruction) {
-  // - Inputs to cudnn batchnorm custom calls don't need the major-first layout
-  //   (i.e. {n, n-1, ...0}) -- we can handle any layout.
-  // - Inputs to cudnn convolution require custom layouts handled in
-  //   AddBackendConstraints.
-  return !IsCustomCallToDnnBatchNorm(*instruction) &&
-         !IsCustomCallToDnnConvolution(*instruction);
-}
-
 Status GpuLayoutAssignment::PropagateOperandConstraint(
     const OperandLayoutConstraint& layout_constraint,
     LayoutConstraints* constraints) {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
index 4ba7989e9c..6a48e55fd2 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
@@ -46,8 +46,6 @@ class GpuLayoutAssignment : public LayoutAssignment {
   Status PropagateBufferConstraint(
       const BufferLayoutConstraint& buffer_constraint,
       LayoutConstraints* constraints) override;
-  bool CustomCallRequiresMajorFirstLayout(
-      const HloInstruction* instruction) override;
 
  private:
   Status AddBackendConstraintsToDnnConvCustomCall(
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 1ea26ddd5b..a0eb9e6ddc 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -34,7 +34,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 56
+// Next ID: 58
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -184,6 +184,13 @@ message HloInstructionProto {
   // Sharding for kDomain instructions.
   xla.OpSharding domain_entry_sharding = 54;
   xla.OpSharding domain_exit_sharding = 55;
+
+  // For custom call this indicates that the layouts are constrained. If
+  // constrain_layout is true then the 'shape' field must contain a layout, and
+  // 'operand_shapes_with_layout' must contain a shape with layout for each
+  // operand.
+  bool constrain_layout = 56;
+  repeated Shape operand_shapes_with_layout = 57;
 }
 
 // Serialization of HloComputation.
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 2f6db7cd7c..5c3908a9a4 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -396,9 +396,22 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           operands(1), operands(2), computations(1));
       break;
     case HloOpcode::kCustomCall:
-      instruction = CreateCustomCall(proto.shape(), all_operands(),
-                                     proto.custom_call_target(),
-                                     proto.custom_call_opaque());
+      if (proto.constrain_layout()) {
+        // A proto RepeatedPtrField cannot be converted to a Span (it is a
+        // vector of pointers essentially) so create a vector of shapes to pass
+        // in.
+        std::vector<Shape> operand_shapes;
+        for (const Shape& shape : proto.operand_shapes_with_layout()) {
+          operand_shapes.push_back(shape);
+        }
+        instruction = CreateCustomCall(
+            proto.shape(), all_operands(), proto.custom_call_target(),
+            operand_shapes, proto.custom_call_opaque());
+      } else {
+        instruction = CreateCustomCall(proto.shape(), all_operands(),
+                                       proto.custom_call_target(),
+                                       proto.custom_call_opaque());
+      }
       if (proto.has_window()) {
         static_cast<HloCustomCallInstruction*>(instruction.get())
             ->set_window(proto.window());
@@ -1142,6 +1155,15 @@ bool HloInstruction::HasSideEffect() const {
       shape, operands, custom_call_target, opaque);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCustomCall(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    absl::string_view custom_call_target,
+    absl::Span<const Shape> operand_shapes_with_layout,
+    absl::string_view opaque) {
+  return absl::make_unique<HloCustomCallInstruction>(
+      shape, operands, custom_call_target, opaque, operand_shapes_with_layout);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateTuple(
     absl::Span<HloInstruction* const> elements) {
   std::vector<Shape> element_shapes;
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 374862c4b6..44f776ebac 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -734,6 +734,16 @@ class HloInstruction {
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       absl::string_view custom_call_target, absl::string_view opaque = "");
 
+  // Overload which constrains the layouts of the operand and result. 'shape'
+  // and 'operand_shapes_with_layout' must have layouts.
+  // 'operand_shapes_with_layout' must have a compatible element for each
+  // operand.
+  static std::unique_ptr<HloInstruction> CreateCustomCall(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      absl::string_view custom_call_target,
+      absl::Span<const Shape> operand_shapes_with_layout,
+      absl::string_view opaque = "");
+
   // Creates a tuple instruction with the given elements. This is a convenience
   // wrapper around CreateVariadic.
   static std::unique_ptr<HloInstruction> CreateTuple(
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 152d8eacdb..2ec233eaec 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -1825,7 +1825,24 @@ HloCustomCallInstruction::HloCustomCallInstruction(
     : HloInstruction(HloOpcode::kCustomCall, shape),
       custom_call_target_(custom_call_target.begin(), custom_call_target.end()),
       opaque_(opaque.begin(), opaque.end()),
-      feature_group_count_(1) {
+      feature_group_count_(1),
+      layout_constrained_(false) {
+  for (auto operand : operands) {
+    AppendOperand(operand);
+  }
+}
+
+HloCustomCallInstruction::HloCustomCallInstruction(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    absl::string_view custom_call_target, absl::string_view opaque,
+    absl::Span<const Shape> operand_shapes_with_layout)
+    : HloInstruction(HloOpcode::kCustomCall, shape),
+      custom_call_target_(custom_call_target.begin(), custom_call_target.end()),
+      opaque_(opaque.begin(), opaque.end()),
+      feature_group_count_(1),
+      layout_constrained_(true),
+      operand_shapes_with_layout_(operand_shapes_with_layout.begin(),
+                                  operand_shapes_with_layout.end()) {
   for (auto operand : operands) {
     AppendOperand(operand);
   }
@@ -1843,6 +1860,12 @@ HloInstructionProto HloCustomCallInstruction::ToProto() const {
   proto.set_custom_call_target(custom_call_target_);
   proto.set_custom_call_opaque(opaque_);
   proto.set_feature_group_count(feature_group_count_);
+  if (layout_constrained()) {
+    proto.set_constrain_layout(true);
+    for (const Shape& shape : operand_shapes_with_layout_) {
+      *proto.add_operand_shapes_with_layout() = shape;
+    }
+  }
   return proto;
 }
 
@@ -1870,6 +1893,14 @@ std::vector<string> HloCustomCallInstruction::ExtraAttributesToStringImpl(
   if (!opaque_.empty()) {
     extra.push_back(StrCat("opaque=\"", CEscape(opaque_), "\""));
   }
+  if (layout_constrained()) {
+    std::vector<string> shape_strings;
+    for (const Shape& shape : operand_shapes_with_layout_) {
+      shape_strings.push_back(ShapeUtil::HumanStringWithLayout(shape));
+    }
+    extra.push_back(StrCat("operand_layout_constraints={",
+                           StrJoin(shape_strings, ", "), "}"));
+  }
   return extra;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index e169604072..4c5fc759a3 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -1053,10 +1053,19 @@ class HloSelectAndScatterInstruction : public HloInstruction {
 
 class HloCustomCallInstruction : public HloInstruction {
  public:
-  explicit HloCustomCallInstruction(const Shape& shape,
-                                    absl::Span<HloInstruction* const> operands,
-                                    absl::string_view custom_call_target,
-                                    absl::string_view opaque);
+  HloCustomCallInstruction(const Shape& shape,
+                           absl::Span<HloInstruction* const> operands,
+                           absl::string_view custom_call_target,
+                           absl::string_view opaque);
+
+  // Constructor for a custom call with constrained layout. 'shape' and
+  // 'operands_with_layout' must all have layouts.
+  HloCustomCallInstruction(const Shape& shape,
+                           absl::Span<HloInstruction* const> operands,
+                           absl::string_view custom_call_target,
+                           absl::string_view opaque,
+                           absl::Span<const Shape> operand_shapes_with_layout);
+
   const Window& window() const override {
     CHECK(window_ != nullptr);
     return *window_;
@@ -1085,6 +1094,16 @@ class HloCustomCallInstruction : public HloInstruction {
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
+  // Returns whether the result and operand layouts are constrained.
+  bool layout_constrained() const { return layout_constrained_; }
+
+  // Returns the shapes (with layout) of the operands. CHECKs if this custom
+  // call does not have constrained layouts.
+  const std::vector<Shape>& operand_shapes_with_layout() const {
+    CHECK(layout_constrained());
+    return operand_shapes_with_layout_;
+  }
+
  private:
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
@@ -1106,6 +1125,11 @@ class HloCustomCallInstruction : public HloInstruction {
   std::unique_ptr<ConvolutionDimensionNumbers> convolution_dimension_numbers_;
   // The number of feature groups. This is used for grouped convolutions.
   int64 feature_group_count_;
+  // Whether the result and operand layouts are constrained.
+  bool layout_constrained_;
+  // For layout-constrained custom calls, this vector holds the shape with
+  // layout for each operand.
+  std::vector<Shape> operand_shapes_with_layout_;
 };
 
 class HloPadInstruction : public HloInstruction {
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index dd62988bcc..96f9ff6654 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -174,6 +174,7 @@ class HloParser {
     kDistribution,
     kDomain,
     kPrecisionList,
+    kShapeList
   };
 
   struct AttrConfig {
@@ -240,6 +241,7 @@ class HloParser {
 
   bool ParseSliceRanges(SliceRanges* result);
   bool ParsePrecisionList(std::vector<PrecisionConfig::Precision>* result);
+  bool ParseShapeList(std::vector<Shape>* result);
   bool ParseInt64List(const TokKind start, const TokKind end,
                       const TokKind delim,
                       std::vector<tensorflow::int64>* result);
@@ -1341,6 +1343,7 @@ bool HloParser::ParseInstruciontRhs(HloComputation::Builder* builder,
       optional<Window> window;
       optional<ConvolutionDimensionNumbers> dnums;
       optional<int64> feature_group_count;
+      optional<std::vector<Shape>> operand_layout_constraints;
       attrs["custom_call_target"] = {/*required=*/true, AttrTy::kString,
                                      &custom_call_target};
       attrs["opaque"] = {/*required=*/false, AttrTy::kString, &opaque};
@@ -1349,12 +1352,52 @@ bool HloParser::ParseInstruciontRhs(HloComputation::Builder* builder,
                              AttrTy::kConvolutionDimensionNumbers, &dnums};
       attrs["feature_group_count"] = {/*required=*/false, AttrTy::kInt64,
                                       &feature_group_count};
+      attrs["operand_layout_constraints"] = {
+          /*required=*/false, AttrTy::kShapeList, &operand_layout_constraints};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
-      instruction = builder->AddInstruction(
-          HloInstruction::CreateCustomCall(shape, operands, *custom_call_target,
-                                           opaque.has_value() ? *opaque : ""));
+      if (operand_layout_constraints.has_value()) {
+        if (!LayoutUtil::HasLayout(shape)) {
+          return Error(lexer_.GetLoc(),
+                       "Layout must be set on layout-constrained custom call");
+        }
+        if (operands.size() != operand_layout_constraints->size()) {
+          return Error(lexer_.GetLoc(),
+                       StrCat("Expected ", operands.size(),
+                              " operand layout constraints, ",
+                              operand_layout_constraints->size(), " given"));
+        }
+        for (int64 i = 0; i < operands.size(); ++i) {
+          const Shape& operand_shape_with_layout =
+              (*operand_layout_constraints)[i];
+          if (!LayoutUtil::HasLayout(operand_shape_with_layout)) {
+            return Error(lexer_.GetLoc(),
+                         StrCat("Operand layout constraint shape ",
+                                ShapeUtil::HumanStringWithLayout(
+                                    operand_shape_with_layout),
+                                " for operand ", i, " does not have a layout"));
+          }
+          if (!ShapeUtil::Compatible(operand_shape_with_layout,
+                                     operands[i]->shape())) {
+            return Error(
+                lexer_.GetLoc(),
+                StrCat(
+                    "Operand layout constraint shape ",
+                    ShapeUtil::HumanStringWithLayout(operand_shape_with_layout),
+                    " for operand ", i,
+                    " is not compatible with operand shape ",
+                    ShapeUtil::HumanStringWithLayout(operands[i]->shape())));
+          }
+        }
+        instruction = builder->AddInstruction(HloInstruction::CreateCustomCall(
+            shape, operands, *custom_call_target, *operand_layout_constraints,
+            opaque.has_value() ? *opaque : ""));
+      } else {
+        instruction = builder->AddInstruction(HloInstruction::CreateCustomCall(
+            shape, operands, *custom_call_target,
+            opaque.has_value() ? *opaque : ""));
+      }
       if (window.has_value()) {
         instruction->set_window(*window);
       }
@@ -2533,6 +2576,15 @@ bool HloParser::ParseAttributeHelper(
             ->emplace(result);
         return true;
       }
+      case AttrTy::kShapeList: {
+        std::vector<Shape> result;
+        if (!ParseShapeList(&result)) {
+          return false;
+        }
+        static_cast<optional<std::vector<Shape>>*>(attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
     }
   }();
   if (!success) {
@@ -2825,6 +2877,23 @@ bool HloParser::ParsePrecisionList(
                    parse_and_add_item);
 }
 
+// shapelist ::= '{' shapes '}'
+// precision_elements
+//   ::= /*empty*/
+//   ::= shape (',' shape)*
+bool HloParser::ParseShapeList(std::vector<Shape>* result) {
+  auto parse_and_add_item = [&]() {
+    Shape shape;
+    if (!ParseShape(&shape)) {
+      return false;
+    }
+    result->push_back(std::move(shape));
+    return true;
+  };
+  return ParseList(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma,
+                   parse_and_add_item);
+}
+
 // int64list ::= start int64_elements end
 // int64_elements
 //   ::= /*empty*/
@@ -2832,23 +2901,15 @@ bool HloParser::ParsePrecisionList(
 bool HloParser::ParseInt64List(const TokKind start, const TokKind end,
                                const TokKind delim,
                                std::vector<tensorflow::int64>* result) {
-  if (!ParseToken(start, StrCat("expects an int64 list starting with ",
-                                TokKindToString(start)))) {
-    return false;
-  }
-  if (lexer_.GetKind() == end) {
-    // empty
-  } else {
-    do {
-      tensorflow::int64 i;
-      if (!ParseInt64(&i)) {
-        return false;
-      }
-      result->push_back(i);
-    } while (EatIfPresent(delim));
-  }
-  return ParseToken(
-      end, StrCat("expects an int64 list to end with ", TokKindToString(end)));
+  auto parse_and_add_item = [&]() {
+    tensorflow::int64 i;
+    if (!ParseInt64(&i)) {
+      return false;
+    }
+    result->push_back(i);
+    return true;
+  };
+  return ParseList(start, end, delim, parse_and_add_item);
 }
 
 bool HloParser::ParseList(const TokKind start, const TokKind end,
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 255123d331..17538c05bc 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -802,6 +802,43 @@ ENTRY %ConstantUnsignedNoOverflow () -> u64[] {
   ROOT %constant = u64[] constant(9223372036854775807)
 }
 
+)"
+},
+// CustomCallWithLayoutConstraints
+{
+"CustomCallWithLayoutConstraints",
+R"(HloModule CustomCallWithLayoutConstraints
+
+ENTRY %CustomCallWithLayoutConstraints (p0: f32[42,2,3], p1: f32[123,4]) -> f32[1,2,3] {
+  %p0 = f32[42,2,3]{0,1,2} parameter(0)
+  %p1 = f32[123,4]{0,1} parameter(1)
+  ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(f32[42,2,3]{0,1,2} %p0, f32[123,4]{0,1} %p1), custom_call_target="baz", operand_layout_constraints={f32[42,2,3]{0,1,2}, f32[123,4]{1,0}}
+}
+
+)"
+},
+// CustomCallWithLayoutConstraintsNoOperands
+{
+"CustomCallWithLayoutConstraintsNoOperands",
+R"(HloModule CustomCallWithLayoutConstraintsNoOperands
+
+ENTRY %CustomCallWithLayoutConstraints () -> f32[1,2,3] {
+  ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(), custom_call_target="baz", operand_layout_constraints={}
+}
+
+)"
+},
+// CustomCallWithLayoutConstraintsTupleShapes
+{
+"CustomCallWithLayoutConstraintsTupleShapes",
+R"(HloModule CustomCallWithLayoutConstraintsTupleShapes
+
+ENTRY %CustomCallWithLayoutConstraints (p0: (f32[2,2], f32[42,2,3]), p1: f32[123,4]) -> (f32[1,2,3], f32[1,2,3]) {
+  %p0 = (f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) parameter(0)
+  %p1 = f32[123,4]{0,1} parameter(1)
+  ROOT %custom-call = (f32[1,2,3]{0,2,1}, f32[1,2,3]{1,2,0}) custom-call((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) %p0, f32[123,4]{0,1} %p1), custom_call_target="baz", operand_layout_constraints={(f32[2,2]{1,0}, f32[42,2,3]{2,0,1}), f32[123,4]{1,0}}
+}
+
 )"
 },
   });
@@ -2069,5 +2106,35 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
                              op::Broadcast(), op::Multiply(), op::Add()));
 }
 
+TEST_F(HloParserTest, CustomCallWrongNumberofOperandConstraints) {
+  const string original = R"(HloModule CustomCallWrongNumberofOperandConstraints
+
+ENTRY %CustomCallWrongNumberofOperandConstraints (p0: f32[42,2,3], p1: f32[123,4]) -> f32[1,2,3] {
+  %p0 = f32[42,2,3]{0,1,2} parameter(0)
+  %p1 = f32[123,4]{0,1} parameter(1)
+  ROOT %custom-call = f32[1,2,3]{0,1,2} custom-call(f32[42,2,3]{0,1,2} %p0, f32[123,4]{0,1} %p1), custom_call_target="baz", operand_layout_constraints={f32[42,2,3]{0,1,2}}
+}
+
+)";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "Expected 2 operand layout constraints, 1 given");
+}
+
+TEST_F(HloParserTest, CustomCallIncompatibleOperandConstraints) {
+  const string original = R"(HloModule CustomCallIncompatibleOperandConstraints
+
+ENTRY %CustomCallIncompatibleOperandConstraints (p0: f32[42,2,3], p1: f32[123,4]) -> f32[1,2,3] {
+  %p0 = f32[42,2,3]{0,1,2} parameter(0)
+  %p1 = f32[123,4]{0,1} parameter(1)
+  ROOT %custom-call = f32[1,2,3]{0,1,2} custom-call(f32[42,2,3]{0,1,2} %p0, f32[123,4]{0,1} %p1), custom_call_target="baz", operand_layout_constraints={f32[42,2,3]{0,1,2}, f32[555,5]{1,0}}
+}
+
+)";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "operand 1 is not compatible with operand shape");
+}
+
+// custom call incompatible shape.
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 496fe1795d..be3bee5975 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -360,7 +360,27 @@ Status ShapeVerifier::HandleCall(HloInstruction* call) {
   return CheckShape(call, call->to_apply()->root_instruction()->shape());
 }
 
-Status ShapeVerifier::HandleCustomCall(HloInstruction*) { return Status::OK(); }
+Status ShapeVerifier::HandleCustomCall(HloInstruction* instruction) {
+  const HloCustomCallInstruction* custom_call =
+      DynCast<const HloCustomCallInstruction>(instruction);
+  TF_RET_CHECK(custom_call != nullptr);
+  if (custom_call->layout_constrained()) {
+    // If the layout is constrained, verify all the respective shapes have
+    // layouts and that the constrained operand shapes match the shapes of the
+    // operands.
+    TF_RET_CHECK(LayoutUtil::HasLayout(custom_call->shape()));
+    TF_RET_CHECK(custom_call->operand_count() ==
+                 custom_call->operand_shapes_with_layout().size());
+    for (int64 i = 0; i < custom_call->operand_count(); ++i) {
+      const Shape& operand_shape_with_layout =
+          custom_call->operand_shapes_with_layout()[i];
+      TF_RET_CHECK(ShapeUtil::Compatible(custom_call->operand(i)->shape(),
+                                         operand_shape_with_layout));
+      TF_RET_CHECK(LayoutUtil::HasLayout(operand_shape_with_layout));
+    }
+  }
+  return Status::OK();
+}
 
 Status ShapeVerifier::HandleSlice(HloInstruction* slice) {
   return CheckShape(slice,
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index cc4a342e9d..ad65b147c1 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -419,6 +419,16 @@ Status LayoutAssignment::BuildHostChannelConstraints(
   return Status::OK();
 }
 
+namespace {
+
+bool IsLayoutConstrainedCustomCall(HloInstruction* instruction) {
+  const HloCustomCallInstruction* custom_call =
+      DynCast<HloCustomCallInstruction>(instruction);
+  return custom_call != nullptr && custom_call->layout_constrained();
+}
+
+}  // namespace
+
 Status LayoutAssignment::AddMandatoryConstraints(
     const ComputationLayout* computation_layout,
     ChannelLayoutConstraints* channel_constraints, HloComputation* computation,
@@ -434,7 +444,6 @@ Status LayoutAssignment::AddMandatoryConstraints(
   // Constrain layouts of instructions which define values with pre-existing
   // layouts.
   for (auto* instruction : computation->instructions()) {
-    Shape const* shape_with_layout = nullptr;
     if (instruction->opcode() == HloOpcode::kInfeed) {
       // Infeed layouts must match the layout of the original inserted
       // instruction.
@@ -456,17 +465,21 @@ Status LayoutAssignment::AddMandatoryConstraints(
         if (parameter_layout.LayoutIsSet()) {
           // Parameter layouts must match the respective layout in
           // ComputationLayout, if there is one.
-          shape_with_layout = &parameter_layout.shape();
+          TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
+              parameter_layout.shape(), instruction));
         }
       }
-    }
-    if (shape_with_layout != nullptr) {
+    } else if (IsLayoutConstrainedCustomCall(instruction)) {
+      const HloCustomCallInstruction* custom_call =
+          DynCast<HloCustomCallInstruction>(instruction);
       TF_RETURN_IF_ERROR(
-          constraints->SetInstructionLayout(*shape_with_layout, instruction));
-    }
-
-    if (instruction->opcode() == HloOpcode::kSend ||
-        instruction->opcode() == HloOpcode::kRecv) {
+          constraints->SetInstructionLayout(custom_call->shape(), custom_call));
+      for (int64 i = 0; i < custom_call->operand_count(); ++i) {
+        TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
+            custom_call->operand_shapes_with_layout()[i], custom_call, i));
+      }
+    } else if (instruction->opcode() == HloOpcode::kSend ||
+               instruction->opcode() == HloOpcode::kRecv) {
       CHECK(get_channel_constraints(instruction))
           << "Multi-module layout assignment requires ChannelLayoutConstraints";
       int64 channel_id = instruction->channel_id();
@@ -621,31 +634,6 @@ Status LayoutAssignment::AddMandatoryConstraints(
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
           false_computation_layout.parameter_shape(0), instruction, 2,
           /*mandatory=*/true));
-    } else if (instruction->opcode() == HloOpcode::kCustomCall) {
-      if (!CustomCallRequiresMajorFirstLayout(instruction)) {
-        continue;
-      }
-      // Add constraints for kCustomCall instruction operands and instructions.
-      // For now we only support major-first layouts for all inputs and outputs.
-      Shape result_shape = ShapeUtil::MakeShapeWithDescendingLayout(
-          instruction->shape().element_type(),
-          AsInt64Slice(instruction->shape().dimensions()));
-      TF_RETURN_IF_ERROR(
-          constraints->SetInstructionLayout(result_shape, instruction));
-      for (int64 i = 0; i < instruction->operand_count(); ++i) {
-        const Shape& operand_shape = instruction->operand(i)->shape();
-        // Opaque operands don't get a layout constraint.
-        if (ShapeUtil::IsOpaque(operand_shape)) {
-          continue;
-        }
-
-        Shape row_major_operand_shape =
-            ShapeUtil::MakeShapeWithDescendingLayout(
-                operand_shape.element_type(),
-                AsInt64Slice(operand_shape.dimensions()));
-        TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-            row_major_operand_shape, instruction, i));
-      }
     }
   }
   // Finally set the result layout to match ComputationLayout, if there is one.
@@ -676,16 +664,18 @@ Status CheckCallLayout(HloInstruction* call,
   return Status::OK();
 }
 
-// Custom calls have fixed input and output layouts.
-Status CheckCustomCallLayout(HloInstruction* custom_call) {
-  for (const HloInstruction* operand : custom_call->operands()) {
-    TF_RET_CHECK(
-        ShapeUtil::IsOpaque(operand->shape()) ||
-        LayoutUtil::IsMonotonicWithDim0Major(operand->shape().layout()));
+// Operands of layout-constrained custom calls must match the expected
+// constrained layouts.
+Status CheckCustomCallLayout(HloInstruction* instruction) {
+  if (IsLayoutConstrainedCustomCall(instruction)) {
+    const HloCustomCallInstruction* custom_call =
+        DynCast<HloCustomCallInstruction>(instruction);
+    for (int64 i = 0; i < custom_call->operand_count(); ++i) {
+      TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
+          custom_call->operand(i)->shape(),
+          custom_call->operand_shapes_with_layout()[i]));
+    }
   }
-  TF_RET_CHECK(
-      ShapeUtil::IsOpaque(custom_call->shape()) ||
-      LayoutUtil::IsMonotonicWithDim0Major(custom_call->shape().layout()));
   return Status::OK();
 }
 
@@ -932,9 +922,7 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
               FindOrDie(computation_layouts_, instruction->to_apply())));
           break;
         case HloOpcode::kCustomCall:
-          if (CustomCallRequiresMajorFirstLayout(instruction)) {
-            TF_RETURN_IF_ERROR(CheckCustomCallLayout(instruction));
-          }
+          TF_RETURN_IF_ERROR(CheckCustomCallLayout(instruction));
           break;
         case HloOpcode::kFusion:
           TF_RETURN_IF_ERROR(CheckFusionLayout(instruction));
@@ -1554,11 +1542,11 @@ Status LayoutAssignment::CalculateComputationLayout(
 
 Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
   // Clear existing layouts of the instructions.  All layouts must be assigned
-  // by the LayoutAssignment pass, except for those on infeeds, parameters,
-  // and the computation result. The latter two are specified in
-  // computation_layout, so we only need to keep the existing layouts for
-  // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
-  // layout assignment pass that may accidentally use the existing layout.
+  // by the LayoutAssignment pass, except for those on parameters, the
+  // computation result, and a couple special cases. The former two are
+  // specified in computation_layout.  Clearing the layouts here avoids hiding
+  // potential bugs in the layout assignment pass that may accidentally use the
+  // existing layout.
   for (HloInstruction* instruction : computation->instructions()) {
     if (instruction->opcode() == HloOpcode::kBitcast) {
       // bitcasts are inherently layout sensitive and so a bitcast instruction
@@ -1567,7 +1555,9 @@ Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
           "Unexpected bitcast operation seen during layout assignment: %s.",
           instruction->ToString());
     }
-    if (instruction->opcode() != HloOpcode::kInfeed) {
+    // Some instructions carry mandatory layouts in their shape.
+    if (instruction->opcode() != HloOpcode::kInfeed &&
+        !IsLayoutConstrainedCustomCall(instruction)) {
       LayoutUtil::ClearLayout(instruction->mutable_shape());
     }
   }
@@ -1802,6 +1792,18 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   }
   TF_RETURN_IF_ERROR(Init());
 
+  // Verify computation layout is sane.
+  const HloComputation* entry = module->entry_computation();
+  TF_RET_CHECK(entry_computation_layout_->parameter_count() ==
+               entry->num_parameters());
+  for (int64 i = 0; i < entry->num_parameters(); ++i) {
+    TF_RET_CHECK(
+        ShapeUtil::Compatible(entry_computation_layout_->parameter_shape(i),
+                              entry->parameter_instruction(i)->shape()));
+  }
+  TF_RET_CHECK(ShapeUtil::Compatible(entry_computation_layout_->result_shape(),
+                                     entry->root_instruction()->shape()));
+
   // We do two passes. The first one we pass a nullptr ComputationLayout to
   // the RunOnComputation() calls (for non entry computations), and we register
   // the ComputationLayout which are naturally flowing in DFS fashion to the
@@ -1873,7 +1875,6 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
-    case HloOpcode::kCustomCall:
     case HloOpcode::kDivide:
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kDynamicUpdateSlice:
@@ -1930,6 +1931,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kConstant:
     case HloOpcode::kConvolution:
     case HloOpcode::kCopy:
+    case HloOpcode::kCustomCall:
     case HloOpcode::kDomain:
     case HloOpcode::kDot:
     case HloOpcode::kFusion:
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 2d48e12263..cb56f4cd19 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -333,19 +333,6 @@ class LayoutAssignment : public HloModulePass {
       const ResultLayoutConstraint& layout_constraint,
       LayoutConstraints* constraints);
 
-  // By default LayoutAssignment ensures that inputs and outputs of CustomCalls
-  // have the "major-first" layout (i.e.  {n, n-1, ..., 0}).
-  //
-  // If this function returns true, LayoutAssignment does not set a layout for
-  // the given CustomCall.  It's up to the backend to set one in
-  // AddBackendConstraints, if necessary.
-  //
-  // Precondition: instruction->opcode() == HloOpcode::kCustomCall.
-  virtual bool CustomCallRequiresMajorFirstLayout(
-      const HloInstruction* /*instruction*/) {
-    return true;
-  }
-
   // Called after layouts of an instruction have been finalized to allow
   // subclasses to check for platform specific assumptions.
   virtual Status Verify(const HloInstruction* instruction) {
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 2c549cd872..ff6fdb5e4a 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -65,6 +65,27 @@ class LayoutAssignmentTest : public HloVerifiedTestBase {
         FindInstruction(module, name)->shape().layout().minor_to_major();
     return std::vector<int64>(minor_to_major.begin(), minor_to_major.end());
   }
+
+  void ExpectLayoutIs(const Shape& shape,
+                      absl::Span<const int64> minor_to_major) {
+    const Layout expected = LayoutUtil::MakeLayout(minor_to_major);
+    EXPECT_TRUE(LayoutUtil::Equal(shape.layout(), expected))
+        << "Expected layout " << expected << ", actual " << shape.layout();
+  }
+
+  void ExpectTupleLayoutIs(
+      const Shape& shape,
+      std::initializer_list<absl::Span<const int64>> minor_to_majors) {
+    int i = 0;
+    for (const absl::Span<const int64> minor_to_major : minor_to_majors) {
+      const Layout expected = LayoutUtil::MakeLayout(minor_to_major);
+      const Layout& actual = ShapeUtil::GetTupleElementShape(shape, i).layout();
+      EXPECT_TRUE(LayoutUtil::Equal(actual, expected))
+          << "Expected tuple element " << i << " layout " << expected
+          << ", actual " << actual;
+      ++i;
+    }
+  }
 };
 
 TEST_F(LayoutAssignmentTest, ComputationLayout) {
@@ -1102,5 +1123,174 @@ TEST_F(LayoutAssignmentTest, TupleCopyOnLayoutMismatch) {
   EXPECT_THAT(LayoutOf(&module(), "next_buf"), ElementsAre(1, 0));
 }
 
+TEST_F(LayoutAssignmentTest, CustomCallNotLayoutConstrained) {
+  const char* module_str = R"(
+HloModule CustomCallNotLayoutConstrained
+
+ENTRY %CustomCallWithNotLayoutConstrained (p: f32[42,2,3]) -> f32[1,2,3,4] {
+  %p = f32[42,2,3] parameter(0)
+  ROOT %custom-call = f32[1,2,3,4] custom-call(f32[42,2,3] %p), custom_call_target="baz"
+}
+)";
+  // Try with a couple different layouts. In each case the custom calls operand
+  // and result layout should match that of the computation.
+  {
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<VerifiedHloModule> module,
+        ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
+    ComputationLayout computation_layout = module->entry_computation_layout();
+    *computation_layout.mutable_parameter_layout(0) =
+        ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {42, 2, 3}, {0, 2, 1}));
+    *computation_layout.mutable_result_layout() = ShapeLayout(
+        ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {3, 2, 0, 1}));
+    AssignLayouts(module.get(), &computation_layout);
+
+    HloInstruction* root = module->entry_computation()->root_instruction();
+    ASSERT_THAT(root, op::CustomCall(op::Parameter()));
+    ExpectLayoutIs(root->shape(), {3, 2, 0, 1});
+    ExpectLayoutIs(root->operand(0)->shape(), {0, 2, 1});
+  }
+  {
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<VerifiedHloModule> module,
+        ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
+    ComputationLayout computation_layout = module->entry_computation_layout();
+    *computation_layout.mutable_parameter_layout(0) =
+        ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {42, 2, 3}, {0, 1, 2}));
+    *computation_layout.mutable_result_layout() = ShapeLayout(
+        ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {0, 2, 3, 1}));
+    AssignLayouts(module.get(), &computation_layout);
+
+    HloInstruction* root = module->entry_computation()->root_instruction();
+    ASSERT_THAT(root, op::CustomCall(op::Parameter()));
+    ExpectLayoutIs(root->shape(), {0, 2, 3, 1});
+    ExpectLayoutIs(root->operand(0)->shape(), {0, 1, 2});
+  }
+}
+
+TEST_F(LayoutAssignmentTest, CustomCallLayoutConstrained) {
+  const char* module_str = R"(
+HloModule CustomCallLayoutConstrained
+
+ENTRY %CustomCallWithLayoutConstraints (p0: f32[4,4], p1: f32[2,3]) -> f32[1,2,3,4] {
+  %p0 = f32[4,4] parameter(0)
+  %p1 = f32[2,3] parameter(1)
+  ROOT %custom-call = f32[1,2,3,4]{3,2,0,1} custom-call(f32[4,4] %p0, f32[2,3] %p1), custom_call_target="baz", operand_layout_constraints={f32[4,4]{0,1}, f32[2,3]{1,0}}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
+  ComputationLayout computation_layout = module->entry_computation_layout();
+  *computation_layout.mutable_parameter_layout(0) =
+      ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}));
+  *computation_layout.mutable_parameter_layout(1) =
+      ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0}));
+  *computation_layout.mutable_result_layout() = ShapeLayout(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {2, 1, 0, 3}));
+  AssignLayouts(module.get(), &computation_layout);
+
+  // The custom call should be partially encapsulated in kCopy instructions
+  // because of the layout mismatches.
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              op::Copy(op::CustomCall(op::Copy(), op::Parameter())));
+
+  const HloInstruction* custom_call =
+      module->entry_computation()->root_instruction()->operand(0);
+  ExpectLayoutIs(custom_call->shape(), {3, 2, 0, 1});
+  ExpectLayoutIs(custom_call->operand(0)->shape(), {0, 1});
+  ExpectLayoutIs(custom_call->operand(1)->shape(), {1, 0});
+}
+
+TEST_F(LayoutAssignmentTest, CustomCallLayoutConstrainedZeroOperands) {
+  const char* module_str = R"(
+HloModule CustomCallLayoutConstrainedZeroOperands
+
+ENTRY %CustomCallLayoutConstrainedZeroOperands () -> f32[1,2,3,4] {
+  ROOT %custom-call = f32[1,2,3,4]{3,2,0,1} custom-call(), custom_call_target="baz", operand_layout_constraints={}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
+  ComputationLayout computation_layout = module->entry_computation_layout();
+  *computation_layout.mutable_result_layout() = ShapeLayout(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {2, 1, 0, 3}));
+  AssignLayouts(module.get(), &computation_layout);
+
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              op::Copy(op::CustomCall()));
+
+  const HloInstruction* custom_call =
+      module->entry_computation()->root_instruction()->operand(0);
+  ExpectLayoutIs(custom_call->shape(), {3, 2, 0, 1});
+}
+
+TEST_F(LayoutAssignmentTest, CustomCallLayoutConstrainedTupleOperand) {
+  const char* module_str = R"(
+HloModule CustomCallLayoutConstrainedTupleOperand
+
+ENTRY %CustomCallLayoutConstrainedTupleOperand (p0: f32[4,4], p1: f32[2,3]) -> f32[1,2,3,4] {
+  %p0 = f32[4,4] parameter(0)
+  %p1 = f32[2,3] parameter(1)
+  %tuple = (f32[4,4], f32[2,3]) tuple(%p0, %p1)
+  ROOT %custom-call = f32[1,2,3,4]{3,2,0,1} custom-call(%tuple), custom_call_target="baz", operand_layout_constraints={(f32[4,4]{1,0}, f32[2,3]{0,1})}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
+  ComputationLayout computation_layout = module->entry_computation_layout();
+  *computation_layout.mutable_parameter_layout(0) =
+      ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}));
+  *computation_layout.mutable_parameter_layout(1) =
+      ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0}));
+  *computation_layout.mutable_result_layout() = ShapeLayout(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {2, 1, 0, 3}));
+  AssignLayouts(module.get(), &computation_layout);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ExpectLayoutIs(root->shape(), {2, 1, 0, 3});
+
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              op::Copy(op::CustomCall(op::Tuple())));
+
+  const HloInstruction* custom_call =
+      module->entry_computation()->root_instruction()->operand(0);
+  ExpectLayoutIs(custom_call->shape(), {3, 2, 0, 1});
+  ExpectTupleLayoutIs(custom_call->operand(0)->shape(), {{1, 0}, {0, 1}});
+}
+
+TEST_F(LayoutAssignmentTest, CustomCallLayoutConstrainedTupleResult) {
+  const char* module_str = R"(
+HloModule CustomCallLayoutConstrainedTupleResult
+
+ENTRY %CustomCallLayoutConstrainedTupleResult (p0: f32[4,4]) -> (f32[4,4]{1,0}, f32[2,3]{0,1}) {
+  %p0 = f32[4,4] parameter(0)
+  ROOT %custom-call = (f32[4,4]{1,0}, f32[2,3]{0,1}) custom-call(%p0), custom_call_target="baz", operand_layout_constraints={f32[4,4]{1,0}}
+}
+)";
+  // Try with a couple different layouts. In each case the custom calls operand
+  // and result layout should match that of the computation.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
+  ComputationLayout computation_layout = module->entry_computation_layout();
+  *computation_layout.mutable_parameter_layout(0) =
+      ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}));
+  *computation_layout.mutable_result_layout() =
+      ShapeLayout(ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}),
+           ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0})}));
+  AssignLayouts(module.get(), &computation_layout);
+
+  ExpectTupleLayoutIs(module->entry_computation()->root_instruction()->shape(),
+                      {{1, 0}, {1, 0}});
+
+  const HloInstruction* custom_call =
+      FindInstruction(module.get(), "custom-call");
+  ExpectTupleLayoutIs(custom_call->shape(), {{1, 0}, {0, 1}});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index d244923532..7f0201942b 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -1645,7 +1645,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 }
 
 std::ostream& operator<<(std::ostream& out, const Shape& shape) {
-  out << ShapeUtil::HumanString(shape);
+  out << ShapeUtil::HumanStringWithLayout(shape);
   return out;
 }
 
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index a693fa3595..001490c6a8 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -105,8 +105,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) {
   LiteralTestUtil::ExpectR0Near<float>(10.0f, result, error_spec_);
 }
 
-XLA_TEST_F(CustomCallTest,
-           DISABLED_ON_GPU(CustomCall_UsedInOtherComputations)) {
+XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(UsedInOtherComputations)) {
   auto module = CreateNewModule();
   auto b = HloComputation::Builder(TestName());
 
@@ -130,6 +129,53 @@ XLA_TEST_F(CustomCallTest,
       Array3D<float>{{{2, 3}, {4, 5}}, {{3, 4}, {5, 6}}}, result);
 }
 
+XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(InputAndOutputLayoutDiffer)) {
+  auto module = CreateNewModule();
+  auto b = HloComputation::Builder(TestName());
+
+  auto input =
+      b.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
+  b.AddInstruction(
+      HloInstruction::CreateCustomCall(r2f32_, {input}, "Add1ToValues"));
+
+  module->AddEntryComputation(b.Build());
+  ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
+  ForceResultLayout(module.get(), LayoutUtil::MakeLayout({0, 1}));
+
+  Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
+
+  // Note, the expected result is transposed! This is because the input and
+  // output layouts of the custom call differ and the called function just
+  // blindly adds one to each element.
+  Literal result = ExecuteAndTransfer(std::move(module), {&argument});
+  LiteralTestUtil::ExpectR2Equal<float>({{2.f, 4.f}, {3.f, 5.f}}, result);
+}
+
+XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(LayoutConstrained)) {
+  // The argument and result of the computation are set to different layouts,
+  // but the custom call is layout constrained to a fixed operand and result
+  // layout, so the correct result should be produced.
+  auto module = CreateNewModule();
+  auto b = HloComputation::Builder(TestName());
+
+  auto input =
+      b.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
+
+  const Shape& r2f32_dim0_major =
+      ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0});
+  b.AddInstruction(HloInstruction::CreateCustomCall(
+      r2f32_dim0_major, {input}, "Add1ToValues", {r2f32_dim0_major}));
+
+  module->AddEntryComputation(b.Build());
+  ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
+  ForceResultLayout(module.get(), LayoutUtil::MakeLayout({0, 1}));
+
+  Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
+
+  Literal result = ExecuteAndTransfer(std::move(module), {&argument});
+  LiteralTestUtil::ExpectR2Equal<float>({{2.f, 3.f}, {4.f, 5.f}}, result);
+}
+
 class CustomCallClientAPITest : public ClientLibraryTestBase {};
 
 // When using the client API, CustomCall targets can't begin with '$' -- these
-- 
GitLab


From af5b714179ff5e279ba27c024f453e2d75636ac9 Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Mon, 8 Oct 2018 14:43:55 -0700
Subject: [PATCH 547/570] Add more logging to the convolution transformations.

PiperOrigin-RevId: 216252980
---
 .../xla/service/gpu/cudnn_convolution_algorithm_picker.cc      | 3 +++
 .../compiler/xla/service/gpu/cudnn_convolution_rewriter.cc     | 3 +++
 .../xla/service/gpu/cudnn_fused_convolution_rewriter.cc        | 3 ++-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index 590c0a7d54..6d4a72038f 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -360,6 +360,9 @@ StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
   HloInstruction* new_call = computation->AddInstruction(
       instr->CloneWithNewOperands(new_call_shape, instr->operands()));
 
+  VLOG(1) << "Replacing convolution " << instr->ToString() << " with "
+          << new_call->ToString();
+
   TF_RETURN_IF_ERROR(new_call->set_backend_config(backend_config));
 
   // Repackage new_call so it has the same shape as the original call, namely
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc
index ef29237301..437d25727e 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc
@@ -525,6 +525,9 @@ StatusOr<bool> RunOnInstruction(HloInstruction* conv) {
   TF_RETURN_IF_ERROR(
       custom_call->set_backend_config(GetDefaultBackendConfig()));
 
+  VLOG(1) << "Replacing convolution " << conv->ToString() << " with "
+          << custom_call->ToString();
+
   // The CustomCall returns a tuple (conv_result, scratch_memory).  Extract out
   // the conv result and replace `conv` with it.
   TF_RETURN_IF_ERROR(conv->parent()->ReplaceWithNewInstruction(
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_convolution_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_fused_convolution_rewriter.cc
index 3761c19cfc..d508cbc2e1 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_fused_convolution_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_fused_convolution_rewriter.cc
@@ -234,7 +234,8 @@ StatusOr<std::unique_ptr<HloInstruction>> TryRewriteToCudnnForwardRelu(
   config.set_side_input_scale(alpha_side_input);
   TF_RETURN_IF_ERROR(new_conv->set_backend_config(config));
 
-  VLOG(1) << "Rewriting " << conv->name() << " to " << new_conv->name();
+  VLOG(1) << "Replacing convolution " << conv->ToString() << " with "
+          << new_conv->ToString();
   return HloInstruction::CreateGetTupleElement(conv->shape().tuple_shapes(0),
                                                new_conv, 0);
 }
-- 
GitLab


From b3bd7b378d00190fef831092836a5df62e39e7ed Mon Sep 17 00:00:00 2001
From: Shivani Agrawal <shivaniagrawal@google.com>
Date: Mon, 8 Oct 2018 14:44:37 -0700
Subject: [PATCH 548/570] Ignore args and kwargs for defun's get_concrete_fn if
 `PolymorphicFunction` was created with an input_signature.

PiperOrigin-RevId: 216253122
---
 tensorflow/python/eager/function.py      | 14 ++++++++++++++
 tensorflow/python/eager/function_test.py |  9 ++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 99bf375ea7..ff138cad1e 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -664,6 +664,11 @@ class Function(object):
 
     return self._build_call_outputs(outputs)
 
+  @property
+  def name(self):
+    """Function name."""
+    return self._inference_function.name
+
   @property
   def graph(self):
     """Returns the graph from which this function was constructed."""
@@ -721,6 +726,10 @@ class Function(object):
     return nest.map_structure(lambda x: x.dtype if x is not None else None,
                               self._func_graph.structured_outputs)
 
+  def add_to_graph(self, g):
+    """Adds this function into the graph g."""
+    return self._inference_function.add_to_graph(g)
+
   def _construct_backprop_function(self):
     """Constructs the backprop function object for this function."""
     backwards_graph = FuncGraph(_backward_name(self._func_graph.name))
@@ -1133,6 +1142,8 @@ class PolymorphicFunction(object):
       *args: inputs to specialize on.
       **kwargs: inputs to specialize on.
     """
+    if self._input_signature:
+      args, kwargs = None, None
     graph_function, _ = self._maybe_define_function(args, kwargs)
     return graph_function
 
@@ -1322,6 +1333,9 @@ def register(func, *args, **kwargs):
   function definition into graph. Register function with different input param
   will result into multiple version of functions registered in graph.
 
+  Also, `args` and `kwargs` are ignored if this `PolymorphicFunction` was
+  created with an `input_signature`.
+
   Args:
     func: the PolymorphicFunction instance that generated by a @defun
     *args: input arguments for the Python function.
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index e46bde098b..953f4300cf 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -1841,11 +1841,10 @@ class FunctionTest(test.TestCase):
         # pylint: disable=protected-access
         self.assertEqual(len(graph._functions), 3)
 
-        # Test input param shape mismatch
-        t2 = constant_op.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-        with self.assertRaisesRegexp(
-            ValueError, 'Python inputs incompatible with input_signature'):
-          function.register(defun_matmul, t2, t2)
+        # Test register function with cache, note inputs are ignored.
+        function.register(defun_matmul)
+        graph = ops.get_default_graph()
+        self.assertEqual(len(graph._functions), 3)
 
   def testRegisterFunctionWithCache(self):
     def matmul(x, y):
-- 
GitLab


From 220c0f90af05ed1ca86831258888cc80757654fd Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Mon, 8 Oct 2018 15:00:36 -0700
Subject: [PATCH 549/570] [XLA] Simplify loop nesting in HandleConvolution

The calculation of a spatial coordinate in the kernel and activations is not
dependent on which part of the contracted dimension (input feature) we are in.

Rather than nesting the loops, the loops can be siblings:
- One loop over spatial dimensions
- One loop over the input feature group

This reduces the nesting depth which makes the code a little more readable and
might be slightly faster due work invariant in the spatial loop getting hoisted
out.

PiperOrigin-RevId: 216255839
---
 .../xla/service/hlo_evaluator_typed_visitor.h | 96 +++++++++----------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index a450dc6ff5..84fbbd3e0c 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -1072,66 +1072,66 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
       // Convolve input feature with kernel.
       do {
+        // Find corresponding spatial dimension index for input (lhs).
+        int64 lhs_linear_spatial_index = 0;
+        int64 rhs_linear_spatial_index = 0;
+        for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
+          // Spatial dimension number for input (lhs) and output.
+          const int64 input_spatial_dim = dnums.input_spatial_dimensions(ki);
+          const int64 output_spatial_dim = dnums.output_spatial_dimensions(ki);
+
+          // Calculate lhs (input) index without taking base dilation into
+          // account.
+          const auto& window_dim = window.dimensions(ki);
+          const int64 undilated_index =
+              out_index[output_spatial_dim] * window_dim.stride() -
+              window_dim.padding_low() +
+              rhs_spatial_index[ki] * window_dim.window_dilation();
+          // Skip if the lhs (input) index is to be dilated.  As an
+          // optimization, skip this mod if there's no dilation.
+          if (window_dim.base_dilation() > 1 &&
+              undilated_index % window_dim.base_dilation() != 0) {
+            goto cnt;
+          }
+
+          // Calculate the actual lhs (input) index after dilation.  As an
+          // optimization, skip this integer divide if there's no dilation.
+          int64 lhs_spatial_index;
+          if (window_dim.base_dilation() > 1) {
+            lhs_spatial_index = undilated_index / window_dim.base_dilation();
+          } else {
+            lhs_spatial_index = undilated_index;
+          }
+
+          // Skip if input index is not in bounds.
+          if (!(lhs_spatial_index >= 0 &&
+                lhs_spatial_index < lhs_shape.dimensions(input_spatial_dim))) {
+            goto cnt;
+          }
+
+          lhs_linear_spatial_index +=
+              lhs_spatial_index * lhs_dim_multipliers[input_spatial_dim];
+          rhs_linear_spatial_index +=
+              (window_dim.window_reversal()
+                   ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
+                   : rhs_spatial_index[ki]) *
+              rhs_dim_multipliers[dnums.kernel_spatial_dimensions(ki)];
+        }
+
         for (int64 rhs_iz = 0; rhs_iz < input_feature_group_size; ++rhs_iz) {
           const int64 iz =
               feature_group_index * input_feature_group_size + rhs_iz;
 
-          int64 lhs_linear_index = 0;
+          int64 lhs_linear_index = lhs_linear_spatial_index;
           lhs_linear_index += out_index[output_batch_dim] *
                               lhs_dim_multipliers[input_batch_dim];
           lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
 
-          int64 rhs_linear_index = 0;
+          int64 rhs_linear_index = rhs_linear_spatial_index;
           rhs_linear_index += out_index[output_z_dim] *
                               rhs_dim_multipliers[kernel_output_z_dim];
           rhs_linear_index += rhs_iz * rhs_dim_multipliers[kernel_input_z_dim];
 
-          // Find corresponding spatial dimension index for input (lhs).
-          for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
-            // Spatial dimension number for input (lhs) and output.
-            const int64 input_spatial_dim = dnums.input_spatial_dimensions(ki);
-            const int64 output_spatial_dim =
-                dnums.output_spatial_dimensions(ki);
-
-            // Calculate lhs (input) index without taking base dilation into
-            // account.
-            const auto& window_dim = window.dimensions(ki);
-            const int64 undilated_index =
-                out_index[output_spatial_dim] * window_dim.stride() -
-                window_dim.padding_low() +
-                rhs_spatial_index[ki] * window_dim.window_dilation();
-            // Skip if the lhs (input) index is to be dilated.  As an
-            // optimization, skip this mod if there's no dilation.
-            if (window_dim.base_dilation() > 1 &&
-                undilated_index % window_dim.base_dilation() != 0) {
-              goto cnt;
-            }
-
-            // Calculate the actual lhs (input) index after dilation.  As an
-            // optimization, skip this integer divide if there's no dilation.
-            int64 lhs_spatial_index;
-            if (window_dim.base_dilation() > 1) {
-              lhs_spatial_index = undilated_index / window_dim.base_dilation();
-            } else {
-              lhs_spatial_index = undilated_index;
-            }
-            lhs_linear_index +=
-                lhs_spatial_index * lhs_dim_multipliers[input_spatial_dim];
-
-            // Skip if input index is not in bounds.
-            if (!(lhs_spatial_index >= 0 &&
-                  lhs_spatial_index <
-                      lhs_shape.dimensions(input_spatial_dim))) {
-              goto cnt;
-            }
-
-            rhs_linear_index +=
-                (window_dim.window_reversal()
-                     ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
-                     : rhs_spatial_index[ki]) *
-                rhs_dim_multipliers[dnums.kernel_spatial_dimensions(ki)];
-          }
-
           result_val +=
               static_cast<ElementwiseT>(lhs_literal_data[lhs_linear_index]) *
               static_cast<ElementwiseT>(rhs_literal_data[rhs_linear_index]);
-- 
GitLab


From 5da3cebe00111aa43e34b5a3fc12d1a97b838ba7 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 8 Oct 2018 15:02:13 -0700
Subject: [PATCH 550/570] Automated rollback of commit
 09b0fc199129e0f487a39741bdf674cf09035cbc

PiperOrigin-RevId: 216256115
---
 .../core/kernels/data/shuffle_dataset_op.cc   |  2 +-
 .../data/experimental/kernel_tests/BUILD      | 13 ------
 .../kernel_tests/random_dataset_test.py       | 45 -------------------
 .../kernel_tests/shuffle_and_repeat_test.py   | 21 +--------
 .../data/experimental/ops/random_ops.py       | 21 ++-------
 .../data/experimental/ops/shuffle_ops.py      | 21 ++-------
 tensorflow/python/data/kernel_tests/BUILD     |  1 -
 .../kernel_tests/shuffle_dataset_op_test.py   | 25 +----------
 tensorflow/python/data/ops/dataset_ops.py     | 22 ++-------
 tensorflow/python/data/util/BUILD             |  1 -
 tensorflow/python/data/util/random_seed.py    |  5 +--
 .../python/data/util/random_seed_test.py      | 13 +-----
 12 files changed, 16 insertions(+), 174 deletions(-)
 delete mode 100644 tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py

diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 9f54c381a9..66466d6a36 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -485,7 +485,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
                      int64 buffer_size, int64 seed, int64 seed2, int64 count)
         : ShuffleDatasetBase(ctx, input, buffer_size, count),
           seed_(seed),
-          seed2_(seed2) {}
+          seed2_(seed) {}
 
     string DebugString() const override {
       return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index a67f6ff031..4eef9580ad 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -453,18 +453,6 @@ cuda_py_test(
     tags = ["no_windows_gpu"],
 )
 
-py_test(
-    name = "random_dataset_test",
-    srcs = ["random_dataset_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python/data/experimental/ops:random_ops",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 py_library(
     name = "reader_dataset_ops_test_base",
     testonly = 1,
@@ -574,7 +562,6 @@ py_test(
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py
deleted file mode 100644
index d403a575ec..0000000000
--- a/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for `tf.data.experimental.RandomDataset()`."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-
-from tensorflow.python.data.experimental.ops import random_ops
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-
-
-class RandomDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ("NoSeed", None),
-      ("WithSeed", 42),
-  )
-  def testZipRandomDataset(self, seed):
-    dataset = random_ops.RandomDataset(seed=seed).take(30)
-    dataset = dataset_ops.Dataset.zip((dataset, dataset))
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(30):
-        x, y = sess.run(next_element)
-        self.assertEqual(x, y)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
diff --git a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
index 883169495f..c208963a86 100644
--- a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import shuffle_ops
@@ -28,7 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
 
-class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase):
+class ShuffleAndRepeatTest(test_base.DatasetTestBase):
 
   def _build_ds(self, seed, count=5, num_elements=20):
     return dataset_ops.Dataset.range(num_elements).apply(
@@ -111,24 +110,6 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase):
       with self.session(graph=g) as sess:
         sess.run(get_next_op)
 
-  @parameterized.named_parameters(
-      ("NoSeed", None),
-      ("WithSeed", 42),
-  )
-  def testShuffleAndRepeatAndZipDataset(self, seed):
-    dataset = dataset_ops.Dataset.range(10).apply(
-        shuffle_ops.shuffle_and_repeat(10, count=3, seed=seed))
-    dataset = dataset_ops.Dataset.zip((dataset, dataset))
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(30):
-        x, y = sess.run(next_element)
-        self.assertEqual(x, y)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/ops/random_ops.py b/tensorflow/python/data/experimental/ops/random_ops.py
index 25d7fbf691..e3a2aeab31 100644
--- a/tensorflow/python/data/experimental/ops/random_ops.py
+++ b/tensorflow/python/data/experimental/ops/random_ops.py
@@ -33,26 +33,13 @@ class RandomDataset(dataset_ops.DatasetSource):
   def __init__(self, seed=None):
     """A `Dataset` of pseudorandom values."""
     super(RandomDataset, self).__init__()
-
-    # NOTE(mrry): We generate the seed-pair once per graph in which the dataset
-    # is iterated over, and cache it in `self._graph_seed_map`. This supports
-    # two features: iterating over the same `ShuffleDataset` twice in the same
-    # pipeline and observing the same order (by tying the seeds together with
-    # a randomly-generated seed), and using `Dataset.make_one_shot_iterator()`,
-    # which requires the stateful RNG op to be created inside the same graph as
-    # the dataset.
-    self._original_seed = seed
-    self._graph_seed_map = {}
+    self._seed, self._seed2 = random_seed.get_seed(seed)
 
   def _as_variant_tensor(self):
-    try:
-      seed, seed2 = self._graph_seed_map[ops.get_default_graph()]
-    except KeyError:
-      seed, seed2 = random_seed.get_seed(self._original_seed)
-      self._graph_seed_map[ops.get_default_graph()] = (seed, seed2)
-
     return gen_dataset_ops.random_dataset(
-        seed=seed, seed2=seed2, **dataset_ops.flat_structure(self))
+        seed=self._seed,
+        seed2=self._seed2,
+        **dataset_ops.flat_structure(self))
 
   @property
   def output_classes(self):
diff --git a/tensorflow/python/data/experimental/ops/shuffle_ops.py b/tensorflow/python/data/experimental/ops/shuffle_ops.py
index a82e4b7d09..a4307212da 100644
--- a/tensorflow/python/data/experimental/ops/shuffle_ops.py
+++ b/tensorflow/python/data/experimental/ops/shuffle_ops.py
@@ -39,32 +39,17 @@ class _ShuffleAndRepeatDataset(dataset_ops.UnaryDataset):
     else:
       self._count = ops.convert_to_tensor(
           count, dtype=dtypes.int64, name="count")
-
-    # NOTE(mrry): We generate the seed-pair once per graph in which the dataset
-    # is iterated over, and cache it in `self._graph_seed_map`. This supports
-    # two features: iterating over the same `ShuffleDataset` twice in the same
-    # pipeline and observing the same order (by tying the seeds together with
-    # a randomly-generated seed), and using `Dataset.make_one_shot_iterator()`,
-    # which requires the stateful RNG op to be created inside the same graph as
-    # the dataset.
-    self._original_seed = seed
-    self._graph_seed_map = {}
+    self._seed, self._seed2 = random_seed.get_seed(seed)
 
   def _as_variant_tensor(self):
-    try:
-      seed, seed2 = self._graph_seed_map[ops.get_default_graph()]
-    except KeyError:
-      seed, seed2 = random_seed.get_seed(self._original_seed)
-      self._graph_seed_map[ops.get_default_graph()] = (seed, seed2)
-
     # pylint: disable=protected-access
     input_resource = self._input_dataset._as_variant_tensor()
     return gen_dataset_ops.shuffle_and_repeat_dataset(
         input_resource,
         buffer_size=self._buffer_size,
         count=self._count,
-        seed=seed,
-        seed2=seed2,
+        seed=self._seed,
+        seed2=self._seed2,
         **dataset_ops.flat_structure(self))
     # pylint: enable=protected-access
 
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index ecb24103b3..c7295d6e69 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -443,7 +443,6 @@ tf_py_test(
     srcs = ["shuffle_dataset_op_test.py"],
     additional_deps = [
         ":test_base",
-        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
index 6001721726..347af18576 100644
--- a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 import collections
 
-from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.kernel_tests import test_base
@@ -32,7 +31,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class ShuffleDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+class ShuffleDatasetTest(test_base.DatasetTestBase):
 
   def testShuffleDataset(self):
     components = (
@@ -210,27 +209,5 @@ class ShuffleDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
-  @parameterized.named_parameters(
-      ("ReshuffleEachIterationNoSeed", None, True),
-      ("ReshuffleEachIterationWithSeed", 42, True),
-      ("NoReshuffleEachIterationNoSeed", None, False),
-      ("NoReshuffleEachIterationWithSeed", 42, False),
-  )
-  def testShuffleAndZipDataset(self, seed, reshuffle):
-    dataset = (dataset_ops.Dataset.range(10)
-               .shuffle(10, seed=seed, reshuffle_each_iteration=reshuffle)
-               .repeat(3))
-    dataset = dataset_ops.Dataset.zip((dataset, dataset))
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(30):
-        x, y = sess.run(next_element)
-        self.assertEqual(x, y)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 2d036fd0d6..b7e19055f2 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -2254,34 +2254,18 @@ class ShuffleDataset(UnaryDataset):
     self._input_dataset = input_dataset
     self._buffer_size = ops.convert_to_tensor(
         buffer_size, dtype=dtypes.int64, name="buffer_size")
-
-    # NOTE(mrry): We generate the seed-pair once per graph in which the dataset
-    # is iterated over, and cache it in `self._graph_seed_map`. This supports
-    # two features: iterating over the same `ShuffleDataset` twice in the same
-    # pipeline and observing the same order (by tying the seeds together with
-    # a randomly-generated seed), and using `Dataset.make_one_shot_iterator()`,
-    # which requires the stateful RNG op to be created inside the same graph as
-    # the dataset.
-    self._original_seed = seed
-    self._graph_seed_map = {}
-
+    self._seed, self._seed2 = random_seed.get_seed(seed)
     if reshuffle_each_iteration is None:
       self._reshuffle_each_iteration = True
     else:
       self._reshuffle_each_iteration = reshuffle_each_iteration
 
   def _as_variant_tensor(self):
-    try:
-      seed, seed2 = self._graph_seed_map[ops.get_default_graph()]
-    except KeyError:
-      seed, seed2 = random_seed.get_seed(self._original_seed)
-      self._graph_seed_map[ops.get_default_graph()] = (seed, seed2)
-
     return gen_dataset_ops.shuffle_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         buffer_size=self._buffer_size,
-        seed=seed,
-        seed2=seed2,
+        seed=self._seed,
+        seed2=self._seed2,
         reshuffle_each_iteration=self._reshuffle_each_iteration,
         **flat_structure(self))
 
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 95bf3209d7..39082ce370 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -142,7 +142,6 @@ py_test(
         ":random_seed",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:random_ops",
         "//tensorflow/python:util",
     ],
 )
diff --git a/tensorflow/python/data/util/random_seed.py b/tensorflow/python/data/util/random_seed.py
index d24df6d957..d5169f7a53 100644
--- a/tensorflow/python/data/util/random_seed.py
+++ b/tensorflow/python/data/util/random_seed.py
@@ -24,7 +24,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
 
 
 def get_seed(seed):
@@ -38,7 +37,7 @@ def get_seed(seed):
 
   Returns:
     A tuple of two `tf.int64` scalar tensors that should be used for the local
-    seeds of the calling dataset.
+    seed of the calling dataset.
   """
   seed, seed2 = random_seed.get_seed(seed)
   if seed is None:
@@ -46,7 +45,7 @@ def get_seed(seed):
   else:
     seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed")
   if seed2 is None:
-    seed2 = random_ops.random_uniform([], 1, 2**63 - 1, dtype=dtypes.int64)
+    seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
   else:
     with ops.name_scope("seed2") as scope:
       seed2 = ops.convert_to_tensor(seed2, dtype=dtypes.int64)
diff --git a/tensorflow/python/data/util/random_seed_test.py b/tensorflow/python/data/util/random_seed_test.py
index 5df2e38c62..a809151e6e 100644
--- a/tensorflow/python/data/util/random_seed_test.py
+++ b/tensorflow/python/data/util/random_seed_test.py
@@ -41,6 +41,7 @@ class RandomSeedTest(test.TestCase):
         # (input_graph_seed, input_op_seed)
         # and output from get_seed:
         # (output_graph_seed, output_op_seed)
+        ((None, None), (0, 0)),
         ((None, 1), (random_seed.DEFAULT_GRAPH_SEED, 1)),
         ((1, 1), (1, 1)),
         ((0, 0), (0, 2**31 - 1)),  # Avoid nondeterministic (0, 0) output
@@ -77,18 +78,6 @@ class RandomSeedTest(test.TestCase):
       self.assertEqual((g_seed, op_seed), toutput, msg=msg)
       random_seed.set_random_seed(None)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testNondeterministicRandomSeed(self):
-    random_seed.set_random_seed(None)
-    op_seeds = []
-    for _ in range(50):
-      g_seed, op_seed = data_random_seed.get_seed(None)
-      g_seed = self.evaluate(g_seed)
-      op_seed = self.evaluate(op_seed)
-      self.assertEqual(0, g_seed)
-      self.assertNotEqual(0, op_seed)
-      op_seeds.append(op_seed)
-    self.assertGreater(len(set(op_seeds)), 1)
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From b055d78b0edbf117ec5f7f2662d3bb2781ae02b3 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Mon, 8 Oct 2018 15:09:57 -0700
Subject: [PATCH 551/570] Fix issue with type inference for ops with fixed
 output types

Use the ArgDef::type field when available for propagating
the output types from a given unsupported operator.

PiperOrigin-RevId: 216257741
---
 tensorflow/contrib/lite/toco/import_tensorflow.cc |  7 +++++--
 .../contrib/lite/toco/import_tensorflow_test.cc   | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 133ef79a34..32f22e1ea0 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -1151,11 +1151,14 @@ tensorflow::Status ConvertUnsupportedOperator(
     op->output_data_types.push_back(ConvertDataType(output_type));
   } else if (op_def != nullptr) {
     for (const auto& output_arg : op_def->output_arg()) {
-      if (HasAttr(node, output_arg.type_attr())) {
+      if (output_arg.type() != tensorflow::DT_INVALID) {
+        op->output_data_types.push_back(ConvertDataType(output_arg.type()));
+      } else if (HasAttr(node, output_arg.type_attr())) {
         op->output_data_types.push_back(
             ConvertDataType(GetDataTypeAttr(node, output_arg.type_attr())));
       } else {
-        LOG(INFO) << "Op node missing output type attribute: " << node.name();
+        LOG(WARNING) << "Op node missing output type attribute: "
+                     << node.name();
         op->output_data_types.clear();
         break;
       }
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow_test.cc b/tensorflow/contrib/lite/toco/import_tensorflow_test.cc
index 8a236d4444..cd9a144b52 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow_test.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow_test.cc
@@ -235,6 +235,21 @@ TEST_P(TypeImportTest, BasicTypeInference) {
 INSTANTIATE_TEST_CASE_P(BasicTypeInference, TypeImportTest,
                         ::testing::ValuesIn(UnaryTestTypes()));
 
+TEST(ImportTest, TypeInferenceWithFixedOutputType) {
+  // Create an op that has a fixed output type (bool).
+  Model model;
+  EXPECT_TRUE(ImportNode(BuildNode("IsFinite", {{1, 2}, {2, 3}}), &model).ok());
+  ASSERT_THAT(model.operators.size(), ::testing::Ge(1));
+  ASSERT_EQ(model.operators[0]->type, OperatorType::kUnsupported);
+  const TensorFlowUnsupportedOperator* op =
+      static_cast<const TensorFlowUnsupportedOperator*>(
+          model.operators[0].get());
+
+  // The static output type should be indicated in the imported op.
+  ASSERT_THAT(op->output_data_types,
+              ::testing::ElementsAre(ArrayDataType::kBool));
+}
+
 TEST(ImportTest, FailedTypeInference) {
   // Create a unary op with no Type ("T") annotation.
   NodeDef node;
-- 
GitLab


From 0b13d0806b061deaec0e96cfdca1ae4509174f89 Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Mon, 8 Oct 2018 15:24:56 -0700
Subject: [PATCH 552/570] Simple comment fix in CheckpointInputPipelineHook.

PiperOrigin-RevId: 216260216
---
 tensorflow/python/data/experimental/ops/iterator_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/data/experimental/ops/iterator_ops.py b/tensorflow/python/data/experimental/ops/iterator_ops.py
index 72d7d58f06..5eb2563977 100644
--- a/tensorflow/python/data/experimental/ops/iterator_ops.py
+++ b/tensorflow/python/data/experimental/ops/iterator_ops.py
@@ -198,7 +198,7 @@ class CheckpointInputPipelineHook(session_run_hook.SessionRunHook):
     # is run *after* this hook. That is troublesome because
     # 1. If a checkpoint exists and this hook restores it, the initializer hook
     #    will override it.
-    # 2. If no checkpoint exists, this hook will try to save an initialized
+    # 2. If no checkpoint exists, this hook will try to save an uninitialized
     #    iterator which will result in an exception.
     #
     # As a temporary fix we enter the following implicit contract between this
-- 
GitLab


From a991acba07ce6c5903ee84e4a72d3d59e22b77fc Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Mon, 8 Oct 2018 15:26:34 -0700
Subject: [PATCH 553/570] Internal Change.

PiperOrigin-RevId: 216260437
---
 tensorflow/contrib/__init__.py | 8 --------
 tensorflow/python/__init__.py  | 7 -------
 2 files changed, 15 deletions(-)

diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index e71b0e0ae3..f52a1a7bab 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -21,14 +21,6 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.tools import component_api_helper
-component_api_helper.package_hook(
-    parent_package_str=(
-        "tensorflow.contrib"),
-    child_package_str=(
-        "tensorflow_estimator.contrib.estimator"))
-del component_api_helper
-
 # Add projects here, they will show up under tf.contrib.
 from tensorflow.contrib import autograph
 from tensorflow.contrib import batching
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 4921ecc43c..a2ab63bb48 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -48,13 +48,6 @@ import numpy as np
 
 from tensorflow.python import pywrap_tensorflow
 
-from tensorflow.python.tools import component_api_helper
-component_api_helper.package_hook(
-    parent_package_str='tensorflow.python',
-    child_package_str=(
-        'tensorflow_estimator.python.estimator'))
-del component_api_helper
-
 # Protocol buffers
 from tensorflow.core.framework.graph_pb2 import *
 from tensorflow.core.framework.node_def_pb2 import *
-- 
GitLab


From eb0f862ba60f41e8d0f06ceb6fc65f7f9905a25a Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 8 Oct 2018 15:27:40 -0700
Subject: [PATCH 554/570] Automated rollback of commit
 13b47e6c4f9d7b295948b1057139bf676e394b6f

PiperOrigin-RevId: 216260575
---
 tensorflow/core/kernels/data/iterator_ops.cc  |  4 +++
 .../kernels/data/map_and_batch_dataset_op.cc  |  9 +++----
 .../core/kernels/data/model_dataset_op.cc     | 10 +++----
 .../data/parallel_interleave_dataset_op.cc    | 27 ++++++++-----------
 .../kernels/data/parallel_map_iterator.cc     |  9 +++----
 .../core/kernels/data/prefetch_dataset_op.cc  | 10 +++----
 tensorflow/core/kernels/data/writer_ops.cc    | 12 ++++-----
 7 files changed, 37 insertions(+), 44 deletions(-)

diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 8acd6cc724..7a833668ac 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -16,8 +16,10 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_op_kernel.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
@@ -25,11 +27,13 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/optional_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index 0fb721cd7c..f45a239793 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -445,10 +445,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!runner_thread_) {
           auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
-          runner_thread_ =
-              MakeUnique<BackgroundWorker>(ctx->env(), "runner_thread");
-          runner_thread_->Schedule(
-              std::bind(&Iterator::RunnerThread, this, ctx_copy));
+          runner_thread_.reset(ctx->env()->StartThread(
+              {}, "runner_thread",
+              std::bind(&Iterator::RunnerThread, this, ctx_copy)));
         }
       }
 
@@ -704,7 +703,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_;
       // Buffer for storing the (intermediate) batch results.
       std::deque<std::shared_ptr<BatchResult>> batch_results_ GUARDED_BY(*mu_);
-      std::unique_ptr<BackgroundWorker> runner_thread_ GUARDED_BY(*mu_);
+      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
       bool cancelled_ GUARDED_BY(*mu_) = false;
     };
 
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index 859df57962..9aa505f4f1 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -127,10 +126,9 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (!optimize_thread_) {
           std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-          optimize_thread_ =
-              MakeUnique<BackgroundWorker>(ctx->env(), "optimize_thread");
-          optimize_thread_->Schedule(
-              [this, new_ctx]() { OptimizeThread(new_ctx); });
+          optimize_thread_.reset(ctx->env()->StartThread(
+              {}, "optimize_thread",
+              [this, new_ctx]() { OptimizeThread(new_ctx); }));
         }
         return Status::OK();
       }
@@ -169,7 +167,7 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       mutex mu_;
       condition_variable cond_var_;
       std::shared_ptr<model::Model> model_;
-      std::unique_ptr<BackgroundWorker> optimize_thread_ GUARDED_BY(mu_);
+      std::unique_ptr<Thread> optimize_thread_ GUARDED_BY(mu_);
       bool cancelled_ GUARDED_BY(mu_) = false;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
     };
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 9c836b836e..6b6b3d6ab9 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -482,10 +481,9 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
           worker_threads_.reserve(dataset()->num_threads());
           for (size_t i = 0; i < dataset()->num_threads(); ++i) {
             std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(
-                MakeUnique<BackgroundWorker>(ctx->env(), "worker_thread"));
-            worker_threads_.back()->Schedule(
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); });
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, "worker_thread",
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
           }
         }
         return Status::OK();
@@ -582,10 +580,9 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
             }
             workers_[i].SetInputs(s, std::move(args));
             std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(
-                MakeUnique<BackgroundWorker>(ctx->env(), "worker_thread"));
-            worker_threads_.back()->Schedule(
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); });
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, "worker_thread",
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
             if (i < dataset()->cycle_length_) {
               interleave_indices_.push_back(i);
             } else {
@@ -1050,8 +1047,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       // The worker threads. This must be last to ensure the
       // threads have exited before any other members are deallocated.
       // TODO(b/65178177): Avoid allocating additional threads.
-      std::vector<std::unique_ptr<BackgroundWorker>> worker_threads_
-          GUARDED_BY(mu_);
+      std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
@@ -1393,10 +1389,9 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!runner_thread_) {
           std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-          runner_thread_ =
-              MakeUnique<BackgroundWorker>(ctx->env(), "runner_thread");
-          runner_thread_->Schedule(
-              [this, new_ctx]() { RunnerThread(new_ctx); });
+          runner_thread_.reset(ctx->env()->StartThread(
+              {}, "runner_thread",
+              [this, new_ctx]() { RunnerThread(new_ctx); }));
         }
       }
 
@@ -1650,7 +1645,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       int64 num_calls_ GUARDED_BY(*mu_) = 0;
 
       std::unique_ptr<thread::ThreadPool> thread_pool_;
-      std::unique_ptr<BackgroundWorker> runner_thread_ GUARDED_BY(*mu_);
+      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
 
       // Identifies whether background activity should be cancelled.
       bool cancelled_ GUARDED_BY(*mu_) = false;
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index e69274e4f2..ebf41925c9 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -181,10 +181,9 @@ class ParallelMapIterator : public DatasetBaseIterator {
       EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     if (!runner_thread_) {
       auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
-      runner_thread_ =
-          MakeUnique<BackgroundWorker>(ctx->env(), "runner_thread");
-      runner_thread_->Schedule(
-          std::bind(&ParallelMapIterator::RunnerThread, this, ctx_copy));
+      runner_thread_.reset(ctx->env()->StartThread(
+          {}, "runner_thread",
+          std::bind(&ParallelMapIterator::RunnerThread, this, ctx_copy)));
     }
   }
 
@@ -332,7 +331,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   // Buffer for storing the invocation results.
   std::deque<std::shared_ptr<InvocationResult>> invocation_results_
       GUARDED_BY(*mu_);
-  std::unique_ptr<BackgroundWorker> runner_thread_ GUARDED_BY(*mu_);
+  std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
   bool cancelled_ GUARDED_BY(*mu_) = false;
 };
 
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index e9c38eb8a0..754ed772db 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -257,11 +256,10 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     Status EnsurePrefetchThreadStarted(IteratorContext* ctx)
         EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!prefetch_thread_) {
-        prefetch_thread_ =
-            MakeUnique<BackgroundWorker>(ctx->env(), "prefetch_thread");
         std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-        prefetch_thread_->Schedule(
-            [this, new_ctx]() { PrefetchThread(new_ctx); });
+        prefetch_thread_.reset(ctx->env()->StartThread(
+            {}, "prefetch_thread",
+            [this, new_ctx]() { PrefetchThread(new_ctx); }));
       }
       return Status::OK();
     }
@@ -365,7 +363,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     string prefix_end_;
     PrefetchAutotuner auto_tuner_ GUARDED_BY(mu_);
     std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
-    std::unique_ptr<BackgroundWorker> prefetch_thread_ GUARDED_BY(mu_);
+    std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
     bool cancelled_ GUARDED_BY(mu_) = false;
     bool prefetch_thread_finished_ GUARDED_BY(mu_) = false;
   };
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc
index 7bb2077b62..3f76695bb1 100644
--- a/tensorflow/core/kernels/data/writer_ops.cc
+++ b/tensorflow/core/kernels/data/writer_ops.cc
@@ -29,10 +29,10 @@ class ToTFRecordOp : public AsyncOpKernel {
  public:
   explicit ToTFRecordOp(OpKernelConstruction* ctx)
       : AsyncOpKernel(ctx),
-        background_worker_(
-            ctx->env(),
-            strings::StrCat("to_tf_record_op_", SanitizeThreadSuffix(name()))) {
-  }
+        thread_pool_(new thread::ThreadPool(
+            ctx->env(), ThreadOptions(),
+            strings::StrCat("to_tf_record__op_", SanitizeThreadSuffix(name())),
+            1 /* num_threads */, false /* low_latency_hint */)) {}
 
   template <typename T>
   Status ParseScalarArgument(OpKernelContext* ctx,
@@ -50,7 +50,7 @@ class ToTFRecordOp : public AsyncOpKernel {
     // The call to `iterator->GetNext()` may block and depend on an
     // inter-op thread pool thread, so we issue the call from the
     // owned thread pool.
-    background_worker_.Schedule([this, ctx, done]() {
+    thread_pool_->Schedule([this, ctx, done]() {
       string filename;
       OP_REQUIRES_OK_ASYNC(
           ctx, ParseScalarArgument<string>(ctx, "filename", &filename), done);
@@ -97,7 +97,7 @@ class ToTFRecordOp : public AsyncOpKernel {
   }
 
  private:
-  BackgroundWorker background_worker_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("DatasetToTFRecord").Device(DEVICE_CPU),
-- 
GitLab


From cb057ea64032e551027c8f9058a9d28a258c9d6b Mon Sep 17 00:00:00 2001
From: Chris Leary <leary@google.com>
Date: Mon, 8 Oct 2018 15:42:17 -0700
Subject: [PATCH 555/570] [XLA] Make overly-specific ShapeUtil predicate a
 little more general.

PiperOrigin-RevId: 216263039
---
 tensorflow/compiler/xla/service/hlo_instruction_test.cc | 3 ++-
 tensorflow/compiler/xla/service/hlo_query.cc            | 2 +-
 tensorflow/compiler/xla/shape_util.cc                   | 5 +++--
 tensorflow/compiler/xla/shape_util.h                    | 5 ++++-
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index c1b7c3832b..d93351fe04 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -135,7 +135,8 @@ TEST_F(HloInstructionTest, BasicProperties) {
   auto parameter = HloInstruction::CreateParameter(1, r0f32_, "foo");
 
   EXPECT_EQ(HloOpcode::kParameter, parameter->opcode());
-  EXPECT_TRUE(ShapeUtil::IsScalarF32(parameter->shape()));
+  EXPECT_TRUE(ShapeUtil::IsScalarWithElementType(parameter->shape(), F32));
+  EXPECT_FALSE(ShapeUtil::IsScalarWithElementType(parameter->shape(), S32));
   EXPECT_EQ(0, parameter->operand_count());
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_query.cc b/tensorflow/compiler/xla/service/hlo_query.cc
index 2a07b6fcbc..2d5197be9e 100644
--- a/tensorflow/compiler/xla/service/hlo_query.cc
+++ b/tensorflow/compiler/xla/service/hlo_query.cc
@@ -24,7 +24,7 @@ namespace hlo_query {
 
 bool IsConstantR0F32(HloInstruction* instruction, float* out) {
   if (instruction->opcode() == HloOpcode::kConstant &&
-      ShapeUtil::IsScalarF32(instruction->shape())) {
+      ShapeUtil::IsScalarWithElementType(instruction->shape(), F32)) {
     *out = instruction->literal().Get<float>({});
     return true;
   }
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 7f0201942b..9267de3cfc 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -461,8 +461,9 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return ShapeUtil::IsArray(shape) && ElementsIn(shape) == 0;
 }
 
-/* static */ bool ShapeUtil::IsScalarF32(const Shape& shape) {
-  return shape.element_type() == F32 && Rank(shape) == 0;
+/* static */ bool ShapeUtil::IsScalarWithElementType(
+    const Shape& shape, PrimitiveType element_type) {
+  return IsScalar(shape) && shape.element_type() == element_type;
 }
 
 namespace {
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index d8bb27beae..73f541d505 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -312,7 +312,10 @@ class ShapeUtil {
   static bool IsEffectiveScalar(const Shape& shape) {
     return IsArray(shape) && TrueRank(shape) == 0;
   }
-  static bool IsScalarF32(const Shape& shape);
+
+  // Returns whether "shape" is a scalar (array) with the given element_type.
+  static bool IsScalarWithElementType(const Shape& shape,
+                                      PrimitiveType element_type);
 
   // Extracts the size of the shape's dimension at dimension number
   // GetDimensionNumber(dimension_number).
-- 
GitLab


From 783627bf63cdfa467e7811f2bf8330555d66f313 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 15:55:18 -0700
Subject: [PATCH 556/570] Convert TensorFlow's aws dependency to new third
 party import method.

PiperOrigin-RevId: 216265275
---
 tensorflow/workspace.bzl                   | 14 +++-----------
 third_party/aws/BUILD                      |  1 +
 third_party/{aws.BUILD => aws/BUILD.bazel} |  0
 third_party/aws/workspace.bzl              | 15 +++++++++++++++
 4 files changed, 19 insertions(+), 11 deletions(-)
 create mode 100644 third_party/aws/BUILD
 rename third_party/{aws.BUILD => aws/BUILD.bazel} (100%)
 create mode 100644 third_party/aws/workspace.bzl

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index adeac62e43..40c226a861 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -20,12 +20,15 @@ load(
     "//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl",
     "def_file_filter_configure",
 )
+load("//third_party/aws:workspace.bzl", aws = "repo")
 load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
 load("//third_party/icu:workspace.bzl", icu = "repo")
 load("//third_party/jpeg:workspace.bzl", jpeg = "repo")
 load("//third_party/nasm:workspace.bzl", nasm = "repo")
 
 def initialize_third_party():
+    """ Load third party repositories.  See above load() statements. """
+    aws()
     flatbuffers()
     icu()
     jpeg()
@@ -585,17 +588,6 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "aws",
-        build_file = clean_dep("//third_party:aws.BUILD"),
-        sha256 = "b888d8ce5fc10254c3dd6c9020c7764dd53cf39cf011249d0b4deda895de1b7c",
-        strip_prefix = "aws-sdk-cpp-1.3.15",
-        urls = [
-            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
-            "https://github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
-        ],
-    )
-
     java_import_external(
         name = "junit",
         jar_sha256 = "59721f0805e223d84b90677887d9ff567dc534d7c502ca903c0c2b17f05c116a",
diff --git a/third_party/aws/BUILD b/third_party/aws/BUILD
new file mode 100644
index 0000000000..2f5d02becb
--- /dev/null
+++ b/third_party/aws/BUILD
@@ -0,0 +1 @@
+# Dummy BUILD file to make this directory a package.
diff --git a/third_party/aws.BUILD b/third_party/aws/BUILD.bazel
similarity index 100%
rename from third_party/aws.BUILD
rename to third_party/aws/BUILD.bazel
diff --git a/third_party/aws/workspace.bzl b/third_party/aws/workspace.bzl
new file mode 100644
index 0000000000..c216638154
--- /dev/null
+++ b/third_party/aws/workspace.bzl
@@ -0,0 +1,15 @@
+"""loads the aws library, used by TF."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "aws",
+        urls = [
+            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
+            "https://github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
+        ],
+        sha256 = "b888d8ce5fc10254c3dd6c9020c7764dd53cf39cf011249d0b4deda895de1b7c",
+        strip_prefix = "aws-sdk-cpp-1.3.15",
+        build_file = "//third_party/aws:BUILD.bazel",
+    )
-- 
GitLab


From 46d296b2d03ddbb6f0723d213fdfa9c5226e1e2a Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Mon, 8 Oct 2018 16:24:49 -0700
Subject: [PATCH 557/570] Internal change

PiperOrigin-RevId: 216270385
---
 tensorflow/contrib/lite/build_def.bzl | 40 +++++++++++++++++++++++----
 tensorflow/contrib/lite/testing/BUILD |  4 +--
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 7ef26de69f..b9e933a8b6 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -212,7 +212,8 @@ def json_to_tflite(name, src, out):
 
 # This is the master list of generated examples that will be made into tests. A
 # function called make_XXX_tests() must also appear in generate_examples.py.
-# Disable a test by commenting it out. If you do, add a link to a bug or issue.
+# Disable a test by adding it to the blacklists specified in
+# generated_test_models_failing().
 def generated_test_models():
     return [
         "add",
@@ -291,12 +292,38 @@ def generated_test_models():
         "tile",
         "topk",
         "transpose",
-        #"transpose_conv",   # disabled due to b/111213074
+        "transpose_conv",
         "unpack",
         "where",
         "zeros_like",
     ]
 
+# List of models that fail generated tests for the conversion mode.
+# If you have to disable a test, please add here with a link to the appropriate
+# bug or issue.
+def generated_test_models_failing(conversion_mode):
+    if not conversion_mode:
+        return [
+            "transpose_conv",  # disabled due to b/111213074
+        ]
+
+    if conversion_mode == "toco-flex":
+        # TODO(b/117328698): Fix and enable the known flex failures.
+        return [
+            "arg_min_max",
+            "div",
+            "floor_div",
+            "gather ",
+            "lstm ",
+            "resize_bilinear",
+            "space_to_batch_nd",
+            "split",
+            "transpose",
+            "unpack",
+        ]
+
+    return []
+
 def generated_test_conversion_modes():
     """Returns a list of conversion modes."""
 
@@ -313,10 +340,14 @@ def generated_test_models_all():
     tests = generated_test_models()
     options = []
     for conversion_mode in conversion_modes:
+        failing_tests = generated_test_models_failing(conversion_mode)
         for test in tests:
+            tags = []
+            if test in failing_tests:
+                tags.append("notap")
             if conversion_mode:
                 test += "_%s" % conversion_mode
-            options.append((conversion_mode, test))
+            options.append((conversion_mode, test, tags))
     return options
 
 def gen_zip_test(name, test_name, conversion_mode, **kwargs):
@@ -336,9 +367,6 @@ def gen_zip_test(name, test_name, conversion_mode, **kwargs):
         # if conversion_mode == "pb2lite":
         #     toco = "//tensorflow/contrib/lite/experimental/pb2lite:pb2lite"
         flags = "--ignore_toco_errors --run_with_flex"
-        kwargs["tags"].append("skip_already_failing")
-        kwargs["tags"].append("no_oss")
-        kwargs["tags"].append("notap")
 
     gen_zipped_test_file(
         name = "zip_%s" % test_name,
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index f0bfec2338..45baad782a 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -35,7 +35,7 @@ load(
         ":zip_%s" % test_name,
     ],
     shard_count = 20,
-    tags = [
+    tags = tags + [
         "gen_zip_test",
         "no_oss",
         "tflite_not_portable_intentional",
@@ -61,7 +61,7 @@ load(
             "//tensorflow/core:android_tensorflow_test_lib",
         ],
     }),
-) for conversion_mode, test_name in generated_test_models_all()]
+) for conversion_mode, test_name, tags in generated_test_models_all()]
 
 test_suite(
     name = "generated_zip_tests",
-- 
GitLab


From 8815f34385eb28f1cfcb53bebd526c11573f3027 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Mon, 8 Oct 2018 16:25:40 -0700
Subject: [PATCH 558/570] Avoid calling get_default_graph() during
 tf.enable_eager_execution()

PiperOrigin-RevId: 216270497
---
 tensorflow/python/framework/ops.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 77c2bc930e..140bd098a6 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -5457,8 +5457,7 @@ def enable_eager_execution_internal(config=None,
         "tf.contrib.eager.ASYNC")
   if context.default_execution_mode == context.GRAPH_MODE:
     graph_mode_has_been_used = (
-        _default_session_stack.stack
-        or len(get_default_graph().get_operations()) > 0)  # pylint: disable=g-explicit-length-test
+        _default_graph_stack._global_default_graph is not None) # pylint: disable=protected-access
     if graph_mode_has_been_used:
       raise ValueError(
           "tf.enable_eager_execution must be called at program startup.")
-- 
GitLab


From 49643265c3f1f279a93bd8bc3a126e11e979bc44 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 8 Oct 2018 17:14:47 -0700
Subject: [PATCH 559/570] Remove deprecations for some of the endpoints in
 ApiDef files. These changes are made according to
 https://github.com/tensorflow/community/pull/16.

I am keeping a few symbols deprecated not mentioned in the doc:
tf.diag - it seems best to keep it next to tf.linalg.diag, so that the two are easy to compare and decide which one to use. The plan is to rename tf.diag to tf.tensor_diag.
tf.is_nan - similar to tf.is_inf, tf.is_finite, tf.is_numeric_tensor which are all getting deprecated and replaced by symbols in tf.debugging.
tf.string_to_number - other string endpoints in root namespace are getting deprecated: for e.g. tf.substr, tf.string_join.
tf.dequantize - all quantization ops should be under tf.quantize. I probably missed this one.
tf.check_numerics - similar to other debugging ops that are getting moved to tf.debugging.
tf.squared_difference - moved to tf.math namespace and not as popular as some other math ops such as tf.add to justify keeping endpoint in root.
tf.decode_raw - similar to other ops such as tf.decode_csv that are getting moved to tf.io.decode_csv.

PiperOrigin-RevId: 216278010
---
 tensorflow/core/api_def/python_api/api_def_Acos.pbtxt         | 1 -
 tensorflow/core/api_def/python_api/api_def_Acosh.pbtxt        | 1 -
 tensorflow/core/api_def/python_api/api_def_Add.pbtxt          | 1 -
 tensorflow/core/api_def/python_api/api_def_AsString.pbtxt     | 1 -
 tensorflow/core/api_def/python_api/api_def_Asin.pbtxt         | 1 -
 tensorflow/core/api_def/python_api/api_def_Asinh.pbtxt        | 1 -
 tensorflow/core/api_def/python_api/api_def_Atan.pbtxt         | 1 -
 tensorflow/core/api_def/python_api/api_def_Atan2.pbtxt        | 1 -
 tensorflow/core/api_def/python_api/api_def_Atanh.pbtxt        | 1 -
 tensorflow/core/api_def/python_api/api_def_Cos.pbtxt          | 1 -
 tensorflow/core/api_def/python_api/api_def_Cosh.pbtxt         | 1 -
 tensorflow/core/api_def/python_api/api_def_Equal.pbtxt        | 1 -
 tensorflow/core/api_def/python_api/api_def_Exp.pbtxt          | 1 -
 tensorflow/core/api_def/python_api/api_def_Floor.pbtxt        | 1 -
 tensorflow/core/api_def/python_api/api_def_Greater.pbtxt      | 1 -
 tensorflow/core/api_def/python_api/api_def_GreaterEqual.pbtxt | 1 -
 tensorflow/core/api_def/python_api/api_def_Less.pbtxt         | 1 -
 tensorflow/core/api_def/python_api/api_def_LessEqual.pbtxt    | 1 -
 tensorflow/core/api_def/python_api/api_def_Log.pbtxt          | 1 -
 tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt        | 1 -
 tensorflow/core/api_def/python_api/api_def_LogicalAnd.pbtxt   | 1 -
 tensorflow/core/api_def/python_api/api_def_LogicalNot.pbtxt   | 1 -
 tensorflow/core/api_def/python_api/api_def_LogicalOr.pbtxt    | 1 -
 tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt      | 1 -
 tensorflow/core/api_def/python_api/api_def_Minimum.pbtxt      | 1 -
 tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt     | 1 -
 tensorflow/core/api_def/python_api/api_def_Sin.pbtxt          | 1 -
 tensorflow/core/api_def/python_api/api_def_Sinh.pbtxt         | 1 -
 tensorflow/core/api_def/python_api/api_def_Tan.pbtxt          | 1 -
 29 files changed, 29 deletions(-)

diff --git a/tensorflow/core/api_def/python_api/api_def_Acos.pbtxt b/tensorflow/core/api_def/python_api/api_def_Acos.pbtxt
index 1fd8baf05f..f4d7f498b2 100644
--- a/tensorflow/core/api_def/python_api/api_def_Acos.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Acos.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "acos"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Acosh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Acosh.pbtxt
index f7946652ef..e921f26d1e 100644
--- a/tensorflow/core/api_def/python_api/api_def_Acosh.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Acosh.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "acosh"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Add.pbtxt b/tensorflow/core/api_def/python_api/api_def_Add.pbtxt
index fb505a91ac..4c6f387ebd 100644
--- a/tensorflow/core/api_def/python_api/api_def_Add.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Add.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "add"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt b/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
index ea65543a76..d51defc376 100644
--- a/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "as_string"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Asin.pbtxt b/tensorflow/core/api_def/python_api/api_def_Asin.pbtxt
index eedf4553c6..b13f5c398f 100644
--- a/tensorflow/core/api_def/python_api/api_def_Asin.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Asin.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "asin"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Asinh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Asinh.pbtxt
index 10c2fb356e..89a3f9da44 100644
--- a/tensorflow/core/api_def/python_api/api_def_Asinh.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Asinh.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "asinh"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Atan.pbtxt b/tensorflow/core/api_def/python_api/api_def_Atan.pbtxt
index 03dd5dc848..4403a2379c 100644
--- a/tensorflow/core/api_def/python_api/api_def_Atan.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Atan.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "atan"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Atan2.pbtxt b/tensorflow/core/api_def/python_api/api_def_Atan2.pbtxt
index 85b27bd881..56eed0f0fb 100644
--- a/tensorflow/core/api_def/python_api/api_def_Atan2.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Atan2.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "atan2"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Atanh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Atanh.pbtxt
index ee7c0600d6..a8f5e792f0 100644
--- a/tensorflow/core/api_def/python_api/api_def_Atanh.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Atanh.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "atanh"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Cos.pbtxt b/tensorflow/core/api_def/python_api/api_def_Cos.pbtxt
index 1af8c0c2c9..db52d25ff2 100644
--- a/tensorflow/core/api_def/python_api/api_def_Cos.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Cos.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "cos"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Cosh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Cosh.pbtxt
index 2de87df40d..74bf573565 100644
--- a/tensorflow/core/api_def/python_api/api_def_Cosh.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Cosh.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "cosh"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Equal.pbtxt b/tensorflow/core/api_def/python_api/api_def_Equal.pbtxt
index 78aa1b3bc5..34717e74bc 100644
--- a/tensorflow/core/api_def/python_api/api_def_Equal.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Equal.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "equal"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt b/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt
index 70323fe5b4..38a9078d9f 100644
--- a/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "exp"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Floor.pbtxt b/tensorflow/core/api_def/python_api/api_def_Floor.pbtxt
index 9b93caa0b1..14accd2b20 100644
--- a/tensorflow/core/api_def/python_api/api_def_Floor.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Floor.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "floor"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Greater.pbtxt b/tensorflow/core/api_def/python_api/api_def_Greater.pbtxt
index 7de60d44c4..7926deaa3b 100644
--- a/tensorflow/core/api_def/python_api/api_def_Greater.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Greater.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "greater"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_GreaterEqual.pbtxt b/tensorflow/core/api_def/python_api/api_def_GreaterEqual.pbtxt
index 9c8975c2a9..21bbb1b094 100644
--- a/tensorflow/core/api_def/python_api/api_def_GreaterEqual.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_GreaterEqual.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "greater_equal"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Less.pbtxt b/tensorflow/core/api_def/python_api/api_def_Less.pbtxt
index 055df2922a..0b5f06e99f 100644
--- a/tensorflow/core/api_def/python_api/api_def_Less.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Less.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "less"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_LessEqual.pbtxt b/tensorflow/core/api_def/python_api/api_def_LessEqual.pbtxt
index d2803ddb69..afc4f2a8c9 100644
--- a/tensorflow/core/api_def/python_api/api_def_LessEqual.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_LessEqual.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "less_equal"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Log.pbtxt b/tensorflow/core/api_def/python_api/api_def_Log.pbtxt
index 26d2473b9c..ac4a4454c7 100644
--- a/tensorflow/core/api_def/python_api/api_def_Log.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Log.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "log"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt b/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt
index d85b6dccec..5a2d77a417 100644
--- a/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "log1p"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_LogicalAnd.pbtxt b/tensorflow/core/api_def/python_api/api_def_LogicalAnd.pbtxt
index 80bd98b740..d4e6a7a380 100644
--- a/tensorflow/core/api_def/python_api/api_def_LogicalAnd.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_LogicalAnd.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "logical_and"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_LogicalNot.pbtxt b/tensorflow/core/api_def/python_api/api_def_LogicalNot.pbtxt
index b2244c44b1..49068738a4 100644
--- a/tensorflow/core/api_def/python_api/api_def_LogicalNot.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_LogicalNot.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "logical_not"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_LogicalOr.pbtxt b/tensorflow/core/api_def/python_api/api_def_LogicalOr.pbtxt
index cf78b52e07..a5133962dc 100644
--- a/tensorflow/core/api_def/python_api/api_def_LogicalOr.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_LogicalOr.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "logical_or"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt b/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt
index bcff379b71..130729ece1 100644
--- a/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "maximum"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Minimum.pbtxt b/tensorflow/core/api_def/python_api/api_def_Minimum.pbtxt
index 9aae74226a..8aded1f154 100644
--- a/tensorflow/core/api_def/python_api/api_def_Minimum.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Minimum.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "minimum"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt b/tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt
index f37317854f..07fe3b6af1 100644
--- a/tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "not_equal"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Sin.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sin.pbtxt
index 9c19a1a177..a2b776ee0c 100644
--- a/tensorflow/core/api_def/python_api/api_def_Sin.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Sin.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "sin"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Sinh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sinh.pbtxt
index 155e58e6d5..38c7c729bf 100644
--- a/tensorflow/core/api_def/python_api/api_def_Sinh.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Sinh.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "sinh"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Tan.pbtxt b/tensorflow/core/api_def/python_api/api_def_Tan.pbtxt
index ffa92f5580..20cfac05fd 100644
--- a/tensorflow/core/api_def/python_api/api_def_Tan.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Tan.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "tan"
-    deprecated: true
   }
 }
-- 
GitLab


From 03d097bc96080981098ffdbaf1b3465e6e153a6a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 17:33:22 -0700
Subject: [PATCH 560/570] Consolidate device parameter arguments into a shared
 DeviceInfo struct

PiperOrigin-RevId: 216280197
---
 tensorflow/core/grappler/costs/cost_estimator.h           | 5 +++++
 tensorflow/core/grappler/costs/op_level_cost_estimator.cc | 2 +-
 tensorflow/core/grappler/costs/op_level_cost_estimator.h  | 6 ------
 tensorflow/python/grappler/cluster.i                      | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index e91f0cc9da..569d9da683 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -30,6 +30,11 @@ struct GrapplerItem;
 constexpr int64 kMemoryUnknown = -1ll;
 constexpr int64 kZeroMemory = 0ll;
 
+struct DeviceInfo {
+  double gigaops;     // Billions of operations executed per second.
+  double gb_per_sec;  // Bandwidth to main memory in GB per second.
+};
+
 // Holds the set of things we might want to estimate or measure in Grappler.
 // Always produce execution time. Other fields are optional depending on the
 // estimator being used.
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 71f4d9fd05..f363f2915f 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -372,7 +372,7 @@ Costs OpLevelCostEstimator::PredictCosts(const OpContext& op_context) const {
   return costs;
 }
 
-OpLevelCostEstimator::DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
+DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
     const DeviceProperties& device) const {
   double gflops = -1;
   double gb_per_sec = -1;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index a277dfdf65..dd1ee39cb2 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -40,12 +40,6 @@ class OpLevelCostEstimator {
 
   virtual Costs PredictCosts(const OpContext& op_context) const;
 
-  // Basic device performance info, sufficient for roofline estimate.
-  struct DeviceInfo {
-    double gigaops;     // Billions of operations executed per second.
-    double gb_per_sec;  // Bandwidth to main memory in GB per second.
-  };
-
   // Returns basic device performance info.
   virtual DeviceInfo GetDeviceInfo(const DeviceProperties& device) const;
 
diff --git a/tensorflow/python/grappler/cluster.i b/tensorflow/python/grappler/cluster.i
index 6816e20407..87795ffcfb 100644
--- a/tensorflow/python/grappler/cluster.i
+++ b/tensorflow/python/grappler/cluster.i
@@ -308,7 +308,7 @@ static PyObject* TF_GetSupportedDevices(GCluster cluster, GItem item) {
 
 static double TF_EstimatePerformance(const tensorflow::NamedDevice& device) {
   tensorflow::grappler::OpLevelCostEstimator estimator;
-  tensorflow::grappler::OpLevelCostEstimator::DeviceInfo info =
+  tensorflow::grappler::DeviceInfo info =
       estimator.GetDeviceInfo(device.properties());
   return info.gigaops;
 }
-- 
GitLab


From 4ff7b81514ea1b86295bc74b620e3c1d3e127e6f Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 8 Oct 2018 17:37:44 -0700
Subject: [PATCH 561/570] Fix the seeding for `Dataset.shuffle(...,
 reshuffle_each_iteration=False)`.

Previously, we were passing the first (graph-level) seed for both the
graph-level and op-level seeds when creating a C++ dataset. This
change passes the op-level seed to the appropriate point, and adds a test
for the behavior with graph-but-not-op-level seeds.

PiperOrigin-RevId: 216280641
---
 .../core/kernels/data/shuffle_dataset_op.cc   |  2 +-
 tensorflow/python/data/kernel_tests/BUILD     |  3 ++
 .../kernel_tests/shuffle_dataset_op_test.py   | 35 ++++++++++++++++++-
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 66466d6a36..9f54c381a9 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -485,7 +485,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
                      int64 buffer_size, int64 seed, int64 seed2, int64 count)
         : ShuffleDatasetBase(ctx, input, buffer_size, count),
           seed_(seed),
-          seed2_(seed) {}
+          seed2_(seed2) {}
 
     string DebugString() const override {
       return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index c7295d6e69..671b7ca1bb 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -443,12 +443,15 @@ tf_py_test(
     srcs = ["shuffle_dataset_op_test.py"],
     additional_deps = [
         ":test_base",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:random_seed",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
     ],
diff --git a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
index 347af18576..8694f58a24 100644
--- a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import collections
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.kernel_tests import test_base
@@ -27,11 +28,13 @@ from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class ShuffleDatasetTest(test_base.DatasetTestBase):
+class ShuffleDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testShuffleDataset(self):
     components = (
@@ -209,5 +212,35 @@ class ShuffleDatasetTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  @parameterized.named_parameters(
+      ("ReshuffleGraphLevelSeed", True, 38, None),
+      ("ReshuffleOpLevelSeed", True, None, 42),
+      ("ReshuffleGraphAndOpLevelSeed", True, 38, 42),
+      ("NoReshuffleGraphLevelSeed", False, 38, None),
+      ("NoReshuffleOpLevelSeed", False, None, 42),
+      ("NoReshuffleGraphAndOpLevelSeed", False, 38, 42),
+  )
+  def testShuffleSeed(self, reshuffle, graph_level_seed, op_level_seed):
+    results = []
+    for _ in range(2):
+      with ops.Graph().as_default() as g:
+        random_seed.set_random_seed(graph_level_seed)
+        dataset = dataset_ops.Dataset.range(10).shuffle(
+            10, seed=op_level_seed, reshuffle_each_iteration=reshuffle).repeat(
+                3)
+        iterator = dataset.make_one_shot_iterator()
+        next_element = iterator.get_next()
+
+        run_results = []
+        with self.session(graph=g) as sess:
+          for _ in range(30):
+            run_results.append(sess.run(next_element))
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(next_element)
+        results.append(run_results)
+
+    self.assertAllEqual(results[0], results[1])
+
+
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 934fde5b8c60987db36438ab4f70f8a91bce306b Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Mon, 8 Oct 2018 17:40:07 -0700
Subject: [PATCH 562/570] Register int64 SUM GPU kernel.

PiperOrigin-RevId: 216280913
---
 tensorflow/core/kernels/reduction_ops_sum.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
index 5318d8c133..cf0d0f5c71 100644
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -51,6 +51,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .HostMemory("reduction_indices"),                                    \
       ReductionOp<GPUDevice, type, int64, Eigen::internal::SumReducer<type>>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_int64(REGISTER_GPU_KERNELS);
 TF_CALL_complex64(REGISTER_GPU_KERNELS);
 TF_CALL_complex128(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
-- 
GitLab


From d58712b7fc8de0e1f87fe2ea5221bc3c85230ed3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 18:12:42 -0700
Subject: [PATCH 563/570] Add a tracing::ScopedActivity event to track the
 duration of a Session::Run() call for better xprof tracing. Also annotate
 synchronous op execution with the session-run id (or step_id) as metadata
 leveraging the support introduced in cl/215985561. This should enable
 highlighting the duration of a Session::Run and all the ops that ran in it
 for visualizing latency regressions in the case of CPU inference.

PiperOrigin-RevId: 216284682
---
 tensorflow/core/common_runtime/direct_session.cc |  4 ++++
 tensorflow/core/common_runtime/executor.cc       | 12 ++++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 458e133b68..52c1cd2691 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -64,6 +64,7 @@ limitations under the License.
 #include "tensorflow/core/platform/device_tracer.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
@@ -453,6 +454,9 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
                                   CallFrameInterface* call_frame,
                                   ExecutorsAndKeys* executors_and_keys,
                                   RunMetadata* run_metadata) {
+  string session_id_meta = strings::StrCat("SessionRun #id=", step_id, "#");
+  tracing::ScopedActivity activity(session_id_meta);
+
   const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1);
 
   std::unique_ptr<DebuggerStateInterface> debugger_state;
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 40ec1502da..eb69d1991c 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1771,14 +1771,18 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
             // The OpKernel may create child activities (such as GPU kernel
             // launches), so use a `ScopedAnnotation` to relate these activities
             // in the trace.
-            tracing::ScopedAnnotation activity(op_name,
-                                               op_kernel->type_string());
+            tracing::ScopedAnnotation activity(
+                op_name, strings::StrCat(op_kernel->type_string(),
+                                         "#id=", step_id_, "#"));
             device->Compute(op_kernel, &ctx);
           } else {
             // Use the cheaper `ScopedActivity` to trace just the OpKernel
             // execution.
-            tracing::ScopedActivity activity(op_name, op_kernel->type_string(),
-                                             item.kernel_is_expensive);
+            tracing::ScopedActivity activity(
+                op_name,
+                strings::StrCat(op_kernel->type_string(), "#id=", step_id_,
+                                "#"),
+                item.kernel_is_expensive);
             device->Compute(op_kernel, &ctx);
           }
         } else {
-- 
GitLab


From 375c109659d2d0e6265447dffdeb460693b3cccf Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Mon, 8 Oct 2018 21:18:36 -0700
Subject: [PATCH 564/570] [XLA] Introduce input/output alias config.

- This CL intruduces input/output alias config in HLO module that allows any HLO pass to configure it. Once the alias_config is set, each backend needs to follow the contract during execution time to make sure the input and output are indeed aliased.

- Copy insertion / buffer assignment and alias analysis has been updated to correctly honor the config and avoid any possible liveness interference.

PiperOrigin-RevId: 216299501
---
 tensorflow/compiler/xla/service/BUILD         |  21 ++
 .../compiler/xla/service/buffer_assignment.cc |  34 ++--
 .../compiler/xla/service/buffer_value.h       |   3 +
 .../compiler/xla/service/copy_insertion.cc    |  85 +++++++-
 .../xla/service/copy_insertion_test.cc        | 183 +++++++++++++++++
 tensorflow/compiler/xla/service/hlo.proto     |  29 +++
 .../xla/service/hlo_alias_analysis.cc         |  46 ++++-
 .../xla/service/hlo_alias_analysis_test.cc    | 175 +++++++++++++++++
 .../xla/service/hlo_dataflow_analysis.cc      |   2 +-
 .../service/hlo_input_output_alias_config.cc  | 172 ++++++++++++++++
 .../service/hlo_input_output_alias_config.h   | 101 ++++++++++
 .../hlo_input_output_alias_config_test.cc     | 184 ++++++++++++++++++
 tensorflow/compiler/xla/service/hlo_module.cc |   9 +
 tensorflow/compiler/xla/service/hlo_module.h  |  14 ++
 .../compiler/xla/service/hlo_verifier.cc      |   2 +
 tensorflow/compiler/xla/shape_util.h          |   2 +-
 16 files changed, 1037 insertions(+), 25 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
 create mode 100644 tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
 create mode 100644 tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 2b292ed053..26ebb88e96 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -294,6 +294,7 @@ cc_library(
     srcs = [
         "dfs_hlo_visitor.cc",
         "hlo_computation.cc",
+        "hlo_input_output_alias_config.cc",
         "hlo_instruction.cc",
         "hlo_instructions.cc",
         "hlo_module.cc",
@@ -308,6 +309,7 @@ cc_library(
         "hlo_clone_context.h",
         "hlo_computation.h",
         "hlo_domain_metadata.h",
+        "hlo_input_output_alias_config.h",
         "hlo_instruction.h",
         "hlo_instructions.h",
         "hlo_module.h",
@@ -1268,6 +1270,25 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "hlo_input_output_alias_config_test",
+    srcs = ["hlo_input_output_alias_config_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_dce",
+        ":hlo_memory_scheduler",
+        ":hlo_ordering",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
 cc_library(
     name = "hlo_memory_scheduler",
     srcs = ["hlo_memory_scheduler.cc"],
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 2c2d1626c2..d5d6a044a8 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -239,7 +239,7 @@ BufferAllocation::Slice BufferAllocation::GetSlice(
 
 void BufferAllocation::AddAssignment(const LogicalBuffer& buffer, int64 offset,
                                      int64 size) {
-  VLOG(4) << "Trying to add " << buffer << " to " << this;
+  VLOG(4) << "Trying to add " << buffer << " to allocation #" << index();
   CHECK(assigned_buffers_.count(&buffer) == 0)
       << "LogicalBuffer " << buffer << " already assigned to allocation "
       << index_;
@@ -784,21 +784,6 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
     }
   }
 
-  if (allow_input_output_aliasing_ && allocation->maybe_live_out()) {
-    const HloComputation* entry_computation =
-        assignment->module_->entry_computation();
-    for (auto param : entry_computation->parameter_instructions()) {
-      for (auto& param_buffer :
-           assignment->points_to_analysis().GetBuffersDefinedByInstruction(
-               param)) {
-        if (assignment->liveness().MayInterfere(*param_buffer, buffer)) {
-          VLOG(4) << "Can't assign: Parameter interference with result";
-          return false;
-        }
-      }
-    }
-  }
-
   // If the buffer is live out of the computation then it should only be
   // assigned a buffer which exactly fits the result to avoid wasting memory
   // (result buffers can have arbitrary lifetimes).
@@ -1434,13 +1419,28 @@ BufferAssigner::MergeColocatedBufferSets(
 
 // Builds sets of buffers in 'colocated_buffer_sets' which should be colocated
 // in the same allocation (currently just supports kWhile, kCall, and
-// kConditional).
+// kConditional and input output aliasing).
 void BufferAssigner::BuildColocatedBufferSets(
     const HloModule* module, const BufferLiveness& buffer_liveness,
     const LogicalBuffer::SizeFunction& buffer_size,
     std::vector<ColocatedBufferSet>* colocated_buffer_sets) {
   const TuplePointsToAnalysis& points_to_analysis =
       buffer_liveness.points_to_analysis();
+
+  // Set up colocated buffer set for input and output.
+  module->input_output_alias_config().ForEachAlias(
+      [&](const ShapeIndex& output_index, int64 param_number,
+          const ShapeIndex& param_index) {
+        std::vector<const LogicalBuffer*> colocated_set;
+        AddBufferToColocatedSet(module->entry_computation()->root_instruction(),
+                                output_index, points_to_analysis,
+                                &colocated_set);
+        AddBufferToColocatedSet(
+            module->entry_computation()->parameter_instruction(param_number),
+            param_index, points_to_analysis, &colocated_set);
+        AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
+      });
+
   for (const HloComputation* computation : module->MakeComputationPostOrder()) {
     if (computation->IsFusionComputation()) {
       continue;
diff --git a/tensorflow/compiler/xla/service/buffer_value.h b/tensorflow/compiler/xla/service/buffer_value.h
index 69b3646356..11d8abc5ba 100644
--- a/tensorflow/compiler/xla/service/buffer_value.h
+++ b/tensorflow/compiler/xla/service/buffer_value.h
@@ -141,6 +141,9 @@ class BufferValue {
   // operator< is required for std::set.
   bool operator<(const BufferValue& other) const { return id_ < other.id_; }
 
+  bool operator==(const BufferValue& other) const { return id_ == other.id_; }
+  bool operator!=(const BufferValue& other) const { return id_ != other.id_; }
+
   virtual string ToString() const = 0;
 
   // TODO(lauj) rename LogicalBufferProto to BufferValueProto.
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index f35324aa35..cfe025fdd1 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -40,10 +40,12 @@ namespace {
 
 using absl::StrAppend;
 
-bool IsEntryParameterValue(const HloValue& value) {
+bool IsReadonlyEntryParameterValue(const HloValue& value) {
   const HloComputation* computation = value.defining_instruction()->parent();
   return value.defining_instruction()->opcode() == HloOpcode::kParameter &&
-         computation == computation->parent()->entry_computation();
+         computation == computation->parent()->entry_computation() &&
+         !computation->parent()->input_output_alias_config().ParameterHasAlias(
+             value.defining_instruction()->parameter_number());
 }
 
 bool IsConstantValue(const HloValue& value) {
@@ -51,7 +53,7 @@ bool IsConstantValue(const HloValue& value) {
 }
 
 bool ValueIsReadOnly(const HloValue& value) {
-  return IsConstantValue(value) || IsEntryParameterValue(value);
+  return IsConstantValue(value) || IsReadonlyEntryParameterValue(value);
 }
 
 // Data structure describing the action which should be taken on parts of a
@@ -332,6 +334,81 @@ Status AddCopiesForConditional(const HloAliasAnalysis& alias_analysis,
   return Status::OK();
 }
 
+// Conservatively adds copies before root instruction of entry computation and
+// each aliased parameter to resolve interference of aliased input and output
+// buffer. We later rely on the CopyRemover to drop the unnecessary ones.
+Status AddCopiesForAliasedInputOutputs(HloModule* module) {
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* root = entry->root_instruction();
+
+  ShapeTree<bool> output_indices_to_copy(root->shape());
+  std::vector<ShapeTree<HloInstruction*>> copied_parameters;
+  bool has_alias = false;
+  for (auto* param : entry->parameter_instructions()) {
+    bool param_has_alias = false;
+    ShapeTree<bool> param_indices_to_copy(param->shape());
+
+    module->input_output_alias_config().ForEachAlias(
+        [&](const ShapeIndex& output_index, int64 param_number,
+            const ShapeIndex& param_index) {
+          if (param_number == param->parameter_number()) {
+            param_has_alias = true;
+            *(param_indices_to_copy.mutable_element(param_index)) = true;
+            *(output_indices_to_copy.mutable_element(output_index)) = true;
+          }
+        });
+
+    if (!param_has_alias) {
+      continue;
+    }
+
+    has_alias = true;
+    // Store a snapshot of users before DeepCopyInstruction, as
+    // DeepCopyInstruction introduces new users of the instruction.
+    std::vector<HloInstruction*> users = param->users();
+    ShapeTree<HloInstruction*> param_copy_tree(param->shape(),
+                                               /*init_value=*/nullptr);
+    TF_ASSIGN_OR_RETURN(HloInstruction * copied,
+                        entry->DeepCopyInstruction(
+                            param, &param_indices_to_copy, &param_copy_tree));
+    for (HloInstruction* user : users) {
+      TF_RETURN_IF_ERROR(param->ReplaceUseWith(user, copied));
+    }
+
+    copied_parameters.push_back(param_copy_tree);
+  }
+
+  if (!has_alias) {
+    return Status::OK();
+  }
+
+  // Add copies before root instruction.
+  ShapeTree<HloInstruction*> output_copy_tree(root->shape(),
+                                              /*init_value=*/nullptr);
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * root_copied,
+                      root->parent()->DeepCopyInstruction(
+                          root, &output_indices_to_copy, &output_copy_tree));
+
+  // Add control dependencies between the input/output copies.
+  TF_RETURN_IF_ERROR(module->input_output_alias_config().ForEachAliasWithStatus(
+      [&](const ShapeIndex& output_index, int64 param_number,
+          const ShapeIndex& input_index) -> Status {
+        HloInstruction* from =
+            copied_parameters[param_number].element(input_index);
+        HloInstruction* to = output_copy_tree.element(output_index);
+
+        TF_RET_CHECK(from != nullptr);
+        TF_RET_CHECK(to != nullptr);
+        TF_RETURN_IF_ERROR(from->AddControlDependencyTo(to));
+        return Status::OK();
+      }));
+
+  entry->set_root_instruction(root_copied);
+
+  return Status::OK();
+}
+
 // Removes any control dependencies to or from the given instruction.
 Status StripControlDependenciesFrom(HloInstruction* instruction) {
   while (!instruction->control_successors().empty()) {
@@ -953,6 +1030,8 @@ Status CopyInsertion::AddCopiesToResolveInterference(HloModule* module) {
       }
     }
   }
+
+  TF_RETURN_IF_ERROR(AddCopiesForAliasedInputOutputs(module));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 892d0d7b54..3096206c34 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -1351,6 +1351,189 @@ TEST_F(CopyInsertionTest, SwizzlingWhile) {
   EXPECT_THAT(xla_while->operand(0), op::Tuple(op::Copy(), op::Copy()));
 }
 
+TEST_F(CopyInsertionTest, CrossingParameters) {
+  // Test a case where two parameters' dataflow cross with each other while
+  // input and output are aliased with same index:
+  //
+  //  (p0 ,  p1)
+  //   | \   /|
+  //   |  \ / |
+  // alias X  alias
+  //   |  / \ |
+  //   | /   \|
+  //  (p1  ,  p0)
+  auto module = CreateNewModule();
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+  builder.AddInstruction(HloInstruction::CreateTuple({gte1, gte0}));
+  module->AddEntryComputation(builder.Build());
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 4);
+}
+
+TEST_F(CopyInsertionTest, ParametersAliasing) {
+  // Test a case where two parameters' dataflow don't interfere with each other
+  // while aliased.
+  //
+  //  (p0 ,  p1)
+  //   |      |
+  //   |      |
+  // alias   alias
+  //   |      |
+  //   |      |
+  //  (p0 ,  p1)
+  auto module = CreateNewModule();
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+  builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
+  module->AddEntryComputation(builder.Build());
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+  InsertCopies(module.get());
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(op::GetTupleElement(param, 0)),
+                        op::Copy(op::GetTupleElement(param, 1))));
+
+  EXPECT_EQ(CountCopies(*module), 2);
+}
+
+TEST_F(CopyInsertionTest, ParameterWithPartialAliasing) {
+  // Test a case where one parameter is aliased with result while another one
+  // isn't.
+  //
+  //  (p0 ,  p1)
+  //   |      |
+  //   |      |
+  // alias    |
+  //   |      |
+  //   |      |
+  //  (p0 ,  p1)
+  auto module = CreateNewModule();
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+  builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
+  module->AddEntryComputation(builder.Build());
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+  InsertCopies(module.get());
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(op::GetTupleElement(param, 0)),
+                        op::Copy(op::GetTupleElement(param, 1))));
+
+  EXPECT_EQ(CountCopies(*module), 2);
+}
+
+TEST_F(CopyInsertionTest, ParameterAndParallelOpsWithPartialAliasing) {
+  // Test a case where one parameter is aliased with result while another one
+  // isn't.
+  //
+  //   +-- (p0 ,  p1)
+  //   |    |      |
+  //   |    |      |
+  // alias Negate  Negate
+  //   |    |      |
+  //   |    |      |
+  //   +-- (p0 ,  p1)
+  auto module = CreateNewModule();
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+
+  auto negate0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, gte0));
+
+  auto negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, gte1));
+  builder.AddInstruction(HloInstruction::CreateTuple({negate0, negate1}));
+  module->AddEntryComputation(builder.Build());
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 0);
+}
+
+TEST_F(CopyInsertionTest, ParameterAndOpsWithPartialAliasing) {
+  // Test a case where one parameter is aliased with result while another one
+  // isn't.
+  //
+  //   +-- (p0 ,  p1)
+  //   |    |      |
+  //   |    |      |
+  // alias Negate  Negate
+  //   |    |      |
+  //   |    Add----+
+  //   |    |      |
+  //   +-- (p0 ,  p1)
+  auto module = CreateNewModule();
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+
+  auto negate0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, gte0));
+
+  auto negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, gte1));
+
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, negate0, negate1));
+  builder.AddInstruction(HloInstruction::CreateTuple({add, negate1}));
+  module->AddEntryComputation(builder.Build());
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 0);
+}
+
 TEST_F(CopyInsertionTest, SwizzlingWhileWithOneOp) {
   // Test a while instruction with a body which permutes its tuple parameter
   // elements and applies one operation to one of the elements. The addition of
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index a0eb9e6ddc..82c8fb1904 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -225,6 +225,32 @@ message HloScheduleProto {
   map<int64, InstructionSequence> sequences = 1;
 }
 
+message HloInputOutputAliasProto {
+  // The following proto describes a pair of aliased an input
+  // (described by parameter number and a ShapeIndex of the parameter)
+  // and an output (described by a ShapeIndex of the root
+  // instruction). For example:
+  //
+  // entry = {
+  //  output_shape_index={1},
+  //  parameter_number=0,
+  //  parameter_shape_index={1, 2},
+  // }
+  //
+  // This entry indicates that the first paremter's {1, 2} element is
+  // aliased with the {1} element of the root instruction.
+  message AliasEntryProto {
+    // ShapeIndex of the root hlo.
+    repeated int64 output_shape_index = 1;
+    // Number of the parameter in entry computation.
+    int64 parameter_number = 2;
+    // ShapeIndex of the parameter instruction.
+    repeated int64 parameter_shape_index = 3;
+  }
+
+  repeated AliasEntryProto entries = 1;
+}
+
 // Serialization of HloModule.
 message HloModuleProto {
   string name = 1;
@@ -243,6 +269,9 @@ message HloModuleProto {
 
   // The schedule for this module.
   HloScheduleProto schedule = 7;
+
+  // Describes alias information between inputs and outputs.
+  HloInputOutputAliasProto input_output_alias = 8;
 }
 
 // Serialization of LogicalBuffer.
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index c3da12e273..cf8e6594cb 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -59,8 +59,9 @@ class BufferValueMap {
   // construction process.
   using BufferNumber = int64;
 
-  explicit BufferValueMap(const HloDataflowAnalysis& dataflow)
-      : dataflow_(dataflow) {
+  explicit BufferValueMap(HloModule* module,
+                          const HloDataflowAnalysis& dataflow)
+      : module_(module), dataflow_(dataflow) {
     buffers_.reserve(dataflow_.values().size());
     value_to_buffer_number_.reserve(dataflow_.values().size());
     for (const HloValue* value : dataflow_.values()) {
@@ -171,6 +172,42 @@ class BufferValueMap {
     return value_to_buffer_number_.at(&value);
   }
 
+  void ComputeInputOutputAliasedBuffers(
+      const HloValue& value, std::vector<BufferNumber>* aliased_buffers) {
+    // Get parameter value from an aliased_input object.
+    const auto get_parameter_value =
+        [this](const std::pair<int64, ShapeIndex>& aliased_input)
+        -> const HloValue& {
+      int64 param_number = aliased_input.first;
+      const ShapeIndex& param_index = aliased_input.second;
+      return dataflow_.GetUniqueValueAt(
+          module_->entry_computation()->parameter_instruction(param_number),
+          param_index);
+    };
+
+    // If the value shows up in a root instruction, alias it with parameter
+    // intruction.
+    for (const HloPosition& pos : value.positions()) {
+      if (pos.instruction == module_->entry_computation()->root_instruction()) {
+        ShapeIndex output_index = pos.index;
+
+        auto aliased_input =
+            module_->input_output_alias_config().GetAliasedParameter(
+                output_index);
+        if (aliased_input) {
+          aliased_buffers->push_back(
+              GetBufferForValue(get_parameter_value(*aliased_input)));
+        }
+      }
+    }
+
+    // If the value is parameter instruction itself, alias it with itself.
+    if (value.instruction()->opcode() == HloOpcode::kParameter &&
+        value.instruction()->parent() == module_->entry_computation()) {
+      aliased_buffers->push_back(GetBufferForValue(value));
+    }
+  }
+
   void ComputeWhileAliasedBuffers(const HloValue& value,
                                   std::vector<BufferNumber>* aliased_buffers) {
     VLOG(3) << "Compute kWhile aliases";
@@ -278,6 +315,7 @@ class BufferValueMap {
       VLOG(2) << "Use of value " << value.ToShortString() << ": " << use;
     }
     std::vector<BufferNumber> aliased_buffers;
+    ComputeInputOutputAliasedBuffers(value, &aliased_buffers);
     ComputeWhileAliasedBuffers(value, &aliased_buffers);
     ComputeConditionalAliasedBuffers(value, &aliased_buffers);
     // Uniquify aliased buffers.
@@ -288,6 +326,8 @@ class BufferValueMap {
     return aliased_buffers;
   }
 
+  HloModule* module_;
+
   // Dataflow analysis used to construct the buffer map.
   const HloDataflowAnalysis& dataflow_;
 
@@ -461,7 +501,7 @@ StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
                                                /*bitcast_defines_value=*/false,
                                                fusion_can_share_buffer));
 
-  BufferValueMap buffer_map(alias_analysis->dataflow_analysis());
+  BufferValueMap buffer_map(module, alias_analysis->dataflow_analysis());
   buffer_map.MergeAliasedBuffers();
 
   // Create a vector of HloBuffers, one for each set of values in the
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index 0cd0ab36fc..5c8d97b2d1 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -217,6 +217,181 @@ TEST_F(HloAliasAnalysisTest, NondistinctTuple) {
   EXPECT_FALSE(AnyValuesInSameBufferInterfere());
 }
 
+TEST_F(HloAliasAnalysisTest, ParametersWithAliasing) {
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+
+  auto negate0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, gte0));
+  auto negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, gte1));
+
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({negate0, negate1}));
+  module_->AddEntryComputation(builder.Build());
+  TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+  TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+
+  // Cannot alias an output twice.
+  ASSERT_IS_NOT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0}));
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_EQ(analysis.GetUniqueBufferAt(gte0),
+            analysis.GetUniqueBufferAt(tuple, /*index=*/{0}));
+
+  EXPECT_EQ(analysis.GetUniqueBufferAt(gte1),
+            analysis.GetUniqueBufferAt(tuple, /*index=*/{1}));
+}
+
+TEST_F(HloAliasAnalysisTest, ParametersWithCrossAliasing) {
+  // parameter 0 aliased with output 1 and parameter 1 aliased with output 0.
+  //
+  //  (p0 ,  p1)
+  //     \   /
+  //      \ /
+  // alias X
+  //      / \
+  //     /   \
+  //  (p0  ,  p1)
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
+  module_->AddEntryComputation(builder.Build());
+  TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{1}));
+  TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0}));
+
+  // Cannot alias an output twice.
+  ASSERT_IS_NOT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  // Every Ops in this graph are aliased with each other.
+  EXPECT_EQ(analysis.GetUniqueBufferAt(gte0),
+            analysis.GetUniqueBufferAt(tuple, /*index=*/{0}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(gte0),
+            analysis.GetUniqueBufferAt(tuple, /*index=*/{1}));
+
+  EXPECT_EQ(analysis.GetUniqueBufferAt(gte1),
+            analysis.GetUniqueBufferAt(tuple, /*index=*/{0}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(gte1),
+            analysis.GetUniqueBufferAt(tuple, /*index=*/{1}));
+}
+
+TEST_F(HloAliasAnalysisTest, InputOutputAliasingWithWhile) {
+  // Test a simple single while instruction can be aliased with input and output
+  // of the computation.
+  //
+  // body((F32[], F32[]) %tuple_param):
+  //   %add = Add(%tuple_param{0}, %tuple_param{1})
+  //   return Tuple(%tuple_param{0}, %add)
+  //
+  // condition((F32[], F32[]) %tuple_param):
+  //   return Constant(false)
+  //
+  // entry:
+  //   %param1 = param1
+  //   %while = While(%param1, body, condition)
+  //   %while_1 = GTE(%while, 0)
+  //   %while_2 = GTE(%while, 1)
+  //   %negate_1 = Negate(%while_1)
+  //   %negate_2 = Negate(%while_2)
+  //   return Tuple(negate_1, negate_2)
+  //
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  // Element 0 passes transparently through the body.
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  auto add = body_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, body_element_0, body_element_1));
+  auto body_tuple = body_builder.AddInstruction(
+      HloInstruction::CreateTuple({body_element_0, add}));
+  HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
+
+  // Condition computation trivially returns a constant "false".
+  auto cond_builder = HloComputation::Builder("condition");
+  auto cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_->AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, param));
+  auto while_element_1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, xla_while, 0));
+  auto while_element_2 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, xla_while, 1));
+  auto negate_1 = builder.AddInstruction(HloInstruction::CreateUnary(
+      scalar_shape_, HloOpcode::kNegate, while_element_1));
+  auto negate_2 = builder.AddInstruction(HloInstruction::CreateUnary(
+      scalar_shape_, HloOpcode::kNegate, while_element_2));
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({negate_1, negate_2}));
+  module_->AddEntryComputation(builder.Build());
+  TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+  TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_THAT(
+      GetValuesInBuffer(analysis.GetUniqueBufferAt(xla_while, /*index=*/{1})),
+      UnorderedElementsAre(GetValueDefinedAt(param, {1}),
+                           GetValueDefinedAt(xla_while, /*index=*/{1}),
+                           GetValueDefinedAt(body_param, {1}),
+                           GetValueDefinedAt(cond_param, {1}),
+                           GetValueDefinedAt(add),
+                           GetValueDefinedAt(negate_2)));
+
+  EXPECT_THAT(
+      analysis.GetUniqueBufferAt(xla_while, /*index=*/{1}).ComputePositions(),
+      UnorderedElementsAre(
+          HloPosition{param, {1}}, HloPosition{xla_while, {1}},
+          HloPosition{while_element_2, {}}, HloPosition{body_param, {1}},
+          HloPosition{body_element_1, {}}, HloPosition{add, {}},
+          HloPosition{body_tuple, {1}}, HloPosition{tuple, {1}},
+          HloPosition{cond_param, {1}}, HloPosition{negate_2, {}}));
+
+  EXPECT_FALSE(AnyValuesInSameBufferInterfere());
+}
+
 TEST_F(HloAliasAnalysisTest, SingleCall) {
   // Test a single call of a subcomputation. The subcomputation adds its two
   // array-shaped parameters.
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index c22adcdd8d..f401eac016 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -126,7 +126,7 @@ bool HloDataflowAnalysis::ValueIsDefinedAt(const HloInstruction* instruction,
 
 const HloValue& HloDataflowAnalysis::GetValueDefinedAt(
     const HloInstruction* instruction, const ShapeIndex& index) const {
-  CHECK(ValueIsDefinedAt(instruction, index));
+  CHECK(ValueIsDefinedAt(instruction, index)) << instruction->ToString();
   return GetUniqueValueAt(instruction, index);
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
new file mode 100644
index 0000000000..9ad98e5038
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
@@ -0,0 +1,172 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+namespace xla {
+Status HloInputOutputAliasConfig::SetUpAlias(const ShapeIndex& output_index,
+                                             int64 param_number,
+                                             const ShapeIndex& param_index) {
+  // Output can't be aliased with multiple parameters.
+  TF_RET_CHECK(!alias_.element(output_index));
+  (*alias_.mutable_element(output_index)) =
+      std::make_pair(param_number, param_index);
+  return Status::OK();
+}
+
+HloInputOutputAliasProto HloInputOutputAliasConfig::ToProto() const {
+  HloInputOutputAliasProto result;
+  alias_.ForEachElement(
+      [&](const ShapeIndex& index,
+          const absl::optional<std::pair<int64, ShapeIndex>>& data) {
+        if (data) {
+          HloInputOutputAliasProto::AliasEntryProto entry;
+          for (int64 i : index) {
+            entry.add_output_shape_index(i);
+          }
+          entry.set_parameter_number(data->first);
+          for (int64 i : data->second) {
+            entry.add_parameter_shape_index(i);
+          }
+          result.add_entries()->Swap(&entry);
+        }
+      });
+  return result;
+}
+
+StatusOr<HloInputOutputAliasConfig> HloInputOutputAliasConfig::CreateFromProto(
+    const HloModule* module, const HloInputOutputAliasProto& proto) {
+  HloInputOutputAliasConfig result(
+      module->entry_computation()->root_instruction()->shape());
+  for (const HloInputOutputAliasProto::AliasEntryProto& entry :
+       proto.entries()) {
+    ShapeIndex output_index(entry.output_shape_index().begin(),
+                            entry.output_shape_index().end());
+
+    int64 param_number = entry.parameter_number();
+    ShapeIndex param_index(entry.parameter_shape_index().begin(),
+                           entry.parameter_shape_index().end());
+    TF_RETURN_IF_ERROR(
+        result.SetUpAlias(output_index, param_number, param_index));
+  }
+
+  return result;
+}
+
+string HloInputOutputAliasConfig::ToString() const {
+  std::vector<string> pieces;
+  pieces.push_back("HloInputOutputAliasConfig");
+
+  ForEachAlias([&](const ShapeIndex& output_index, int64 param_number,
+                   const ShapeIndex& param_index) {
+    pieces.push_back(absl::StrFormat(
+        "  OutputIndex %s is aliased with parameter %lld at %s:",
+        output_index.ToString(), param_number, param_index.ToString()));
+  });
+
+  return absl::StrJoin(pieces, "\n");
+}
+
+bool HloInputOutputAliasConfig::ParameterHasAlias(int64 param_number) const {
+  bool output = false;
+  alias_.ForEachElement(
+      [&](const xla::ShapeIndex&,
+          absl::optional<std::pair<int64, ShapeIndex>> alias) {
+        if (alias && alias->first == param_number) {
+          output = true;
+        }
+      });
+  return output;
+}
+
+absl::optional<ShapeIndex> HloInputOutputAliasConfig::GetAliasedOutput(
+    int64 param_number, const ShapeIndex& param_index) const {
+  absl::optional<ShapeIndex> output;
+  alias_.ForEachElement(
+      [&](const xla::ShapeIndex& output_index,
+          absl::optional<std::pair<int64, ShapeIndex>> alias) {
+        if (alias && alias->first == param_number &&
+            alias->second == param_index) {
+          output = output_index;
+        }
+      });
+  return output;
+}
+
+absl::optional<std::pair<int64, ShapeIndex>>
+HloInputOutputAliasConfig::GetAliasedParameter(
+    const ShapeIndex& output_index) const {
+  CHECK(ShapeUtil::IndexIsValid(alias_.shape(), output_index));
+  return alias_.element(output_index);
+}
+
+void HloInputOutputAliasConfig::ForEachAlias(AliasFn fn) const {
+  alias_.ForEachElement(
+      [&](const ShapeIndex& output_index,
+          absl::optional<std::pair<int64, ShapeIndex>> aliased) {
+        if (aliased) {
+          fn(output_index, aliased->first, aliased->second);
+        }
+      });
+}
+
+Status HloInputOutputAliasConfig::ForEachAliasWithStatus(
+    AliasFnWithStatus fn) const {
+  return alias_.ForEachElementWithStatus(
+      [&](const ShapeIndex& output_index,
+          absl::optional<std::pair<int64, ShapeIndex>> aliased) {
+        if (aliased) {
+          TF_RETURN_IF_ERROR(fn(output_index, aliased->first, aliased->second));
+        }
+        return Status::OK();
+      });
+}
+
+Status HloInputOutputAliasConfig::Verify(const HloModule& module) const {
+  std::vector<ShapeTree<bool>> param_has_seen;
+  const HloComputation* entry = module.entry_computation();
+  for (int64 i = 0; i < entry->num_parameters(); ++i) {
+    HloInstruction* param = entry->parameter_instruction(i);
+    param_has_seen.emplace_back(param->shape());
+  }
+  return ForEachAliasWithStatus([&](const ShapeIndex& output_index,
+                                    int64 param_number,
+                                    const ShapeIndex& param_index) -> Status {
+    const HloInstruction* root = entry->root_instruction();
+
+    const Shape& param_shape =
+        entry->parameter_instruction(param_number)->shape();
+    const Shape& output_shape = root->shape();
+    TF_RET_CHECK(entry->num_parameters() > param_number);
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(param_shape, param_index));
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(output_shape, output_index));
+
+    // Check each param_number and param_index pair only show up once. No
+    // input can be aliased with output buffers.
+    TF_RET_CHECK(param_has_seen[param_number].element(param_index) == false);
+
+    *(param_has_seen[param_number].mutable_element(param_index)) = true;
+
+    return Status::OK();
+  });
+}
+
+std::ostream& operator<<(std::ostream& out,
+                         const HloInputOutputAliasConfig& config) {
+  out << config.ToString();
+  return out;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
new file mode 100644
index 0000000000..02c46f65c8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
@@ -0,0 +1,101 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INPUT_OUTPUT_ALIAS_CONFIG_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INPUT_OUTPUT_ALIAS_CONFIG_H_
+
+#include <utility>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+class HloModule;
+
+// This class specifies the alias map from output index to parameter number and
+// parameter index in the entry computation.
+class HloInputOutputAliasConfig {
+ public:
+  HloInputOutputAliasConfig() = default;
+
+  explicit HloInputOutputAliasConfig(Shape shape) : alias_(shape) {}
+
+  virtual ~HloInputOutputAliasConfig() = default;
+
+  // Sets up alias config from `output_index` to `param_index` at
+  // `param_number`.
+  Status SetUpAlias(const ShapeIndex& output_index, int64 param_number,
+                    const ShapeIndex& param_index);
+
+  // Returns true if the given parameter is aliased with one of the output
+  // buffers.
+  bool ParameterHasAlias(int64 param_number) const;
+
+  // (De)Serializes an HloInputOutoutAliasConfig to/from an
+  // HloInputOutoutAliasProto.
+  HloInputOutputAliasProto ToProto() const;
+
+  static StatusOr<HloInputOutputAliasConfig> CreateFromProto(
+      const HloModule* module, const HloInputOutputAliasProto& proto);
+
+  // Returns the output index that the given parameter and parameter index is
+  // aliased with. A nullopt is returned if there is no output that is aliased
+  // with the parameter number and index.
+  absl::optional<ShapeIndex> GetAliasedOutput(
+      int64 param_number, const ShapeIndex& param_index) const;
+
+  // Returns the number of parameter and index of the parameter buffer that the
+  // given output buffer index is aliased with. A nullopt is returned if there
+  // is no parameter is aliased with the specific output.
+  absl::optional<std::pair<int64, ShapeIndex>> GetAliasedParameter(
+      const ShapeIndex& output_index) const;
+
+  using AliasFn =
+      std::function<void(const ShapeIndex& output_index, int64 param_number,
+                         const ShapeIndex& param_index)>;
+
+  // Iterates through each aliased output and input.
+  void ForEachAlias(AliasFn fn) const;
+
+  using AliasFnWithStatus =
+      std::function<Status(const ShapeIndex& output_index, int64 param_number,
+                           const ShapeIndex& param_index)>;
+
+  // Verifies that the given config is valid for the given module.
+  // Specifically, the config's input and output should be in-bound and size of
+  // the aliased buffers should match.
+  Status Verify(const HloModule& module) const;
+
+  Status ForEachAliasWithStatus(AliasFnWithStatus fn) const;
+
+  string ToString() const;
+
+ private:
+  // A ShapeTree which indicates the list of buffers that's expected to be
+  // aliased. The key on this shape tree represents the output index. The value
+  // is a pair of parameter number and index into the buffer. If the value is
+  // nullopt, it means there is no parameter aliasing for this output.
+  ShapeTree<absl::optional<std::pair<int64, ShapeIndex>>> alias_;
+};
+
+std::ostream& operator<<(std::ostream& out,
+                         const HloInputOutputAliasConfig& config);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INPUT_OUTPUT_ALIAS_CONFIG_H_
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
new file mode 100644
index 0000000000..3b61ff04e6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
@@ -0,0 +1,184 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+class HloInputOutputAliasConfigTest : public HloTestBase {
+ protected:
+  void expect_aliased(const ShapeIndex& output_index, int64 param_number,
+                      const ShapeIndex& param_index,
+                      const HloInputOutputAliasConfig& config) {
+    absl::optional<ShapeIndex> aliased_output =
+        config.GetAliasedOutput(param_number, param_index);
+
+    EXPECT_TRUE(aliased_output);
+    EXPECT_EQ(aliased_output.value(), output_index);
+
+    absl::optional<std::pair<int64, ShapeIndex>> aliased_param =
+        config.GetAliasedParameter(output_index);
+
+    EXPECT_TRUE(aliased_param);
+    EXPECT_EQ(aliased_param.value(), std::make_pair(param_number, param_index));
+  }
+
+  void expect_not_aliased(const ShapeIndex& output_index, int64 param_number,
+                          const ShapeIndex& param_index,
+                          const HloInputOutputAliasConfig& config) {
+    absl::optional<ShapeIndex> aliased_output =
+        config.GetAliasedOutput(param_number, param_index);
+
+    EXPECT_FALSE(aliased_output && aliased_output == output_index);
+
+    absl::optional<std::pair<int64, ShapeIndex>> aliased_param =
+        config.GetAliasedParameter(output_index);
+
+    EXPECT_FALSE(aliased_param && aliased_param->first == param_number &&
+                 aliased_param->second == param_index);
+  }
+};
+
+TEST_F(HloInputOutputAliasConfigTest, SimpleAliasing) {
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT root = (f32[], f32[]) tuple(%a, %b)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  HloInputOutputAliasConfig config(
+      module->entry_computation()->root_instruction()->shape());
+
+  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/1,
+                                 /*param_index=*/{}));
+
+  expect_aliased(/*output_index=*/{0}, /*param_number=*/1,
+                 /*param_index=*/{}, config);
+
+  expect_not_aliased(/*output_index=*/{1}, /*param_number=*/1,
+                     /*param_index=*/{}, config);
+
+  expect_not_aliased(/*output_index=*/{0}, /*param_number=*/0,
+                     /*param_index=*/{}, config);
+}
+
+TEST_F(HloInputOutputAliasConfigTest, SimpleAliasingWithTupleInput) {
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  param = (f32[], f32[]) parameter(0)
+  gte1 = f32[] get-tuple-element(%param), index=0
+  gte2 = f32[] get-tuple-element(%param), index=1
+  ROOT root = (f32[], f32[]) tuple(%gte1, %gte2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  HloInputOutputAliasConfig config(
+      module->entry_computation()->root_instruction()->shape());
+
+  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
+                                 /*param_index=*/{0}));
+
+  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{1}, /*param_number=*/0,
+                                 /*param_index=*/{1}));
+
+  expect_aliased(/*output_index=*/{0}, /*param_number=*/0,
+                 /*param_index=*/{0}, config);
+
+  expect_aliased(/*output_index=*/{1}, /*param_number=*/0,
+                 /*param_index=*/{1}, config);
+
+  expect_not_aliased(/*output_index=*/{1}, /*param_number=*/1,
+                     /*param_index=*/{}, config);
+
+  expect_not_aliased(/*output_index=*/{0}, /*param_number=*/0,
+                     /*param_index=*/{}, config);
+}
+
+TEST_F(HloInputOutputAliasConfigTest, InputDoNotAliasTwice) {
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT root = (f32[], f32[]) tuple(%a, %b)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  HloInputOutputAliasConfig config(
+      module->entry_computation()->root_instruction()->shape());
+
+  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
+                                 /*param_index=*/{}));
+
+  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{1}, /*param_number=*/0,
+                                 /*param_index=*/{}));
+
+  ASSERT_IS_NOT_OK(config.Verify(*module));
+}
+
+TEST_F(HloInputOutputAliasConfigTest, OutputDoNotAliasTwice) {
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT root = (f32[], f32[]) tuple(%a, %b)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  HloInputOutputAliasConfig config(
+      module->entry_computation()->root_instruction()->shape());
+
+  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
+                                 /*param_index=*/{}));
+
+  ASSERT_IS_NOT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/1,
+                                     /*param_index=*/{}));
+}
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 93e04eb3db..547f74a0ed 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -73,6 +73,8 @@ HloComputation* HloModule::AddComputationInternal(
       config_.SetDefaultComputationLayout(
           entry_computation_->ComputeProgramShape());
     }
+    input_output_alias_config_ = HloInputOutputAliasConfig(
+        entry_computation_->root_instruction()->shape());
   }
 
   if (uniquify_identifiers) {
@@ -252,6 +254,9 @@ HloModuleProto HloModule::ToProto() const {
   if (has_schedule()) {
     *proto.mutable_schedule() = schedule().ToProto().ValueOrDie();
   }
+
+  *proto.mutable_input_output_alias() = input_output_alias_config().ToProto();
+
   return proto;
 }
 
@@ -328,6 +333,10 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   }
   TF_RET_CHECK(module->entry_computation_ != nullptr);
 
+  TF_ASSIGN_OR_RETURN(module->input_output_alias_config_,
+                      HloInputOutputAliasConfig::CreateFromProto(
+                          module.get(), proto.input_output_alias()));
+
   // Because we didn't uniquify the names or the ids, double-check that the
   // instruction and computation names and ids are unique from the proto.
   absl::flat_hash_set<string> computation_names;
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 735804e827..9b9dc3ba9f 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
@@ -212,6 +213,15 @@ class HloModule {
     return result;
   }
 
+  // input_output_alias_config indicates the list of aliased buffers that are
+  // expected from the module.
+  HloInputOutputAliasConfig& input_output_alias_config() {
+    return input_output_alias_config_;
+  }
+  const HloInputOutputAliasConfig& input_output_alias_config() const {
+    return input_output_alias_config_;
+  }
+
   // Returns the number of unique intruction ids given out.  All ids up to
   // this point are guaranteed to be in the range [0..NumUniqueInstructionIds())
   int NumUniqueInstructionIds() const { return next_unique_id_; }
@@ -284,6 +294,10 @@ class HloModule {
   // sequential order of instructions for each non-fusion computation in the
   // module.
   absl::optional<HloSchedule> schedule_;
+
+  // alias_config indicates the alias information of input/output buffers that
+  // are expected from the module.
+  HloInputOutputAliasConfig input_output_alias_config_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index be3bee5975..2902a11a42 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -1220,6 +1220,8 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
     TF_RETURN_IF_ERROR(module->schedule().Verify());
   }
 
+  TF_RETURN_IF_ERROR(module->input_output_alias_config().Verify(*module));
+
   return false;
 }
 
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 73f541d505..51cedce7f0 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -72,7 +72,7 @@ class ShapeIndex {
   void push_back(int64 value) { indices_.push_back(value); }
   void pop_back() { indices_.pop_back(); }
 
-  // push_front is O(n^2), but shapes don't usually have a ton of dimensions.
+  // push_front is O(n), but shapes don't usually have a ton of dimensions.
   void push_front(int64 value) { indices_.insert(indices_.begin(), value); }
 
   using container_type = absl::InlinedVector<int64, 2>;
-- 
GitLab


From a593c6885bec8c545665ec2f25d794777be55ba9 Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Mon, 8 Oct 2018 21:23:08 -0700
Subject: [PATCH 565/570] Automated rollback of commit
 07df147ab20c4a5329148e5fb5f7f6b187cb73a4

PiperOrigin-RevId: 216299809
---
 tensorflow/core/grappler/optimizers/meta_optimizer.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 7488cedec5..225c0a91e3 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -114,8 +114,7 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
   MK_OPT("scoped_allocator",
          new ScopedAllocatorOptimizer(cfg_.scoped_allocator_optimization(),
                                       cfg_.scoped_allocator_opts()));
-  MK_OPT("pin_to_host",
-         new PinToHostOptimizer(cfg_.pin_to_host_optimization()));
+  MK_OPT("small_op", new PinToHostOptimizer(cfg_.pin_to_host_optimization()));
 
   return std::unique_ptr<GraphOptimizer>();
 }
@@ -162,7 +161,7 @@ Status MetaOptimizer::InitializeOptimizers(
   if (cfg_.remapping() != RewriterConfig::OFF) {
     optimizers->push_back(MakeUnique<Remapper>(cfg_.remapping()));
   }
-  if (cfg_.pin_to_host_optimization() != RewriterConfig::OFF) {
+  if (cfg_.pin_to_host_optimization() == RewriterConfig::ON) {
     optimizers->push_back(MakeUnique<PinToHostOptimizer>());
   }
   if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
@@ -592,7 +591,7 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
          cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
          cfg.debug_stripper() == RewriterConfig::ON ||
          cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
-         cfg.pin_to_host_optimization() != RewriterConfig::OFF ||
+         cfg.pin_to_host_optimization() == RewriterConfig::ON ||
          !cfg.optimizers().empty() || !cfg.custom_optimizers().empty();
 }
 
-- 
GitLab


From d1f0494b89a31298df7743018c0a3fa388ac16a2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 22:13:04 -0700
Subject: [PATCH 566/570] Add Floor_mod to schema.

PiperOrigin-RevId: 216303340
---
 tensorflow/contrib/lite/builtin_ops.h         |   1 +
 .../lite/core/api/flatbuffer_conversions.cc   |   1 +
 tensorflow/contrib/lite/nnapi_delegate.cc     |   1 +
 tensorflow/contrib/lite/schema/schema.fbs     |   5 +
 .../contrib/lite/schema/schema_generated.h    | 124 +++++++++++++++++-
 5 files changed, 126 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index 7809d114e2..6117cbf9f1 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -120,6 +120,7 @@ typedef enum {
   kTfLiteBuiltinSquare = 92,
   kTfLiteBuiltinZerosLike = 93,
   kTfLiteBuiltinFill = 94,
+  kTfLiteBuiltinFloorMod = 95,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc b/tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc
index b092e5ee54..890d9c04bb 100644
--- a/tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc
@@ -651,6 +651,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_SQUARE:
     case BuiltinOperator_ZEROS_LIKE:
     case BuiltinOperator_FILL:
+    case BuiltinOperator_FLOOR_MOD:
       break;
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index f23a0ccb80..c7005eb53e 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -679,6 +679,7 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_SQUARE:
       case tflite::BuiltinOperator_ZEROS_LIKE:
       case tflite::BuiltinOperator_FILL:
+      case tflite::BuiltinOperator_FLOOR_MOD:
         logError("Op code %d is currently not delegated to NNAPI", builtin);
         return kTfLiteError;
         break;
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index cb7a282743..2b36209e5f 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -176,6 +176,7 @@ enum BuiltinOperator : byte {
   SQUARE = 92,
   ZEROS_LIKE = 93,
   FILL = 94,
+  FLOOR_MOD = 95,
 }
 
 // Options for the builtin operators.
@@ -251,6 +252,7 @@ union BuiltinOptions {
   BidirectionalSequenceLSTMOptions,
   BidirectionalSequenceRNNOptions,
   UnidirectionalSequenceLSTMOptions,
+  FloorModOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -618,6 +620,9 @@ table ZerosLikeOptions {
 table FillOptions {
 }
 
+table FloorModOptions {
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index e7b7a59def..3aaa99ec55 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -241,6 +241,9 @@ struct ZerosLikeOptionsT;
 struct FillOptions;
 struct FillOptionsT;
 
+struct FloorModOptions;
+struct FloorModOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -401,11 +404,12 @@ enum BuiltinOperator {
   BuiltinOperator_SQUARE = 92,
   BuiltinOperator_ZEROS_LIKE = 93,
   BuiltinOperator_FILL = 94,
+  BuiltinOperator_FLOOR_MOD = 95,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_FILL
+  BuiltinOperator_MAX = BuiltinOperator_FLOOR_MOD
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[94] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[95] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -500,7 +504,8 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[94] {
     BuiltinOperator_REDUCE_ANY,
     BuiltinOperator_SQUARE,
     BuiltinOperator_ZEROS_LIKE,
-    BuiltinOperator_FILL
+    BuiltinOperator_FILL,
+    BuiltinOperator_FLOOR_MOD
   };
   return values;
 }
@@ -602,6 +607,7 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "SQUARE",
     "ZEROS_LIKE",
     "FILL",
+    "FLOOR_MOD",
     nullptr
   };
   return names;
@@ -685,11 +691,12 @@ enum BuiltinOptions {
   BuiltinOptions_BidirectionalSequenceLSTMOptions = 69,
   BuiltinOptions_BidirectionalSequenceRNNOptions = 70,
   BuiltinOptions_UnidirectionalSequenceLSTMOptions = 71,
+  BuiltinOptions_FloorModOptions = 72,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_UnidirectionalSequenceLSTMOptions
+  BuiltinOptions_MAX = BuiltinOptions_FloorModOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[72] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[73] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -762,7 +769,8 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[72] {
     BuiltinOptions_FillOptions,
     BuiltinOptions_BidirectionalSequenceLSTMOptions,
     BuiltinOptions_BidirectionalSequenceRNNOptions,
-    BuiltinOptions_UnidirectionalSequenceLSTMOptions
+    BuiltinOptions_UnidirectionalSequenceLSTMOptions,
+    BuiltinOptions_FloorModOptions
   };
   return values;
 }
@@ -841,6 +849,7 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "BidirectionalSequenceLSTMOptions",
     "BidirectionalSequenceRNNOptions",
     "UnidirectionalSequenceLSTMOptions",
+    "FloorModOptions",
     nullptr
   };
   return names;
@@ -1139,6 +1148,10 @@ template<> struct BuiltinOptionsTraits<UnidirectionalSequenceLSTMOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_UnidirectionalSequenceLSTMOptions;
 };
 
+template<> struct BuiltinOptionsTraits<FloorModOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FloorModOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1738,6 +1751,14 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_UnidirectionalSequenceLSTMOptions ?
       reinterpret_cast<const UnidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
   }
+  FloorModOptionsT *AsFloorModOptions() {
+    return type == BuiltinOptions_FloorModOptions ?
+      reinterpret_cast<FloorModOptionsT *>(value) : nullptr;
+  }
+  const FloorModOptionsT *AsFloorModOptions() const {
+    return type == BuiltinOptions_FloorModOptions ?
+      reinterpret_cast<const FloorModOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -6241,6 +6262,46 @@ inline flatbuffers::Offset<FillOptions> CreateFillOptions(
 
 flatbuffers::Offset<FillOptions> CreateFillOptions(flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct FloorModOptionsT : public flatbuffers::NativeTable {
+  typedef FloorModOptions TableType;
+  FloorModOptionsT() {
+  }
+};
+
+struct FloorModOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef FloorModOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  FloorModOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FloorModOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<FloorModOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FloorModOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit FloorModOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  FloorModOptionsBuilder &operator=(const FloorModOptionsBuilder &);
+  flatbuffers::Offset<FloorModOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<FloorModOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  FloorModOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -6587,6 +6648,9 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const UnidirectionalSequenceLSTMOptions *builtin_options_as_UnidirectionalSequenceLSTMOptions() const {
     return builtin_options_type() == BuiltinOptions_UnidirectionalSequenceLSTMOptions ? static_cast<const UnidirectionalSequenceLSTMOptions *>(builtin_options()) : nullptr;
   }
+  const FloorModOptions *builtin_options_as_FloorModOptions() const {
+    return builtin_options_type() == BuiltinOptions_FloorModOptions ? static_cast<const FloorModOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -6902,6 +6966,10 @@ template<> inline const UnidirectionalSequenceLSTMOptions *Operator::builtin_opt
   return builtin_options_as_UnidirectionalSequenceLSTMOptions();
 }
 
+template<> inline const FloorModOptions *Operator::builtin_options_as<FloorModOptions>() const {
+  return builtin_options_as_FloorModOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -9286,6 +9354,29 @@ inline flatbuffers::Offset<FillOptions> CreateFillOptions(flatbuffers::FlatBuffe
       _fbb);
 }
 
+inline FloorModOptionsT *FloorModOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new FloorModOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void FloorModOptions::UnPackTo(FloorModOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<FloorModOptions> FloorModOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFloorModOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FloorModOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateFloorModOptions(
+      _fbb);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -9759,6 +9850,10 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const UnidirectionalSequenceLSTMOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<const FloorModOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -10061,6 +10156,10 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const UnidirectionalSequenceLSTMOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<const FloorModOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -10351,6 +10450,10 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const UnidirectionalSequenceLSTMOptionsT *>(value);
       return CreateUnidirectionalSequenceLSTMOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<const FloorModOptionsT *>(value);
+      return CreateFloorModOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -10641,6 +10744,10 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new UnidirectionalSequenceLSTMOptionsT(*reinterpret_cast<UnidirectionalSequenceLSTMOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_FloorModOptions: {
+      value = new FloorModOptionsT(*reinterpret_cast<FloorModOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -11003,6 +11110,11 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<FloorModOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
-- 
GitLab


From e27ee15fa45a5f4e43e10ed1fe0eb3a1feb4253a Mon Sep 17 00:00:00 2001
From: Peter Ma <pcma@google.com>
Date: Mon, 8 Oct 2018 23:12:08 -0700
Subject: [PATCH 567/570] Refactor CalculateOutputSize() from VirtualScheduler
 protected member function to utils; Refactor EstimateSize() from
 memory_optimizer.cc to utils; some small changes for readability improvement

PiperOrigin-RevId: 216307257
---
 tensorflow/core/grappler/costs/BUILD          |   1 +
 tensorflow/core/grappler/costs/utils.cc       |  40 ++++++-
 tensorflow/core/grappler/costs/utils.h        |  11 ++
 tensorflow/core/grappler/costs/utils_test.cc  | 112 +++++++++++++-----
 .../core/grappler/costs/virtual_scheduler.cc  |  48 ++------
 .../core/grappler/costs/virtual_scheduler.h   |  22 ++--
 .../grappler/costs/virtual_scheduler_test.cc  |  48 +-------
 tensorflow/core/grappler/optimizers/BUILD     |   1 +
 .../grappler/optimizers/memory_optimizer.cc   |  26 +---
 9 files changed, 161 insertions(+), 148 deletions(-)

diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index f3dc2c2091..46eacd3a06 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -236,6 +236,7 @@ tf_cc_test(
     name = "virtual_scheduler_test",
     srcs = ["virtual_scheduler_test.cc"],
     deps = [
+        ":utils",
         ":virtual_placer",
         ":virtual_scheduler",
         "//tensorflow/cc:cc_ops",
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index 5415324b48..2fcadf1de3 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -74,7 +74,8 @@ static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
       }
       break;
     }
-    default: {}
+    default: {
+    }
   }
   return tensors;
 }
@@ -201,6 +202,43 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
   return inputs;
 }
 
+int64 CalculateTensorSize(const OpInfo::TensorProperties& prop) {
+  int64 size = DataTypeSize(BaseType(prop.dtype()));
+  TensorShapeProto shape = prop.shape();
+
+  // Can't infer the size if the rank is unknown. It has to be at least a
+  // scalar though.
+  if (shape.unknown_rank()) {
+    LOG(WARNING) << "CalculateTensorSize() -- unknown rank";
+    return size;
+  }
+
+  // If one of the dimensions is unknown statically, assume it's at least one.
+  for (int i = 0; i < shape.dim_size(); ++i) {
+    if (shape.dim(i).size() < 0) {
+      shape.mutable_dim(i)->set_size(1);
+      LOG(WARNING) << "CalculateTensorSize() -- unknown dim: " << i;
+    }
+  }
+
+  int64 num_elems = TensorShape(shape).num_elements();
+  return num_elems * size;
+}
+
+int64 CalculateOutputSize(
+    const std::vector<OpInfo::TensorProperties>& output_properties,
+    const int port_num) {
+  if (port_num < 0) return 4;  // 4B for control dependency.
+
+  if (port_num >= output_properties.size()) {
+    LOG(ERROR) << "CalculateOutputSize() -- port_num: " << port_num
+               << " >= output_properties.size(): " << output_properties.size();
+    return 0;
+  }
+
+  return CalculateTensorSize(output_properties[port_num]);
+}
+
 DeviceProperties GetDeviceInfo(const string& device_str) {
   DeviceProperties unknown;
   unknown.set_type("UNKNOWN");
diff --git a/tensorflow/core/grappler/costs/utils.h b/tensorflow/core/grappler/costs/utils.h
index 5fd6717712..ea64e5a41d 100644
--- a/tensorflow/core/grappler/costs/utils.h
+++ b/tensorflow/core/grappler/costs/utils.h
@@ -43,6 +43,17 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
     const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
     const std::unordered_map<string, const NodeDef*>& name_to_node);
 
+// Returns the size of tensor (unit: bytes). For tensor shape with unknown rank,
+// it assumes the tensor to be scalar. For any unknown dimension, it assumes
+// size one.
+int64 CalculateTensorSize(const OpInfo::TensorProperties& prop);
+
+// Returns the size of output at port_num (unit: bytes). A special case is
+// port_num -1, which is for control dependency and assumed to be 4 bytes.
+int64 CalculateOutputSize(
+    const std::vector<OpInfo::TensorProperties>& output_properties,
+    int port_num);
+
 // Returns the DeviceProperties of the device on which 'node' runs.
 DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);
 DeviceProperties GetDeviceInfo(const string& device_str);
diff --git a/tensorflow/core/grappler/costs/utils_test.cc b/tensorflow/core/grappler/costs/utils_test.cc
index baa654f475..db5c11f0fe 100644
--- a/tensorflow/core/grappler/costs/utils_test.cc
+++ b/tensorflow/core/grappler/costs/utils_test.cc
@@ -26,36 +26,42 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-class UtilsTest : public ::testing::Test {
- public:
-  void CreateConstOp(const string& name, std::initializer_list<int64> dims,
-                     NodeDef* node) {
-    Tensor tensor(DT_FLOAT, TensorShape(dims));
-    for (int64 i = 0; i < tensor.NumElements(); ++i) {
-      tensor.flat<float>()(i) = i / 10.0f;
-    }
-    TF_CHECK_OK(NodeDefBuilder(name, "Const")
-                    .Attr("dtype", DT_FLOAT)
-                    .Attr("value", tensor)
-                    .Finalize(node));
-  }
+namespace {
 
-  void CreateConstSizesOp(const string& name, const std::vector<int32>& sizes,
-                          NodeDef* node) {
-    TensorShape shape;
-    shape.AddDim(sizes.size());
-    Tensor tensor(DT_INT32, shape);
-    for (int64 i = 0; i < tensor.NumElements(); ++i) {
-      tensor.flat<int32>()(i) = sizes[i];
-    }
-    TF_CHECK_OK(NodeDefBuilder(name, "Const")
-                    .Attr("dtype", DT_INT32)
-                    .Attr("value", tensor)
-                    .Finalize(node));
-  }
-};
+void CreateConstOp(const string& name, std::initializer_list<int64> dims,
+                   NodeDef* node) {
+  Tensor tensor(DT_FLOAT, TensorShape(dims));
+  for (int64 i = 0; i < tensor.NumElements(); ++i)
+    tensor.flat<float>()(i) = i / 10.0f;
+  TF_CHECK_OK(NodeDefBuilder(name, "Const")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("value", tensor)
+                  .Finalize(node));
+}
 
-TEST_F(UtilsTest, ConvOpInfo) {
+void CreateConstSizesOp(const string& name, const std::vector<int32>& sizes,
+                        NodeDef* node) {
+  TensorShape shape;
+  shape.AddDim(sizes.size());
+  Tensor tensor(DT_INT32, shape);
+  for (int64 i = 0; i < tensor.NumElements(); ++i)
+    tensor.flat<int32>()(i) = sizes[i];
+  TF_CHECK_OK(NodeDefBuilder(name, "Const")
+                  .Attr("dtype", DT_INT32)
+                  .Attr("value", tensor)
+                  .Finalize(node));
+}
+
+// Helper method for converting shapes vector to TensorProperty.
+OpInfo::TensorProperties ShapeToTensorProperty(const std::vector<int>& shapes,
+                                               const DataType& data_type) {
+  OpInfo::TensorProperties prop;
+  prop.set_dtype(data_type);
+  for (int shape : shapes) prop.mutable_shape()->add_dim()->set_size(shape);
+  return prop;
+}
+
+TEST(UtilsTest, ConvOpInfo) {
   int batch = 32;
   int rows = 7;
   int cols = 9;
@@ -146,7 +152,7 @@ TEST_F(UtilsTest, ConvOpInfo) {
   }
 }
 
-TEST_F(UtilsTest, TestSkipControlInput) {
+TEST(UtilsTest, TestSkipControlInput) {
   GraphDef graph;
   TF_CHECK_OK(NodeDefBuilder("constant", "Const")
                   .Attr("dtype", DT_INT32)
@@ -172,6 +178,52 @@ TEST_F(UtilsTest, TestSkipControlInput) {
   EXPECT_TRUE(node_found);
 }
 
+TEST(UtilsTest, CalculateTensorSize) {
+  // Test normal usage.
+  EXPECT_EQ(DataTypeSize(DT_FLOAT) * 1,
+            CalculateTensorSize(ShapeToTensorProperty({1}, DT_FLOAT)));
+  EXPECT_EQ(DataTypeSize(DT_FLOAT) * 4 * 4,
+            CalculateTensorSize(ShapeToTensorProperty({4, 4}, DT_FLOAT)));
+  EXPECT_EQ(DataTypeSize(DT_HALF) * 10 * 10 * 10,
+            CalculateTensorSize(ShapeToTensorProperty({10, 10, 10}, DT_HALF)));
+  EXPECT_EQ(
+      DataTypeSize(DT_FLOAT) * 100 * 7 * 8 * 99,
+      CalculateTensorSize(ShapeToTensorProperty({100, 7, 8, 99}, DT_FLOAT)));
+
+  // Test unknown rank: assumes the tensor to be a scalar.
+  OpInfo::TensorProperties t = ShapeToTensorProperty({100, 7, 8, 99}, DT_FLOAT);
+  t.mutable_shape()->set_unknown_rank(true);
+  EXPECT_EQ(DataTypeSize(DT_FLOAT) * 1, CalculateTensorSize(t));
+
+  // Test unknown shape: assumes unknown shape (-1) to have size 1.
+  EXPECT_EQ(
+      DataTypeSize(DT_FLOAT) * 1 * 7 * 8 * 99,
+      CalculateTensorSize(ShapeToTensorProperty({-1, 7, 8, 99}, DT_FLOAT)));
+  EXPECT_EQ(
+      DataTypeSize(DT_FLOAT) * 1 * 7 * 1 * 99,
+      CalculateTensorSize(ShapeToTensorProperty({-1, 7, -1, 99}, DT_FLOAT)));
+}
+
+TEST(UtilsTest, CalculateOutputSize) {
+  // Create a set of tensor properties.
+  std::vector<OpInfo::TensorProperties> output = {
+      ShapeToTensorProperty({4, 4}, DT_FLOAT),          // 0
+      ShapeToTensorProperty({-1, 7, -1, 99}, DT_FLOAT)  // 1
+  };
+
+  // Test valid outputs.
+  EXPECT_EQ(DataTypeSize(DT_FLOAT) * 4 * 4, CalculateOutputSize(output, 0));
+  EXPECT_EQ(DataTypeSize(DT_FLOAT) * 1 * 7 * 1 * 99,
+            CalculateOutputSize(output, 1));
+
+  // port_num -1 is for control dependency: hard coded 4B.
+  EXPECT_EQ(4, CalculateOutputSize(output, -1));
+
+  // Invalid port_num (though it may be an error) shall yield zero
+  // output size.
+  EXPECT_EQ(0, CalculateOutputSize(output, 2));
+}
+
 // Class for testing TensorSizeHistogram.
 class TestTensorSizeHistogram : public TensorSizeHistogram {
  public:
@@ -285,5 +337,7 @@ TEST(DeviceClassTest, GetDeviceClassForNonChannelDevice) {
   EXPECT_EQ("//GPU", GetDeviceClassForNonChannelDevice("/device:GPU:7"));
 }
 
+}  // namespace
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 037a823096..5b93fb128f 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -473,6 +473,7 @@ Status VirtualScheduler::Init() {
     VLOG(1) << "Some feed nodes were not consumed by the fetch fanin: "
             << str_util::Join(feed_nodes, ",");
   }
+
   initialized_ = true;
   return Status::OK();
 }
@@ -695,38 +696,6 @@ NodeState& VirtualScheduler::GetNodeStateOrCreateIt(const NodeDef* node) {
   return it->second;
 }
 
-int64 VirtualScheduler::CalculateOutputSize(
-    const std::vector<OpInfo::TensorProperties>& output_properties,
-    const int port_num) const {
-  if (port_num < 0) {
-    return 4;  // 4B for control dependency.
-  }
-
-  if (port_num >= output_properties.size()) {
-    VLOG(3) << "VirtualScheduler::CalculateOutputSize() -- "
-            << "port_num: " << port_num
-            << " >= output_properties.size(): " << output_properties.size();
-    return 0;
-  }
-
-  const auto& output = output_properties[port_num];
-  int64 output_size = DataTypeSize(BaseType(output.dtype()));
-
-  for (const auto& dim : output.shape().dim()) {
-    auto dim_size = dim.size();
-    if (dim_size < 0) {
-      // Zero output size if there's any unknown dim.
-      output_size = 0;
-      VLOG(3) << "VirtualScheduler::CalculateOutputSize() -- "
-              << "unknown dim: " << output_size;
-      break;
-    }
-    output_size *= dim_size;
-  }
-
-  return output_size;
-}
-
 Costs& VirtualScheduler::FindOrCreateZero(const string& op_name,
                                           std::map<string, Costs>* op_cost) {
   auto it = op_cost->find(op_name);
@@ -744,7 +713,10 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   const NodeDef* node = ready_nodes_->GetCurrNode();
   const string& op_name = node->op();
 
-  // Also keep track of op counts and times per op (with their shapes).
+  auto& op_cost = FindOrCreateZero(op_name, &op_to_cost_);
+  op_cost = CombineCosts(op_cost, node_costs);
+
+  // Also keep track of op counts and costs per op (with their shapes).
   OpContext op_context = GetCurrNode();
   string node_description = GetOpDescription(op_context.op_info);
   op_counts_[node_description] += 1;
@@ -752,9 +724,6 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
       std::make_pair(node_costs.execution_time.asMicroSeconds().count(),
                      !node_costs.inaccurate);
 
-  auto& op_cost = FindOrCreateZero(op_name, &op_to_cost_);
-  op_cost = CombineCosts(op_cost, node_costs);
-
   // Update node and device states.
   auto& node_state = node_map_[node];
   auto& device = device_[node_state.device_name];
@@ -795,7 +764,7 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
           << ", scheduled: " << node_state.time_scheduled.count()
           << ", finished: " << node_state.time_finished.count();
 
-  // Increment num_inputs_ready of the output nodes
+  // Increment num_inputs_ready of the output nodes and maybe add to ready nodes
   for (const auto& port_num_output_pair : node_state.outputs) {
     for (auto* output_node : port_num_output_pair.second) {
       auto& output_state = node_map_[output_node];
@@ -812,7 +781,7 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
     }
   }
 
-  // Increment num_outputs_executed of the input nodes.
+  // Increment num_outputs_executed of the input nodes and maybe update memory.
   for (const auto& input_port : node_state.inputs) {
     auto* input = input_port.first;
     auto port = input_port.second;
@@ -841,7 +810,6 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
     }
   }
 
-  // Remove the current node; assume FIFO.
   ready_nodes_->RemoveCurrNode();
 
   return !ready_nodes_->Empty();
@@ -1007,7 +975,7 @@ Costs VirtualScheduler::Summary(RunMetadata* metadata) {
     return Summary();
   }
 
-  // Fill RunMetadata.
+  // Fill RunMetadata's step_stats and partition_graphs fields.
   StepStats* stepstats = metadata->mutable_step_stats();
   for (const auto& device : device_) {
     GraphDef* device_partition_graph = metadata->add_partition_graphs();
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 0e66e8a463..bead84af29 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -107,10 +107,10 @@ struct DeviceState {
       mem_usage_snapshot_at_peak;
 
   Costs device_costs;
-  std::map<string, Costs> op_to_cost;    // Per-op cost.
-  std::map<string, int64> op_to_memory;  // Per-op memory usage at peak usage.
-  int64 memory_usage;
-  int64 max_memory_usage;
+  std::map<string, Costs> op_to_cost;  // Per-op cost.
+
+  int64 memory_usage;      // Current temporary memory usage
+  int64 max_memory_usage;  // Max temporary memory usage
 
   DeviceState() {
     device_costs = Costs::ZeroCosts();
@@ -283,13 +283,6 @@ class VirtualScheduler {
     return &node_map_;
   }
 
- protected:
-  // Returns the size of output at port_num (unit: bytes). A special case is
-  // port_num -1, which is for control dependency and assumed to be 4 bytes.
-  int64 CalculateOutputSize(
-      const std::vector<OpInfo::TensorProperties>& output_properties,
-      const int port_num) const;
-
  private:
   // Constants.
   const string kAttrInputSrc = "input_source_";
@@ -321,8 +314,11 @@ class VirtualScheduler {
   std::vector<std::unique_ptr<NodeDef>> additional_nodes_;
 
   // Stats:
-  std::map<string, int> op_counts_;  // Op counts with key with input shape.
-  // Individual op costs (with input shapes).
+  // Op counts with key with input shape.
+  // Example key: "[Op=AssignSub, input_shapes=[[7,1,160,160][7,1,160,160]]"
+  std::map<string, int> op_counts_;
+  // Individual op costs with key with input shape.
+  // Integer field for execution time in micro seconds.
   // Boolean field for whether the cost is accurate.
   std::map<string, std::pair<int, bool>> op_costs_;
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 80889afc86..99272dd7e9 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -19,12 +19,14 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/costs/virtual_placer.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace grappler {
+
 // Class for testing virtual scheduler.
 class TestVirtualScheduler : public VirtualScheduler {
  public:
@@ -33,7 +35,6 @@ class TestVirtualScheduler : public VirtualScheduler {
       : VirtualScheduler(grappler_item, use_static_shapes, cluster,
                          &ready_node_manager_) {}
 
-  FRIEND_TEST(VirtualSchedulerTest, CalculateOutputSize);
   FRIEND_TEST(VirtualSchedulerTest, MemoryUsage);
   FRIEND_TEST(VirtualSchedulerTest, ControlDependency);
   FRIEND_TEST(VirtualSchedulerTest, ComplexDependency);
@@ -1034,17 +1035,6 @@ versions {
     }
   }
 
-  // Helper method for converting shape vector to TensorProperty.
-  OpInfo::TensorProperties ShapeToTensorProperty(
-      const std::vector<int> shape, const DataType& data_type) const {
-    OpInfo::TensorProperties tensor_property;
-    tensor_property.set_dtype(data_type);
-    for (const auto& x : shape) {
-      tensor_property.mutable_shape()->add_dim()->set_size(x);
-    }
-    return tensor_property;
-  }
-
   // SetUp() inits cluster_ and placer_.
   std::unique_ptr<VirtualCluster> cluster_;
   std::unique_ptr<VirtualPlacer> placer_;
@@ -1729,38 +1719,6 @@ TEST_F(VirtualSchedulerTest, InitAndBasicScheduling) {
   EXPECT_EQ(2, ops_executed["c1"].op_info.inputs_size());
 }
 
-TEST_F(VirtualSchedulerTest, CalculateOutputSize) {
-  // Init.
-  CreateGrapplerItemWithAddN();
-  InitScheduler();
-
-  // Create a set of tensor properties.
-  std::vector<OpInfo::TensorProperties> output;
-  output.push_back(ShapeToTensorProperty({4, 4}, DT_FLOAT));           // 0
-  output.push_back(ShapeToTensorProperty({1}, DT_FLOAT));              // 1
-  output.push_back(ShapeToTensorProperty({10, 10, 10}, DT_HALF));      // 2
-  output.push_back(ShapeToTensorProperty({100, 7, 8, 99}, DT_FLOAT));  // 3
-  output.push_back(ShapeToTensorProperty({-1, 7, 8, 99}, DT_FLOAT));   // 4
-  output.push_back(ShapeToTensorProperty({-1, 7, -1, 99}, DT_FLOAT));  // 4
-
-  // port_num -1 is for control dependency: hard coded 4B.
-  EXPECT_EQ(4, scheduler_->CalculateOutputSize(output, -1));
-
-  // Test valid outputs.
-  EXPECT_EQ(4 * 4 * 4, scheduler_->CalculateOutputSize(output, 0));
-  EXPECT_EQ(4 * 1, scheduler_->CalculateOutputSize(output, 1));
-  EXPECT_EQ(2 * 10 * 10 * 10, scheduler_->CalculateOutputSize(output, 2));
-  EXPECT_EQ(4 * 100 * 7 * 8 * 99, scheduler_->CalculateOutputSize(output, 3));
-
-  // Any unknown shape (-1) shall yield zero output size.
-  EXPECT_EQ(0, scheduler_->CalculateOutputSize(output, 4));
-  EXPECT_EQ(0, scheduler_->CalculateOutputSize(output, 5));
-
-  // Invalid port_num (though it may be an error) shall yield zero
-  // output size.
-  EXPECT_EQ(0, scheduler_->CalculateOutputSize(output, 6));
-}
-
 TEST_F(VirtualSchedulerTest, MemoryUsage) {
   // Init.
   CreateGrapplerItemWithAddN();
@@ -2041,7 +1999,7 @@ TEST_F(VirtualSchedulerTest, InterDeviceTransfer) {
     for (const auto& output_property : output_properties_) {
       output_properties.push_back(output_property);
     }
-    return scheduler_->CalculateOutputSize(output_properties, 0);
+    return CalculateOutputSize(output_properties, 0);
   };
 
   // Validate transfer size.
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index c708f84948..e898377ded 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -423,6 +423,7 @@ cc_library(
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_memory",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/costs:utils",
         "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core/grappler/utils:traversal",
     ],
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index c775a26914..73f0977242 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_memory.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
@@ -43,6 +44,8 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+namespace {
+
 // Prefix added to nodes which are recomputed.
 const char* kRecomputedNodePrefix = "Recomputed";
 const char* kRecomputeTriggerNodePrefix = "RecomputeTrigger";
@@ -744,25 +747,6 @@ Status BuildSwapPair(NodeDef* node, int input_to_swap,
   return Status::OK();
 }
 
-static int64 EstimateSize(const OpInfo::TensorProperties& t) {
-  DataType dtype = t.dtype();
-  int64 size = DataTypeSize(dtype);
-  TensorShapeProto shape = t.shape();
-  if (shape.unknown_rank()) {
-    // Can't infer the size if the rank is unknown. It has to be at least a
-    // scalar though.
-    return size;
-  }
-  // If one of the dimensions is unknown statically, assume it's at least one.
-  for (int i = 0; i < shape.dim_size(); ++i) {
-    if (shape.dim(i).size() < 0) {
-      shape.mutable_dim(i)->set_size(1);
-    }
-  }
-  int64 num_elems = TensorShape(shape).num_elements();
-  return num_elems * size;
-}
-
 struct SwapInfo {
   std::vector<int> inputs_to_swap;
   Costs::NanoSeconds time_to_swap = 0;
@@ -1149,7 +1133,7 @@ bool SwappingPass(RewriterConfig::MemOptType optimization_level,
     int64 bytes_to_swap = 0;
     for (int64 input_id : swap_info.inputs_to_swap) {
       const OpInfo::TensorProperties& t = props[input_id];
-      bytes_to_swap += EstimateSize(t);
+      bytes_to_swap += CalculateTensorSize(t);
     }
     // Let's assume we're going to swap over PCIe running at 16 GBps.
     swap_info.time_to_swap = bytes_to_swap / 16;
@@ -1299,6 +1283,8 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
   return Status::OK();
 }
 
+}  // namespace
+
 Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* optimized_graph) {
   *optimized_graph = item.graph;
-- 
GitLab


From 129bb5e845ccb2ab6339e85d39545800dac6ca33 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Oct 2018 23:42:02 -0700
Subject: [PATCH 568/570] Automated rollback of commit
 5f308cb408eb46ec9af0546be6b9ae1d5166b185

PiperOrigin-RevId: 216309111
---
 tensorflow/core/grappler/op_types.cc          |  22 +--
 .../optimizers/pin_to_host_optimizer.cc       | 162 ++++++------------
 .../optimizers/pin_to_host_optimizer.h        |   4 +-
 .../optimizers/pin_to_host_optimizer_test.cc  |  76 +++-----
 4 files changed, 85 insertions(+), 179 deletions(-)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index cbf5c8e038..1b5a215987 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -102,19 +102,15 @@ bool IsConjugateTranspose(const NodeDef& node) {
 }
 
 bool IsControlFlow(const NodeDef& node) {
-  // TODO(williamchan): Add a microbenchmark to compare FlatSet vs. iterative
-  // string comparison.
-  static const gtl::FlatSet<string>* const kControFlowOps =
-      CHECK_NOTNULL((new gtl::FlatSet<string>{
-          "ControlTrigger",
-          "Enter",
-          "Exit",
-          "LoopCond",
-          "Merge",
-          "NextIteration",
-          "Switch",
-      }));
-  return kControFlowOps->count(node.op()) > 0;
+  // clang-format off
+  return node.op() == "ControlTrigger" ||
+         node.op() == "Enter" ||
+         node.op() == "Exit" ||
+         node.op() == "LoopCond" ||
+         node.op() == "Merge" ||
+         node.op() == "NextIteration" ||
+         node.op() == "Switch";
+  // clang-format on
 }
 
 bool IsConv2D(const NodeDef& node) { return node.op() == "Conv2D"; }
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
index 29a3b2b74c..8ed4271fa4 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
@@ -25,29 +25,16 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace grappler {
-
 namespace internal {
 
-namespace {
 // TODO(williamchan): Change this constant to be something smarter, maybe
 // dynamically determined.
 constexpr int64 kTensorMaxSize = 64;
 
-struct OpDevicePortHasher {
-  std::size_t operator()(const std::tuple<string, string, int>& x) const {
-    uint64 code = Hash64Combine(Hash64(std::get<0>(x)), Hash64(std::get<1>(x)));
-
-    return Hash64Combine(code, hash<int>()(std::get<2>(x)));
-  }
-};
-using OpDevicePortOnHostMap =
-    gtl::FlatMap<std::tuple<string, string, int>, bool, OpDevicePortHasher>;
-
 // All the nodes that should be blacklisted and not swapped.
 bool IsBlacklisted(const NodeDef& node) {
   return
@@ -95,10 +82,10 @@ Status TryFindKernelDef(const std::vector<DeviceType>& devices,
 
 // Checks if a node's output port is host friendly.
 // Roughly this means checking if the output port is on Host memory.
-Status IsNodeOutputPortHostFriendly(
-    const GraphView& graph, GraphProperties* properties, const NodeDef& node,
-    int port_id, OpDevicePortOnHostMap* op_device_outport_pinned_to_host_cache,
-    bool* is_candidate) {
+Status IsNodeOutputPortHostFriendly(const GraphView& graph,
+                                    GraphProperties* properties,
+                                    const NodeDef& node, int port_id,
+                                    bool* is_candidate) {
   *is_candidate = false;
 
   // Make sure we are not a blacklisted op.
@@ -130,8 +117,7 @@ Status IsNodeOutputPortHostFriendly(
     for (const auto& fanin : graph.GetFanins(node, false)) {
       bool fanin_candidate = false;
       TF_RETURN_IF_ERROR(IsNodeOutputPortHostFriendly(
-          graph, properties, *fanin.node, fanin.port_id,
-          op_device_outport_pinned_to_host_cache, &fanin_candidate));
+          graph, properties, *fanin.node, fanin.port_id, &fanin_candidate));
       if (!fanin_candidate) {
         return Status::OK();
       }
@@ -146,22 +132,11 @@ Status IsNodeOutputPortHostFriendly(
     return Status::OK();
   }
 
-  // Check `op_device_outport_pinned_to_host_cache` for our
-  // {op, device, port_id} combo to see if the arg is pinned on Host.
-  const std::tuple<string, string, int> cache_key(node.op(), node.device(),
-                                                  port_id);
-  auto it = op_device_outport_pinned_to_host_cache->find(cache_key);
-  if (it != op_device_outport_pinned_to_host_cache->end()) {
-    *is_candidate = it->second;
-    return Status::OK();
-  }
-
   // Check if op's output port is pinned to HostMemory.
   const OpDef* op = nullptr;
   Status s = OpRegistry::Global()->LookUpOpDef(node.op(), &op);
   if (!s.ok()) {
     LOG(WARNING) << "Could not find OpDef for : " << node.op();
-    op_device_outport_pinned_to_host_cache->emplace(cache_key, false);
     return Status::OK();
   }
 
@@ -171,7 +146,6 @@ Status IsNodeOutputPortHostFriendly(
     LOG(WARNING) << "Invalid port: " << port_id << "!\n"
                  << node.DebugString() << "\n"
                  << op->DebugString();
-    op_device_outport_pinned_to_host_cache->emplace(cache_key, false);
     return Status::OK();
   }
 
@@ -181,7 +155,6 @@ Status IsNodeOutputPortHostFriendly(
                        &kernel);
   if (!s.ok()) {
     LOG(INFO) << "Could not find KernelDef for: " << node.op();
-    op_device_outport_pinned_to_host_cache->emplace(cache_key, false);
     return Status::OK();
   }
 
@@ -193,35 +166,22 @@ Status IsNodeOutputPortHostFriendly(
     }
   }
 
-  op_device_outport_pinned_to_host_cache->emplace(cache_key, *is_candidate);
-
   return Status::OK();
 }
 
 // Checks if a node's input port is Host friendly.
 // Roughly this means checking if the input port is on Host memory.
-bool IsNodeInputPortHostFriendly(
-    const NodeDef& node, int port_id,
-    OpDevicePortOnHostMap* op_device_inport_pinned_to_host_cache) {
+bool IsNodeInputPortHostFriendly(const NodeDef& node, int port_id) {
   // If node is on Host, assume its inputs are Host friendly.
   if (str_util::StrContains(node.device(), DEVICE_CPU)) {
     return true;
   }
 
-  // Check `op_device_inport_pinned_to_host_cache` for our
-  // {op, device, port_id} combo to see if the arg is pinned on Host.
-  std::tuple<string, string, int> cache_key(node.op(), node.device(), port_id);
-  auto it = op_device_inport_pinned_to_host_cache->find(cache_key);
-  if (it != op_device_inport_pinned_to_host_cache->end()) {
-    return it->second;
-  }
-
   // Check if op's input port is pinned to HostMemory.
   const OpDef* op = nullptr;
   Status s = OpRegistry::Global()->LookUpOpDef(node.op(), &op);
   if (!s.ok()) {
     LOG(WARNING) << "Could not find OpDef for : " << node.op();
-    op_device_inport_pinned_to_host_cache->emplace(cache_key, false);
     return false;
   }
   const int input_arg_id = OpInputPortIdToArgId(node, *op, port_id);
@@ -232,20 +192,16 @@ bool IsNodeInputPortHostFriendly(
       {node.device().c_str(), DEVICE_GPU, DEVICE_CPU}, node, &kernel);
   if (!s.ok()) {
     LOG(INFO) << "Could not find KernelDef for: " << node.op();
-    op_device_inport_pinned_to_host_cache->emplace(cache_key, false);
     return false;
   }
 
   // Check if the input_arg is pinned to Host.
   for (const string& host_memory_arg : kernel->host_memory_arg()) {
     if (op->input_arg(input_arg_id).name() == host_memory_arg) {
-      op_device_inport_pinned_to_host_cache->emplace(cache_key, true);
       return true;
     }
   }
 
-  op_device_inport_pinned_to_host_cache->emplace(cache_key, false);
-
   return false;
 }
 
@@ -255,29 +211,38 @@ bool IsNodeInputPortHostFriendly(
 // 2] Check if node can run on Host.
 // 3] Check all input/outputs are Host "friendly" (atm, friendly means small,
 //    ints, and pinned to Host).
-Status IsNodeHostCandidate(
-    const GraphView& graph, GraphProperties* properties, const NodeDef& node,
-    OpDevicePortOnHostMap* op_device_outport_pinned_to_host_cache,
-    bool* is_candidate) {
+Status IsNodeHostCandidate(const GraphView& graph, GraphProperties* properties,
+                           const NodeDef& node, bool* is_candidate) {
   *is_candidate = false;
 
-  // Skip these node types.
-  if (IsBlacklisted(node)) {
-    return Status::OK();
-  }
-
   // Check if node already on CPU.
   if (str_util::StrContains(node.device(), DEVICE_CPU)) {
     *is_candidate = true;
     return Status::OK();
   }
 
+  // Skip these node types.
+  if (IsBlacklisted(node)) {
+    return Status::OK();
+  }
+
   // Check the node can be run on CPU.
   Status s = TryFindKernelDef({DEVICE_CPU}, node, nullptr);
   if (!s.ok()) {
     return Status::OK();
   }
 
+  // Check all inputs are Host friendly.
+  for (const GraphView::OutputPort& fanin :
+       graph.GetFanins(node, /*include_controlling_nodes=*/false)) {
+    bool fanin_candidate = false;
+    TF_RETURN_IF_ERROR(IsNodeOutputPortHostFriendly(
+        graph, properties, *fanin.node, fanin.port_id, &fanin_candidate));
+    if (!fanin_candidate) {
+      return Status::OK();
+    }
+  }
+
   // Check all outputs are Host friendly.
   if (!properties->has_properties()) {
     // This is an expensive call, call it lazily.
@@ -290,42 +255,16 @@ Status IsNodeHostCandidate(
     }
   }
 
-  // Check all inputs are Host friendly.
-  for (const GraphView::OutputPort& fanin :
-       graph.GetFanins(node, /*include_controlling_nodes=*/false)) {
-    bool fanin_candidate = false;
-    TF_RETURN_IF_ERROR(IsNodeOutputPortHostFriendly(
-        graph, properties, *fanin.node, fanin.port_id,
-        op_device_outport_pinned_to_host_cache, &fanin_candidate));
-    if (!fanin_candidate) {
-      return Status::OK();
-    }
-  }
-
   *is_candidate = true;
   return Status::OK();
 }
 
-bool IsTPUGraphDef(const GraphDef& def) {
-  for (const auto& node : def.node()) {
-    if (node.op() == "TPUCompile" || node.op() == "TPUExecute" ||
-        node.op() == "TPUPartitionedCall") {
-      return true;
-    }
-  }
-  return false;
-}
-}  // end namespace
-
-// Tries to swap `device` to a Host device from `devices`. Returns true iff
-// there was a swap.
-bool TrySwapToHostDevice(const gtl::FlatSet<string>& devices,
-                         bool has_device_cpu, string* device) {
+string TryFindHostDevice(const gtl::FlatSet<string>& devices,
+                         bool has_device_cpu, const string& device) {
   // Force this node onto the CPU.
-  if (device->empty() && has_device_cpu) {
-    *device = "/device:CPU:0";
-    return true;
-  } else if (str_util::StrContains(*device, DEVICE_GPU)) {
+  if (device.empty() && has_device_cpu) {
+    return "/device:CPU:0";
+  } else if (str_util::StrContains(device, DEVICE_GPU)) {
     // Sometimes the cluster can have:
     //   devices = {"/device:CPU:0", "/device:XLA_GPU:0"}
     // and we need to handle them properly.
@@ -333,19 +272,27 @@ bool TrySwapToHostDevice(const gtl::FlatSet<string>& devices,
          {std::pair<string, string>("GPU", "CPU:0"),
           std::pair<string, string>("/device", "/device:CPU:0")}) {
       const string device_host =
-          strings::StrCat(device->substr(0, device->rfind(device_match.first)),
+          strings::StrCat(device.substr(0, device.rfind(device_match.first)),
                           device_match.second);
       if (devices.find(device_host) != devices.end()) {
-        *device = device_host;
-        return true;
+        return device_host;
       }
     }
   }
 
-  // We couldn't find an appropriate Host device, return false.
-  return false;
+  // We couldn't find an appropriate Host device, return original device.
+  return device;
 }
 
+bool IsTPUGraphDef(const GraphDef& def) {
+  for (const auto& node : def.node()) {
+    if (node.op() == "TPUCompile" || node.op() == "TPUExecute" ||
+        node.op() == "TPUPartitionedCall") {
+      return true;
+    }
+  }
+  return false;
+}
 }  // end namespace internal
 
 Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
@@ -377,26 +324,20 @@ Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   // All the Const nodes, and their original devices in topological order.
   std::vector<std::pair<NodeDef*, string>> const_nodes;
 
-  // Cache to map {op, device, port} -> bool on whether it is pinned to host.
-  internal::OpDevicePortOnHostMap op_device_outport_pinned_to_host_cache;
-  internal::OpDevicePortOnHostMap op_device_inport_pinned_to_host_cache;
-
   for (auto& node : *optimized_graph->mutable_node()) {
     bool is_candidate = false;
-    TF_RETURN_IF_ERROR(internal::IsNodeHostCandidate(
-        graph, &properties, node, &op_device_outport_pinned_to_host_cache,
-        &is_candidate));
+    TF_RETURN_IF_ERROR(
+        internal::IsNodeHostCandidate(graph, &properties, node, &is_candidate));
     if (!is_candidate) {
       continue;
     }
 
-    const string original_device = node.device();
-    const bool swapped = internal::TrySwapToHostDevice(devices, has_device_cpu,
-                                                       node.mutable_device());
-    // Keep track of all Const nodes that we swapped.
-    if (swapped && IsConstant(node)) {
-      const_nodes.emplace_back(&node, original_device);
+    if (IsConstant(node)) {
+      const_nodes.emplace_back(&node, node.device());
     }
+    // Try and swap the device to Host.
+    node.set_device(
+        internal::TryFindHostDevice(devices, has_device_cpu, node.device()));
   }
 
   // Traverse all `const_nodes`, and map them back to GPU greedily.
@@ -408,9 +349,8 @@ Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     // this node back onto the original device.
     for (const GraphView::InputPort& fanout : graph.GetFanouts(*node, false)) {
       // The consumer is not Host friendly, swap it back to the original device.
-      if (!internal::IsNodeInputPortHostFriendly(
-              *fanout.node, fanout.port_id,
-              &op_device_inport_pinned_to_host_cache)) {
+      if (!internal::IsNodeInputPortHostFriendly(*fanout.node,
+                                                 fanout.port_id)) {
         node->set_device(device);
         break;
       }
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
index bed4a9ef95..d557a03463 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
@@ -26,8 +26,8 @@ namespace tensorflow {
 namespace grappler {
 namespace internal {
 // Try and find an appropriate Host device in `devices` given `device`.
-bool TrySwapToHostDevice(const gtl::FlatSet<string>& devices,
-                         bool has_device_cpu, string* device);
+string TryFindHostDevice(const gtl::FlatSet<string>& devices,
+                         bool has_device_cpu, const string& device);
 }  // end namespace internal
 
 // Optimize TensorFlow ops that should be swapped into the CPU to avoid
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
index 9bb030b220..7c64529441 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
@@ -28,60 +28,30 @@ namespace {
 
 class PinToHostOptimizerTest : public GrapplerTest {};
 
-TEST_F(PinToHostOptimizerTest, TrySwapToHostDeviceNoDevices) {
+TEST_F(PinToHostOptimizerTest, TryFindHostDevice) {
   gtl::FlatSet<string> devices = {};
-
-  string device = "ABC";
-  EXPECT_FALSE(internal::TrySwapToHostDevice(devices, false, &device));
-  EXPECT_EQ(device, "ABC");
-}
-
-TEST_F(PinToHostOptimizerTest, TrySwapToHostDeviceCpuXlaGpu) {
-  gtl::FlatSet<string> devices = {"/device:CPU:0", "/device:XLA_GPU:0"};
-
-  string device = "";
-  EXPECT_TRUE(internal::TrySwapToHostDevice(devices, true, &device));
-  EXPECT_EQ(device, "/device:CPU:0");
-
-  device = "/device:XLA_GPU:0";
-  EXPECT_TRUE(internal::TrySwapToHostDevice(devices, true, &device));
-  EXPECT_EQ(device, "/device:CPU:0");
-
-  device = "/device:XLA_GPU:*";
-  EXPECT_TRUE(internal::TrySwapToHostDevice(devices, true, &device));
-  EXPECT_EQ(device, "/device:CPU:0");
-}
-
-TEST_F(PinToHostOptimizerTest, TrySwapToHostDeviceXlaCpuXlaGpu) {
-  gtl::FlatSet<string> devices = {"/device:XLA_CPU:0", "/device:XLA_GPU:0"};
-
-  string device = "";
-  EXPECT_FALSE(internal::TrySwapToHostDevice(devices, false, &device));
-  EXPECT_TRUE(device.empty());
-
-  device = "/device:XLA_GPU:0";
-  EXPECT_TRUE(internal::TrySwapToHostDevice(devices, false, &device));
-  EXPECT_EQ(device, "/device:XLA_CPU:0");
-
-  device = "/device:XLA_GPU:*";
-  EXPECT_TRUE(internal::TrySwapToHostDevice(devices, false, &device));
-  EXPECT_EQ(device, "/device:XLA_CPU:0");
-}
-
-TEST_F(PinToHostOptimizerTest, TrySwapToHostDeviceXlaGpu) {
-  gtl::FlatSet<string> devices = {"/device:XLA_GPU:0"};
-
-  string device = "";
-  EXPECT_FALSE(internal::TrySwapToHostDevice(devices, false, &device));
-  EXPECT_TRUE(device.empty());
-
-  device = "/device:XLA_GPU:0";
-  EXPECT_FALSE(internal::TrySwapToHostDevice(devices, false, &device));
-  EXPECT_EQ(device, "/device:XLA_GPU:0");
-
-  device = "/device:XLA_GPU:*";
-  EXPECT_FALSE(internal::TrySwapToHostDevice(devices, false, &device));
-  EXPECT_EQ(device, "/device:XLA_GPU:*");
+  EXPECT_EQ("ABC", internal::TryFindHostDevice(devices, false, "ABC"));
+
+  devices = {"/device:CPU:0", "/device:XLA_GPU:0"};
+  EXPECT_EQ(internal::TryFindHostDevice(devices, true, ""), "/device:CPU:0");
+  EXPECT_EQ(internal::TryFindHostDevice(devices, true, "/device:XLA_GPU:0"),
+            "/device:CPU:0");
+  EXPECT_EQ(internal::TryFindHostDevice(devices, true, "/device:XLA_GPU:*"),
+            "/device:CPU:0");
+
+  devices = {"/device:XLA_CPU:0", "/device:XLA_GPU:0"};
+  EXPECT_EQ(internal::TryFindHostDevice(devices, false, ""), "");
+  EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:0"),
+            "/device:XLA_CPU:0");
+  EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:*"),
+            "/device:XLA_CPU:0");
+
+  devices = {"/device:XLA_GPU:0"};
+  EXPECT_EQ(internal::TryFindHostDevice(devices, false, ""), "");
+  EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:0"),
+            "/device:XLA_GPU:0");
+  EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:*"),
+            "/device:XLA_GPU:*");
 }
 
 TEST_F(PinToHostOptimizerTest, OptimizeSmallOpsToHost) {
-- 
GitLab


From a198ca7d9bbc752a322c59b9a30519eab1b6730c Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 9 Oct 2018 00:56:23 -0700
Subject: [PATCH 569/570] Enable support for PRED values in KeyValueSort for
 the HloEvaluator.

PiperOrigin-RevId: 216315110
---
 tensorflow/compiler/xla/service/hlo_evaluator.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index eec8d242fa..6cba46135c 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/index_util.h"
@@ -1279,7 +1280,9 @@ StatusOr<Literal> EvaluateSortInternal(HloInstruction* sort,
                     return SafeLess<KeyType>(a.first, b.first);
                   });
         std::vector<KeyType> result_keys;
-        std::vector<ValueType> result_values;
+        // We use a InlinedVector here because we need to convert it to an
+        // absl::Span later, and this would not work with std::vector<bool>.
+        absl::InlinedVector<ValueType, 10> result_values;
         for (const auto& key_value : key_value_vector) {
           result_keys.push_back(key_value.first);
           result_values.push_back(key_value.second);
@@ -1316,6 +1319,9 @@ StatusOr<Literal> EvaluateSortCurried(HloInstruction* sort,
                                       const Literal& keys_literal,
                                       const Literal& values_literal) {
   switch (sort->operand(1)->shape().element_type()) {
+    case PRED:
+      return EvaluateSortInternal<KeyType, bool>(sort, keys_literal,
+                                                 values_literal);
     case F32:
       return EvaluateSortInternal<KeyType, float>(sort, keys_literal,
                                                   values_literal);
-- 
GitLab


From 69f60d4c8cb5edb6fdc63b837b6db29562d28744 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Oct 2018 02:09:06 -0700
Subject: [PATCH 570/570] compat: Update forward compatibility horizon to
 2018-10-09

PiperOrigin-RevId: 216323343
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 349c84e13c..0e14c0e044 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -26,7 +26,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 8)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 9)
 
 
 @tf_export("compat.forward_compatible")
-- 
GitLab